BetterDocs
Home
Docs

Creation | pd.read_csv()

It is similar to pd.read_table() method.

Method:

pd.read_csv(filepath_or_buffer, *, sep=None, delimiter=None, header='infer', names=<no_default>, index_col=None, usecols=None, dtype=None, engine=None, converters=None, true_values=None, false_values=None, skipinitialspace=False, skiprows=None, skipfooter=0, nrows=None, na_values=None, keep_default_na=True, na_filter=True, verbose=<no_default>, skip_blank_lines=True, parse_dates=False, infer_datetime_format=<no_default>, keep_date_col=<no_default>, date_parser=<no_default>, date_format=None, dayfirst=False, cache_dates=True, iterator=False, chunksize=None, compression='infer', thousands=None, decimal='.', lineterminator=None, quotechar='"', quoting=0, doublequote=True, escapechar=None, comment=None, encoding=None, encoding_errors='strict', dialect=None, on_bad_lines='error', low_memory=True, memory_map=False, float_precision=None, storage_options=None, dtype_backend=<no_default>)

Reads a CSV file into a DataFrame.

Returns:

pandas.core.frame.DataFrame

Parameters:

filepath_or_buffer: (file_path or buffer), Optional-

Path to the file or file-like object to read.

import pandas as pd

# Data to write to the file
data = {
    "ID": [1, 2, 3],
    "Name": ["Alice", "Bob", "Charlie"],
    "Age": [30, 25, 35],
    "Salary": [55000, 48000, 62000],
}

# Create a DataFrame
df_to_write = pd.DataFrame(data)

# Filepath for the tab-separated file
file_path = "data.csv"

# Writing the DataFrame to a tab-separated file
df_to_write.to_csv(file_path, sep="\t", index=False)

# Reading the file back into a DataFrame
df_read = pd.read_csv(filepath_or_buffer=file_path, sep="\t",)

# Display the DataFrame read from the file
print(df_read)
'''
Output:
   ID     Name  Age  Salary
0   1    Alice   30   55000
1   2      Bob   25   48000
2   3  Charlie   35   62000
'''

.to_csv  is used to write object to a comma-separated values (csv) file.

sep: str, Optional-

Specifies the delimiter.

import pandas as pd

# Data to write to the file
data = {
    "ID": [1, 2, 3],
    "Name": ["Alice", "Bob", "Charlie"],
    "Age": [30, 25, 35],
    "Salary": [55000, 48000, 62000],
}

# Create a DataFrame
df_to_write = pd.DataFrame(data)

# Filepath for the tab-separated file
file_path = "data.csv"

# Writing the DataFrame to a tab-separated file
df_to_write.to_csv(file_path, sep="\t", index=False)

# Reading the file back into a DataFrame
df_read = pd.read_csv(filepath_or_buffer=file_path, sep="\t",)

# Display the DataFrame read from the file
print(df_read)
'''
Output:
   ID     Name  Age  Salary
0   1    Alice   30   55000
1   2      Bob   25   48000
2   3  Charlie   35   62000
'''

delimiter: single-char, Optional-

It serves the same purpose as sep but works in the context of the csv engine.

import pandas as pd

# Data to write to the file
data = {
    "ID": [1, 2, 3],
    "Name": ["Alice", "Bob", "Charlie"],
    "Age": [30, 25, 35],
    "Salary": [55000, 48000, 62000],
}

# Create a DataFrame
df_to_write = pd.DataFrame(data)

# Filepath for the tab-separated file
file_path = "data.csv"

# Writing the DataFrame to a tab-separated file
df_to_write.to_csv(file_path, sep="\t", index=False)

# Reading the file back into a DataFrame
df_read = pd.read_csv(filepath_or_buffer=file_path, delimiter="\t",)

# Display the DataFrame read from the file
print(df_read)
'''
Output:
   ID     Name  Age  Salary
0   1    Alice   30   55000
1   2      Bob   25   48000
2   3  Charlie   35   62000
'''

If both sep and delimiter are provided, pandas will use delimiter and ignore sep.

sep accepts a string whereas delimiter accepts only a single character.

names: array-like, Optional-

It allows you to manually specify the column names when reading a file into a DataFrame.

import pandas as pd

# Data to write to the file
data = {
    "ID": [1, 2, 3],
    "Name": ["Alice", "Bob", "Charlie"],
    "Age": [30, 25, 35],
    "Salary": [55000, 48000, 62000],
}

# Create a DataFrame
df_to_write = pd.DataFrame(data)

# Filepath for the tab-separated file
file_path = "data.csv"

# Writing the DataFrame to a tab-separated file
df_to_write.to_csv(file_path, sep="\t", index=False)

# Reading the file back into a DataFrame
df_read = pd.read_csv(filepath_or_buffer=file_path, delimiter="\t", header=0, names=["E_ID", "FName", "Age_in_Years", "Annual_Salary"])

# Display the DataFrame read from the file
print(df_read)
'''
Output:
   E_ID    FName  Age_in_Years  Annual_Salary
0     1    Alice            30          55000
1     2      Bob            25          48000
2     3  Charlie            35          62000
'''

Overrides existing headers for columns.

index_col: None, Optional-

It specifies which column should be used as the index of the resulting DataFrame.

import pandas as pd

# Data to write to the file
data = {
    "ID": [1, 2, 3],
    "Name": ["Alice", "Bob", "Charlie"],
    "Age": [30, 25, 35],
    "Salary": [55000, 48000, 62000],
}

# Create a DataFrame
df_to_write = pd.DataFrame(data)

# Filepath for the tab-separated file
file_path = "data.csv"

# Writing the DataFrame to a tab-separated file
df_to_write.to_csv(file_path, sep="\t", index=False)

# Reading the file back into a DataFrame
df_read = pd.read_csv(filepath_or_buffer=file_path, delimiter="\t", header=0, index_col='ID')

# Display the DataFrame read from the file
print(df_read)
'''
Output:
       Name  Age  Salary
ID                      
1     Alice   30   55000
2       Bob   25   48000
3   Charlie   35   62000
'''

usecols: None, Optional-

It allows you to select specific columns to read from the input file, rather than reading all columns.

import pandas as pd

# Data to write to the file
data = {
    "ID": [1, 2, 3],
    "Name": ["Alice", "Bob", "Charlie"],
    "Age": [30, 25, 35],
    "Salary": [55000, 48000, 62000],
}

# Create a DataFrame
df_to_write = pd.DataFrame(data)

# Filepath for the tab-separated file
file_path = "data.csv"

# Writing the DataFrame to a tab-separated file
df_to_write.to_csv(file_path, sep="\t", index=False)

# Reading the file back into a DataFrame
df_read = pd.read_csv(filepath_or_buffer=file_path, delimiter="\t", header=0, usecols=["Name", "Salary"])

# Display the DataFrame read from the file
print(df_read)
'''
Output:
      Name  Salary
0    Alice   55000
1      Bob   48000
2  Charlie   62000
'''

dtype: data-type, Optional-

Specifies the data-type of the DataFrame. If not provided, it’s inferred from the input.

import pandas as pd

# Data to write to the file
data = {
    "ID": [1, 2, 3],
    "Name": ["Alice", "Bob", "Charlie"],
    "Age": [30, 25, 35],
    "Salary": [55000, 48000, 62000],
}

# Create a DataFrame
df_to_write = pd.DataFrame(data)

# Filepath for the tab-separated file
file_path = "data.csv"

# Writing the DataFrame to a tab-separated file
df_to_write.to_csv(file_path, sep="\t", index=False)

# Reading the file back into a DataFrame
df_read = pd.read_csv(filepath_or_buffer=file_path, delimiter="\t", header=0, dtype={"ID": 'int8', "Name": 'string', "Age": 'int8', "Salary": 'float32'})

# Display the DataFrame read from the file
print(df_read)
'''
Output:
   ID     Name  Age   Salary
0   1    Alice   30  55000.0
1   2      Bob   25  48000.0
2   3  Charlie   35  62000.0
'''

Values: +

engine: ('c' or 'python'), Optional-

It specifies the underlying parsing engine to use when reading the file. Default c.

import pandas as pd

# Data to write to the file
data = {
    "ID": [1, 2, 3],
    "Name": ["Alice", "Bob", "Charlie"],
    "Age": [30, 25, 35],
    "Salary": [55000, 48000, 62000],
}

# Create a DataFrame
df_to_write = pd.DataFrame(data)

# Filepath for the tab-separated file
file_path = "data.csv"

# Writing the DataFrame to a tab-separated file
df_to_write.to_csv(file_path, sep="\t", index=False)

# Reading the file back into a DataFrame
df_read = pd.read_csv(filepath_or_buffer=file_path, sep="\t", header=0, engine='c')

# Display the DataFrame read from the file
print(df_read)
'''
Output:
   ID     Name  Age  Salary
0   1    Alice   30   55000
1   2      Bob   25   48000
2   3  Charlie   35   62000
'''

When reading a file, pandas uses one of these engines to parse the content. The c engine is preferred for its speed, but the python engine can be used if more control is needed or if the c engine encounters issues.

converters: None, Optional-

It allows you to specify custom functions to convert or process values in certain columns while reading the file.

import pandas as pd

# Data to write to the file
data = {
    "ID": [1, 2, 3],
    "Name": ["Alice", "Bob", "Charlie"],
    "Age": [30, 20, 10],
    "Salary": [55000, 48000, 62000],
}

# Create a DataFrame
df_to_write = pd.DataFrame(data)

# Filepath for the tab-separated file
file_path = "data.csv"

# Writing the DataFrame to a tab-separated file
df_to_write.to_csv(file_path, sep="\t", index=False)

def half_the_age(age):
    return int(age)/2

# Reading the file back into a DataFrame
df_read = pd.read_csv(filepath_or_buffer=file_path, sep="\t", header=0, converters={"Age": half_the_age})

# Display the DataFrame read from the file
print(df_read)
'''
Output:
   ID     Name   Age  Salary
0   1    Alice  15.0   55000
1   2      Bob  10.0   48000
2   3  Charlie   5.0   62000
'''

true_values: None, Optional-

It is used to specify which string values should be interpreted as boolean True.

import pandas as pd

# Data to write to the file
data = {
    "ID": [1, 2, 3],
    "Name": ["Alice", "Bob", "Charlie"],
    "Age": [30, 20, 10],
    "Salary": [55000, 48000, 62000],
    "IsActive": ["no", "maybe", "yes"]
}

# Create a DataFrame
df_to_write = pd.DataFrame(data)

# Filepath for the tab-separated file
file_path = "data.csv"

# Writing the DataFrame to a tab-separated file
df_to_write.to_csv(file_path, sep="\t", index=False)

# Reading the file back into a DataFrame using true_values
df_read = pd.read_csv(filepath_or_buffer=file_path, sep="\t", header=0, true_values=["yes", "maybe"], false_values=["no"])

# Display the DataFrame read from the file
print(df_read)
'''
Output:
   ID     Name  Age  Salary  IsActive
0   1    Alice   30   55000     False
1   2      Bob   20   48000      True
2   3  Charlie   10   62000      True
'''

For true_values or false_values to work as intended, both of them must contain either of the values in a column.

false_values: None, Optional-

It is used to specify which string values should be interpreted as boolean False.

import pandas as pd

# Data to write to the file
data = {
    "ID": [1, 2, 3],
    "Name": ["Alice", "Bob", "Charlie"],
    "Age": [30, 20, 10],
    "Salary": [55000, 48000, 62000],
    "IsActive": ["no", "maybe", "yes"]
}

# Create a DataFrame
df_to_write = pd.DataFrame(data)

# Filepath for the tab-separated file
file_path = "data.csv"

# Writing the DataFrame to a tab-separated file
df_to_write.to_csv(file_path, sep="\t", index=False)

# Reading the file back into a DataFrame using true_values
df_read = pd.read_csv(filepath_or_buffer=file_path, sep="\t", header=0, true_values=["yes"], false_values=["no", "maybe"])

# Display the DataFrame read from the file
print(df_read)
'''
Output:
   ID     Name  Age  Salary  IsActive
0   1    Alice   30   55000     False
1   2      Bob   20   48000     False
2   3  Charlie   10   62000      True
'''

For true_values or false_values to work as intended, both of them must contain either of the values in a column.

skipinitialspace: (True or False), Optional-

It controls whether or not to skip spaces following the delimiter when parsing a file. Default False.

import pandas as pd

# Example data with extra spaces after the comma delimiter
data = """ID,   Name     ,    Age
1,   Alice   ,   30
2,   Bob     ,   25
3,   Charlie ,  35"""

# Write the data to a file
file_path = "example_skipspace.csv"
with open(file_path, "w") as f:
    f.write(data)

# skipinitialspace=False (No space skipping)
df_false = pd.read_csv(file_path, sep=',', skipinitialspace=False)

print("With skipinitialspace=False:")
print(df_false)

# skipinitialspace=True (Skipping spaces after the delimiter)
df_true = pd.read_csv(file_path, sep=',', skipinitialspace=True)

print("With skipinitialspace=True:")
print(df_true)
'''
Output:
With skipinitialspace=False:
   ID    Name           Age
0   1     Alice          30
1   2     Bob            25
2   3     Charlie        35

With skipinitialspace=True:
   ID Name       Age
0   1  Alice      30
1   2  Bob        25
2   3  Charlie    35
'''

skiprows: None, Optional-

It is used to skip a specified number of rows from the start of the file or from a list of specific row indices.

import pandas as pd

# Example data with extra rows before the actual data
data = """Header1, Header2, Header3
Some metadata line
More metadata line
ID, Name, Age
1, Alice, 30
2, Bob, 25
3, Charlie, 35"""

# Write the data to a file
file_path = "skiprows_example.csv"
with open(file_path, "w") as f:
    f.write(data)

# Reading the file while skipping the first 3 rows
df = pd.read_csv(file_path, skiprows=3, sep=",")
print(df)
'''
Output:
   ID      Name   Age
0   1     Alice    30
1   2       Bob    25
2   3   Charlie    35
'''

skipfooter: None, Optional-

It allows you to skip a specified number of rows from the end of the file when reading the data.

import pandas as pd

# Example data with extra rows before the actual data
data = """
ID, Name, Age
1, Alice, 30
2, Bob, 25
3, Charlie, 35
Some metadata line
More metadata line"""

# Write the data to a file
file_path = "skiprows_example.csv"
with open(file_path, "w") as f:
    f.write(data)

df = pd.read_csv(file_path, skipfooter=2, sep=",")
print(df)
'''
Output:
   ID      Name   Age
0   1     Alice    30
1   2       Bob    25
2   3   Charlie    35
'''

nrows: None, Optional-

It is used to limit the number of rows that are read from the file.

import pandas as pd

# Example data with extra rows before the actual data
data = """
ID, Name, Age
1, Alice, 30
2, Bob, 25
3, Charlie, 35
Some metadata line
More metadata line"""

# Write the data to a file
file_path = "skiprows_example.csv"
with open(file_path, "w") as f:
    f.write(data)

df = pd.read_csv(file_path, sep=",", nrows=2)
print(df)
'''
Output:
   ID      Name   Age
0   1     Alice    30
1   2       Bob    25
'''

na_values: None, Optional-

It is used to specify additional values that should be treated as NaN (Not a Number) while reading the data.

import pandas as pd

# Data to write to the file
data = {
    "ID": [1, 2, 3],
    "Name": ["Alice", "Bob", "na"],
    "Age": [30, "missing", 10],
    "Salary": [55000, 48000, 62000],
}

# Create a DataFrame
df_to_write = pd.DataFrame(data)

# Filepath for the tab-separated file
file_path = "data.csv"

# Writing the DataFrame to a tab-separated file
df_to_write.to_csv(file_path, sep="\t", index=False)

# Reading the file back into a DataFrame 
df_read = pd.read_csv(filepath_or_buffer=file_path, sep="\t", header=0, na_values=["missing", "na"])

# Display the DataFrame read from the file
print(df_read)
'''
Output:
   ID   Name   Age  Salary
0   1  Alice  30.0   55000
1   2    Bob   NaN   48000
2   3    NaN  10.0   62000
'''

Values: +

keep_default_na: (True or False), Optional-

It controls whether or not the default missing values (i.e., NaN values) specified by pandas should be preserved when reading the file.

keep_default_na = True (default) +

keep_default_na = False +

Values: +

If keep_default_na is True, and na_values are specified, na_values is appended to the default NaN values used for parsing.

If keep_default_na is True, and na_values are not specified, only the default NaN values are used for parsing.

If keep_default_na is False, and na_values are specified, only the NaN values specified na_values are used for parsing.

If keep_default_na is False, and na_values are not specified, no strings will be parsed as NaN.

If na_filter  is passed in as <ri>False<ri>, the keep_default_na and na_values parameters will be ignored.

na_filter: (True or False), Optional-

It controls whether pandas should check for missing values (e.g., NaN, NA, null, etc.) during the reading process.

na_filter = True (default) +

na_filter = False +

Values: +

verbose: (True or False), Optional-

It controls whether or not detailed information about the parsing process is displayed.

verbose = False (default) +

verbose = True +

skip_blank_lines: (True or False), Optional-

It controls whether or not blank lines should be skipped during the reading of the file.

skip_blank_lines = True (default) +

skip_blank_lines = False +

parse_dates: (True or False), Optional-

It is used to specify which columns should be parsed as dates during the reading of the file.

parse_dates = False (default) +

parse_dates = True +

infer_datetime_format: (True or False), Optional-

Deprecated since version 2.0.0: A strict version of this argument is now the default, passing it has no effect.

keep_date_col: (True or False), Optional-

Deprecated since version 2.0.0: A strict version of this argument is now the default, passing it has no effect.

date_parser: (True or False), Optional-

Deprecated since version 2.0.0: A strict version of this argument is now the default, passing it has no effect.

date_format: None, Optional-

It allows you specify the input format of the date when reading.

import pandas as pd

# Example data with date-like strings
data = """ID,Name,BirthDate
1,Alice,2003-10-02
2,Bob,2003-04-16
3,Charlie,2023-12-29"""

# Write the data to a file
file_path = "parse_dates_example.csv"
with open(file_path, "w") as f:
    f.write(data)

# Reading the data with parse_dates=True and keep_date_col=True
df = pd.read_csv(file_path, sep=',', header=0, parse_dates=["BirthDate"], date_format="%Y-%m-%d")

# Display the DataFrame
print(df['BirthDate'])
'''
Output:
0   2003-10-02
1   2003-04-16
2   2023-12-29
Name: BirthDate, dtype: datetime64[ns]
'''

dayfirst: None, Optional-

DD/MM format dates, international and European format.

cache_dates: None, Optional-

It allows you specify the input format of the date when reading.

import pandas as pd

# Example data with dates
data = """ID,Name,BirthDate
1,Alice,2003-10-02
2,Bob,2003-04-16
3,Charlie,2023-12-29"""

# Write the data to a file
file_path = "example_cache_dates.csv"
with open(file_path, "w") as f:
    f.write(data)

# Read the data with cache_dates
dff = pd.read_csv(file_path, sep=',', parse_dates=["BirthDate"], cache_dates=False)
dft = pd.read_csv(file_path, sep=',', parse_dates=["BirthDate"], cache_dates=True)

print('cache_date=False',dff['BirthDate'],'cache_date=True',dft['BirthDate'],sep='\n')
'''
Output:
cache_date=False
0   2003-10-02
1   2003-04-16
2   2023-12-29
Name: BirthDate, dtype: datetime64[ns]
cache_date=True
0   2003-10-02
1   2003-04-16
2   2023-12-29
Name: BirthDate, dtype: datetime64[ns]
'''

When cache_dates=True: Pandas will store the parsed date results in memory. This can make date parsing faster in some cases, especially when the same date is encountered repeatedly in the dataset.

When cache_dates=False: Pandas will not cache the parsed dates. Each date will be parsed from scratch as it is encountered.

iterator: (True or False), Optional-

It is used to specify which columns should be parsed as dates during the reading of the file.

iterator = False (default) +

iterator = True +

chunksize: int, Optional-

Number of lines to read from the file per chunk.

import pandas as pd

# Example data with dates
data = """ID,Name,BirthDate
1,Alice,2020-01-01
2,Bob,2021-02-28
3,Charlie,2023-03-15
4,David,2019-11-12
5,Eve,2022-08-09"""

# Write the data to a file
file_path = "example_ite.csv"
with open(file_path, "w") as f:
    f.write(data)

# Read the data 
df = pd.read_csv(file_path, sep=',', chunksize=3)
# Read first 2
print(df.get_chunk(2))
# # Read the *next* 2
print(df.get_chunk(2))
'''
Output:
   ID   Name   BirthDate
0   1  Alice  2020-01-01
1   2    Bob  2021-02-28

   ID     Name   BirthDate
2   3  Charlie  2023-03-15
3   4    David  2019-11-12
'''

compression: str, Optional-

It specifies the type of compression used for reading a file.

import pandas as pd

# Example data
data = {
    "ID": [1, 2, 3],
    "Name": ["Alice", "Bob", "Charlie"],
    "Age": [30, 25, 35]
}

# Create a DataFrame
df = pd.DataFrame(data)

# Write the DataFrame to a GZIP-compressed file
file_path = "example_compressed.gz"
df.to_csv(file_path, index=False, compression='gzip')

# Reading the GZIP compressed file directly using pandas
df_gzip = pd.read_csv(file_path, sep=',', compression='gzip')

print(df_gzip)
'''
Output:
   ID     Name  Age
0   1    Alice   30
1   2      Bob   25
2   3  Charlie   35
'''

thousands: single char, Optional-

Character acting as the thousands separator in numerical values.

import pandas as pd

# Example data with thousands separators (dot)
data = """ID,Amount
1,1,000
2,2,500.000
3,3,700"""

# Write the data to a file
file_path = "example_thousands.csv"
with open(file_path, "w") as f:
    f.write(data)

# Read the file and specify the thousands separator (dot)
df = pd.read_csv(file_path, sep=',', thousands='.')

print(df['Amount'])
'''
Output:
1         0
2    500000
3       700
Name: Amount, dtype: int64
'''

delimiter or sep cannot have the same value as thousands.

decimal: single char, Optional-

Character to recognize as decimal point (e.g., use ‘,’ for European data).

import pandas as pd

# Example data with decimal separators (|)
data = """ID,Amount
1,1,000|9
2,2,500|70
3,3,700"""

# Write the data to a file
file_path = "example_dec.csv"
with open(file_path, "w") as f:
    f.write(data)

# Read the file
df = pd.read_csv(file_path, sep=',', decimal='|')

print(df['Amount'])
'''
1      0.9
2    500.7
3    700.0
Name: Amount, dtype: float64
'''

delimiter or sep cannot have the same value as decimal.

lineterminator: single char, Optional-

It specifies the character or string used to terminate each line in the input file. Default \n

import pandas as pd

# Example data with Windows-style line terminators (\t)
data = "ID,Name,Age\t1,Alice,30\t2,Bob,25\t3,Charlie,35"

# Write the data to a file with \r as line terminator
file_path = "example_lineterminator.csv"
with open(file_path, "w") as f:
    f.write(data)

# Read the file without specifying lineterminator
df = pd.read_csv(file_path, sep=',', lineterminator="\t")
print(df)
'''
Output:
   ID     Name  Age
0   1    Alice   30
1   2      Bob   25
2   3  Charlie   35
'''

delimiter or sep is preferred to not have the same value as lineterminator.

quotechar: single char, Optional-

It specifies the character used to quote fields in the input file.

import pandas as pd

# Example data with quoted fields
data = """ID,Name,Age
1,"Alice, Smith",30
2,"Bob, Johnson",25
3,"Charlie, Brown",35"""

# Write the data to a file
file_path = "example_quotechar.csv"
with open(file_path, "w") as f:
    f.write(data)

# Read the file with quotechar as "
df = pd.read_csv(file_path, sep=',', quotechar='"')

print(df)
'''
Output:
   ID            Name  Age
0   1    Alice, Smith   30
1   2    Bob, Johnson   25
2   3  Charlie, Brown   35
'''

delimiter or sep is preferred to not have the same value as quotechar.

quoting: None, Optional-

Control field quoting behavior per csv.QUOTE_* constants. Default is csv.QUOTE_MINIMAL (i.e., 0) which implies that only fields containing special characters are quoted (e.g., characters defined in quotechar, delimiter, or lineterminator.

doublequote: (True or False), Optional-

It is used to control how the parser handles quoted data that contains the quote character itself.

doublequote = True (default) +

doublequote = False +

escapechar: single char, Optional-

It is used to specify a character that will escape special characters in the file.

import pandas as pd

# Example data with escape characters
data = '''ID,Name
1,Alice
2,Bob
3,Charlie\,The King
'''

# Write the data to a file
file_path = "escapechar_example.csv"
with open(file_path, "w") as f:
    f.write(data)

# Read the file with escapechar='\'
df = pd.read_csv(file_path, sep=',', escapechar='\\')
print(df)
'''
Output:
   ID              Name
0   1             Alice
1   2               Bob
2   3  Charlie,The King
'''

delimiter or sep is preferred to not have the same value as escapechar.

comment: single char, Optional-

It is used to specify a character that indicates the beginning of a comment in the data file.

import pandas as pd

# Example data with a comment line
data = '''# This is a comment line
ID,Name
1,Alice
2,Bob
3,Charlie
'''

# Write the data to a file
file_path = "comment_example.csv"
with open(file_path, "w") as f:
    f.write(data)

# Read the file and skip comment lines
df = pd.read_csv(file_path, sep=',', comment='#')
print(df)
'''
Output:
   ID     Name
0   1    Alice
1   2      Bob
2   3  Charlie
'''

encoding: str, Optional-

It is used to specify the character encoding of the input file.

import pandas as pd

# Example data with special characters
data = "ID,Name\n1,Alice\n2,Bob\n3,Chloé"

# Write the data to a file with ISO-8859-1 encoding
file_path = "encoding_example.csv"
with open(file_path, "w", encoding="ISO-8859-1") as f:
    f.write(data)

# Read the file with the correct encoding
df = pd.read_csv(file_path, sep=",", encoding="ISO-8859-1")
print(df)
'''
Output:
   ID   Name
0   1  Alice
1   2    Bob
2   3  Chloé
'''

encoding_errors: str, Optional-

It allows you to control how encoding errors are handled during the reading process.

import pandas as pd

# Write the file with explicit UTF-8 encoding
data = "ID,Name\n1,Alice\n2,Bob\n3,Chloé"
file_path = "encoding_errors_example.csv"

with open(file_path, "w", encoding="utf-8") as f:
    f.write(data)

# Read the file with incorrect ASCII encoding

try:
  df = pd.read_csv(file_path, sep=",", encoding="ascii", encoding_errors="strict")
except UnicodeDecodeError as e:
  print(e)

df_ignore = pd.read_csv(file_path, sep=",", encoding="ascii", encoding_errors="ignore")

df_replace = pd.read_csv(file_path, sep=",", encoding="ascii", encoding_errors="replace")

# Print the results
print("With encoding_errors='ignore':")
print(df_ignore)

print("\nWith encoding_errors='replace':")
print(df_replace)

'''
Output:
'ascii' codec can't decode byte 0xc3 in position 28: ordinal not in range(128)
With encoding_errors='ignore':
   ID   Name
0   1  Alice
1   2    Bob
2   3   Chlo

With encoding_errors='replace':
   ID    Name
0   1   Alice
1   2     Bob
2   3  Chlo��
'''

dialect: str, Optional-

It is used to specify a predefined set of CSV parsing rules.

import csv
import pandas as pd

# Register a custom dialect
csv.register_dialect('my_dialect', delimiter='|', quoting=csv.QUOTE_NONE)

# Create a sample file
file_path = 'sample_dialect.csv'
with open(file_path, 'w') as f:
    f.write("ID|Name\n1|Alice\n2|Bob\n3|Charlie")

# Use the custom dialect to read the file
df = pd.read_csv(file_path, dialect='my_dialect')
print(df)
'''
Output:
   ID     Name
0   1    Alice
1   2      Bob
2   3  Charlie
'''

on_bad_lines: ('skip' or 'warn' or 'error' or Callable), Optional-

Specifies what to do upon encountering a bad line (a line with too many fields).

on_bad_lines = 'error' (default) +

on_bad_lines = 'warn' +

on_bad_lines = 'skip' +

on_bad_lines = Callable +

engine must always be python or pyarrow.

low_memory: (True or False), Optional-

It is used to control memory optimization when reading large files.

low_memory = True (default) +

low_memory = False +

memory_map: (True or False), Optional-

It allows you to use memory-mapped file access to read a file. Memory-mapped files enable efficient handling of large files by using virtual memory instead of directly reading the file into RAM.

memory_map = False (default) +

memory_map = True +

float_precision: ('legacy' or 'high' or 'round_trip'), Optional-

It is used to specify a predefined set of CSV parsing rules.

import pandas as pd

# Create a file with high precision floating-point numbers
data = """ID,Value
1,3.141592653589793238462643383279502884197169399375105820974944
2,2.718281828459045235360287471352662497757247093699959574966967
3,1.618033988749894848204586834365638117900284550292151098084
"""

file_path = "high_precision_data.csv"

# Write the data to a CSV file
with open(file_path, "w") as f:
    f.write(data)

# Read the file with legacy float precision
df_legacy = pd.read_csv(file_path, sep=",", float_precision="legacy")
print("With float_precision='legacy':")
print(f"Low:        {df_legacy['Value'][0]:.30f}")

# Read the file with high float precision
df_high = pd.read_csv(file_path, sep=",", float_precision="high")
print("\nWith float_precision='high':")
print(f"High:       {df_high['Value'][0]:.30f}")

# Read the file with round_trip float precision
df_round = pd.read_csv(file_path, sep=",", float_precision="round_trip")
print("\nWith float_precision='round_trip':")
print(f"Round Trip: {df_round['Value'][0]:.30f}")
'''
Output:
With float_precision='legacy':
Low:        3.141592653589792227819543768419

With float_precision='high':
High:       3.141592653589792671908753618482

With float_precision='round_trip':
Round Trip: 3.141592653589793115997963468544
'''

storage_options: dict, Optional-

Dictionary of storage-specific options, such as credentials for cloud storage.

dtype_backend: None, Optional-

The dtype_backend parameter is new in Pandas 2.0 which is used to specify the backend for handling the types of data when reading a file.


Logo

BetterDocs

Support

EmailDiscordForms

Documentations

Python

Company

AboutDocs

Policies

Terms of ServicePrivacy Policy