It is similar to pd.read_table() method.
Path to the file or file-like object to read.
import pandas as pd
# Data to write to the file
data = {
"ID": [1, 2, 3],
"Name": ["Alice", "Bob", "Charlie"],
"Age": [30, 25, 35],
"Salary": [55000, 48000, 62000],
}
# Create a DataFrame
df_to_write = pd.DataFrame(data)
# Filepath for the tab-separated file
file_path = "data.csv"
# Writing the DataFrame to a tab-separated file
df_to_write.to_csv(file_path, sep="\t", index=False)
# Reading the file back into a DataFrame
df_read = pd.read_csv(filepath_or_buffer=file_path, sep="\t",)
# Display the DataFrame read from the file
print(df_read)
'''
Output:
ID Name Age Salary
0 1 Alice 30 55000
1 2 Bob 25 48000
2 3 Charlie 35 62000
'''
.to_csv is used to write object to a comma-separated values (csv) file.
Specifies the delimiter.
import pandas as pd
# Data to write to the file
data = {
"ID": [1, 2, 3],
"Name": ["Alice", "Bob", "Charlie"],
"Age": [30, 25, 35],
"Salary": [55000, 48000, 62000],
}
# Create a DataFrame
df_to_write = pd.DataFrame(data)
# Filepath for the tab-separated file
file_path = "data.csv"
# Writing the DataFrame to a tab-separated file
df_to_write.to_csv(file_path, sep="\t", index=False)
# Reading the file back into a DataFrame
df_read = pd.read_csv(filepath_or_buffer=file_path, sep="\t",)
# Display the DataFrame read from the file
print(df_read)
'''
Output:
ID Name Age Salary
0 1 Alice 30 55000
1 2 Bob 25 48000
2 3 Charlie 35 62000
'''
It serves the same purpose as sep but works in the context of the csv engine.
import pandas as pd
# Data to write to the file
data = {
"ID": [1, 2, 3],
"Name": ["Alice", "Bob", "Charlie"],
"Age": [30, 25, 35],
"Salary": [55000, 48000, 62000],
}
# Create a DataFrame
df_to_write = pd.DataFrame(data)
# Filepath for the tab-separated file
file_path = "data.csv"
# Writing the DataFrame to a tab-separated file
df_to_write.to_csv(file_path, sep="\t", index=False)
# Reading the file back into a DataFrame
df_read = pd.read_csv(filepath_or_buffer=file_path, delimiter="\t",)
# Display the DataFrame read from the file
print(df_read)
'''
Output:
ID Name Age Salary
0 1 Alice 30 55000
1 2 Bob 25 48000
2 3 Charlie 35 62000
'''
If both sep and delimiter are provided, pandas will use delimiter and ignore sep.
sep accepts a string whereas delimiter accepts only a single character.
It specifies the row of the file to use as column names for the resulting DataFrame.
import pandas as pd
# Data to write to the file
data = {
"ID": [1, 2, 3],
"Name": ["Alice", "Bob", "Charlie"],
"Age": [30, 25, 35],
"Salary": [55000, 48000, 62000],
}
# Create a DataFrame
df_to_write = pd.DataFrame(data)
# Filepath for the tab-separated file
file_path = "data.csv"
# Writing the DataFrame to a tab-separated file
df_to_write.to_csv(file_path, sep="\t", index=False)
# Reading the file back into a DataFrame
df_read = pd.read_csv(filepath_or_buffer=file_path, delimiter="\t", header=0)
# Display the DataFrame read from the file
print(df_read)
'''
Output:
ID Name Age Salary
0 1 Alice 30 55000
1 2 Bob 25 48000
2 3 Charlie 35 62000
'''
It allows you to manually specify the column names when reading a file into a DataFrame.
import pandas as pd
# Data to write to the file
data = {
"ID": [1, 2, 3],
"Name": ["Alice", "Bob", "Charlie"],
"Age": [30, 25, 35],
"Salary": [55000, 48000, 62000],
}
# Create a DataFrame
df_to_write = pd.DataFrame(data)
# Filepath for the tab-separated file
file_path = "data.csv"
# Writing the DataFrame to a tab-separated file
df_to_write.to_csv(file_path, sep="\t", index=False)
# Reading the file back into a DataFrame
df_read = pd.read_csv(filepath_or_buffer=file_path, delimiter="\t", header=0, names=["E_ID", "FName", "Age_in_Years", "Annual_Salary"])
# Display the DataFrame read from the file
print(df_read)
'''
Output:
E_ID FName Age_in_Years Annual_Salary
0 1 Alice 30 55000
1 2 Bob 25 48000
2 3 Charlie 35 62000
'''
Overrides existing headers for columns.
It specifies which column should be used as the index of the resulting DataFrame.
import pandas as pd
# Data to write to the file
data = {
"ID": [1, 2, 3],
"Name": ["Alice", "Bob", "Charlie"],
"Age": [30, 25, 35],
"Salary": [55000, 48000, 62000],
}
# Create a DataFrame
df_to_write = pd.DataFrame(data)
# Filepath for the tab-separated file
file_path = "data.csv"
# Writing the DataFrame to a tab-separated file
df_to_write.to_csv(file_path, sep="\t", index=False)
# Reading the file back into a DataFrame
df_read = pd.read_csv(filepath_or_buffer=file_path, delimiter="\t", header=0, index_col='ID')
# Display the DataFrame read from the file
print(df_read)
'''
Output:
Name Age Salary
ID
1 Alice 30 55000
2 Bob 25 48000
3 Charlie 35 62000
'''
It allows you to select specific columns to read from the input file, rather than reading all columns.
import pandas as pd
# Data to write to the file
data = {
"ID": [1, 2, 3],
"Name": ["Alice", "Bob", "Charlie"],
"Age": [30, 25, 35],
"Salary": [55000, 48000, 62000],
}
# Create a DataFrame
df_to_write = pd.DataFrame(data)
# Filepath for the tab-separated file
file_path = "data.csv"
# Writing the DataFrame to a tab-separated file
df_to_write.to_csv(file_path, sep="\t", index=False)
# Reading the file back into a DataFrame
df_read = pd.read_csv(filepath_or_buffer=file_path, delimiter="\t", header=0, usecols=["Name", "Salary"])
# Display the DataFrame read from the file
print(df_read)
'''
Output:
Name Salary
0 Alice 55000
1 Bob 48000
2 Charlie 62000
'''
Specifies the data-type of the DataFrame. If not provided, it’s inferred from the input.
import pandas as pd
# Data to write to the file
data = {
"ID": [1, 2, 3],
"Name": ["Alice", "Bob", "Charlie"],
"Age": [30, 25, 35],
"Salary": [55000, 48000, 62000],
}
# Create a DataFrame
df_to_write = pd.DataFrame(data)
# Filepath for the tab-separated file
file_path = "data.csv"
# Writing the DataFrame to a tab-separated file
df_to_write.to_csv(file_path, sep="\t", index=False)
# Reading the file back into a DataFrame
df_read = pd.read_csv(filepath_or_buffer=file_path, delimiter="\t", header=0, dtype={"ID": 'int8', "Name": 'string', "Age": 'int8', "Salary": 'float32'})
# Display the DataFrame read from the file
print(df_read)
'''
Output:
ID Name Age Salary
0 1 Alice 30 55000.0
1 2 Bob 25 48000.0
2 3 Charlie 35 62000.0
'''
int8: 8-bit signed integer (range: -128 to 127).
int16: 16-bit signed integer (range: -32,768 to 32,767).
int32: 32-bit signed integer (range: -2,147,483,648 to 2,147,483,647).
int64: 64-bit signed integer (large integer range).
uint8: 8-bit unsigned integer (range: 0 to 255).
uint16: 16-bit unsigned integer (range: 0 to 65,535).
uint32: 32-bit unsigned integer (range: 0 to 4,294,967,295).
uint64: 64-bit unsigned integer (large positive integer range).
float16: Half precision floating-point (16-bit, for low-precision computations).
float32: Single precision floating-point (32-bit).
float64: Double precision floating-point (64-bit, the default float in NumPy).
float128: Extended precision floating-point (128-bit, availability depends on system).
complex64: Complex number represented by two 32-bit floats (for real and imaginary parts).
complex128: Complex number represented by two 64-bit floats (default complex dtype).
complex256: Complex number represented by two 128-bit floats (system-dependent).
bool: Boolean type, can be either True or False (stored as 1-bit but takes up a full byte).
str: Fixed-length Unicode string, specified by S + length (e.g., S10 for a 10-character string).
unicode: Fixed-length Unicode string with support for multiple characters (uses U).
object: Allows storing any Python object, including mixed types, strings, or other arrays. Useful for heterogeneous data but slower than native NumPy types.
datetime64: Stores dates and times with varying precisions (e.g., Y, M, D, h, m, s, ms, us, ns, ps, fs, as). Example: datetime64('2003-10-02')
timedelta64: Represents time durations with units (same units as datetime64).
It specifies the underlying parsing engine to use when reading the file. Default c.
import pandas as pd
# Data to write to the file
data = {
"ID": [1, 2, 3],
"Name": ["Alice", "Bob", "Charlie"],
"Age": [30, 25, 35],
"Salary": [55000, 48000, 62000],
}
# Create a DataFrame
df_to_write = pd.DataFrame(data)
# Filepath for the tab-separated file
file_path = "data.csv"
# Writing the DataFrame to a tab-separated file
df_to_write.to_csv(file_path, sep="\t", index=False)
# Reading the file back into a DataFrame
df_read = pd.read_csv(filepath_or_buffer=file_path, sep="\t", header=0, engine='c')
# Display the DataFrame read from the file
print(df_read)
'''
Output:
ID Name Age Salary
0 1 Alice 30 55000
1 2 Bob 25 48000
2 3 Charlie 35 62000
'''
When reading a file, pandas uses one of these engines to parse the content. The c engine is preferred for its speed, but the python engine can be used if more control is needed or if the c engine encounters issues.
It allows you to specify custom functions to convert or process values in certain columns while reading the file.
import pandas as pd
# Data to write to the file
data = {
"ID": [1, 2, 3],
"Name": ["Alice", "Bob", "Charlie"],
"Age": [30, 20, 10],
"Salary": [55000, 48000, 62000],
}
# Create a DataFrame
df_to_write = pd.DataFrame(data)
# Filepath for the tab-separated file
file_path = "data.csv"
# Writing the DataFrame to a tab-separated file
df_to_write.to_csv(file_path, sep="\t", index=False)
def half_the_age(age):
return int(age)/2
# Reading the file back into a DataFrame
df_read = pd.read_csv(filepath_or_buffer=file_path, sep="\t", header=0, converters={"Age": half_the_age})
# Display the DataFrame read from the file
print(df_read)
'''
Output:
ID Name Age Salary
0 1 Alice 15.0 55000
1 2 Bob 10.0 48000
2 3 Charlie 5.0 62000
'''
It is used to specify which string values should be interpreted as boolean True.
import pandas as pd
# Data to write to the file
data = {
"ID": [1, 2, 3],
"Name": ["Alice", "Bob", "Charlie"],
"Age": [30, 20, 10],
"Salary": [55000, 48000, 62000],
"IsActive": ["no", "maybe", "yes"]
}
# Create a DataFrame
df_to_write = pd.DataFrame(data)
# Filepath for the tab-separated file
file_path = "data.csv"
# Writing the DataFrame to a tab-separated file
df_to_write.to_csv(file_path, sep="\t", index=False)
# Reading the file back into a DataFrame using true_values
df_read = pd.read_csv(filepath_or_buffer=file_path, sep="\t", header=0, true_values=["yes", "maybe"], false_values=["no"])
# Display the DataFrame read from the file
print(df_read)
'''
Output:
ID Name Age Salary IsActive
0 1 Alice 30 55000 False
1 2 Bob 20 48000 True
2 3 Charlie 10 62000 True
'''
For true_values or false_values to work as intended, both of them must contain either of the values in a column.
It is used to specify which string values should be interpreted as boolean False.
import pandas as pd
# Data to write to the file
data = {
"ID": [1, 2, 3],
"Name": ["Alice", "Bob", "Charlie"],
"Age": [30, 20, 10],
"Salary": [55000, 48000, 62000],
"IsActive": ["no", "maybe", "yes"]
}
# Create a DataFrame
df_to_write = pd.DataFrame(data)
# Filepath for the tab-separated file
file_path = "data.csv"
# Writing the DataFrame to a tab-separated file
df_to_write.to_csv(file_path, sep="\t", index=False)
# Reading the file back into a DataFrame using true_values
df_read = pd.read_csv(filepath_or_buffer=file_path, sep="\t", header=0, true_values=["yes"], false_values=["no", "maybe"])
# Display the DataFrame read from the file
print(df_read)
'''
Output:
ID Name Age Salary IsActive
0 1 Alice 30 55000 False
1 2 Bob 20 48000 False
2 3 Charlie 10 62000 True
'''
For true_values or false_values to work as intended, both of them must contain either of the values in a column.
It controls whether or not to skip spaces following the delimiter when parsing a file. Default False.
import pandas as pd
# Example data with extra spaces after the comma delimiter
data = """ID, Name , Age
1, Alice , 30
2, Bob , 25
3, Charlie , 35"""
# Write the data to a file
file_path = "example_skipspace.csv"
with open(file_path, "w") as f:
f.write(data)
# skipinitialspace=False (No space skipping)
df_false = pd.read_csv(file_path, sep=',', skipinitialspace=False)
print("With skipinitialspace=False:")
print(df_false)
# skipinitialspace=True (Skipping spaces after the delimiter)
df_true = pd.read_csv(file_path, sep=',', skipinitialspace=True)
print("With skipinitialspace=True:")
print(df_true)
'''
Output:
With skipinitialspace=False:
ID Name Age
0 1 Alice 30
1 2 Bob 25
2 3 Charlie 35
With skipinitialspace=True:
ID Name Age
0 1 Alice 30
1 2 Bob 25
2 3 Charlie 35
'''
It is used to skip a specified number of rows from the start of the file or from a list of specific row indices.
import pandas as pd
# Example data with extra rows before the actual data
data = """Header1, Header2, Header3
Some metadata line
More metadata line
ID, Name, Age
1, Alice, 30
2, Bob, 25
3, Charlie, 35"""
# Write the data to a file
file_path = "skiprows_example.csv"
with open(file_path, "w") as f:
f.write(data)
# Reading the file while skipping the first 3 rows
df = pd.read_csv(file_path, skiprows=3, sep=",")
print(df)
'''
Output:
ID Name Age
0 1 Alice 30
1 2 Bob 25
2 3 Charlie 35
'''
It allows you to skip a specified number of rows from the end of the file when reading the data.
import pandas as pd
# Example data with extra rows before the actual data
data = """
ID, Name, Age
1, Alice, 30
2, Bob, 25
3, Charlie, 35
Some metadata line
More metadata line"""
# Write the data to a file
file_path = "skiprows_example.csv"
with open(file_path, "w") as f:
f.write(data)
df = pd.read_csv(file_path, skipfooter=2, sep=",")
print(df)
'''
Output:
ID Name Age
0 1 Alice 30
1 2 Bob 25
2 3 Charlie 35
'''
It is used to limit the number of rows that are read from the file.
import pandas as pd
# Example data with extra rows before the actual data
data = """
ID, Name, Age
1, Alice, 30
2, Bob, 25
3, Charlie, 35
Some metadata line
More metadata line"""
# Write the data to a file
file_path = "skiprows_example.csv"
with open(file_path, "w") as f:
f.write(data)
df = pd.read_csv(file_path, sep=",", nrows=2)
print(df)
'''
Output:
ID Name Age
0 1 Alice 30
1 2 Bob 25
'''
It is used to specify additional values that should be treated as NaN (Not a Number) while reading the data.
import pandas as pd
# Data to write to the file
data = {
"ID": [1, 2, 3],
"Name": ["Alice", "Bob", "na"],
"Age": [30, "missing", 10],
"Salary": [55000, 48000, 62000],
}
# Create a DataFrame
df_to_write = pd.DataFrame(data)
# Filepath for the tab-separated file
file_path = "data.csv"
# Writing the DataFrame to a tab-separated file
df_to_write.to_csv(file_path, sep="\t", index=False)
# Reading the file back into a DataFrame
df_read = pd.read_csv(filepath_or_buffer=file_path, sep="\t", header=0, na_values=["missing", "na"])
# Display the DataFrame read from the file
print(df_read)
'''
Output:
ID Name Age Salary
0 1 Alice 30.0 55000
1 2 Bob NaN 48000
2 3 NaN 10.0 62000
'''
" ", "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", "1.#QNAN", "<NA>", "N/A", "NA", "NULL", "NaN", "None", "n/a", "nan", "null"
It controls whether or not the default missing values (i.e., NaN values) specified by pandas should be preserved when reading the file.
Pandas will keep the default missing value strings in the dataset (like NaN, NA, null, etc.) and convert them to NaN.
import pandas as pd
# Data to write to the file
data = {
"ID": [1, 2, 3],
"Name": ["Alice", "Bob", "null"],
"Age": [30, "NA", 10],
"Salary": [55000, 48000, 62000],
}
# Create a DataFrame
df_to_write = pd.DataFrame(data)
# Filepath for the tab-separated file
file_path = "data.csv"
# Writing the DataFrame to a tab-separated file
df_to_write.to_csv(file_path, sep="\t", index=False)
# Reading the file back into a DataFrame
df_read = pd.read_csv(filepath_or_buffer=file_path, sep="\t", header=0, keep_default_na=True)
# Display the DataFrame read from the file
print(df_read)
'''
Output:
ID Name Age Salary
0 1 Alice 30.0 55000
1 2 Bob NaN 48000
2 3 NaN 10.0 62000
'''
Pandas does not interpret the default missing values (e.g., NA, null, NaN, etc.) as missing values. It will only treat the values specified in the na_values parameter as NaN.
import pandas as pd
# Data to write to the file
data = {
"ID": [1, 2, 3],
"Name": ["Alice", "Bob", "null"],
"Age": [30, "NA", 10],
"Salary": [55000, 48000, 62000],
}
# Create a DataFrame
df_to_write = pd.DataFrame(data)
# Filepath for the tab-separated file
file_path = "data.csv"
# Writing the DataFrame to a tab-separated file
df_to_write.to_csv(file_path, sep="\t", index=False)
# Reading the file back into a DataFrame
df_read = pd.read_csv(filepath_or_buffer=file_path, sep="\t", header=0, keep_default_na=False)
# Display the DataFrame read from the file
print(df_read)
'''
Output:
ID Name Age Salary
0 1 Alice 30 55000
1 2 Bob NA 48000
2 3 null 10 62000
'''
" ", "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", "1.#QNAN", "<NA>", "N/A", "NA", "NULL", "NaN", "None", "n/a", "nan", "null"
If keep_default_na is True, and na_values are specified, na_values is appended to the default NaN values used for parsing.
If keep_default_na is True, and na_values are not specified, only the default NaN values are used for parsing.
If keep_default_na is False, and na_values are specified, only the NaN values specified na_values are used for parsing.
If keep_default_na is False, and na_values are not specified, no strings will be parsed as NaN.
If na_filter is passed in as <ri>False<ri>, the keep_default_na and na_values parameters will be ignored.
It controls whether pandas should check for missing values (e.g., NaN, NA, null, etc.) during the reading process.
Pandas will scan the entire dataset while reading and try to detect missing values (such as NaN, NA, null, etc.) and convert them into NaN.
import pandas as pd
# Data to write to the file
data = {
"ID": [1, 2, 3],
"Name": ["Alice", "Bob", "null"],
"Age": [30, "NA", 10],
"Salary": [55000, 48000, 62000],
}
# Create a DataFrame
df_to_write = pd.DataFrame(data)
# Filepath for the tab-separated file
file_path = "data.csv"
# Writing the DataFrame to a tab-separated file
df_to_write.to_csv(file_path, sep="\t", index=False)
# Reading the file back into a DataFrame
df_read = pd.read_csv(filepath_or_buffer=file_path, sep="\t", header=0, na_filter=True)
# Display the DataFrame read from the file
print(df_read)
'''
Output:
ID Name Age Salary
0 1 Alice 30.0 55000
1 2 Bob NaN 48000
2 3 NaN 10.0 62000
'''
Pandas will not check for missing values during the reading process.
import pandas as pd
# Data to write to the file
data = {
"ID": [1, 2, 3],
"Name": ["Alice", "Bob", "null"],
"Age": [30, "NA", 10],
"Salary": [55000, 48000, 62000],
}
# Create a DataFrame
df_to_write = pd.DataFrame(data)
# Filepath for the tab-separated file
file_path = "data.csv"
# Writing the DataFrame to a tab-separated file
df_to_write.to_csv(file_path, sep="\t", index=False)
# Reading the file back into a DataFrame
df_read = pd.read_csv(filepath_or_buffer=file_path, sep="\t", header=0, na_filter=False)
# Display the DataFrame read from the file
print(df_read)
'''
Output:
ID Name Age Salary
0 1 Alice 30 55000
1 2 Bob NA 48000
2 3 null 10 62000
'''
" ", "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", "1.#QNAN", "<NA>", "N/A", "NA", "NULL", "NaN", "None", "n/a", "nan", "null"
It controls whether or not detailed information about the parsing process is displayed.
Pandas will not print additional information during the read operation.
import pandas as pd
# Example data
data = """ID, Name, Age
1, Alice, 30
2, Bob, 25
3, Charlie, 35"""
# Write the data to a file
file_path = "verbose_example.csv"
with open(file_path, "w") as f:
f.write(data)
# Read the data with verbose=True
df = pd.read_csv(file_path, sep=',', verbose=False)
# Display the DataFrame
print(df)
'''
Output:
ID Name Age
0 1 Alice 30
1 2 Bob 25
2 3 Charlie 35
'''
Pandas will print out more information about the parsing process.
import pandas as pd
# Example data
data = """ID, Name, Age
1, Alice, 30
2, Bob, 25
3, Charlie, 35"""
# Write the data to a file
file_path = "verbose_example.csv"
with open(file_path, "w") as f:
f.write(data)
# Read the data with verbose=True
df = pd.read_csv(file_path, sep=',', verbose=True)
# Display the DataFrame
print(df)
'''
Output:
Tokenization took: 0.02 ms
Type conversion took: 0.93 ms
Parser memory cleanup took: 0.01 ms
ID Name Age
0 1 Alice 30
1 2 Bob 25
2 3 Charlie 35
'''
It controls whether or not blank lines should be skipped during the reading of the file.
Blank lines are skipped during the read process.
import pandas as pd
# Example data with blank lines
data = """ID, Name, Age
1, Alice, 30
2, Bob, 25
3, Charlie, 35"""
# Write the data to a file
file_path = "skip_blank_lines_example.csv"
with open(file_path, "w") as f:
f.write(data)
# Reading with skip_blank_lines=True (default)
df = pd.read_csv(file_path, sep=',', skip_blank_lines=True)
print(df)
'''
Output:
ID Name Age
0 1 Alice 30
1 2 Bob 25
2 3 Charlie 35
'''
Blank lines are not skipped. They will be treated as rows of NaN values, which might result in rows with missing data (i.e., NaN for every column).
import pandas as pd
# Example data with blank lines
data = """ID, Name, Age
1, Alice, 30
2, Bob, 25
3, Charlie, 35"""
# Write the data to a file
file_path = "skip_blank_lines_example.csv"
with open(file_path, "w") as f:
f.write(data)
# Reading with skip_blank_lines=False
df = pd.read_csv(file_path, sep=',', skip_blank_lines=False)
print(df)
'''
Output:
ID Name Age
0 1.0 Alice 30.0
1 NaN NaN NaN
2 2.0 Bob 25.0
3 NaN NaN NaN
4 3.0 Charlie 35.0
'''
It is used to specify which columns should be parsed as dates during the reading of the file.
Pandas will not attempt to parse any columns as dates. If any date strings are present, they will be read as plain text.
import pandas as pd
# Example data with date-like strings
data = """ID,Name,BirthDate
1,Alice,2003-10-02
2,Bob,2003-04-16
3,Charlie,2023-12-29"""
# Write the data to a file
file_path = "parse_dates_example.csv"
with open(file_path, "w") as f:
f.write(data)
# Reading with parse_dates=False
df = pd.read_csv(file_path, sep=',', header=0, parse_dates=False)
print(df['BirthDate'])
'''
Output:
0 2003-10-02
1 2003-04-16
2 2023-12-29
Name: BirthDate, dtype: object
'''
Pandas will attempt to parse all columns with date-like values (e.g., strings in the format "YYYY-MM-DD") into datetime objects.
import pandas as pd
# Example data with date-like strings
data = """ID,Name,BirthDate
1,Alice,2003-10-02
2,Bob,2003-04-16
3,Charlie,2023-12-29"""
# Write the data to a file
file_path = "parse_dates_example.csv"
with open(file_path, "w") as f:
f.write(data)
# Reading with parse_dates=True
df = pd.read_csv(file_path, sep=',', header=0, parse_dates=['BirthDate'])
print(df['BirthDate'])
'''
Output:
0 2003-10-02
1 2003-04-16
2 2023-12-29
Name: BirthDate, dtype: datetime64[ns]
'''
Deprecated since version 2.0.0: A strict version of this argument is now the default, passing it has no effect.
Deprecated since version 2.0.0: A strict version of this argument is now the default, passing it has no effect.
Deprecated since version 2.0.0: A strict version of this argument is now the default, passing it has no effect.
It allows you specify the input format of the date when reading.
import pandas as pd
# Example data with date-like strings
data = """ID,Name,BirthDate
1,Alice,2003-10-02
2,Bob,2003-04-16
3,Charlie,2023-12-29"""
# Write the data to a file
file_path = "parse_dates_example.csv"
with open(file_path, "w") as f:
f.write(data)
# Reading the data with parse_dates=True and keep_date_col=True
df = pd.read_csv(file_path, sep=',', header=0, parse_dates=["BirthDate"], date_format="%Y-%m-%d")
# Display the DataFrame
print(df['BirthDate'])
'''
Output:
0 2003-10-02
1 2003-04-16
2 2023-12-29
Name: BirthDate, dtype: datetime64[ns]
'''
DD/MM format dates, international and European format.
It allows you specify the input format of the date when reading.
import pandas as pd
# Example data with dates
data = """ID,Name,BirthDate
1,Alice,2003-10-02
2,Bob,2003-04-16
3,Charlie,2023-12-29"""
# Write the data to a file
file_path = "example_cache_dates.csv"
with open(file_path, "w") as f:
f.write(data)
# Read the data with cache_dates
dff = pd.read_csv(file_path, sep=',', parse_dates=["BirthDate"], cache_dates=False)
dft = pd.read_csv(file_path, sep=',', parse_dates=["BirthDate"], cache_dates=True)
print('cache_date=False',dff['BirthDate'],'cache_date=True',dft['BirthDate'],sep='\n')
'''
Output:
cache_date=False
0 2003-10-02
1 2003-04-16
2 2023-12-29
Name: BirthDate, dtype: datetime64[ns]
cache_date=True
0 2003-10-02
1 2003-04-16
2 2023-12-29
Name: BirthDate, dtype: datetime64[ns]
'''
When cache_dates=True: Pandas will store the parsed date results in memory. This can make date parsing faster in some cases, especially when the same date is encountered repeatedly in the dataset.
When cache_dates=False: Pandas will not cache the parsed dates. Each date will be parsed from scratch as it is encountered.
It is used to specify which columns should be parsed as dates during the reading of the file.
It allows you to return an iterator (a type of object that allows you to iterate over the data incrementally) instead of loading the entire dataset into memory all at once.
import pandas as pd
# Example data with dates
data = """ID,Name,BirthDate
1,Alice,2020-01-01
2,Bob,2021-02-28
3,Charlie,2023-03-15
4,David,2019-11-12
5,Eve,2022-08-09"""
# Write the data to a file
file_path = "example_ite.csv"
with open(file_path, "w") as f:
f.write(data)
# Read the data
df = pd.read_csv(file_path, sep=',', iterator=False)
print(df)
'''
Output:
ID Name BirthDate
0 1 Alice 2020-01-01
1 2 Bob 2021-02-28
2 3 Charlie 2023-03-15
3 4 David 2019-11-12
4 5 Eve 2022-08-09
'''
Returns an iterator object that can be used to read the data in chunks, which is useful for processing large datasets.
import pandas as pd
# Example data with dates
data = """ID,Name,BirthDate
1,Alice,2020-01-01
2,Bob,2021-02-28
3,Charlie,2023-03-15
4,David,2019-11-12
5,Eve,2022-08-09"""
# Write the data to a file
file_path = "example_ite.csv"
with open(file_path, "w") as f:
f.write(data)
# Read the data
df = pd.read_csv(file_path, sep=',', iterator=True)
# Read first 2
print(df.get_chunk(2))
print()
# Read the *next* 2
print(df.get_chunk(2))
'''
Output:
ID Name BirthDate
0 1 Alice 2020-01-01
1 2 Bob 2021-02-28
ID Name BirthDate
2 3 Charlie 2023-03-15
3 4 David 2019-11-12
'''
Number of lines to read from the file per chunk.
import pandas as pd
# Example data with dates
data = """ID,Name,BirthDate
1,Alice,2020-01-01
2,Bob,2021-02-28
3,Charlie,2023-03-15
4,David,2019-11-12
5,Eve,2022-08-09"""
# Write the data to a file
file_path = "example_ite.csv"
with open(file_path, "w") as f:
f.write(data)
# Read the data
df = pd.read_csv(file_path, sep=',', chunksize=3)
# Read first 2
print(df.get_chunk(2))
# # Read the *next* 2
print(df.get_chunk(2))
'''
Output:
ID Name BirthDate
0 1 Alice 2020-01-01
1 2 Bob 2021-02-28
ID Name BirthDate
2 3 Charlie 2023-03-15
3 4 David 2019-11-12
'''
It specifies the type of compression used for reading a file.
import pandas as pd
# Example data
data = {
"ID": [1, 2, 3],
"Name": ["Alice", "Bob", "Charlie"],
"Age": [30, 25, 35]
}
# Create a DataFrame
df = pd.DataFrame(data)
# Write the DataFrame to a GZIP-compressed file
file_path = "example_compressed.gz"
df.to_csv(file_path, index=False, compression='gzip')
# Reading the GZIP compressed file directly using pandas
df_gzip = pd.read_csv(file_path, sep=',', compression='gzip')
print(df_gzip)
'''
Output:
ID Name Age
0 1 Alice 30
1 2 Bob 25
2 3 Charlie 35
'''
Character acting as the thousands separator in numerical values.
import pandas as pd
# Example data with thousands separators (dot)
data = """ID,Amount
1,1,000
2,2,500.000
3,3,700"""
# Write the data to a file
file_path = "example_thousands.csv"
with open(file_path, "w") as f:
f.write(data)
# Read the file and specify the thousands separator (dot)
df = pd.read_csv(file_path, sep=',', thousands='.')
print(df['Amount'])
'''
Output:
1 0
2 500000
3 700
Name: Amount, dtype: int64
'''
delimiter or sep cannot have the same value as thousands.
Character to recognize as decimal point (e.g., use ‘,’ for European data).
import pandas as pd
# Example data with decimal separators (|)
data = """ID,Amount
1,1,000|9
2,2,500|70
3,3,700"""
# Write the data to a file
file_path = "example_dec.csv"
with open(file_path, "w") as f:
f.write(data)
# Read the file
df = pd.read_csv(file_path, sep=',', decimal='|')
print(df['Amount'])
'''
1 0.9
2 500.7
3 700.0
Name: Amount, dtype: float64
'''
delimiter or sep cannot have the same value as decimal.
It specifies the character or string used to terminate each line in the input file. Default \n
import pandas as pd
# Example data with Windows-style line terminators (\t)
data = "ID,Name,Age\t1,Alice,30\t2,Bob,25\t3,Charlie,35"
# Write the data to a file with \r as line terminator
file_path = "example_lineterminator.csv"
with open(file_path, "w") as f:
f.write(data)
# Read the file without specifying lineterminator
df = pd.read_csv(file_path, sep=',', lineterminator="\t")
print(df)
'''
Output:
ID Name Age
0 1 Alice 30
1 2 Bob 25
2 3 Charlie 35
'''
delimiter or sep is preferred to not have the same value as lineterminator.
It specifies the character used to quote fields in the input file.
import pandas as pd
# Example data with quoted fields
data = """ID,Name,Age
1,"Alice, Smith",30
2,"Bob, Johnson",25
3,"Charlie, Brown",35"""
# Write the data to a file
file_path = "example_quotechar.csv"
with open(file_path, "w") as f:
f.write(data)
# Read the file with quotechar as "
df = pd.read_csv(file_path, sep=',', quotechar='"')
print(df)
'''
Output:
ID Name Age
0 1 Alice, Smith 30
1 2 Bob, Johnson 25
2 3 Charlie, Brown 35
'''
delimiter or sep is preferred to not have the same value as quotechar.
Control field quoting behavior per csv.QUOTE_* constants. Default is csv.QUOTE_MINIMAL (i.e., 0) which implies that only fields containing special characters are quoted (e.g., characters defined in quotechar, delimiter, or lineterminator.
It is used to control how the parser handles quoted data that contains the quote character itself.
If a field contains the quote character, it will be escaped by doubling the quote.
import pandas as pd
import csv
# Example data with quotes inside quoted fields
data = '''ID,Name
1,Alice
2,Bob
3,"Charlie ""The King"""
'''
# Write the data to a file
file_path = "doublequote_example.csv"
with open(file_path, "w") as f:
f.write(data)
# Read the file with doublequote=True (default)
df = pd.read_csv(file_path, quotechar='"', doublequote=True)
print(df)
'''
Output:
ID Name
0 1 Alice
1 2 Bob
2 3 Charlie "The King"
'''
Pandas will not escape the quote character by doubling it.
import pandas as pd
import csv
# Example data with quotes inside quoted fields
data = '''ID,Name
1,Alice
2,Bob
3,"Charlie ""The King"""
'''
# Write the data to a file
file_path = "doublequote_example.csv"
with open(file_path, "w") as f:
f.write(data)
# Read the file with doublequote=False
df = pd.read_csv(file_path, quotechar='"', doublequote=False)
print(df)
'''
Output:
ID Name
0 1 Alice
1 2 Bob
2 3 Charlie "The King"""
'''
It is used to specify a character that will escape special characters in the file.
import pandas as pd
# Example data with escape characters
data = '''ID,Name
1,Alice
2,Bob
3,Charlie\,The King
'''
# Write the data to a file
file_path = "escapechar_example.csv"
with open(file_path, "w") as f:
f.write(data)
# Read the file with escapechar='\'
df = pd.read_csv(file_path, sep=',', escapechar='\\')
print(df)
'''
Output:
ID Name
0 1 Alice
1 2 Bob
2 3 Charlie,The King
'''
delimiter or sep is preferred to not have the same value as escapechar.
It is used to specify a character that indicates the beginning of a comment in the data file.
import pandas as pd
# Example data with a comment line
data = '''# This is a comment line
ID,Name
1,Alice
2,Bob
3,Charlie
'''
# Write the data to a file
file_path = "comment_example.csv"
with open(file_path, "w") as f:
f.write(data)
# Read the file and skip comment lines
df = pd.read_csv(file_path, sep=',', comment='#')
print(df)
'''
Output:
ID Name
0 1 Alice
1 2 Bob
2 3 Charlie
'''
It is used to specify the character encoding of the input file.
import pandas as pd
# Example data with special characters
data = "ID,Name\n1,Alice\n2,Bob\n3,Chloé"
# Write the data to a file with ISO-8859-1 encoding
file_path = "encoding_example.csv"
with open(file_path, "w", encoding="ISO-8859-1") as f:
f.write(data)
# Read the file with the correct encoding
df = pd.read_csv(file_path, sep=",", encoding="ISO-8859-1")
print(df)
'''
Output:
ID Name
0 1 Alice
1 2 Bob
2 3 Chloé
'''
It allows you to control how encoding errors are handled during the reading process.
import pandas as pd
# Write the file with explicit UTF-8 encoding
data = "ID,Name\n1,Alice\n2,Bob\n3,Chloé"
file_path = "encoding_errors_example.csv"
with open(file_path, "w", encoding="utf-8") as f:
f.write(data)
# Read the file with incorrect ASCII encoding
try:
df = pd.read_csv(file_path, sep=",", encoding="ascii", encoding_errors="strict")
except UnicodeDecodeError as e:
print(e)
df_ignore = pd.read_csv(file_path, sep=",", encoding="ascii", encoding_errors="ignore")
df_replace = pd.read_csv(file_path, sep=",", encoding="ascii", encoding_errors="replace")
# Print the results
print("With encoding_errors='ignore':")
print(df_ignore)
print("\nWith encoding_errors='replace':")
print(df_replace)
'''
Output:
'ascii' codec can't decode byte 0xc3 in position 28: ordinal not in range(128)
With encoding_errors='ignore':
ID Name
0 1 Alice
1 2 Bob
2 3 Chlo
With encoding_errors='replace':
ID Name
0 1 Alice
1 2 Bob
2 3 Chlo��
'''
It is used to specify a predefined set of CSV parsing rules.
import csv
import pandas as pd
# Register a custom dialect
csv.register_dialect('my_dialect', delimiter='|', quoting=csv.QUOTE_NONE)
# Create a sample file
file_path = 'sample_dialect.csv'
with open(file_path, 'w') as f:
f.write("ID|Name\n1|Alice\n2|Bob\n3|Charlie")
# Use the custom dialect to read the file
df = pd.read_csv(file_path, dialect='my_dialect')
print(df)
'''
Output:
ID Name
0 1 Alice
1 2 Bob
2 3 Charlie
'''
Specifies what to do upon encountering a bad line (a line with too many fields).
Raise an Exception when a bad line is encountered.
import pandas as pd
# Create a sample file with a bad line
file_path = 'sample_bad_lines.csv'
with open(file_path, 'w') as f:
f.write("ID,Name\n1,Alice\n2,Bob\n3,Charlie,ExtraField\n4,Diana")
df_err = pd.read_csv(file_path, sep=",", on_bad_lines="error")
print("With on_bad_lines='error':")
print(df_err)
'''
Output (Error raised):
ParserError Traceback (most recent call last)
'''
Raise a warning when a bad line is encountered and skip that line.
import pandas as pd
# Create a sample file with a bad line
file_path = 'sample_bad_lines.csv'
with open(file_path, 'w') as f:
f.write("ID,Name\n1,Alice\n2,Bob\n3,Charlie,ExtraField\n4,Diana")
# Handle bad lines by logging a warning
df_warn = pd.read_csv(file_path, sep=",", on_bad_lines="warn")
print("With on_bad_lines='warn':")
print(df_warn)
'''
Output:
With on_bad_lines='warn':
ID Name
0 1 Alice
1 2 Bob
2 4 Diana
<ipython-input-301-47d9fc19f966>:8: ParserWarning: Skipping line 4: expected 2 fields, saw 3
'''
Skip bad lines without raising or warning when they are encountered.
import pandas as pd
# Create a sample file with a bad line
file_path = 'sample_bad_lines.csv'
with open(file_path, 'w') as f:
f.write("ID,Name\n1,Alice\n2,Bob\n3,Charlie,ExtraField\n4,Diana")
# Handle bad lines by skipping them
df = pd.read_csv(file_path, sep=",", on_bad_lines="skip")
print("With on_bad_lines='skip':")
print(df)
'''
Output:
With on_bad_lines='skip':
ID Name
0 1 Alice
1 2 Bob
2 4 Diana
'''
A user-defined function that processes bad lines. This function receives the bad line as input and can either return a processed version of the line or raise an error.
import pandas as pd
# Create a sample file with a bad line
file_path = 'sample_bad_lines.csv'
with open(file_path, 'w') as f:
f.write("ID,Name\n1,Alice\n2,Bob\n3,Charlie,ExtraField\n4,Diana")
# Define a custom function for processing bad lines
def custom_handler(bad_line):
print(f"Processing bad line: {bad_line}")
# Example: return None to skip the line or modify the line
return None
df_custom = pd.read_csv(file_path, sep=",", on_bad_lines=custom_handler, engine='python')
print("With a custom handler:")
print(df_custom)
'''
Output:
Processing bad line: ['3', 'Charlie', 'ExtraField']
With a custom handler:
ID Name
0 1 Alice
1 2 Bob
2 4 Diana
'''
engine must always be python or pyarrow.
It is used to control memory optimization when reading large files.
Pandas will attempt to optimize memory usage by reading the file in chunks.
import pandas as pd
import csv,sys
sys.setrecursionlimit(15000) # Increase the recursion limit
# Recursive function to generate large data
def generate_data(current_row, max_rows, file_path):
if current_row > max_rows:
return
with open(file_path, mode='a', newline='', encoding="utf-8") as file:
csv.writer(file).writerow([current_row, f"Name{current_row}"])
generate_data(current_row + 1, max_rows, file_path)
# File path for the generated large CSV
file_path = "large_data.csv"
# Write headers to the file first
with open(file_path, mode='w', newline='', encoding="utf-8") as file:
csv.writer(file).writerow(["ID", "Name"])
# Generate large data (10000 rows for this example)
generate_data(1, 10000, file_path)
# Read the generated large data using low_memory=True
df_low_memory = pd.read_csv(file_path, sep=",", low_memory=True)
# Display the first few rows
print(df_low_memory.head())
'''
Output:
ID Name
0 1 Name1
1 2 Name2
2 3 Name3
3 4 Name4
4 5 Name5
'''
Pandas will read the entire file in one go, which might be more memory-intensive but will ensure that column types are inferred accurately.
import pandas as pd
import csv
# Recursive function to generate large data
def generate_data(current_row, max_rows, file_path):
if current_row > max_rows:
return
with open(file_path, mode='a', newline='', encoding="utf-8") as file:
csv.writer(file).writerow([current_row, f"Name{current_row}"])
generate_data(current_row + 1, max_rows, file_path)
# File path for the generated large CSV
file_path = "large_data.csv"
# Write headers to the file first
with open(file_path, mode='w', newline='', encoding="utf-8") as file:
csv.writer(file).writerow(["ID", "Name"])
# Generate large data (10000 rows for this example)
generate_data(1, 10000, file_path)
# Read the generated large data using low_memory=False
df = pd.read_csv(file_path, sep=",", low_memory=False)
# Display the first few rows
print(df.head())
'''
Output:
ID Name
0 1 Name1
1 2 Name2
2 3 Name3
3 4 Name4
4 5 Name5
'''
It allows you to use memory-mapped file access to read a file. Memory-mapped files enable efficient handling of large files by using virtual memory instead of directly reading the file into RAM.
It reads the file in the normal way, loading it into memory completely.
import pandas as pd
import csv,sys
sys.setrecursionlimit(15000) # Increase the recursion limit
# Recursive function to generate large data
def generate_data(current_row, max_rows, file_path):
if current_row > max_rows:
return
with open(file_path, mode='a', newline='', encoding="utf-8") as file:
csv.writer(file).writerow([current_row, f"Name{current_row}"])
generate_data(current_row + 1, max_rows, file_path)
# File path for the generated large CSV
file_path = "large_data.csv"
# Write headers to the file first
with open(file_path, mode='w', newline='', encoding="utf-8") as file:
csv.writer(file).writerow(["ID", "Name"])
# Generate large data (10000 rows for this example)
generate_data(1, 10000, file_path)
# Read the file with memory-mapped access
df_mmap = pd.read_csv(file_path, memory_map=False)
print(df_mmap.head())
'''
Output:
ID,Name
0 1,Name1
1 2,Name2
2 3,Name3
3 4,Name4
4 5,Name5
'''
It enables memory-mapped file reading, which uses the operating system's virtual memory to map the contents of the file directly into memory, improving performance for very large files.
import pandas as pd
import csv,sys
sys.setrecursionlimit(15000) # Increase the recursion limit
# Recursive function to generate large data
def generate_data(current_row, max_rows, file_path):
if current_row > max_rows:
return
with open(file_path, mode='a', newline='', encoding="utf-8") as file:
csv.writer(file).writerow([current_row, f"Name{current_row}"])
generate_data(current_row + 1, max_rows, file_path)
# File path for the generated large CSV
file_path = "large_data.csv"
# Write headers to the file first
with open(file_path, mode='w', newline='', encoding="utf-8") as file:
csv.writer(file).writerow(["ID", "Name"])
# Generate large data (10000 rows for this example)
generate_data(1, 10000, file_path)
# Read the file with memory-mapped access
df_mmap = pd.read_csv(file_path, memory_map=True)
print(df_mmap.head())
'''
Output:
ID,Name
0 1,Name1
1 2,Name2
2 3,Name3
3 4,Name4
4 5,Name5
'''
It is used to specify a predefined set of CSV parsing rules.
import pandas as pd
# Create a file with high precision floating-point numbers
data = """ID,Value
1,3.141592653589793238462643383279502884197169399375105820974944
2,2.718281828459045235360287471352662497757247093699959574966967
3,1.618033988749894848204586834365638117900284550292151098084
"""
file_path = "high_precision_data.csv"
# Write the data to a CSV file
with open(file_path, "w") as f:
f.write(data)
# Read the file with legacy float precision
df_legacy = pd.read_csv(file_path, sep=",", float_precision="legacy")
print("With float_precision='legacy':")
print(f"Low: {df_legacy['Value'][0]:.30f}")
# Read the file with high float precision
df_high = pd.read_csv(file_path, sep=",", float_precision="high")
print("\nWith float_precision='high':")
print(f"High: {df_high['Value'][0]:.30f}")
# Read the file with round_trip float precision
df_round = pd.read_csv(file_path, sep=",", float_precision="round_trip")
print("\nWith float_precision='round_trip':")
print(f"Round Trip: {df_round['Value'][0]:.30f}")
'''
Output:
With float_precision='legacy':
Low: 3.141592653589792227819543768419
With float_precision='high':
High: 3.141592653589792671908753618482
With float_precision='round_trip':
Round Trip: 3.141592653589793115997963468544
'''
Dictionary of storage-specific options, such as credentials for cloud storage.
The dtype_backend parameter is new in Pandas 2.0 which is used to specify the backend for handling the types of data when reading a file.