Sheet name or index to parse; default is 0 (first sheet).
import pandas as pd
from openpyxl import Workbook
# Create an Excel file with multiple sheets
file_path = "example.xlsx"
# Create a workbook and add data
wb = Workbook()
sheet1 = wb.active
sheet1.title = "Sheet1"
sheet1.append(["ID", "Name", "Salary"])
sheet1.append([1, "Alice", "2000"])
sheet1.append([2, "Bob", "10000"])
sheet1.append([2, "Charlie", "2000"])
# Save the workbook
wb.save(file_path)
# Read the sheets
df = pd.ExcelFile(path_or_buffer=file_path).parse(sheet_name='Sheet1')
print(df)
'''
Output:
ID Name Salary
0 1 Alice 2000
1 2 Bob 10000
2 2 Charlie 2000
'''
Row number(s) to use as the column names.
import pandas as pd
from openpyxl import Workbook
# Create an Excel file with multiple sheets
file_path = "example.xlsx"
# Create a workbook and add data
wb = Workbook()
sheet1 = wb.active
sheet1.title = "Sheet1"
sheet1.append(["ID", "Name", "Salary"])
sheet1.append([1, "Alice", "2000"])
sheet1.append([2, "Bob", "10000"])
sheet1.append([2, "Charlie", "2000"])
# Save the workbook
wb.save(file_path)
# Read the sheets
df = pd.ExcelFile(path_or_buffer=file_path).parse(sheet_name='Sheet1', header=0)
print(df)
'''
Output:
ID Name Salary
0 1 Alice 2000
1 2 Bob 10000
2 2 Charlie 2000
'''
Custom column names to use.
import pandas as pd
from openpyxl import Workbook
# Create an Excel file with multiple sheets
file_path = "example.xlsx"
# Create a workbook and add data
wb = Workbook()
sheet1 = wb.active
sheet1.title = "Sheet1"
sheet1.append(["ID", "Name", "Salary"])
sheet1.append([1, "Alice", "2000"])
sheet1.append([2, "Bob", "10000"])
sheet1.append([2, "Charlie", "2000"])
# Save the workbook
wb.save(file_path)
# Read the sheets
df = pd.ExcelFile(path_or_buffer=file_path).parse(sheet_name='Sheet1', names=["EmpID", "FName", "Kids"])
print(df)
'''
Output:
EmpID FName Kids
0 1 Alice 2000
1 2 Bob 10000
2 2 Charlie 2000
'''
Overrides existing headers for columns.
It specifies which column should be used as the index of the resulting DataFrame.
import pandas as pd
from openpyxl import Workbook
# Create an Excel file with multiple sheets
file_path = "example.xlsx"
# Create a workbook and add data
wb = Workbook()
sheet1 = wb.active
sheet1.title = "Sheet1"
sheet1.append(["ID", "Name", "Salary"])
sheet1.append([1, "Alice", "2000"])
sheet1.append([2, "Bob", "10000"])
sheet1.append([2, "Charlie", "2000"])
# Save the workbook
wb.save(file_path)
# Read the sheets
df = pd.ExcelFile(path_or_buffer=file_path).parse(sheet_name='Sheet1', index_col='ID')
print(df)
'''
Output:
Name Salary
ID
1 Alice 2000
2 Bob 10000
2 Charlie 2000
'''
It allows you to select specific columns to read from the input file, rather than reading all columns.
import pandas as pd
from openpyxl import Workbook
# Create an Excel file with multiple sheets
file_path = "example.xlsx"
# Create a workbook and add data
wb = Workbook()
sheet1 = wb.active
sheet1.title = "Sheet1"
sheet1.append(["ID", "Name", "Salary"])
sheet1.append([1, "Alice", "2000"])
sheet1.append([2, "Bob", "10000"])
sheet1.append([2, "Charlie", "2000"])
# Save the workbook
wb.save(file_path)
# Read the sheets
df = pd.ExcelFile(path_or_buffer=file_path).parse(sheet_name='Sheet1', usecols=["ID", "Name"])
print(df)
'''
Output:
ID Name
0 1 Alice
1 2 Bob
2 2 Charlie
'''
It allows you to specify custom functions to convert or process values in certain columns while reading the file.
import pandas as pd
from openpyxl import Workbook
# Create an Excel file with multiple sheets
file_path = "example.xlsx"
# Create a workbook and add data
wb = Workbook()
sheet1 = wb.active
sheet1.title = "Sheet1"
sheet1.append(["ID", "Name", "Age"])
sheet1.append([1, "Alice", 20])
sheet1.append([2, "Bob", 30])
sheet1.append([2, "Charlie", 10])
# Save the workbook
wb.save(file_path)
def half_the_age(age):
return int(age)/2
# Read the sheets
df = pd.ExcelFile(path_or_buffer=file_path).parse(sheet_name='Sheet1', converters={"Age": half_the_age})
print(df)
'''
Output:
ID Name Age
0 1 Alice 10.0
1 2 Bob 15.0
2 2 Charlie 5.0
'''
It is used to specify which string values should be interpreted as boolean True.
import pandas as pd
from openpyxl import Workbook
# Create an Excel file with multiple sheets
file_path = "example.xlsx"
# Create a workbook and add data
wb = Workbook()
sheet1 = wb.active
sheet1.title = "Sheet1"
sheet1.append(["ID", "Name", "Salary", "isActive"])
sheet1.append([1, "Alice", "2000", "yes"])
sheet1.append([2, "Bob", "10000", "maybe"])
sheet1.append([2, "Charlie", "2000", "no"])
# Save the workbook
wb.save(file_path)
# Read the sheets
df = pd.ExcelFile(path_or_buffer=file_path).parse(sheet_name='Sheet1', true_values=["yes", "maybe"], false_values=["no"])
print(df)
'''
Output:
ID Name Salary isActive
0 1 Alice 2000 True
1 2 Bob 10000 True
2 2 Charlie 2000 False
'''
For true_values or false_values to work as intended, both of them must contain either of the values in a column.
It is used to specify which string values should be interpreted as boolean False.
import pandas as pd
from openpyxl import Workbook
# Create an Excel file with multiple sheets
file_path = "example.xlsx"
# Create a workbook and add data
wb = Workbook()
sheet1 = wb.active
sheet1.title = "Sheet1"
sheet1.append(["ID", "Name", "Salary", "isActive"])
sheet1.append([1, "Alice", "2000", "yes"])
sheet1.append([2, "Bob", "10000", "maybe"])
sheet1.append([2, "Charlie", "2000", "no"])
# Save the workbook
wb.save(file_path)
# Read the sheets
df = pd.ExcelFile(path_or_buffer=file_path).parse(sheet_name='Sheet1', true_values=["yes"], false_values=["no", "maybe"])
print(df)
'''
Output:
ID Name Salary isActive
0 1 Alice 2000 True
1 2 Bob 10000 False
2 2 Charlie 2000 False
'''
For true_values or false_values to work as intended, both of them must contain either of the values in a column.
It is used to skip a specified number of rows from the start of the file or from a list of specific row indices.
import pandas as pd
from openpyxl import Workbook
# Create an Excel file with multiple sheets
file_path = "example.xlsx"
# Create a workbook and add data
wb = Workbook()
sheet1 = wb.active
sheet1.title = "Sheet1"
sheet1.append(["random", "random", "random"])
sheet1.append(["ID", "Name", "Salary"])
sheet1.append([1, "Alice", "2000"])
sheet1.append([2, "Bob", "10000"])
sheet1.append([2, "Charlie", "2000"])
# Save the workbook
wb.save(file_path)
# Read the sheets
df = pd.ExcelFile(path_or_buffer=file_path).parse(sheet_name='Sheet1', skiprows=1)
print(df)
'''
Output:
ID Name Salary
0 1 Alice 2000
1 2 Bob 10000
2 2 Charlie 2000
'''
It is used to limit the number of rows that are read from the file.
import pandas as pd
from openpyxl import Workbook
# Create an Excel file with multiple sheets
file_path = "example.xlsx"
# Create a workbook and add data
wb = Workbook()
sheet1 = wb.active
sheet1.title = "Sheet1"
sheet1.append(["ID", "Name", "Salary"])
sheet1.append([1, "Alice", "2000"])
sheet1.append([2, "Bob", "10000"])
sheet1.append([2, "Charlie", "2000"])
# Save the workbook
wb.save(file_path)
# Read the sheets
df = pd.ExcelFile(path_or_buffer=file_path).parse(sheet_name='Sheet1', nrows=2)
print(df)
'''
Output:
ID Name Salary
0 1 Alice 2000
1 2 Bob 10000
'''
It is used to specify additional values that should be treated as NaN (Not a Number) while reading the data.
import pandas as pd
from openpyxl import Workbook
# Create an Excel file with multiple sheets
file_path = "example.xlsx"
# Create a workbook and add data
wb = Workbook()
sheet1 = wb.active
sheet1.title = "Sheet1"
sheet1.append(["ID", "Name", "Salary"])
sheet1.append([1, "Alice", "na"])
sheet1.append([2, "missing", "10000"])
sheet1.append([2, "Charlie", "2000"])
# Save the workbook
wb.save(file_path)
# Read the sheets
df = pd.ExcelFile(path_or_buffer=file_path).parse(sheet_name='Sheet1', na_values=["missing", "na"])
print(df)
'''
Output:
ID Name Salary
0 1 Alice NaN
1 2 NaN 10000.0
2 2 Charlie 2000.0
'''
" ", "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", "1.#QNAN", "<NA>", "N/A", "NA", "NULL", "NaN", "None", "n/a", "nan", "null"
It is used to specify which columns should be parsed as dates during the reading of the file.
Pandas will not attempt to parse any columns as dates. If any date strings are present, they will be read as plain text.
import pandas as pd
from openpyxl import Workbook
# Create an Excel file with multiple sheets
file_path = "example.xlsx"
# Create a workbook and add data
wb = Workbook()
sheet1 = wb.active
sheet1.title = "Sheet1"
sheet1.append(["ID", "Name", "BirthDate"])
sheet1.append([1, "Alice", "2003-10-02"])
sheet1.append([2, "Bob", "2003-04-16"])
sheet1.append([2, "Charlie", "2023-12-29"])
# Save the workbook
wb.save(file_path)
# Read the sheets
df = pd.ExcelFile(path_or_buffer=file_path).parse(sheet_name='Sheet1', parse_dates=False)
print(df['BirthDate'])
'''
Output:
0 2003-10-02
1 2003-04-16
2 2023-12-29
Name: BirthDate, dtype: object
'''
Pandas will attempt to parse all columns with date-like values (e.g., strings in the format "YYYY-MM-DD") into datetime objects.
import pandas as pd
from openpyxl import Workbook
# Create an Excel file with multiple sheets
file_path = "example.xlsx"
# Create a workbook and add data
wb = Workbook()
sheet1 = wb.active
sheet1.title = "Sheet1"
sheet1.append(["ID", "Name", "BirthDate"])
sheet1.append([1, "Alice", "2003-10-02"])
sheet1.append([2, "Bob", "2003-04-16"])
sheet1.append([2, "Charlie", "2023-12-29"])
# Save the workbook
wb.save(file_path)
# Read the sheets
df = pd.ExcelFile(path_or_buffer=file_path).parse(sheet_name='Sheet1', parse_dates=['BirthDate'])
print(df['BirthDate'])
'''
Output:
0 2003-10-02
1 2003-04-16
2 2023-12-29
Name: BirthDate, dtype: datetime64[ns]
'''
Deprecated since version 2.0.0: A strict version of this argument is now the default, passing it has no effect.
It allows you specify the input format of the date when reading.
import pandas as pd
from openpyxl import Workbook
# Create an Excel file with multiple sheets
file_path = "example.xlsx"
# Create a workbook and add data
wb = Workbook()
sheet1 = wb.active
sheet1.title = "Sheet1"
sheet1.append(["ID", "Name", "BirthDate"])
sheet1.append([1, "Alice", "2003-10-02"])
sheet1.append([2, "Bob", "2003-04-16"])
sheet1.append([2, "Charlie", "2023-12-29"])
# Save the workbook
wb.save(file_path)
# Read the sheets
df = pd.ExcelFile(path_or_buffer=file_path).parse(sheet_name='Sheet1', parse_dates=['BirthDate'], date_format='%Y-%m-%d')
print(df['BirthDate'])
'''
Output:
0 2003-10-02
1 2003-04-16
2 2023-12-29
Name: BirthDate, dtype: datetime64[ns]
'''
Character acting as the thousands separator in numerical values.
import pandas as pd
from openpyxl import Workbook
# Create an Excel file with multiple sheets
file_path = "example.xlsx"
# Create a workbook and add data
wb = Workbook()
sheet1 = wb.active
sheet1.title = "Sheet1"
sheet1.append(["ID", "Name", "Salary"])
sheet1.append([1, "Alice", "2|000"])
sheet1.append([2, "Bob", "10|000"])
sheet1.append([2, "Charlie", "200"])
# Save the workbook
wb.save(file_path)
# Read the sheets
df = pd.ExcelFile(path_or_buffer=file_path).parse(sheet_name='Sheet1', thousands='|')
print(df)
'''
Output:
ID Name Salary
0 1 Alice 2000
1 2 Bob 10000
2 2 Charlie 200
'''
It is used to specify a character that indicates the beginning of a comment in the data file.
import pandas as pd
from openpyxl import Workbook
# Create an Excel file with multiple sheets
file_path = "example.xlsx"
# Create a workbook and add data
wb = Workbook()
sheet1 = wb.active
sheet1.title = "Sheet1"
sheet1.append(["ID", "Name", "Salary"])
sheet1.append([1, "Alice", "2000 #commentttt hehe"])
sheet1.append([2, "Bob", "10050"])
sheet1.append([2, "Charlie", "232564"])
# Save the workbook
wb.save(file_path)
# Read the sheets
df = pd.ExcelFile(path_or_buffer=file_path).parse(sheet_name='Sheet1', comment='#')
print(df)
'''
Output:
ID Name Salary
0 1 Alice 2000
1 2 Bob 10050
2 2 Charlie 232564
'''
It allows you to skip a specified number of rows from the end of the file when reading the data.
import pandas as pd
from openpyxl import Workbook
# Create an Excel file with multiple sheets
file_path = "example.xlsx"
# Create a workbook and add data
wb = Workbook()
sheet1 = wb.active
sheet1.title = "Sheet1"
sheet1.append(["ID", "Name", "Salary"])
sheet1.append([1, "Alice", "2000"])
sheet1.append([2, "Bob", "10050"])
sheet1.append([2, "Charlie", "232564"])
sheet1.append(["random", "random", "random"])
# Save the workbook
wb.save(file_path)
# Read the sheets
df = pd.ExcelFile(path_or_buffer=file_path).parse(sheet_name='Sheet1', skipfooter=1)
print(df)
'''
Output:
ID Name Salary
0 1 Alice 2000
1 2 Bob 10050
2 2 Charlie 232564
'''
The dtype_backend parameter is new in Pandas 2.0 which is used to specify the backend for handling the types of data when reading a file.
This parameter allows you to pass additional keyword arguments that are forwarded to the underlying pd.read_excel method.
import pandas as pd
from openpyxl import Workbook
# Create an Excel file with multiple sheets
file_path = "example.xlsx"
# Create a workbook and add data
wb = Workbook()
sheet1 = wb.active
sheet1.title = "Sheet1"
sheet1.append(["ID", "Name", "Salary"])
sheet1.append([1, "Alice", "20.00"])
sheet1.append([2, "Bob", "100.50"])
sheet1.append([2, "Charlie", "2325.64"])
# Save the workbook
wb.save(file_path)
# Read the sheets
df = pd.ExcelFile(path_or_buffer=file_path).parse(sheet_name='Sheet1', decimal='.')
print(df)
'''
Output:
ID Name Salary
0 1 Alice 20.00
1 2 Bob 100.50
2 2 Charlie 2325.64
'''