BetterDocs
Home
Docs

Creation | pd.ExcelFile.parse()

Method:

pd.ExcelFile.parse(sheet_name=0, header=0, names=None, index_col=None, usecols=None, converters=None, true_values=None, false_values=None, skiprows=None, nrows=None, na_values=None, parse_dates=False, date_parser=<no_default>, date_format=None, thousands=None, comment=None, skipfooter=0, dtype_backend=<no_default>, **kwds)

Parses a specific sheet from an Excel file into a DataFrame.

Returns:

pandas.core.frame.DataFrame

Parameters:

sheet_name: (str or int)-

Sheet name or index to parse; default is 0 (first sheet).

import pandas as pd
from openpyxl import Workbook

# Create an Excel file with multiple sheets
file_path = "example.xlsx"

# Create a workbook and add data
wb = Workbook()
sheet1 = wb.active
sheet1.title = "Sheet1"
sheet1.append(["ID", "Name", "Salary"])
sheet1.append([1, "Alice", "2000"])
sheet1.append([2, "Bob", "10000"])
sheet1.append([2, "Charlie", "2000"])

# Save the workbook
wb.save(file_path)

# Read the sheets
df = pd.ExcelFile(path_or_buffer=file_path).parse(sheet_name='Sheet1')
print(df)
'''
Output:
   ID     Name  Salary
0   1    Alice    2000
1   2      Bob   10000
2   2  Charlie    2000
'''

names: array-like, Optional-

Custom column names to use.

import pandas as pd
from openpyxl import Workbook

# Create an Excel file with multiple sheets
file_path = "example.xlsx"

# Create a workbook and add data
wb = Workbook()
sheet1 = wb.active
sheet1.title = "Sheet1"
sheet1.append(["ID", "Name", "Salary"])
sheet1.append([1, "Alice", "2000"])
sheet1.append([2, "Bob", "10000"])
sheet1.append([2, "Charlie", "2000"])

# Save the workbook
wb.save(file_path)

# Read the sheets
df = pd.ExcelFile(path_or_buffer=file_path).parse(sheet_name='Sheet1', names=["EmpID", "FName", "Kids"])
print(df)
'''
Output:
   EmpID    FName   Kids
0      1    Alice   2000
1      2      Bob  10000
2      2  Charlie   2000
'''

Overrides existing headers for columns.

index_col: None, Optional-

It specifies which column should be used as the index of the resulting DataFrame.

import pandas as pd
from openpyxl import Workbook

# Create an Excel file with multiple sheets
file_path = "example.xlsx"

# Create a workbook and add data
wb = Workbook()
sheet1 = wb.active
sheet1.title = "Sheet1"
sheet1.append(["ID", "Name", "Salary"])
sheet1.append([1, "Alice", "2000"])
sheet1.append([2, "Bob", "10000"])
sheet1.append([2, "Charlie", "2000"])

# Save the workbook
wb.save(file_path)

# Read the sheets
df = pd.ExcelFile(path_or_buffer=file_path).parse(sheet_name='Sheet1', index_col='ID')
print(df)
'''
Output:
       Name  Salary
ID                 
1     Alice    2000
2       Bob   10000
2   Charlie    2000
'''

usecols: None, Optional-

It allows you to select specific columns to read from the input file, rather than reading all columns.

import pandas as pd
from openpyxl import Workbook

# Create an Excel file with multiple sheets
file_path = "example.xlsx"

# Create a workbook and add data
wb = Workbook()
sheet1 = wb.active
sheet1.title = "Sheet1"
sheet1.append(["ID", "Name", "Salary"])
sheet1.append([1, "Alice", "2000"])
sheet1.append([2, "Bob", "10000"])
sheet1.append([2, "Charlie", "2000"])

# Save the workbook
wb.save(file_path)

# Read the sheets
df = pd.ExcelFile(path_or_buffer=file_path).parse(sheet_name='Sheet1', usecols=["ID", "Name"])
print(df)
'''
Output:
   ID     Name
0   1    Alice
1   2      Bob
2   2  Charlie
'''

converters: None, Optional-

It allows you to specify custom functions to convert or process values in certain columns while reading the file.

import pandas as pd
from openpyxl import Workbook

# Create an Excel file with multiple sheets
file_path = "example.xlsx"

# Create a workbook and add data
wb = Workbook()
sheet1 = wb.active
sheet1.title = "Sheet1"
sheet1.append(["ID", "Name", "Age"])
sheet1.append([1, "Alice", 20])
sheet1.append([2, "Bob", 30])
sheet1.append([2, "Charlie", 10])

# Save the workbook
wb.save(file_path)

def half_the_age(age):
    return int(age)/2

# Read the sheets
df = pd.ExcelFile(path_or_buffer=file_path).parse(sheet_name='Sheet1', converters={"Age": half_the_age})
print(df)
'''
Output:
   ID     Name   Age
0   1    Alice  10.0
1   2      Bob  15.0
2   2  Charlie   5.0
'''

true_values: None, Optional-

It is used to specify which string values should be interpreted as boolean True.

import pandas as pd
from openpyxl import Workbook

# Create an Excel file with multiple sheets
file_path = "example.xlsx"

# Create a workbook and add data
wb = Workbook()
sheet1 = wb.active
sheet1.title = "Sheet1"
sheet1.append(["ID", "Name", "Salary", "isActive"])
sheet1.append([1, "Alice", "2000", "yes"])
sheet1.append([2, "Bob", "10000", "maybe"])
sheet1.append([2, "Charlie", "2000", "no"])

# Save the workbook
wb.save(file_path)

# Read the sheets
df = pd.ExcelFile(path_or_buffer=file_path).parse(sheet_name='Sheet1', true_values=["yes", "maybe"], false_values=["no"])
print(df)
'''
Output:
   ID     Name  Salary  isActive
0   1    Alice    2000      True
1   2      Bob   10000      True
2   2  Charlie    2000     False
'''

For true_values or false_values to work as intended, both of them must contain either of the values in a column.

false_values: None, Optional-

It is used to specify which string values should be interpreted as boolean False.

import pandas as pd
from openpyxl import Workbook

# Create an Excel file with multiple sheets
file_path = "example.xlsx"

# Create a workbook and add data
wb = Workbook()
sheet1 = wb.active
sheet1.title = "Sheet1"
sheet1.append(["ID", "Name", "Salary", "isActive"])
sheet1.append([1, "Alice", "2000", "yes"])
sheet1.append([2, "Bob", "10000", "maybe"])
sheet1.append([2, "Charlie", "2000", "no"])

# Save the workbook
wb.save(file_path)

# Read the sheets
df = pd.ExcelFile(path_or_buffer=file_path).parse(sheet_name='Sheet1', true_values=["yes"], false_values=["no", "maybe"])
print(df)
'''
Output:
   ID     Name  Salary  isActive
0   1    Alice    2000      True
1   2      Bob   10000     False
2   2  Charlie    2000     False
'''

For true_values or false_values to work as intended, both of them must contain either of the values in a column.

skiprows: None, Optional-

It is used to skip a specified number of rows from the start of the file or from a list of specific row indices.

import pandas as pd
from openpyxl import Workbook

# Create an Excel file with multiple sheets
file_path = "example.xlsx"

# Create a workbook and add data
wb = Workbook()
sheet1 = wb.active
sheet1.title = "Sheet1"
sheet1.append(["random", "random", "random"])
sheet1.append(["ID", "Name", "Salary"])
sheet1.append([1, "Alice", "2000"])
sheet1.append([2, "Bob", "10000"])
sheet1.append([2, "Charlie", "2000"])

# Save the workbook
wb.save(file_path)

# Read the sheets
df = pd.ExcelFile(path_or_buffer=file_path).parse(sheet_name='Sheet1', skiprows=1)
print(df)
'''
Output:
   ID     Name  Salary
0   1    Alice    2000
1   2      Bob   10000
2   2  Charlie    2000
'''

nrows: None, Optional-

It is used to limit the number of rows that are read from the file.

import pandas as pd
from openpyxl import Workbook

# Create an Excel file with multiple sheets
file_path = "example.xlsx"

# Create a workbook and add data
wb = Workbook()
sheet1 = wb.active
sheet1.title = "Sheet1"
sheet1.append(["ID", "Name", "Salary"])
sheet1.append([1, "Alice", "2000"])
sheet1.append([2, "Bob", "10000"])
sheet1.append([2, "Charlie", "2000"])

# Save the workbook
wb.save(file_path)

# Read the sheets
df = pd.ExcelFile(path_or_buffer=file_path).parse(sheet_name='Sheet1', nrows=2)
print(df)
'''
Output:
   ID     Name  Salary
0   1    Alice    2000
1   2      Bob   10000
'''

na_values: None, Optional-

It is used to specify additional values that should be treated as NaN (Not a Number) while reading the data.

import pandas as pd
from openpyxl import Workbook

# Create an Excel file with multiple sheets
file_path = "example.xlsx"

# Create a workbook and add data
wb = Workbook()
sheet1 = wb.active
sheet1.title = "Sheet1"
sheet1.append(["ID", "Name", "Salary"])
sheet1.append([1, "Alice", "na"])
sheet1.append([2, "missing", "10000"])
sheet1.append([2, "Charlie", "2000"])

# Save the workbook
wb.save(file_path)

# Read the sheets
df = pd.ExcelFile(path_or_buffer=file_path).parse(sheet_name='Sheet1', na_values=["missing", "na"])
print(df)
'''
Output:
   ID     Name   Salary
0   1    Alice      NaN
1   2      NaN  10000.0
2   2  Charlie   2000.0
'''

Values: +

parse_dates: (True or False), Optional-

It is used to specify which columns should be parsed as dates during the reading of the file.

parse_dates = False (default) +

parse_dates = True +

date_parser: (True or False), Optional-

Deprecated since version 2.0.0: A strict version of this argument is now the default, passing it has no effect.

date_format: None, Optional-

It allows you specify the input format of the date when reading.

import pandas as pd
from openpyxl import Workbook

# Create an Excel file with multiple sheets
file_path = "example.xlsx"

# Create a workbook and add data
wb = Workbook()
sheet1 = wb.active
sheet1.title = "Sheet1"
sheet1.append(["ID", "Name", "BirthDate"])
sheet1.append([1, "Alice", "2003-10-02"])
sheet1.append([2, "Bob", "2003-04-16"])
sheet1.append([2, "Charlie", "2023-12-29"])

# Save the workbook
wb.save(file_path)

# Read the sheets
df = pd.ExcelFile(path_or_buffer=file_path).parse(sheet_name='Sheet1', parse_dates=['BirthDate'], date_format='%Y-%m-%d')
print(df['BirthDate'])
'''
Output:
0   2003-10-02
1   2003-04-16
2   2023-12-29
Name: BirthDate, dtype: datetime64[ns]
'''

thousands: single char, Optional-

Character acting as the thousands separator in numerical values.

import pandas as pd
from openpyxl import Workbook

# Create an Excel file with multiple sheets
file_path = "example.xlsx"

# Create a workbook and add data
wb = Workbook()
sheet1 = wb.active
sheet1.title = "Sheet1"
sheet1.append(["ID", "Name", "Salary"])
sheet1.append([1, "Alice", "2|000"])
sheet1.append([2, "Bob", "10|000"])
sheet1.append([2, "Charlie", "200"])

# Save the workbook
wb.save(file_path)

# Read the sheets
df = pd.ExcelFile(path_or_buffer=file_path).parse(sheet_name='Sheet1', thousands='|')
print(df)
'''
Output:
   ID     Name  Salary
0   1    Alice    2000
1   2      Bob   10000
2   2  Charlie     200
'''

comment: single char, Optional-

It is used to specify a character that indicates the beginning of a comment in the data file.

import pandas as pd
from openpyxl import Workbook

# Create an Excel file with multiple sheets
file_path = "example.xlsx"

# Create a workbook and add data
wb = Workbook()
sheet1 = wb.active
sheet1.title = "Sheet1"
sheet1.append(["ID", "Name", "Salary"])
sheet1.append([1, "Alice", "2000 #commentttt hehe"])
sheet1.append([2, "Bob", "10050"])
sheet1.append([2, "Charlie", "232564"])

# Save the workbook
wb.save(file_path)

# Read the sheets
df = pd.ExcelFile(path_or_buffer=file_path).parse(sheet_name='Sheet1', comment='#')
print(df)
'''
Output:
   ID     Name  Salary
0   1    Alice    2000
1   2      Bob   10050
2   2  Charlie  232564
'''

skipfooter: None, Optional-

It allows you to skip a specified number of rows from the end of the file when reading the data.

import pandas as pd
from openpyxl import Workbook

# Create an Excel file with multiple sheets
file_path = "example.xlsx"

# Create a workbook and add data
wb = Workbook()
sheet1 = wb.active
sheet1.title = "Sheet1"
sheet1.append(["ID", "Name", "Salary"])
sheet1.append([1, "Alice", "2000"])
sheet1.append([2, "Bob", "10050"])
sheet1.append([2, "Charlie", "232564"])
sheet1.append(["random", "random", "random"])

# Save the workbook
wb.save(file_path)

# Read the sheets
df = pd.ExcelFile(path_or_buffer=file_path).parse(sheet_name='Sheet1', skipfooter=1)
print(df)
'''
Output:
   ID     Name  Salary
0   1    Alice    2000
1   2      Bob   10050
2   2  Charlie  232564
'''

dtype_backend: None, Optional-

The dtype_backend parameter is new in Pandas 2.0 which is used to specify the backend for handling the types of data when reading a file.

**kwds Optional-

This parameter allows you to pass additional keyword arguments that are forwarded to the underlying pd.read_excel  method.

import pandas as pd
from openpyxl import Workbook

# Create an Excel file with multiple sheets
file_path = "example.xlsx"

# Create a workbook and add data
wb = Workbook()
sheet1 = wb.active
sheet1.title = "Sheet1"
sheet1.append(["ID", "Name", "Salary"])
sheet1.append([1, "Alice", "20.00"])
sheet1.append([2, "Bob", "100.50"])
sheet1.append([2, "Charlie", "2325.64"])

# Save the workbook
wb.save(file_path)

# Read the sheets
df = pd.ExcelFile(path_or_buffer=file_path).parse(sheet_name='Sheet1', decimal='.')
print(df)
'''
Output:
   ID     Name   Salary
0   1    Alice    20.00
1   2      Bob   100.50
2   2  Charlie  2325.64
'''

Logo

BetterDocs

Support

EmailDiscordForms

Documentations

Python

Company

AboutDocs

Policies

Terms of ServicePrivacy Policy