Creation | pd.read_hdf()

Previous Next

Method:

pd.read_hdf(path_or_buf, key=None, mode='r', errors='strict', where=None, start=None, stop=None, columns=None, iterator=False, chunksize=None, **kwargs)

Reads data from an HDF5 file into a DataFrame.

Returns:

pandas.core.frame.DataFrame

Parameters:

path_or_buf: (str or path)-

Any valid string path is acceptable. Only supports the local file system, remote URLs and file-like objects are not supported.

import pandas as pd
import requests

# URL to the HDF5 file
url = "https://betterdocs.tech/global/python/pandas/data_large.h5"

# Download the file
response = requests.get(url)
with open("data.h5", "wb") as f:
    f.write(response.content)

# Read the HDF5 file locally
df = pd.read_hdf(path_or_buf="data.h5")
print(df.head())
'''
Output:
   id             name   birthdate  age  salary
0   1       Eric Arias  1979-05-22   45  102905
1   2      Billy Silva  1998-03-13   26   87949
2   3    David Goodwin  1974-09-07   50   34936
3   4  Wendy Jones DVM  1968-07-23   56   67408
4   5     Becky Butler  1995-11-30   29  134267
'''

key: str, Optional-

It is used to specify the group name in the HDF5 file from which the data should be read.

import pandas as pd

# Create two DataFrames
df1 = pd.DataFrame({'id': [1, 2], 'name': ['Alice', 'Bob']})
df2 = pd.DataFrame({'id': [3, 4], 'name': ['Chloe', 'David']})

# Write DataFrames to an HDF5 file under different keys
df1.to_hdf('data.h5', key='group1', mode='w')
df2.to_hdf('data.h5', key='group2', mode='a')

# Read the dataset under 'group1'
df_group1 = pd.read_hdf('data.h5', key='group1')
print(df_group1)

# Read the dataset under 'group2'
df_group2 = pd.read_hdf('data.h5', key='group2')
print(df_group2)
'''
Output:
   id   name
0   1  Alice
1   2    Bob
   id   name
0   3  Chloe
1   4  David
'''

mode: ('r' or 'r+' or 'a'), Optional-

Mode to use when opening the file.

errors: str, Optional-

Specifies how encoding and decoding errors are to be handled.

where: str, Optional-

Query for filtering rows based on conditions.

import pandas as pd

# Create sample DataFrame
data = {
    'id': [1, 2, 3, 4, 5],
    'name': ['Alice', 'Bob', 'Chloe', 'David', 'Eva'],
    'salary': [75000, 80000, 85000, 90000, 95000],
    'department': ['HR', 'Engineering', 'Marketing', 'Engineering', 'HR']
}

df = pd.DataFrame(data)
df.to_hdf('employees_table.h5', key='data', mode='w', data_columns=True, format='table')

# Use the 'where' parameter to filter data with salary > 80000
df_filtered = pd.read_hdf('employees_table.h5', key='data', where="id=2")

# Print the filtered DataFrame
print(df_filtered)
'''
Output:
   id name  salary   department
1   2  Bob   80000  Engineering
'''

start: None, Optional-

Row index to start reading from.

import pandas as pd
import requests

# URL to the HDF5 file
url = "https://betterdocs.tech/global/python/pandas/data_large.h5"

# Download the file
response = requests.get(url)
with open("data.h5", "wb") as f:
    f.write(response.content)

# Read the HDF5 file locally
df = pd.read_hdf(path_or_buf="data.h5", start=2)
print(df.head())
'''
Output:
   id             name   birthdate  age  salary
2   3    David Goodwin  1974-09-07   50   34936
3   4  Wendy Jones DVM  1968-07-23   56   67408
4   5     Becky Butler  1995-11-30   29  134267
5   6    Heather Frank  1985-12-13   39   48576
6   7       John Hicks  1978-12-03   46   49362
'''

stop: None, Optional-

Row index to stop reading at.

import pandas as pd
import requests

# URL to the HDF5 file
url = "https://betterdocs.tech/global/python/pandas/data_large.h5"

# Download the file
response = requests.get(url)
with open("data.h5", "wb") as f:
    f.write(response.content)

# Read the HDF5 file locally
df = pd.read_hdf(path_or_buf="data.h5", stop=3)
print(df)
'''
Output:
   id           name   birthdate  age  salary
0   1     Eric Arias  1979-05-22   45  102905
1   2    Billy Silva  1998-03-13   26   87949
2   3  David Goodwin  1974-09-07   50   34936
'''

columns: None, Optional-

Subset of columns to load.

import pandas as pd

# Create sample DataFrame
data = {
    'id': [1, 2, 3, 4, 5],
    'name': ['Alice', 'Bob', 'Chloe', 'David', 'Eva'],
    'salary': [75000, 80000, 85000, 90000, 95000],
    'department': ['HR', 'Engineering', 'Marketing', 'Engineering', 'HR']
}

df = pd.DataFrame(data)
df.to_hdf('employees_table.h5', key='data', mode='w', data_columns=True, format='table')
df_filtered = pd.read_hdf('employees_table.h5', columns=["id", "name"])

# Print the filtered DataFrame
print(df_filtered)
'''
Output:
   id   name
0   1  Alice
1   2    Bob
2   3  Chloe
3   4  David
4   5    Eva
'''

iterator: (True or False), Optional-

Return an iterator object.

iterator = False (default) +

Retuns an entire DataFrame at once.

import pandas as pd

# Create sample DataFrame
data = {
    'id': [1, 2, 3, 4, 5],
    'name': ['Alice', 'Bob', 'Chloe', 'David', 'Eva'],
    'salary': [75000, 80000, 85000, 90000, 95000],
    'department': ['HR', 'Engineering', 'Marketing', 'Engineering', 'HR']
}

df = pd.DataFrame(data)

# Save to HDF5 in table format
df.to_hdf('bb.h5', key='data', mode='w', data_columns=True, format='table')

# Read HDF5 without iterator
df_filtered = pd.read_hdf('bb.h5', iterator=False)
print(df_filtered)
'''
Output:
   id   name  salary   department
0   1  Alice   75000           HR
1   2    Bob   80000  Engineering
2   3  Chloe   85000    Marketing
3   4  David   90000  Engineering
4   5    Eva   95000           HR
'''

iterator = True +

It indicates that the method will return an iterator object instead of a DataFrame.

import pandas as pd

# Create sample DataFrame
data = {
    'id': [1, 2, 3, 4, 5],
    'name': ['Alice', 'Bob', 'Chloe', 'David', 'Eva'],
    'salary': [75000, 80000, 85000, 90000, 95000],
    'department': ['HR', 'Engineering', 'Marketing', 'Engineering', 'HR']
}

df = pd.DataFrame(data)

# Save to HDF5 in table format
df.to_hdf('bb.h5', key='data', mode='w', data_columns=True, format='table')

# Read HDF5 with chunks (iterator=True, chunksize=3)
df_filtered = pd.read_hdf('bb.h5', iterator=True, chunksize=3)
for chunk in df_filtered:
  print(chunk)
'''
Output:
   id   name  salary   department
0   1  Alice   75000           HR
1   2    Bob   80000  Engineering
2   3  Chloe   85000    Marketing
   id   name  salary   department
3   4  David   90000  Engineering
4   5    Eva   95000           HR
'''

chunksize must be specified when <ri>iterator=True<ri>.

chunksize: None, Optional-

Number of rows to read per chunk.

import pandas as pd

# Create sample DataFrame
data = {
    'id': [1, 2, 3, 4, 5],
    'name': ['Alice', 'Bob', 'Chloe', 'David', 'Eva'],
    'salary': [75000, 80000, 85000, 90000, 95000],
    'department': ['HR', 'Engineering', 'Marketing', 'Engineering', 'HR']
}

df = pd.DataFrame(data)

# Save to HDF5 in table format
df.to_hdf('bb.h5', key='data', mode='w', data_columns=True, format='table')

# Read HDF5 with chunks (iterator=True, chunksize=4)
df_filtered = pd.read_hdf('bb.h5', iterator=True, chunksize=4)
for c in df_filtered:
  print(c)
'''
Output:
   id   name  salary   department
0   1  Alice   75000           HR
1   2    Bob   80000  Engineering
2   3  Chloe   85000    Marketing
3   4  David   90000  Engineering
   id name  salary department
4   5  Eva   95000         HR
'''

**kwargs: None, Optional-

Additional arguments for lower-level HDF5 functionality.

Previous Next

BetterDocs

Support

EmailDiscordForms

Documentations

Python

Company

AboutDocs

Policies

Terms of ServicePrivacy Policy