It specifies the JSON data to be flattened.
import pandas as pd
import json
# Sample data
nested_json = [
{
"id": 1,
"name": "Alice",
"info": {"age": 25, "city": "New York"},
"hobbies": ["reading", "hiking"],
},
{
"id": 2,
"name": "Bob",
"info": {"age": 30, "city": "San Francisco"},
"hobbies": ["cooking"],
},
]
# Read the JSON data
df = pd.json_normalize(data=nested_json)
print(df)
'''
Output:
id name hobbies info.age info.city
0 1 Alice [reading, hiking] 25 New York
1 2 Bob [cooking] 30 San Francisco
'''
It specifies the key or list of keys in the JSON data that identifies the nested list of records to normalize into rows of the resulting DataFrame.
import pandas as pd
import json
# Sample data
nested_json = [
{
"id": 1,
"name": "Alice",
"info": {
"age": 25,
"city": "New York"
},
"hobbies": [
{"type": "reading", "frequency": "daily"},
{"type": "hiking", "frequency": "weekly"}
]
},
{
"id": 2,
"name": "Bob",
"info": {
"age": 30,
"city": "San Francisco"
},
"hobbies": [
{"type": "cooking", "frequency": "monthly"}
]
}
]
# Normalize the JSON with record_path
df = pd.json_normalize(data=nested_json, record_path="hobbies")
print(df)
'''
Output:
type frequency
0 reading daily
1 hiking weekly
2 cooking monthly
'''
It specifies additional keys from the JSON data to include as columns in the resulting DataFrame, alongside the data extracted using record_path.
import pandas as pd
import json
# Sample data
nested_json = [
{
"id": 1,
"name": "Alice",
"info": {
"age": 25,
"city": "New York"
},
"hobbies": [
{"type": "reading", "frequency": "daily"},
{"type": "hiking", "frequency": "weekly"}
]
},
{
"id": 2,
"name": "Bob",
"info": {
"age": 30,
"city": "San Francisco"
},
"hobbies": [
{"type": "cooking", "frequency": "monthly"}
]
}
]
# Normalize the JSON
df = pd.json_normalize(data=nested_json, record_path="hobbies", meta=["id", "name", ["info", "age"], ["info", "city"]])
print(df)
'''
Output:
type frequency id name info.age info.city
0 reading daily 1 Alice 25 New York
1 hiking weekly 1 Alice 25 New York
2 cooking monthly 2 Bob 30 San Francisco
'''
Handy when used with record_path .
Prefix to add to metadata fields.
import pandas as pd
import json
# Sample data
nested_json = [
{
"id": 1,
"name": "Alice",
"info": {
"age": 25,
"city": "New York"
},
"hobbies": [
{"type": "reading", "frequency": "daily"},
{"type": "hiking", "frequency": "weekly"}
]
},
{
"id": 2,
"name": "Bob",
"info": {
"age": 30,
"city": "San Francisco"
},
"hobbies": [
{"type": "cooking", "frequency": "monthly"}
]
}
]
# Normalize the JSON
df = pd.json_normalize(data=nested_json, record_path="hobbies", meta=["id", "name", ["info", "age"], ["info", "city"]], meta_prefix='m-')
print(df)
'''
Output:
type frequency m-id m-name m-info.age m-info.city
0 reading daily 1 Alice 25 New York
1 hiking weekly 1 Alice 25 New York
2 cooking monthly 2 Bob 30 San Francisco
'''
Handy when used with meta .
Prefix to add to nested record fields.
import pandas as pd
import json
# Sample data
nested_json = [
{
"id": 1,
"name": "Alice",
"info": {
"age": 25,
"city": "New York"
},
"hobbies": [
{"type": "reading", "frequency": "daily"},
{"type": "hiking", "frequency": "weekly"}
]
},
{
"id": 2,
"name": "Bob",
"info": {
"age": 30,
"city": "San Francisco"
},
"hobbies": [
{"type": "cooking", "frequency": "monthly"}
]
}
]
# Normalize the JSON
df = pd.json_normalize(data=nested_json, record_path="hobbies", record_prefix='r-')
print(df)
'''
Output:
r-type r-frequency
0 reading daily
1 hiking weekly
2 cooking monthly
'''
Handy when used with record_path .
Error-handling strategy ('raise' or 'ignore').
import pandas as pd
# Sample data with missing 'info' field in Bob's record
nested_json = [
{
"id": 1,
"name": "Alice",
"info": {
"age": 25,
"city": "New York"
}
},
{
"id": 2,
"name": "Bob",
# Missing the entire 'info' field
},
{
"id": 3,
"name": "Chloe",
"info": {
"age": 22,
# Missing 'city' but 'info' exists
}
}
]
# Try to normalize the JSON with errors='ignore'
df = pd.json_normalize(
data=nested_json,
meta=["id", "name", ["info", "age"], ["info", "city"]],
errors="ignore"
)
print(df)
'''
Output:
id name info.age info.city
0 1 Alice 25.0 New York
1 2 Bob NaN NaN
2 3 Chloe 22.0 NaN
'''
'raise' is not working as intended in Pandas 2.0.
It is used to specify the separator for the nested keys in a JSON object when flattening it.
import pandas as pd
# Sample nested JSON
nested_json = [
{
"id": 1,
"name": "Alice",
"info": {
"age": 25,
"city": "New York"
}
},
{
"id": 2,
"name": "Bob",
"info": {
"age": 30,
"city": "Berlin"
}
}
]
# Normalize the JSON with the separator
df = pd.json_normalize(data=nested_json, sep='|')
print(df)
'''
Output:
id name info|age info|city
0 1 Alice 25 New York
1 2 Bob 30 Berlin
'''
It is used to limit the depth of normalization for nested JSON data.
import pandas as pd
# Sample nested JSON
nested_json = [
{
"id": 1,
"name": "Alice",
"info": {
"age": 25,
"city": "New York",
"address": {
"street": "5th Ave",
"zip": "10001"
}
}
},
{
"id": 2,
"name": "Bob",
"info": {
"age": 30,
"city": "Berlin",
"address": {
"street": "Alexanderplatz",
"zip": "10178"
}
}
}
]
# Normalize the JSON with max_level=1 (only flatten 1 level)
df = pd.json_normalize(nested_json, max_level=1)
print(df)
'''
Output:
id name info
0 1 Alice {'age': 25, 'city': 'New York', 'address': {'s...
1 2 Bob {'age': 30, 'city': 'Berlin', 'address': {'str...
'''
By default, it chooses the highest depth if not specified, i.e, 2 in this case.