Path or file-like object containing XML.
import pandas as pd
# URL pointing to an XML file
url = "https://betterdocs.tech/global/python/pandas/read_xml.xml"
# Read the XML data from the URL into a DataFrame
df = pd.read_xml(path_or_buffer=url)
# Display the DataFrame
print(df)
'''
Output:
id name hire_date salary
0 1 Alice 2003-10-02 75000
1 2 Bob 2003-04-16 80000
2 3 Chloe 2023-12-29 85000
'''
It is used to specify an XPath expression to filter the XML data and select a particular part of the XML document.
import pandas as pd
# URL pointing to an XML file
url = "https://betterdocs.tech/global/python/pandas/read_xml.xml"
# Read the XML data from the URL into a DataFrame
df = pd.read_xml(path_or_buffer=url, xpath="//employee")
# Display the DataFrame
print(df)
'''
Output:
id name hire_date salary
0 1 Alice 2003-10-02 75000
1 2 Bob 2003-04-16 80000
2 3 Chloe 2023-12-29 85000
'''
Namespace mapping for XPath expressions.
import pandas as pd
from io import StringIO
# XML data with namespaces
xml_data = """
<library xmlns:ns="http://example.com/ns">
<ns:book>
<ns:title>Python Programming</ns:title>
<ns:author>John Doe</ns:author>
<ns:price>29.99</ns:price>
</ns:book>
<ns:book>
<ns:title>Data Science with Python</ns:title>
<ns:author>Jane Smith</ns:author>
<ns:price>39.99</ns:price>
</ns:book>
<ns:magazine>
<ns:title>Tech Trends</ns:title>
<ns:issue>2023</ns:issue>
</ns:magazine>
</library>
"""
# Use StringIO to simulate reading from a file
xml_file = StringIO(xml_data)
# Define the namespaces
namespaces = {
'ns': 'http://example.com/ns'
}
# Read only 'book' elements using the defined namespace and XPath
df = pd.read_xml(xml_file, xpath=".//ns:book", namespaces=namespaces)
print(df)
'''
Output:
title author price
0 Python Programming John Doe 29.99
1 Data Science with Python Jane Smith 39.99
'''
"lxml" is best for performance, particularly for large files.
"bs4" is useful for handling poorly formatted or complex HTML.
"html5lib" is used when dealing with HTML5 documents and when compatibility with modern web standards is a priority.
Include only element data if True.
The entire XML structure is returned.
import pandas as pd
from io import StringIO
# Sample XML data
xml_data = """
<data>
<employee id="1">
<name>Alice</name>
<hire_date>2003-10-02</hire_date>
<salary>75000</salary>
</employee>
<employee id="2">
<name>Bob</name>
<hire_date>2003-04-16</hire_date>
<salary>80000</salary>
</employee>
<employee id="3">
<name>Chloe</name>
<hire_date>2023-12-29</hire_date>
<salary>85000</salary>
</employee>
</data>
"""
# Use StringIO to simulate reading from a file
xml_file = StringIO(xml_data)
# Read XML data
df = pd.read_xml(xml_file, xpath=".//employee", elems_only=False)
print(df)
'''
Output:
id name hire_date salary
0 1 Alice 2003-10-02 75000
1 2 Bob 2003-04-16 80000
2 3 Chloe 2023-12-29 85000
'''
Only the elements matched by the xpath query are included in the DataFrame, ignoring the outer XML structure and attributes.
import pandas as pd
from io import StringIO
# Sample XML data
xml_data = """
<data>
<employee id="1">
<name>Alice</name>
<hire_date>2003-10-02</hire_date>
<salary>75000</salary>
</employee>
<employee id="2">
<name>Bob</name>
<hire_date>2003-04-16</hire_date>
<salary>80000</salary>
</employee>
<employee id="3">
<name>Chloe</name>
<hire_date>2023-12-29</hire_date>
<salary>85000</salary>
</employee>
</data>
"""
# Use StringIO to simulate reading from a file
xml_file = StringIO(xml_data)
# Read XML data
df = pd.read_xml(xml_file, xpath=".//employee", elems_only=True)
print(df)
'''
Output:
name hire_date salary
0 Alice 2003-10-02 75000
1 Bob 2003-04-16 80000
2 Chloe 2023-12-29 85000
'''
Include only attributes if True.
The attributes and the text content of all child elements are included.
import pandas as pd
from io import StringIO
# Sample XML data with attributes and text content
xml_data = """
<data>
<employee id="1" department="HR">
<name>Alice</name>
<hire_date>2003-10-02</hire_date>
<salary>75000</salary>
</employee>
<employee id="2" department="Finance">
<name>Bob</name>
<hire_date>2003-04-16</hire_date>
<salary>80000</salary>
</employee>
<employee id="3" department="IT">
<name>Chloe</name>
<hire_date>2023-12-29</hire_date>
<salary>85000</salary>
</employee>
</data>
"""
# Use StringIO to simulate a file-like object
xml_file = StringIO(xml_data)
# Read XML data
df = pd.read_xml(xml_file, xpath=".//employee", attrs_only=False)
print(df)
'''
Output:
id department name hire_date salary
0 1 HR Alice 2003-10-02 75000
1 2 Finance Bob 2003-04-16 80000
2 3 IT Chloe 2023-12-29 85000
'''
Only the elements matched by the xpath query are included in the DataFrame, ignoring the outer XML structure and attributes.
import pandas as pd
from io import StringIO
# Sample XML data with attributes and text content
xml_data = """
<data>
<employee id="1" department="HR">
<name>Alice</name>
<hire_date>2003-10-02</hire_date>
<salary>75000</salary>
</employee>
<employee id="2" department="Finance">
<name>Bob</name>
<hire_date>2003-04-16</hire_date>
<salary>80000</salary>
</employee>
<employee id="3" department="IT">
<name>Chloe</name>
<hire_date>2023-12-29</hire_date>
<salary>85000</salary>
</employee>
</data>
"""
# Use StringIO to simulate a file-like object
xml_file = StringIO(xml_data)
# Read XML data
df = pd.read_xml(xml_file, xpath=".//employee", attrs_only=True)
print(df)
'''
Output:
id department
0 1 HR
1 2 Finance
2 3 IT
'''
Custom column names for the DataFrame.
import pandas as pd
# URL pointing to an XML file
url = "https://betterdocs.tech/global/python/pandas/read_xml.xml"
# Read the XML data from the URL into a DataFrame
df = pd.read_xml(path_or_buffer=url, names=["ID", "Names", "Join Date", "Salary"])
# Display the DataFrame
print(df)
'''
Output:
ID Names Join Date Salary
0 1 Alice 2003-10-02 75000
1 2 Bob 2003-04-16 80000
2 3 Chloe 2023-12-29 85000
'''
Data type for DataFrame columns.
import pandas as pd
# URL pointing to an XML file
url = "https://betterdocs.tech/global/python/pandas/read_xml.xml"
# Read the XML data from the URL into a DataFrame
df = pd.read_xml(path_or_buffer=url, dtype={'salary': 'float32'})
# Display the DataFrame
print(df)
'''
Output:
id name hire_date salary
0 1 Alice 2003-10-02 75000.0
1 2 Bob 2003-04-16 80000.0
2 3 Chloe 2023-12-29 85000.0
'''
Functions for converting column values.
import pandas as pd
# URL pointing to an XML file
url = "https://betterdocs.tech/global/python/pandas/read_xml.xml"
def half_sal(sal):
return int(sal)/2
# Read the XML data from the URL into a DataFrame
df = pd.read_xml(path_or_buffer=url, converters={'salary': half_sal})
# Display the DataFrame
print(df)
'''
Output:
id name hire_date salary
0 1 Alice 2003-10-02 37500.0
1 2 Bob 2003-04-16 40000.0
2 3 Chloe 2023-12-29 42500.0
'''
It is used to specify which columns should be parsed as dates during the reading of the table.
Pandas will not attempt to parse any columns as dates. If any date strings are present, they will be read as plain text.
import pandas as pd
# URL pointing to an XML file
url = "https://betterdocs.tech/global/python/pandas/read_xml.xml"
# Read the XML data from the URL into a DataFrame
df = pd.read_xml(path_or_buffer=url, parse_dates=False)
# Display the DataFrame
print(df['hire_date'])
'''
Output:
0 2003-10-02
1 2003-04-16
2 2023-12-29
Name: hire_date, dtype: object
'''
Pandas will attempt to parse all columns with date-like values (e.g., strings in the format "YYYY-MM-DD") into datetime objects.
import pandas as pd
# URL pointing to an XML file
url = "https://betterdocs.tech/global/python/pandas/read_xml.xml"
# Read the XML data from the URL into a DataFrame
df = pd.read_xml(path_or_buffer=url, parse_dates=['hire_date'])
# Display the DataFrame
print(df['hire_date'])
'''
Output:
0 2003-10-02
1 2003-04-16
2 2023-12-29
Name: hire_date, dtype: datetime64[ns]
'''
It specifies the character encoding used when reading the XML data.
import pandas as pd
# URL pointing to an XML file
url = "https://betterdocs.tech/global/python/pandas/read_xml.xml"
# Read the XML data from the URL into a DataFrame
df = pd.read_xml(path_or_buffer=url, encoding='utf-8')
# Display the DataFrame
print(df)
'''
Output:
id name hire_date salary
0 1 Alice 2003-10-02 75000
1 2 Bob 2003-04-16 80000
2 3 Chloe 2023-12-29 85000
'''
XML parsing engine.
import pandas as pd
# URL pointing to an XML file
url = "https://betterdocs.tech/global/python/pandas/read_xml.xml"
def half_sal(sal):
return int(sal)/2
# Read the XML data from the URL into a DataFrame
df = pd.read_xml(path_or_buffer=url, parser='etree')
# Display the DataFrame
print(df)
'''
Output:
id name hire_date salary
0 1 Alice 2003-10-02 75000
1 2 Bob 2003-04-16 80000
2 3 Chloe 2023-12-29 85000
'''
"lxml" - Faster for large XML
"etree" - Slower for large XML
It allows you to apply an XSLT stylesheet to transform the XML data before reading it into a DataFrame.
import pandas as pd
from io import StringIO
# Define the XML data as a string
xml_data = """<?xml version="1.0" encoding="UTF-8"?>
<data>
<employee>
<id>1</id>
<name>Alice</name>
<department>HR</department>
</employee>
<employee>
<id>2</id>
<name>Bob</name>
<department>IT</department>
</employee>
</data>"""
# Define the XSLT stylesheet as a string
xslt_data = """<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
<xsl:output method="xml" indent="yes"/>
<!-- Root element -->
<xsl:template match="/">
<transformed_data>
<xsl:apply-templates select="data/employee"/>
</transformed_data>
</xsl:template>
<!-- Template for employee -->
<xsl:template match="employee">
<employee>
<id><xsl:value-of select="id"/></id>
<name><xsl:value-of select="name"/></name>
</employee>
</xsl:template>
</xsl:stylesheet>"""
xml_file = StringIO(xml_data)
xslt_file = StringIO(xslt_data)
# Read the XML data and apply the XSLT transformation
df = pd.read_xml(path_or_buffer=xml_file, stylesheet=xslt_file)
print(df)
'''
Output:
id name
0 1 Alice
1 2 Bob
'''
It is used to specify additional values that should be treated as NaN (Not a Number) while reading the data.
import pandas as pd
from io import BytesIO
# XML data as a string
xml_data = """<?xml version="1.0" encoding="UTF-8"?>
<data>
<employee>
<id>1</id>
<name>Alice</name>
<department>HR</department>
<info salary="75000" hire_date="2003-10-02"/>
</employee>
<employee>
<id>2</id>
<name>Bob</name>
<department>IT</department>
<info salary="80000" hire_date="2010-05-15"/>
</employee>
<employee>
<id>3</id>
<name>Charlie</name>
<info salary="85000" hire_date="2018-08-20"/>
</employee>
</data>"""
# Encode the string to bytes
xml_bytes = xml_data.encode('utf-8')
# Use BytesIO to wrap the bytes
xml_file = BytesIO(xml_bytes)
# Read XML with iterparse
df = pd.read_xml(xml_file, iterparse={ "employee": ["id", "name", "info@salary", "info@hire_date"]})
print(df)
'''
Output:
id name
0 1 Alice
1 2 Bob
2 3 Charlie
'''
" ", "#N/A", "#N/A N/A", "#NA", "-1.#IND", "-1.#QNAN", "-NaN", "-nan", "1.#IND", "1.#QNAN", "<NA>", "N/A", "NA", "NULL", "NaN", "None", "n/a", "nan", "null"
Compression type for the file.
import pandas as pd
import gzip
# Sample XML Data
xml_data = """<?xml version="1.0" encoding="UTF-8"?>
<data>
<employee>
<id>1</id>
<name>Alice</name>
<salary>75000</salary>
</employee>
<employee>
<id>2</id>
<name>Bob</name>
<salary>80000</salary>
</employee>
</data>"""
# Write XML data to a gzip-compressed file
with gzip.open("employees.xml.gz", "wt", encoding="utf-8") as f:
f.write(xml_data)
# Read the compressed XML file
df = pd.read_xml("employees.xml.gz", compression="gzip")
print(df)
'''
Output:
id name salary
0 1 Alice 75000
1 2 Bob 80000
'''
Dictionary of storage-specific options, such as credentials for cloud storage.
The dtype_backend parameter is new in Pandas 2.0 which is used to specify the backend for handling the types of data when reading a file.