BetterDocs

Home

Docs

Creation | pd.read_xml()

Previous Next

Method:

pd.read_xml(path_or_buffer, , xpath='./', namespaces=None, elems_only=False, attrs_only=False, names=None, dtype=None, converters=None, parse_dates=None, encoding='utf-8', parser='lxml', stylesheet=None, iterparse=None, compression='infer', storage_options=None, dtype_backend=<no_default>)

Reads HTML tables into a list of DataFrames.

Returns:

pandas.core.frame.DataFrame

Parameters:

path_or_buffer: (str or path or file-like)-

Path or file-like object containing XML.

import pandas as pd

# URL pointing to an XML file
url = "https://betterdocs.tech/global/python/pandas/read_xml.xml"

# Read the XML data from the URL into a DataFrame
df = pd.read_xml(path_or_buffer=url)

# Display the DataFrame
print(df)
'''
Output:
   id   name   hire_date  salary
0   1  Alice  2003-10-02   75000
1   2    Bob  2003-04-16   80000
2   3  Chloe  2023-12-29   85000
'''

xpath: str, Optional-

It is used to specify an XPath expression to filter the XML data and select a particular part of the XML document.

import pandas as pd

# URL pointing to an XML file
url = "https://betterdocs.tech/global/python/pandas/read_xml.xml"

# Read the XML data from the URL into a DataFrame
df = pd.read_xml(path_or_buffer=url, xpath="//employee")

# Display the DataFrame
print(df)
'''
Output:
   id   name   hire_date  salary
0   1  Alice  2003-10-02   75000
1   2    Bob  2003-04-16   80000
2   3  Chloe  2023-12-29   85000
'''

namespaces: dict, Optional-

Namespace mapping for XPath expressions.

import pandas as pd
from io import StringIO

# XML data with namespaces
xml_data = """
<library xmlns:ns="http://example.com/ns">
    <ns:book>
        <ns:title>Python Programming</ns:title>
        <ns:author>John Doe</ns:author>
        <ns:price>29.99</ns:price>
    </ns:book>
    <ns:book>
        <ns:title>Data Science with Python</ns:title>
        <ns:author>Jane Smith</ns:author>
        <ns:price>39.99</ns:price>
    </ns:book>
    <ns:magazine>
        <ns:title>Tech Trends</ns:title>
        <ns:issue>2023</ns:issue>
    </ns:magazine>
</library>
"""

# Use StringIO to simulate reading from a file
xml_file = StringIO(xml_data)

# Define the namespaces
namespaces = {
    'ns': 'http://example.com/ns'
}

# Read only 'book' elements using the defined namespace and XPath
df = pd.read_xml(xml_file, xpath=".//ns:book", namespaces=namespaces)
print(df)
'''
Output:
                      title      author  price
0        Python Programming    John Doe  29.99
1  Data Science with Python  Jane Smith  39.99
'''

"lxml" is best for performance, particularly for large files.

"bs4" is useful for handling poorly formatted or complex HTML.

"html5lib" is used when dealing with HTML5 documents and when compatibility with modern web standards is a priority.

elems_only: (True or False), Optional-

Include only element data if True.

elems_only = False (default) +

The entire XML structure is returned.

import pandas as pd
from io import StringIO

# Sample XML data
xml_data = """
<data>
    <employee id="1">
        <name>Alice</name>
        <hire_date>2003-10-02</hire_date>
        <salary>75000</salary>
    </employee>
    <employee id="2">
        <name>Bob</name>
        <hire_date>2003-04-16</hire_date>
        <salary>80000</salary>
    </employee>
    <employee id="3">
        <name>Chloe</name>
        <hire_date>2023-12-29</hire_date>
        <salary>85000</salary>
    </employee>
</data>
"""

# Use StringIO to simulate reading from a file
xml_file = StringIO(xml_data)

# Read XML data
df = pd.read_xml(xml_file, xpath=".//employee", elems_only=False)
print(df)
'''
Output:
   id   name   hire_date  salary
0   1  Alice  2003-10-02   75000
1   2    Bob  2003-04-16   80000
2   3  Chloe  2023-12-29   85000
'''

elems_only = True +

Only the elements matched by the xpath query are included in the DataFrame, ignoring the outer XML structure and attributes.

import pandas as pd
from io import StringIO

# Sample XML data
xml_data = """
<data>
    <employee id="1">
        <name>Alice</name>
        <hire_date>2003-10-02</hire_date>
        <salary>75000</salary>
    </employee>
    <employee id="2">
        <name>Bob</name>
        <hire_date>2003-04-16</hire_date>
        <salary>80000</salary>
    </employee>
    <employee id="3">
        <name>Chloe</name>
        <hire_date>2023-12-29</hire_date>
        <salary>85000</salary>
    </employee>
</data>
"""

# Use StringIO to simulate reading from a file
xml_file = StringIO(xml_data)

# Read XML data
df = pd.read_xml(xml_file, xpath=".//employee", elems_only=True)
print(df)
'''
Output:
    name   hire_date  salary
0  Alice  2003-10-02   75000
1    Bob  2003-04-16   80000
2  Chloe  2023-12-29   85000
'''

attrs_only: (True or False), Optional-

Include only attributes if True.

attrs_only = False (default) +

The attributes and the text content of all child elements are included.

import pandas as pd
from io import StringIO

# Sample XML data with attributes and text content
xml_data = """
<data>
    <employee id="1" department="HR">
        <name>Alice</name>
        <hire_date>2003-10-02</hire_date>
        <salary>75000</salary>
    </employee>
    <employee id="2" department="Finance">
        <name>Bob</name>
        <hire_date>2003-04-16</hire_date>
        <salary>80000</salary>
    </employee>
    <employee id="3" department="IT">
        <name>Chloe</name>
        <hire_date>2023-12-29</hire_date>
        <salary>85000</salary>
    </employee>
</data>
"""

# Use StringIO to simulate a file-like object
xml_file = StringIO(xml_data)

# Read XML data
df = pd.read_xml(xml_file, xpath=".//employee", attrs_only=False)
print(df)
'''
Output:
   id department   name   hire_date  salary
0   1         HR  Alice  2003-10-02   75000
1   2    Finance    Bob  2003-04-16   80000
2   3         IT  Chloe  2023-12-29   85000
'''

attrs_only = True +

Only the elements matched by the xpath query are included in the DataFrame, ignoring the outer XML structure and attributes.

import pandas as pd
from io import StringIO

# Sample XML data with attributes and text content
xml_data = """
<data>
    <employee id="1" department="HR">
        <name>Alice</name>
        <hire_date>2003-10-02</hire_date>
        <salary>75000</salary>
    </employee>
    <employee id="2" department="Finance">
        <name>Bob</name>
        <hire_date>2003-04-16</hire_date>
        <salary>80000</salary>
    </employee>
    <employee id="3" department="IT">
        <name>Chloe</name>
        <hire_date>2023-12-29</hire_date>
        <salary>85000</salary>
    </employee>
</data>
"""

# Use StringIO to simulate a file-like object
xml_file = StringIO(xml_data)

# Read XML data
df = pd.read_xml(xml_file, xpath=".//employee", attrs_only=True)
print(df)
'''
Output:
   id department
0   1         HR
1   2    Finance
2   3         IT
'''

names: array-like, Optional-

Custom column names for the DataFrame.

import pandas as pd

# URL pointing to an XML file
url = "https://betterdocs.tech/global/python/pandas/read_xml.xml"

# Read the XML data from the URL into a DataFrame
df = pd.read_xml(path_or_buffer=url, names=["ID", "Names", "Join Date", "Salary"])

# Display the DataFrame
print(df)
'''
Output:
   ID  Names   Join Date  Salary
0   1  Alice  2003-10-02   75000
1   2    Bob  2003-04-16   80000
2   3  Chloe  2023-12-29   85000
'''

dtype: None, Optional-

Data type for DataFrame columns.

import pandas as pd

# URL pointing to an XML file
url = "https://betterdocs.tech/global/python/pandas/read_xml.xml"

# Read the XML data from the URL into a DataFrame
df = pd.read_xml(path_or_buffer=url, dtype={'salary': 'float32'})

# Display the DataFrame
print(df)
'''
Output:
   id   name   hire_date   salary
0   1  Alice  2003-10-02  75000.0
1   2    Bob  2003-04-16  80000.0
2   3  Chloe  2023-12-29  85000.0
'''

converters: None, Optional-

Functions for converting column values.

import pandas as pd

# URL pointing to an XML file
url = "https://betterdocs.tech/global/python/pandas/read_xml.xml"

def half_sal(sal):
  return int(sal)/2

# Read the XML data from the URL into a DataFrame
df = pd.read_xml(path_or_buffer=url, converters={'salary': half_sal})

# Display the DataFrame
print(df)
'''
Output:
   id   name   hire_date   salary
0   1  Alice  2003-10-02  37500.0
1   2    Bob  2003-04-16  40000.0
2   3  Chloe  2023-12-29  42500.0
'''

parse_dates: (True or False), Optional-

It is used to specify which columns should be parsed as dates during the reading of the table.

parse_dates = False (default) +

Pandas will not attempt to parse any columns as dates. If any date strings are present, they will be read as plain text.

import pandas as pd

# URL pointing to an XML file
url = "https://betterdocs.tech/global/python/pandas/read_xml.xml"

# Read the XML data from the URL into a DataFrame
df = pd.read_xml(path_or_buffer=url, parse_dates=False)

# Display the DataFrame
print(df['hire_date'])
'''
Output:
0    2003-10-02
1    2003-04-16
2    2023-12-29
Name: hire_date, dtype: object
'''

parse_dates = True +

Pandas will attempt to parse all columns with date-like values (e.g., strings in the format "YYYY-MM-DD") into datetime objects.

import pandas as pd

# URL pointing to an XML file
url = "https://betterdocs.tech/global/python/pandas/read_xml.xml"

# Read the XML data from the URL into a DataFrame
df = pd.read_xml(path_or_buffer=url, parse_dates=['hire_date'])

# Display the DataFrame
print(df['hire_date'])
'''
Output:
0   2003-10-02
1   2003-04-16
2   2023-12-29
Name: hire_date, dtype: datetime64[ns]
'''

encoding: str, Optional-

It specifies the character encoding used when reading the XML data.

import pandas as pd

# URL pointing to an XML file
url = "https://betterdocs.tech/global/python/pandas/read_xml.xml"

# Read the XML data from the URL into a DataFrame
df = pd.read_xml(path_or_buffer=url, encoding='utf-8')

# Display the DataFrame
print(df)
'''
Output:
   id   name   hire_date  salary
0   1  Alice  2003-10-02   75000
1   2    Bob  2003-04-16   80000
2   3  Chloe  2023-12-29   85000
'''

parser: str, Optional-

XML parsing engine.

import pandas as pd

# URL pointing to an XML file
url = "https://betterdocs.tech/global/python/pandas/read_xml.xml"

def half_sal(sal):
  return int(sal)/2

# Read the XML data from the URL into a DataFrame
df = pd.read_xml(path_or_buffer=url, parser='etree')

# Display the DataFrame
print(df)
'''
Output:
   id   name   hire_date  salary
0   1  Alice  2003-10-02   75000
1   2    Bob  2003-04-16   80000
2   3  Chloe  2023-12-29   85000
'''

"lxml" - Faster for large XML

"etree" - Slower for large XML

stylesheet: None, Optional-

It allows you to apply an XSLT stylesheet to transform the XML data before reading it into a DataFrame.

import pandas as pd
from io import StringIO

# Define the XML data as a string
xml_data = """<?xml version="1.0" encoding="UTF-8"?>
<data>
    <employee>
        <id>1</id>
        <name>Alice</name>
        <department>HR</department>
    </employee>
    <employee>
        <id>2</id>
        <name>Bob</name>
        <department>IT</department>
    </employee>
</data>"""

# Define the XSLT stylesheet as a string
xslt_data = """<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
    <xsl:output method="xml" indent="yes"/>

    <!-- Root element -->
    <xsl:template match="/">
        <transformed_data>
            <xsl:apply-templates select="data/employee"/>
        </transformed_data>
    </xsl:template>

    <!-- Template for employee -->
    <xsl:template match="employee">
        <employee>
            <id><xsl:value-of select="id"/></id>
            <name><xsl:value-of select="name"/></name>
        </employee>
    </xsl:template>
</xsl:stylesheet>"""

xml_file = StringIO(xml_data)
xslt_file = StringIO(xslt_data)

# Read the XML data and apply the XSLT transformation
df = pd.read_xml(path_or_buffer=xml_file, stylesheet=xslt_file)
print(df)
'''
Output:
   id   name
0   1  Alice
1   2    Bob
'''

iterparse: None, Optional-

It is used to specify additional values that should be treated as NaN (Not a Number) while reading the data.

import pandas as pd
from io import BytesIO

# XML data as a string
xml_data = """<?xml version="1.0" encoding="UTF-8"?>
<data>
    <employee>
        <id>1</id>
        <name>Alice</name>
        <department>HR</department>
        <info salary="75000" hire_date="2003-10-02"/>
    </employee>
    <employee>
        <id>2</id>
        <name>Bob</name>
        <department>IT</department>
        <info salary="80000" hire_date="2010-05-15"/>
    </employee>
    <employee>
        <id>3</id>
        <name>Charlie</name>
        <info salary="85000" hire_date="2018-08-20"/>
    </employee>
</data>"""

# Encode the string to bytes
xml_bytes = xml_data.encode('utf-8')

# Use BytesIO to wrap the bytes
xml_file = BytesIO(xml_bytes)

# Read XML with iterparse
df = pd.read_xml(xml_file, iterparse={ "employee": ["id", "name", "info@salary", "info@hire_date"]})
print(df)
'''
Output:
   id     name
0   1    Alice
1   2      Bob
2   3  Charlie
'''

Values: +

compression: None, Optional-

Compression type for the file.

import pandas as pd
import gzip

# Sample XML Data
xml_data = """<?xml version="1.0" encoding="UTF-8"?>
<data>
    <employee>
        <id>1</id>
        <name>Alice</name>
        <salary>75000</salary>
    </employee>
    <employee>
        <id>2</id>
        <name>Bob</name>
        <salary>80000</salary>
    </employee>
</data>"""

# Write XML data to a gzip-compressed file
with gzip.open("employees.xml.gz", "wt", encoding="utf-8") as f:
    f.write(xml_data)

# Read the compressed XML file
df = pd.read_xml("employees.xml.gz", compression="gzip")
print(df)
'''
Output:
   id   name  salary
0   1  Alice   75000
1   2    Bob   80000
'''

storage_options: dict, Optional-

Dictionary of storage-specific options, such as credentials for cloud storage.

dtype_backend: None, Optional-

The dtype_backend parameter is new in Pandas 2.0 which is used to specify the backend for handling the types of data when reading a file.

Previous Next

BetterDocs

Support

EmailDiscordForms

Documentations

Python

Company

AboutDocs

Policies

Terms of ServicePrivacy Policy

Creation | pd.read_xml()

Method:

pd.read_xml(path_or_buffer, *, xpath='./*', namespaces=None, elems_only=False, attrs_only=False, names=None, dtype=None, converters=None, parse_dates=None, encoding='utf-8', parser='lxml', stylesheet=None, iterparse=None, compression='infer', storage_options=None, dtype_backend=<no_default>)

Reads HTML tables into a list of DataFrames.

Returns:

pandas.core.frame.DataFrame

Parameters:

path_or_buffer: (str or path or file-like)-

xpath: str, Optional-

namespaces: dict, Optional-

elems_only: (True or False), Optional-

elems_only = False (default) +

elems_only = True +

attrs_only: (True or False), Optional-

attrs_only = False (default) +

attrs_only = True +

names: array-like, Optional-

dtype: None, Optional-

converters: None, Optional-

parse_dates: (True or False), Optional-

parse_dates = False (default) +

parse_dates = True +

encoding: str, Optional-

parser: str, Optional-

stylesheet: None, Optional-

iterparse: None, Optional-

Values: +

NaN Values

compression: None, Optional-

storage_options: dict, Optional-

dtype_backend: None, Optional-

BetterDocs

Support

Documentations

Company

Policies

pd.read_xml(path_or_buffer, , xpath='./', namespaces=None, elems_only=False, attrs_only=False, names=None, dtype=None, converters=None, parse_dates=None, encoding='utf-8', parser='lxml', stylesheet=None, iterparse=None, compression='infer', storage_options=None, dtype_backend=<no_default>)