BetterDocs
Home
Docs

Creation | pd.read_xml()

Method:

pd.read_xml(path_or_buffer, *, xpath='./*', namespaces=None, elems_only=False, attrs_only=False, names=None, dtype=None, converters=None, parse_dates=None, encoding='utf-8', parser='lxml', stylesheet=None, iterparse=None, compression='infer', storage_options=None, dtype_backend=<no_default>)

Reads HTML tables into a list of DataFrames.

Returns:

pandas.core.frame.DataFrame

Parameters:

path_or_buffer: (str or path or file-like)-

Path or file-like object containing XML.

import pandas as pd

# URL pointing to an XML file
url = "https://betterdocs.tech/global/python/pandas/read_xml.xml"

# Read the XML data from the URL into a DataFrame
df = pd.read_xml(path_or_buffer=url)

# Display the DataFrame
print(df)
'''
Output:
   id   name   hire_date  salary
0   1  Alice  2003-10-02   75000
1   2    Bob  2003-04-16   80000
2   3  Chloe  2023-12-29   85000
'''

xpath: str, Optional-

It is used to specify an XPath expression to filter the XML data and select a particular part of the XML document.

import pandas as pd

# URL pointing to an XML file
url = "https://betterdocs.tech/global/python/pandas/read_xml.xml"

# Read the XML data from the URL into a DataFrame
df = pd.read_xml(path_or_buffer=url, xpath="//employee")

# Display the DataFrame
print(df)
'''
Output:
   id   name   hire_date  salary
0   1  Alice  2003-10-02   75000
1   2    Bob  2003-04-16   80000
2   3  Chloe  2023-12-29   85000
'''

namespaces: dict, Optional-

Namespace mapping for XPath expressions.

import pandas as pd
from io import StringIO

# XML data with namespaces
xml_data = """
<library xmlns:ns="http://example.com/ns">
    <ns:book>
        <ns:title>Python Programming</ns:title>
        <ns:author>John Doe</ns:author>
        <ns:price>29.99</ns:price>
    </ns:book>
    <ns:book>
        <ns:title>Data Science with Python</ns:title>
        <ns:author>Jane Smith</ns:author>
        <ns:price>39.99</ns:price>
    </ns:book>
    <ns:magazine>
        <ns:title>Tech Trends</ns:title>
        <ns:issue>2023</ns:issue>
    </ns:magazine>
</library>
"""

# Use StringIO to simulate reading from a file
xml_file = StringIO(xml_data)

# Define the namespaces
namespaces = {
    'ns': 'http://example.com/ns'
}

# Read only 'book' elements using the defined namespace and XPath
df = pd.read_xml(xml_file, xpath=".//ns:book", namespaces=namespaces)
print(df)
'''
Output:
                      title      author  price
0        Python Programming    John Doe  29.99
1  Data Science with Python  Jane Smith  39.99
'''

"lxml" is best for performance, particularly for large files.

"bs4" is useful for handling poorly formatted or complex HTML.

"html5lib" is used when dealing with HTML5 documents and when compatibility with modern web standards is a priority.

elems_only: (True or False), Optional-

Include only element data if True.

elems_only = False (default) +

elems_only = True +

attrs_only: (True or False), Optional-

Include only attributes if True.

attrs_only = False (default) +

attrs_only = True +

names: array-like, Optional-

Custom column names for the DataFrame.

import pandas as pd

# URL pointing to an XML file
url = "https://betterdocs.tech/global/python/pandas/read_xml.xml"

# Read the XML data from the URL into a DataFrame
df = pd.read_xml(path_or_buffer=url, names=["ID", "Names", "Join Date", "Salary"])

# Display the DataFrame
print(df)
'''
Output:
   ID  Names   Join Date  Salary
0   1  Alice  2003-10-02   75000
1   2    Bob  2003-04-16   80000
2   3  Chloe  2023-12-29   85000
'''

dtype: None, Optional-

Data type for DataFrame columns.

import pandas as pd

# URL pointing to an XML file
url = "https://betterdocs.tech/global/python/pandas/read_xml.xml"

# Read the XML data from the URL into a DataFrame
df = pd.read_xml(path_or_buffer=url, dtype={'salary': 'float32'})

# Display the DataFrame
print(df)
'''
Output:
   id   name   hire_date   salary
0   1  Alice  2003-10-02  75000.0
1   2    Bob  2003-04-16  80000.0
2   3  Chloe  2023-12-29  85000.0
'''

converters: None, Optional-

Functions for converting column values.

import pandas as pd

# URL pointing to an XML file
url = "https://betterdocs.tech/global/python/pandas/read_xml.xml"

def half_sal(sal):
  return int(sal)/2

# Read the XML data from the URL into a DataFrame
df = pd.read_xml(path_or_buffer=url, converters={'salary': half_sal})

# Display the DataFrame
print(df)
'''
Output:
   id   name   hire_date   salary
0   1  Alice  2003-10-02  37500.0
1   2    Bob  2003-04-16  40000.0
2   3  Chloe  2023-12-29  42500.0
'''

parse_dates: (True or False), Optional-

It is used to specify which columns should be parsed as dates during the reading of the table.

parse_dates = False (default) +

parse_dates = True +

encoding: str, Optional-

It specifies the character encoding used when reading the XML data.

import pandas as pd

# URL pointing to an XML file
url = "https://betterdocs.tech/global/python/pandas/read_xml.xml"

# Read the XML data from the URL into a DataFrame
df = pd.read_xml(path_or_buffer=url, encoding='utf-8')

# Display the DataFrame
print(df)
'''
Output:
   id   name   hire_date  salary
0   1  Alice  2003-10-02   75000
1   2    Bob  2003-04-16   80000
2   3  Chloe  2023-12-29   85000
'''

parser: str, Optional-

XML parsing engine.

import pandas as pd

# URL pointing to an XML file
url = "https://betterdocs.tech/global/python/pandas/read_xml.xml"

def half_sal(sal):
  return int(sal)/2

# Read the XML data from the URL into a DataFrame
df = pd.read_xml(path_or_buffer=url, parser='etree')

# Display the DataFrame
print(df)
'''
Output:
   id   name   hire_date  salary
0   1  Alice  2003-10-02   75000
1   2    Bob  2003-04-16   80000
2   3  Chloe  2023-12-29   85000
'''

"lxml" - Faster for large XML

"etree" - Slower for large XML

stylesheet: None, Optional-

It allows you to apply an XSLT stylesheet to transform the XML data before reading it into a DataFrame.

import pandas as pd
from io import StringIO

# Define the XML data as a string
xml_data = """<?xml version="1.0" encoding="UTF-8"?>
<data>
    <employee>
        <id>1</id>
        <name>Alice</name>
        <department>HR</department>
    </employee>
    <employee>
        <id>2</id>
        <name>Bob</name>
        <department>IT</department>
    </employee>
</data>"""

# Define the XSLT stylesheet as a string
xslt_data = """<?xml version="1.0" encoding="UTF-8"?>
<xsl:stylesheet version="1.0" xmlns:xsl="http://www.w3.org/1999/XSL/Transform">
    <xsl:output method="xml" indent="yes"/>

    <!-- Root element -->
    <xsl:template match="/">
        <transformed_data>
            <xsl:apply-templates select="data/employee"/>
        </transformed_data>
    </xsl:template>

    <!-- Template for employee -->
    <xsl:template match="employee">
        <employee>
            <id><xsl:value-of select="id"/></id>
            <name><xsl:value-of select="name"/></name>
        </employee>
    </xsl:template>
</xsl:stylesheet>"""

xml_file = StringIO(xml_data)
xslt_file = StringIO(xslt_data)

# Read the XML data and apply the XSLT transformation
df = pd.read_xml(path_or_buffer=xml_file, stylesheet=xslt_file)
print(df)
'''
Output:
   id   name
0   1  Alice
1   2    Bob
'''

iterparse: None, Optional-

It is used to specify additional values that should be treated as NaN (Not a Number) while reading the data.

import pandas as pd
from io import BytesIO

# XML data as a string
xml_data = """<?xml version="1.0" encoding="UTF-8"?>
<data>
    <employee>
        <id>1</id>
        <name>Alice</name>
        <department>HR</department>
        <info salary="75000" hire_date="2003-10-02"/>
    </employee>
    <employee>
        <id>2</id>
        <name>Bob</name>
        <department>IT</department>
        <info salary="80000" hire_date="2010-05-15"/>
    </employee>
    <employee>
        <id>3</id>
        <name>Charlie</name>
        <info salary="85000" hire_date="2018-08-20"/>
    </employee>
</data>"""

# Encode the string to bytes
xml_bytes = xml_data.encode('utf-8')

# Use BytesIO to wrap the bytes
xml_file = BytesIO(xml_bytes)

# Read XML with iterparse
df = pd.read_xml(xml_file, iterparse={ "employee": ["id", "name", "info@salary", "info@hire_date"]})
print(df)
'''
Output:
   id     name
0   1    Alice
1   2      Bob
2   3  Charlie
'''

Values: +

compression: None, Optional-

Compression type for the file.

import pandas as pd
import gzip

# Sample XML Data
xml_data = """<?xml version="1.0" encoding="UTF-8"?>
<data>
    <employee>
        <id>1</id>
        <name>Alice</name>
        <salary>75000</salary>
    </employee>
    <employee>
        <id>2</id>
        <name>Bob</name>
        <salary>80000</salary>
    </employee>
</data>"""

# Write XML data to a gzip-compressed file
with gzip.open("employees.xml.gz", "wt", encoding="utf-8") as f:
    f.write(xml_data)

# Read the compressed XML file
df = pd.read_xml("employees.xml.gz", compression="gzip")
print(df)
'''
Output:
   id   name  salary
0   1  Alice   75000
1   2    Bob   80000
'''

storage_options: dict, Optional-

Dictionary of storage-specific options, such as credentials for cloud storage.

dtype_backend: None, Optional-

The dtype_backend parameter is new in Pandas 2.0 which is used to specify the backend for handling the types of data when reading a file.


Logo

BetterDocs

Support

EmailDiscordForms

Documentations

Python

Company

AboutDocs

Policies

Terms of ServicePrivacy Policy