Blog / AI / Claude Code for lxml: Fast XML and HTML Parsing in Python

Claude Code for lxml: Fast XML and HTML Parsing in Python

Published: May 21, 2028

•

Read time: 5 min read

•

By: Claude Skills 360

lxml is a fast XML/HTML parser built on libxml2. pip install lxml. Parse XML: from lxml import etree; tree = etree.parse("file.xml"); root = tree.getroot(). FromString: root = etree.fromstring(b"<root><child/></root>"). Element: e = etree.Element("item"); e.text = "hello"; e.set("id","1"). SubElement: child = etree.SubElement(root, "child", attrib={"k":"v"}). ToSTring: etree.tostring(root, pretty_print=True, encoding="unicode"). XPath: root.xpath("//item[@id='1']"). XPath with ns: root.xpath("//ns:item", namespaces={"ns":"http://example.com"}). Find: root.find("child"). FindAll: root.findall(".//item"). FindText: root.findtext("title"). HTML parse: from lxml import html; doc = html.fromstring(content). CSS: doc.cssselect("div.product h2") (requires pip install cssselect). HTML parse URL: html.parse(url). Clean HTML: from lxml.html.clean import Cleaner; Cleaner(javascript=True)(doc). Validate XMLSchema: schema = etree.XMLSchema(file="schema.xsd"); schema.validate(doc). XSLT: transform = etree.XSLT(etree.parse("style.xsl")); result = transform(tree). iterparse: for event, elem in etree.iterparse("large.xml", events=("start","end")): .... ObjectPath: from lxml import objectify; obj = objectify.fromstring(xml_bytes). E-factory: from lxml.builder import E; tree = E.root(E.item("hello")). Claude Code generates lxml XML parsers, XPath queries, HTML scrapers, and streaming large XML processors.

CLAUDE.md for lxml

## lxml Stack
- Version: lxml >= 5.0 | pip install lxml cssselect
- Parse: etree.parse("file.xml") | etree.fromstring(bytes) | html.fromstring(html_str)
- Query: root.xpath("//item[@id='1']") | root.cssselect("div.title") (needs cssselect)
- Build: etree.Element("tag") | etree.SubElement(parent, "tag") | E.root(E.child("text"))
- Output: etree.tostring(root, pretty_print=True, encoding="unicode")
- Stream: etree.iterparse("large.xml", events=("end",), tag="record")

lxml XML and HTML Processing Pipeline

# app/xml_parse.py — lxml parse, XPath, build, validate, HTML scrape, streaming
from __future__ import annotations

import io
import re
from pathlib import Path
from typing import Any, Generator, Iterator

from lxml import etree, html
from lxml.builder import E, ElementMaker


# ─────────────────────────────────────────────────────────────────────────────
# 1. XML parsing helpers
# ─────────────────────────────────────────────────────────────────────────────

def parse_xml(
    source: str | Path | bytes | io.IOBase,
    remove_comments: bool = False,
    remove_pis: bool = False,
    resolve_entities: bool = True,
) -> etree._ElementTree:
    """
    Parse an XML document from file path, bytes, or file-like object.
    Returns an ElementTree (call .getroot() for the root Element).

    Example:
        tree = parse_xml("catalog.xml")
        root = tree.getroot()
    """
    parser = etree.XMLParser(
        remove_comments=remove_comments,
        remove_pis=remove_pis,
        resolve_entities=resolve_entities,
        ns_clean=True,
        recover=False,
    )

    if isinstance(source, (str, Path)):
        return etree.parse(str(source), parser)
    elif isinstance(source, bytes):
        return etree.parse(io.BytesIO(source), parser)
    else:
        return etree.parse(source, parser)


def from_string(
    xml: str | bytes,
    recover: bool = False,
) -> etree._Element:
    """
    Parse an XML string/bytes to an Element (not an ElementTree).
    recover=True: try to parse malformed XML.

    Example:
        root = from_string(b"<catalog><item id='1'>Widget</item></catalog>")
        print(root.tag)   # "catalog"
    """
    if isinstance(xml, str):
        xml = xml.encode()
    parser = etree.XMLParser(recover=recover, ns_clean=True)
    return etree.fromstring(xml, parser)


def to_string(
    element: etree._Element | etree._ElementTree,
    pretty: bool = True,
    encoding: str = "unicode",
    xml_declaration: bool = False,
) -> str:
    """
    Serialize an Element or ElementTree to a string.

    Example:
        xml_str = to_string(root)
    """
    return etree.tostring(
        element,
        pretty_print=pretty,
        encoding=encoding,
        xml_declaration=xml_declaration,
    )


def to_bytes(
    element: etree._Element | etree._ElementTree,
    pretty: bool = True,
    encoding: str = "UTF-8",
    xml_declaration: bool = True,
) -> bytes:
    """Serialize to bytes with optional XML declaration."""
    return etree.tostring(
        element,
        pretty_print=pretty,
        encoding=encoding,
        xml_declaration=xml_declaration,
    )


# ─────────────────────────────────────────────────────────────────────────────
# 2. XPath and querying
# ─────────────────────────────────────────────────────────────────────────────

def xpath(
    element: etree._Element | etree._ElementTree,
    expression: str,
    namespaces: dict[str, str] | None = None,
    smart_strings: bool = False,
) -> list:
    """
    Execute an XPath expression and return matching nodes or values.
    namespaces: required for documents that use XML namespaces.

    Example:
        items = xpath(root, "//item[@status='active']")
        names = xpath(root, "//item/name/text()")
        # With namespace:
        nodes = xpath(root, "//ns:product", namespaces={"ns": "http://example.com/catalog"})
    """
    return element.xpath(expression, namespaces=namespaces,
                         smart_strings=smart_strings)


def find_text(element: etree._Element, path: str, default: str = "") -> str:
    """Find first element at path and return its text content."""
    found = element.find(path)
    if found is not None and found.text:
        return found.text.strip()
    return default


def find_all_text(element: etree._Element, path: str) -> list[str]:
    """Find all elements at path and return their text content as a list."""
    return [e.text.strip() for e in element.findall(path) if e.text]


def element_to_dict(element: etree._Element, strip_ns: bool = True) -> dict:
    """
    Convert an Element tree to a nested dict.
    Attributes become "@key" entries; text becomes "#text".

    Example:
        d = element_to_dict(order_element)
        # {"@id": "42", "customer": {"name": {"#text": "Alice"}}}
    """
    def clean_tag(tag: str) -> str:
        if strip_ns and "{" in tag:
            return tag.split("}", 1)[1]
        return tag

    result: dict = {}
    # Attributes
    for k, v in element.items():
        result[f"@{clean_tag(k)}"] = v
    # Text
    if element.text and element.text.strip():
        result["#text"] = element.text.strip()
    # Children
    for child in element:
        tag = clean_tag(child.tag)
        child_dict = element_to_dict(child, strip_ns)
        if tag in result:
            if not isinstance(result[tag], list):
                result[tag] = [result[tag]]
            result[tag].append(child_dict)
        else:
            result[tag] = child_dict

    return result


# ─────────────────────────────────────────────────────────────────────────────
# 3. Building XML
# ─────────────────────────────────────────────────────────────────────────────

def build_element(
    tag: str,
    text: str | None = None,
    attrib: dict[str, str] | None = None,
    children: list[etree._Element] | None = None,
    tail: str | None = None,
    nsmap: dict | None = None,
) -> etree._Element:
    """
    Build an XML element programmatically.

    Example:
        item = build_element("item", text="Widget", attrib={"id": "1", "status": "active"})
    """
    e = etree.Element(tag, attrib=attrib or {}, nsmap=nsmap)
    if text is not None:
        e.text = text
    if tail is not None:
        e.tail = tail
    for child in children or []:
        e.append(child)
    return e


def dict_to_element(tag: str, data: dict, parent: etree._Element | None = None) -> etree._Element:
    """
    Convert a dict to an XML element tree.
    Lists become repeated child elements.
    Nested dicts become child elements.
    Scalars become text content.

    Example:
        order = dict_to_element("order", {
            "id": "42",
            "customer": {"name": "Alice", "email": "[email protected]"},
            "items": [{"sku": "A1", "qty": "2"}, {"sku": "B3", "qty": "1"}],
        })
    """
    elem = etree.SubElement(parent, tag) if parent is not None else etree.Element(tag)

    for key, value in data.items():
        if isinstance(value, dict):
            dict_to_element(key, value, elem)
        elif isinstance(value, list):
            for item in value:
                if isinstance(item, dict):
                    dict_to_element(key, item, elem)
                else:
                    child = etree.SubElement(elem, key)
                    child.text = str(item)
        else:
            child = etree.SubElement(elem, key)
            child.text = str(value)

    return elem


# ─────────────────────────────────────────────────────────────────────────────
# 4. Streaming large XML
# ─────────────────────────────────────────────────────────────────────────────

def stream_elements(
    source: str | Path | bytes,
    tag: str,
    namespaces: dict[str, str] | None = None,
) -> Generator[etree._Element, None, None]:
    """
    Stream large XML files by yielding one element at a time.
    Clears processed elements from memory to avoid OOM.

    Example:
        for record in stream_elements("large_export.xml", "record"):
            process(record)
    """
    if isinstance(source, bytes):
        source = io.BytesIO(source)

    context = etree.iterparse(source, events=("end",), tag=tag)

    for _, element in context:
        yield element
        # Free memory
        element.clear()
        while element.getprevious() is not None:
            del element.getparent()[0]

    del context


def count_elements(source: str | Path, tag: str) -> int:
    """Count elements with a given tag in a large XML file without loading fully."""
    count = 0
    for _ in stream_elements(source, tag):
        count += 1
    return count


# ─────────────────────────────────────────────────────────────────────────────
# 5. HTML parsing
# ─────────────────────────────────────────────────────────────────────────────

def parse_html(
    source: str | bytes,
    base_url: str | None = None,
) -> html.HtmlElement:
    """
    Parse an HTML string or bytes and return the root element.
    base_url: used to resolve relative links.

    Example:
        doc = parse_html(response.text, base_url="https://example.com")
        links = [a.get("href") for a in doc.cssselect("a[href]")]
    """
    if isinstance(source, str):
        source = source.encode()
    return html.fromstring(source, base_url=base_url)


def css_select(document: html.HtmlElement, selector: str) -> list:
    """
    Select elements using a CSS selector (requires cssselect package).

    Example:
        titles = css_select(doc, "h1.product-title")
        texts  = [t.text_content().strip() for t in titles]
    """
    return document.cssselect(selector)


def extract_links(
    document: html.HtmlElement,
    base_url: str | None = None,
    absolute: bool = True,
) -> list[str]:
    """
    Extract all <a href> links from an HTML document.
    absolute=True: resolve relative URLs using base_url.
    """
    if absolute and base_url:
        document.make_links_absolute(base_url)
    return [a.get("href") for a in document.cssselect("a[href]")
            if a.get("href") and not a.get("href", "").startswith("#")]


def extract_table(document: html.HtmlElement, selector: str = "table") -> list[list[str]]:
    """
    Extract a table from HTML as a list of row/column string lists.
    Uses first table matching selector.

    Example:
        rows = extract_table(doc, "table.data-table")
        headers = rows[0]
        data    = rows[1:]
    """
    tables = document.cssselect(selector)
    if not tables:
        return []
    table = tables[0]
    result = []
    for tr in table.cssselect("tr"):
        row = [td.text_content().strip() for td in tr.cssselect("th, td")]
        if row:
            result.append(row)
    return result


def clean_html(
    source: str | bytes,
    remove_javascript: bool = True,
    remove_style_tags: bool = True,
    allow_tags: list[str] | None = None,
) -> str:
    """
    Clean HTML by removing scripts, styles, and unsafe content.
    Returns clean HTML as string.
    """
    from lxml.html.clean import Cleaner
    cleaner = Cleaner(
        javascript=remove_javascript,
        style=remove_style_tags,
        allow_tags=allow_tags,
        remove_unknown_tags=False,
    )
    if isinstance(source, str):
        source = source.encode()
    doc = html.fromstring(source)
    clean_doc = cleaner.clean_html(doc)
    return html.tostring(clean_doc, encoding="unicode")


# ─────────────────────────────────────────────────────────────────────────────
# Demo
# ─────────────────────────────────────────────────────────────────────────────

if __name__ == "__main__":
    # Build XML
    print("=== Build XML ===")
    catalog = E.catalog(
        E.item(E.name("Widget"), E.price("9.99"),  id="1", status="active"),
        E.item(E.name("Gadget"), E.price("24.99"), id="2", status="active"),
        E.item(E.name("Doohickey"), E.price("4.99"), id="3", status="discontinued"),
        version="1.0",
    )
    print(to_string(catalog))

    # XPath query
    print("=== XPath ===")
    active = xpath(catalog, "//item[@status='active']")
    print(f"  Active items: {[e.find('name').text for e in active]}")

    prices = xpath(catalog, "//item/price/text()")
    print(f"  Prices: {prices}")

    # dict_to_element
    print("\n=== dict_to_element ===")
    order = dict_to_element("order", {
        "id": "42",
        "customer": {"name": "Alice", "email": "[email protected]"},
        "total": "34.98",
    })
    print(to_string(order))

    # Streaming
    print("=== Streaming ===")
    xml_bytes = to_bytes(catalog)
    count = 0
    for item in stream_elements(xml_bytes, "item"):
        count += 1
    print(f"  Streamed {count} <item> elements")

    # HTML
    print("\n=== HTML parsing ===")
    sample_html = b"""
    <html><body>
      <h1>Products</h1>
      <table class="inventory">
        <tr><th>Name</th><th>Price</th><th>Stock</th></tr>
        <tr><td>Widget</td><td>$9.99</td><td>42</td></tr>
        <tr><td>Gadget</td><td>$24.99</td><td>7</td></tr>
      </table>
      <a href="/products/widget">Widget page</a>
      <a href="/products/gadget">Gadget page</a>
    </body></html>
    """
    doc = parse_html(sample_html)
    rows = extract_table(doc)
    print(f"  Table rows: {rows}")
    links = extract_links(doc, base_url="https://example.com")
    print(f"  Links: {links}")

For the xml.etree.ElementTree (stdlib) alternative — Python’s built-in ElementTree is adequate for small XML files with basic find/findall queries; lxml is 5–30× faster, supports full XPath 1.0 (including predicates and axes), handles HTML recovery, validates against XSD schemas, applies XSLT transforms, and provides streaming iterparse for files too large to fit in memory — use lxml for any serious XML/HTML processing. For the BeautifulSoup4 alternative — BeautifulSoup4 wraps lxml (or html.parser) with a more forgiving API and excellent malformed-HTML recovery; lxml’s html module is faster for well-structured or scraping tasks where you want CSS selectors and XPath in the same tool — use BeautifulSoup4 for lenient HTML scraping, lxml when you need XPath queries, XML validation, or streaming parsers. The Claude Skills 360 bundle includes lxml skill sets covering parse_xml() with XMLParser options, from_string() with recover mode, to_string()/to_bytes() serializers, xpath() with namespace mapping, find_text()/find_all_text() helpers, element_to_dict() tree converter, build_element()/dict_to_element() constructors, stream_elements() iterparse memory-efficient streaming, parse_html()/css_select() HTML querying, extract_links()/extract_table() scraping helpers, and clean_html() sanitizer. Start with the free tier to try lxml XML and HTML parsing code generation.

Keep Reading

Claude Code for email.contentmanager: Python Email Content Accessors

Read and write EmailMessage body content with Python's email.contentmanager module and Claude Code — email contentmanager ContentManager for the class that maps content types to get and set handler functions allowing EmailMessage to support get_content and set_content with type-specific behaviour, email contentmanager raw_data_manager for the ContentManager instance that handles raw bytes and str payloads without any conversion, email contentmanager content_manager for the standard ContentManager instance used by email.policy.default that intelligently handles text plain text html multipart and binary content types, email contentmanager get_content_text for the handler that returns the decoded text payload of a text-star message part as a str, email contentmanager get_content_binary for the handler that returns the raw decoded bytes payload of a non-text message part, email contentmanager get_data_manager for the get-handler lookup used by EmailMessage get_content to find the right reader function for the content type, email contentmanager set_content text for the handler that creates and sets a text part correctly choosing charset and transfer encoding, email contentmanager set_content bytes for the handler that creates and sets a binary part with base64 encoding and optional filename Content-Disposition, email contentmanager EmailMessage get_content for the method that reads the message body using the registered content manager handlers, email contentmanager EmailMessage set_content for the method that sets the message body and MIME headers in one call, email contentmanager EmailMessage make_alternative make_mixed make_related for the methods that convert a simple message into a multipart container, email contentmanager EmailMessage add_attachment for the method that attaches a file or bytes to a multipart message, and email contentmanager integration with email.message and email.policy and email.mime and io for building high-level email readers attachment extractors text body accessors HTML readers and policy-aware MIME construction pipelines.

5 min read Feb 12, 2029

Claude Code for email.charset: Python Email Charset Encoding

Control header and body encoding for international email with Python's email.charset module and Claude Code — email charset Charset for the class that wraps a character set name with the encoding rules for header encoding and body encoding describing how to encode text for that charset in email messages, email charset Charset header_encoding for the attribute specifying whether headers using this charset should use QP quoted-printable encoding BASE64 encoding or no encoding, email charset Charset body_encoding for the attribute specifying the Content-Transfer-Encoding to use for message bodies in this charset such as QP or BASE64, email charset Charset output_codec for the attribute giving the Python codec name used to encode the string to bytes for the wire format, email charset Charset input_codec for the attribute giving the Python codec name used to decode incoming bytes to str, email charset Charset get_output_charset for returning the output charset name, email charset Charset header_encode for encoding a header string using the charset's header_encoding method, email charset Charset body_encode for encoding body content using the charset's body_encoding, email charset Charset convert for converting a string from the input_codec to the output_codec, email charset add_charset for registering a new charset with custom encoding rules in the global charset registry, email charset add_alias for adding an alias name that maps to an existing registered charset, email charset add_codec for registering a codec name mapping for use by the charset machinery, and email charset integration with email.message and email.mime and email.policy and email.encoders for building international email senders non-ASCII header encoders Content-Transfer-Encoding selectors charset-aware message constructors and MIME encoding pipelines.

5 min read Feb 11, 2029

Claude Code for email.utils: Python Email Address and Header Utilities

Parse and format RFC 2822 email addresses and dates with Python's email.utils module and Claude Code — email utils parseaddr for splitting a display-name plus angle-bracket address string into a realname and email address tuple, email utils formataddr for combining a realname and address string into a properly quoted RFC 2822 address with angle brackets, email utils getaddresses for parsing a list of raw address header strings each potentially containing multiple comma-separated addresses into a list of realname address tuples, email utils parsedate for parsing an RFC 2822 date string into a nine-tuple compatible with time.mktime, email utils parsedate_tz for parsing an RFC 2822 date string into a ten-tuple that includes the UTC offset timezone in seconds, email utils parsedate_to_datetime for parsing an RFC 2822 date string into an aware datetime object with timezone, email utils formatdate for formatting a POSIX timestamp or the current time as an RFC 2822 date string with optional usegmt and localtime flags, email utils format_datetime for formatting a datetime object as an RFC 2822 date string, email utils make_msgid for generating a globally unique Message-ID string with optional idstring and domain components, email utils decode_rfc2231 for decoding an RFC 2231 encoded parameter value into a tuple of charset language and value, email utils encode_rfc2231 for encoding a string as an RFC 2231 encoded parameter value, email utils collapse_rfc2231_value for collapsing a decoded RFC 2231 tuple to a Unicode string, and email utils integration with email.message and email.headerregistry and datetime and time for building address parsers date formatters message-id generators header extractors and RFC-compliant email construction utilities.

5 min read Feb 10, 2029

Put these ideas into practice

Claude Skills 360 gives you production-ready skills for everything in this article — and 2,350+ more. Start free or go all-in.

Get 360 skills free

Free $39