lxml is a fast XML/HTML parser built on libxml2. pip install lxml. Parse XML: from lxml import etree; tree = etree.parse("file.xml"); root = tree.getroot(). FromString: root = etree.fromstring(b"<root><child/></root>"). Element: e = etree.Element("item"); e.text = "hello"; e.set("id","1"). SubElement: child = etree.SubElement(root, "child", attrib={"k":"v"}). ToSTring: etree.tostring(root, pretty_print=True, encoding="unicode"). XPath: root.xpath("//item[@id='1']"). XPath with ns: root.xpath("//ns:item", namespaces={"ns":"http://example.com"}). Find: root.find("child"). FindAll: root.findall(".//item"). FindText: root.findtext("title"). HTML parse: from lxml import html; doc = html.fromstring(content). CSS: doc.cssselect("div.product h2") (requires pip install cssselect). HTML parse URL: html.parse(url). Clean HTML: from lxml.html.clean import Cleaner; Cleaner(javascript=True)(doc). Validate XMLSchema: schema = etree.XMLSchema(file="schema.xsd"); schema.validate(doc). XSLT: transform = etree.XSLT(etree.parse("style.xsl")); result = transform(tree). iterparse: for event, elem in etree.iterparse("large.xml", events=("start","end")): .... ObjectPath: from lxml import objectify; obj = objectify.fromstring(xml_bytes). E-factory: from lxml.builder import E; tree = E.root(E.item("hello")). Claude Code generates lxml XML parsers, XPath queries, HTML scrapers, and streaming large XML processors.
CLAUDE.md for lxml
## lxml Stack
- Version: lxml >= 5.0 | pip install lxml cssselect
- Parse: etree.parse("file.xml") | etree.fromstring(bytes) | html.fromstring(html_str)
- Query: root.xpath("//item[@id='1']") | root.cssselect("div.title") (needs cssselect)
- Build: etree.Element("tag") | etree.SubElement(parent, "tag") | E.root(E.child("text"))
- Output: etree.tostring(root, pretty_print=True, encoding="unicode")
- Stream: etree.iterparse("large.xml", events=("end",), tag="record")
lxml XML and HTML Processing Pipeline
# app/xml_parse.py — lxml parse, XPath, build, validate, HTML scrape, streaming
from __future__ import annotations
import io
import re
from pathlib import Path
from typing import Any, Generator, Iterator
from lxml import etree, html
from lxml.builder import E, ElementMaker
# ─────────────────────────────────────────────────────────────────────────────
# 1. XML parsing helpers
# ─────────────────────────────────────────────────────────────────────────────
def parse_xml(
source: str | Path | bytes | io.IOBase,
remove_comments: bool = False,
remove_pis: bool = False,
resolve_entities: bool = True,
) -> etree._ElementTree:
"""
Parse an XML document from file path, bytes, or file-like object.
Returns an ElementTree (call .getroot() for the root Element).
Example:
tree = parse_xml("catalog.xml")
root = tree.getroot()
"""
parser = etree.XMLParser(
remove_comments=remove_comments,
remove_pis=remove_pis,
resolve_entities=resolve_entities,
ns_clean=True,
recover=False,
)
if isinstance(source, (str, Path)):
return etree.parse(str(source), parser)
elif isinstance(source, bytes):
return etree.parse(io.BytesIO(source), parser)
else:
return etree.parse(source, parser)
def from_string(
xml: str | bytes,
recover: bool = False,
) -> etree._Element:
"""
Parse an XML string/bytes to an Element (not an ElementTree).
recover=True: try to parse malformed XML.
Example:
root = from_string(b"<catalog><item id='1'>Widget</item></catalog>")
print(root.tag) # "catalog"
"""
if isinstance(xml, str):
xml = xml.encode()
parser = etree.XMLParser(recover=recover, ns_clean=True)
return etree.fromstring(xml, parser)
def to_string(
element: etree._Element | etree._ElementTree,
pretty: bool = True,
encoding: str = "unicode",
xml_declaration: bool = False,
) -> str:
"""
Serialize an Element or ElementTree to a string.
Example:
xml_str = to_string(root)
"""
return etree.tostring(
element,
pretty_print=pretty,
encoding=encoding,
xml_declaration=xml_declaration,
)
def to_bytes(
element: etree._Element | etree._ElementTree,
pretty: bool = True,
encoding: str = "UTF-8",
xml_declaration: bool = True,
) -> bytes:
"""Serialize to bytes with optional XML declaration."""
return etree.tostring(
element,
pretty_print=pretty,
encoding=encoding,
xml_declaration=xml_declaration,
)
# ─────────────────────────────────────────────────────────────────────────────
# 2. XPath and querying
# ─────────────────────────────────────────────────────────────────────────────
def xpath(
element: etree._Element | etree._ElementTree,
expression: str,
namespaces: dict[str, str] | None = None,
smart_strings: bool = False,
) -> list:
"""
Execute an XPath expression and return matching nodes or values.
namespaces: required for documents that use XML namespaces.
Example:
items = xpath(root, "//item[@status='active']")
names = xpath(root, "//item/name/text()")
# With namespace:
nodes = xpath(root, "//ns:product", namespaces={"ns": "http://example.com/catalog"})
"""
return element.xpath(expression, namespaces=namespaces,
smart_strings=smart_strings)
def find_text(element: etree._Element, path: str, default: str = "") -> str:
"""Find first element at path and return its text content."""
found = element.find(path)
if found is not None and found.text:
return found.text.strip()
return default
def find_all_text(element: etree._Element, path: str) -> list[str]:
"""Find all elements at path and return their text content as a list."""
return [e.text.strip() for e in element.findall(path) if e.text]
def element_to_dict(element: etree._Element, strip_ns: bool = True) -> dict:
"""
Convert an Element tree to a nested dict.
Attributes become "@key" entries; text becomes "#text".
Example:
d = element_to_dict(order_element)
# {"@id": "42", "customer": {"name": {"#text": "Alice"}}}
"""
def clean_tag(tag: str) -> str:
if strip_ns and "{" in tag:
return tag.split("}", 1)[1]
return tag
result: dict = {}
# Attributes
for k, v in element.items():
result[f"@{clean_tag(k)}"] = v
# Text
if element.text and element.text.strip():
result["#text"] = element.text.strip()
# Children
for child in element:
tag = clean_tag(child.tag)
child_dict = element_to_dict(child, strip_ns)
if tag in result:
if not isinstance(result[tag], list):
result[tag] = [result[tag]]
result[tag].append(child_dict)
else:
result[tag] = child_dict
return result
# ─────────────────────────────────────────────────────────────────────────────
# 3. Building XML
# ─────────────────────────────────────────────────────────────────────────────
def build_element(
tag: str,
text: str | None = None,
attrib: dict[str, str] | None = None,
children: list[etree._Element] | None = None,
tail: str | None = None,
nsmap: dict | None = None,
) -> etree._Element:
"""
Build an XML element programmatically.
Example:
item = build_element("item", text="Widget", attrib={"id": "1", "status": "active"})
"""
e = etree.Element(tag, attrib=attrib or {}, nsmap=nsmap)
if text is not None:
e.text = text
if tail is not None:
e.tail = tail
for child in children or []:
e.append(child)
return e
def dict_to_element(tag: str, data: dict, parent: etree._Element | None = None) -> etree._Element:
"""
Convert a dict to an XML element tree.
Lists become repeated child elements.
Nested dicts become child elements.
Scalars become text content.
Example:
order = dict_to_element("order", {
"id": "42",
"customer": {"name": "Alice", "email": "[email protected]"},
"items": [{"sku": "A1", "qty": "2"}, {"sku": "B3", "qty": "1"}],
})
"""
elem = etree.SubElement(parent, tag) if parent is not None else etree.Element(tag)
for key, value in data.items():
if isinstance(value, dict):
dict_to_element(key, value, elem)
elif isinstance(value, list):
for item in value:
if isinstance(item, dict):
dict_to_element(key, item, elem)
else:
child = etree.SubElement(elem, key)
child.text = str(item)
else:
child = etree.SubElement(elem, key)
child.text = str(value)
return elem
# ─────────────────────────────────────────────────────────────────────────────
# 4. Streaming large XML
# ─────────────────────────────────────────────────────────────────────────────
def stream_elements(
source: str | Path | bytes,
tag: str,
namespaces: dict[str, str] | None = None,
) -> Generator[etree._Element, None, None]:
"""
Stream large XML files by yielding one element at a time.
Clears processed elements from memory to avoid OOM.
Example:
for record in stream_elements("large_export.xml", "record"):
process(record)
"""
if isinstance(source, bytes):
source = io.BytesIO(source)
context = etree.iterparse(source, events=("end",), tag=tag)
for _, element in context:
yield element
# Free memory
element.clear()
while element.getprevious() is not None:
del element.getparent()[0]
del context
def count_elements(source: str | Path, tag: str) -> int:
"""Count elements with a given tag in a large XML file without loading fully."""
count = 0
for _ in stream_elements(source, tag):
count += 1
return count
# ─────────────────────────────────────────────────────────────────────────────
# 5. HTML parsing
# ─────────────────────────────────────────────────────────────────────────────
def parse_html(
source: str | bytes,
base_url: str | None = None,
) -> html.HtmlElement:
"""
Parse an HTML string or bytes and return the root element.
base_url: used to resolve relative links.
Example:
doc = parse_html(response.text, base_url="https://example.com")
links = [a.get("href") for a in doc.cssselect("a[href]")]
"""
if isinstance(source, str):
source = source.encode()
return html.fromstring(source, base_url=base_url)
def css_select(document: html.HtmlElement, selector: str) -> list:
"""
Select elements using a CSS selector (requires cssselect package).
Example:
titles = css_select(doc, "h1.product-title")
texts = [t.text_content().strip() for t in titles]
"""
return document.cssselect(selector)
def extract_links(
document: html.HtmlElement,
base_url: str | None = None,
absolute: bool = True,
) -> list[str]:
"""
Extract all <a href> links from an HTML document.
absolute=True: resolve relative URLs using base_url.
"""
if absolute and base_url:
document.make_links_absolute(base_url)
return [a.get("href") for a in document.cssselect("a[href]")
if a.get("href") and not a.get("href", "").startswith("#")]
def extract_table(document: html.HtmlElement, selector: str = "table") -> list[list[str]]:
"""
Extract a table from HTML as a list of row/column string lists.
Uses first table matching selector.
Example:
rows = extract_table(doc, "table.data-table")
headers = rows[0]
data = rows[1:]
"""
tables = document.cssselect(selector)
if not tables:
return []
table = tables[0]
result = []
for tr in table.cssselect("tr"):
row = [td.text_content().strip() for td in tr.cssselect("th, td")]
if row:
result.append(row)
return result
def clean_html(
source: str | bytes,
remove_javascript: bool = True,
remove_style_tags: bool = True,
allow_tags: list[str] | None = None,
) -> str:
"""
Clean HTML by removing scripts, styles, and unsafe content.
Returns clean HTML as string.
"""
from lxml.html.clean import Cleaner
cleaner = Cleaner(
javascript=remove_javascript,
style=remove_style_tags,
allow_tags=allow_tags,
remove_unknown_tags=False,
)
if isinstance(source, str):
source = source.encode()
doc = html.fromstring(source)
clean_doc = cleaner.clean_html(doc)
return html.tostring(clean_doc, encoding="unicode")
# ─────────────────────────────────────────────────────────────────────────────
# Demo
# ─────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
# Build XML
print("=== Build XML ===")
catalog = E.catalog(
E.item(E.name("Widget"), E.price("9.99"), id="1", status="active"),
E.item(E.name("Gadget"), E.price("24.99"), id="2", status="active"),
E.item(E.name("Doohickey"), E.price("4.99"), id="3", status="discontinued"),
version="1.0",
)
print(to_string(catalog))
# XPath query
print("=== XPath ===")
active = xpath(catalog, "//item[@status='active']")
print(f" Active items: {[e.find('name').text for e in active]}")
prices = xpath(catalog, "//item/price/text()")
print(f" Prices: {prices}")
# dict_to_element
print("\n=== dict_to_element ===")
order = dict_to_element("order", {
"id": "42",
"customer": {"name": "Alice", "email": "[email protected]"},
"total": "34.98",
})
print(to_string(order))
# Streaming
print("=== Streaming ===")
xml_bytes = to_bytes(catalog)
count = 0
for item in stream_elements(xml_bytes, "item"):
count += 1
print(f" Streamed {count} <item> elements")
# HTML
print("\n=== HTML parsing ===")
sample_html = b"""
<html><body>
<h1>Products</h1>
<table class="inventory">
<tr><th>Name</th><th>Price</th><th>Stock</th></tr>
<tr><td>Widget</td><td>$9.99</td><td>42</td></tr>
<tr><td>Gadget</td><td>$24.99</td><td>7</td></tr>
</table>
<a href="/products/widget">Widget page</a>
<a href="/products/gadget">Gadget page</a>
</body></html>
"""
doc = parse_html(sample_html)
rows = extract_table(doc)
print(f" Table rows: {rows}")
links = extract_links(doc, base_url="https://example.com")
print(f" Links: {links}")
For the xml.etree.ElementTree (stdlib) alternative — Python’s built-in ElementTree is adequate for small XML files with basic find/findall queries; lxml is 5–30× faster, supports full XPath 1.0 (including predicates and axes), handles HTML recovery, validates against XSD schemas, applies XSLT transforms, and provides streaming iterparse for files too large to fit in memory — use lxml for any serious XML/HTML processing. For the BeautifulSoup4 alternative — BeautifulSoup4 wraps lxml (or html.parser) with a more forgiving API and excellent malformed-HTML recovery; lxml’s html module is faster for well-structured or scraping tasks where you want CSS selectors and XPath in the same tool — use BeautifulSoup4 for lenient HTML scraping, lxml when you need XPath queries, XML validation, or streaming parsers. The Claude Skills 360 bundle includes lxml skill sets covering parse_xml() with XMLParser options, from_string() with recover mode, to_string()/to_bytes() serializers, xpath() with namespace mapping, find_text()/find_all_text() helpers, element_to_dict() tree converter, build_element()/dict_to_element() constructors, stream_elements() iterparse memory-efficient streaming, parse_html()/css_select() HTML querying, extract_links()/extract_table() scraping helpers, and clean_html() sanitizer. Start with the free tier to try lxml XML and HTML parsing code generation.