Python’s xml.parsers.expat module provides direct bindings to the Expat C library — the fastest XML parser in the stdlib. import xml.parsers.expat as expat. Create a parser: p = expat.ParserCreate(encoding="utf-8", namespace_separator="|"). Register handlers: p.StartElementHandler = fn(name, attrs), p.EndElementHandler = fn(name), p.CharacterDataHandler = fn(data), p.ProcessingInstructionHandler = fn(target, data), p.CommentHandler = fn(data), p.StartNamespaceDeclHandler = fn(prefix, uri). Feed data: p.Parse(chunk, is_final=False) (bool); p.ParseFile(fp). Position: p.CurrentLineNumber, p.CurrentColumnNumber, p.CurrentByteIndex. Errors: expat.ExpatError (subclass of Exception) has .lineno, .offset, .code; use expat.ErrorString(code) to get the message. Security: always set p.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_NEVER) to prevent XXE attacks; never parse untrusted XML with default settings. Namespace mode: ParserCreate(namespace_separator="|") makes tag names "{uri}|localname"; set separator to something unlikely to appear in URIs. Expat is approximately 2–5× faster than xml.sax and 3–10× faster than xml.dom.minidom for streaming workloads. Claude Code generates ultra-fast streaming XML processors, element counters, tag-frequency analyzers, namespace extractors, and large-file XML scanners.
CLAUDE.md for xml.parsers.expat
## xml.parsers.expat Stack
- Stdlib: import xml.parsers.expat as expat
- Create: p = expat.ParserCreate("utf-8")
- p.SetParamEntityParsing(expat.XML_PARAM_ENTITY_PARSING_NEVER) # XXE guard
- Handlers:
- p.StartElementHandler = fn(name, attrs) # attrs is dict
- p.EndElementHandler = fn(name)
- p.CharacterDataHandler = fn(data)
- p.ProcessingInstructionHandler = fn(target, data)
- p.CommentHandler = fn(data)
- p.StartNamespaceDeclHandler = fn(prefix, uri)
- Feed: p.Parse(chunk, False) / p.Parse(b"", True) # is_final
- p.ParseFile(fp)
- Pos: p.CurrentLineNumber / p.CurrentColumnNumber
- Error: expat.ExpatError .lineno .offset .code
- expat.ErrorString(code)
xml.parsers.expat Streaming Parser Pipeline
# app/xmlexpatutil.py — count, collect, namespaces, error, stream-large
from __future__ import annotations
import io
import xml.parsers.expat as _expat
from dataclasses import dataclass, field
from typing import Any, Callable
# ─────────────────────────────────────────────────────────────────────────────
# 1. Tag-frequency counter (fastest path — no text buffering)
# ─────────────────────────────────────────────────────────────────────────────
def count_elements(xml_source: "bytes | str",
tag: str | None = None) -> dict[str, int]:
"""
Count element occurrences using Expat (no DOM construction).
If tag is given, count only that tag; otherwise count all.
Example:
counts = count_elements(xml_bytes)
counts = count_elements(xml_bytes, "item")
"""
counts: dict[str, int] = {}
if isinstance(xml_source, str):
xml_source = xml_source.encode("utf-8")
p = _expat.ParserCreate("utf-8")
p.SetParamEntityParsing(_expat.XML_PARAM_ENTITY_PARSING_NEVER)
def start(name: str, attrs: dict) -> None:
if tag is None or name == tag:
counts[name] = counts.get(name, 0) + 1
p.StartElementHandler = start
p.Parse(xml_source, True)
return counts
# ─────────────────────────────────────────────────────────────────────────────
# 2. Element text extractor
# ─────────────────────────────────────────────────────────────────────────────
@dataclass
class ExtractedElement:
tag: str
attrs: dict[str, str]
text: str
def extract_elements(xml_source: "bytes | str",
target_tag: str,
max_items: int = 1000) -> list[ExtractedElement]:
"""
Extract all elements with the given tag, including text and attributes.
Example:
items = extract_elements(rss_bytes, "item", max_items=20)
for item in items:
print(item.attrs, item.text[:60])
"""
if isinstance(xml_source, str):
xml_source = xml_source.encode("utf-8")
results: list[ExtractedElement] = []
_stack: list[tuple[str, dict[str, str], list[str]]] = []
_active = [False]
p = _expat.ParserCreate("utf-8")
p.SetParamEntityParsing(_expat.XML_PARAM_ENTITY_PARSING_NEVER)
def start(name: str, attrs: dict) -> None:
if name == target_tag and len(results) < max_items:
_active[0] = True
_stack.append((name, dict(attrs), []))
elif _active[0]:
_stack.append((name, {}, []))
def characters(data: str) -> None:
if _active[0] and _stack:
_stack[-1][2].append(data)
def end(name: str) -> None:
if not _stack:
return
tag_n, tag_attrs, text_parts = _stack[-1]
_stack.pop()
if tag_n == target_tag:
_active[0] = bool(_stack)
results.append(ExtractedElement(
tag=tag_n,
attrs=tag_attrs,
text="".join(text_parts).strip(),
))
p.StartElementHandler = start
p.CharacterDataHandler = characters
p.EndElementHandler = end
p.Parse(xml_source, True)
return results
# ─────────────────────────────────────────────────────────────────────────────
# 3. Namespace extractor
# ─────────────────────────────────────────────────────────────────────────────
def extract_namespaces(xml_source: "bytes | str") -> dict[str, str]:
"""
Return all namespace prefix→URI declarations in an XML document.
Example:
ns = extract_namespaces(xml_bytes)
print(ns) # {"xsi": "http://www.w3.org/2001/XMLSchema-instance", ...}
"""
if isinstance(xml_source, str):
xml_source = xml_source.encode("utf-8")
ns_map: dict[str, str] = {}
p = _expat.ParserCreate("utf-8", "|")
p.SetParamEntityParsing(_expat.XML_PARAM_ENTITY_PARSING_NEVER)
def ns_start(prefix: str, uri: str) -> None:
ns_map[prefix or "(default)"] = uri
p.StartNamespaceDeclHandler = ns_start
try:
p.Parse(xml_source, True)
except _expat.ExpatError:
pass
return ns_map
# ─────────────────────────────────────────────────────────────────────────────
# 4. Error collector
# ─────────────────────────────────────────────────────────────────────────────
@dataclass
class ParseError:
message: str
line: int
column: int
offset: int
code: int
def validate_xml(xml_source: "bytes | str") -> "ParseError | None":
"""
Validate an XML document. Returns None if well-formed, ParseError if not.
Example:
err = validate_xml(xml_bytes)
if err:
print(f"Line {err.line}: {err.message}")
"""
if isinstance(xml_source, str):
xml_source = xml_source.encode("utf-8")
p = _expat.ParserCreate("utf-8")
p.SetParamEntityParsing(_expat.XML_PARAM_ENTITY_PARSING_NEVER)
try:
p.Parse(xml_source, True)
return None
except _expat.ExpatError as e:
return ParseError(
message=_expat.ErrorString(e.code),
line=e.lineno,
column=e.offset,
offset=getattr(e, "byteoffset", -1),
code=e.code,
)
# ─────────────────────────────────────────────────────────────────────────────
# 5. Streaming large-file parser
# ─────────────────────────────────────────────────────────────────────────────
def stream_parse(stream: "io.RawIOBase | io.BufferedIOBase",
start_handler: "Callable[[str, dict], None] | None" = None,
end_handler: "Callable[[str], None] | None" = None,
char_handler: "Callable[[str], None] | None" = None,
chunk_size: int = 65536) -> "ParseError | None":
"""
Parse a large XML stream in chunks. Register handlers for events.
Returns None on success, ParseError on failure.
Example:
counts = {}
def start(name, attrs): counts[name] = counts.get(name, 0) + 1
with open("large.xml", "rb") as f:
err = stream_parse(f, start_handler=start)
print(counts)
"""
p = _expat.ParserCreate("utf-8")
p.SetParamEntityParsing(_expat.XML_PARAM_ENTITY_PARSING_NEVER)
if start_handler:
p.StartElementHandler = start_handler
if end_handler:
p.EndElementHandler = end_handler
if char_handler:
p.CharacterDataHandler = char_handler
try:
while True:
chunk = stream.read(chunk_size)
if not chunk:
p.Parse(b"", True)
break
p.Parse(chunk, False)
except _expat.ExpatError as e:
return ParseError(
message=_expat.ErrorString(e.code),
line=e.lineno,
column=e.offset,
offset=-1,
code=e.code,
)
return None
# ─────────────────────────────────────────────────────────────────────────────
# Demo
# ─────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
print("=== xml.parsers.expat demo ===")
sample = b"""<?xml version="1.0"?>
<catalog xmlns:dc="http://purl.org/dc/elements/1.1/">
<book id="b1" lang="en">
<dc:title>Python Cookbook</dc:title>
<author>David Beazley</author>
<price>39.99</price>
</book>
<book id="b2" lang="fr">
<dc:title>Apprendre Python</dc:title>
<author>Mark Lutz</author>
<price>29.99</price>
</book>
<magazine id="m1">
<dc:title>Python Weekly</dc:title>
</magazine>
</catalog>"""
# ── count_elements ────────────────────────────────────────────────────
print("\n--- count_elements ---")
counts = count_elements(sample)
for name, n in sorted(counts.items()):
print(f" {name:20s}: {n}")
# ── extract_elements ──────────────────────────────────────────────────
print("\n--- extract_elements (book) ---")
books = extract_elements(sample, "book")
for b in books:
print(f" id={b.attrs.get('id')} lang={b.attrs.get('lang')}")
# ── extract_namespaces ────────────────────────────────────────────────
print("\n--- extract_namespaces ---")
ns = extract_namespaces(sample)
for prefix, uri in ns.items():
print(f" {prefix:12s} → {uri}")
# ── validate_xml ──────────────────────────────────────────────────────
print("\n--- validate_xml ---")
good_err = validate_xml(sample)
bad_err = validate_xml(b"<root><unclosed></root>")
print(f" good: {good_err}")
print(f" bad : message={bad_err.message!r} line={bad_err.line}")
# ── stream_parse ──────────────────────────────────────────────────────
print("\n--- stream_parse ---")
tag_counts: dict[str, int] = {}
def on_start(name: str, attrs: dict) -> None:
tag_counts[name] = tag_counts.get(name, 0) + 1
stream_err = stream_parse(io.BytesIO(sample),
start_handler=on_start, chunk_size=128)
print(f" error: {stream_err}")
for name, n in sorted(tag_counts.items()):
print(f" {name:20s}: {n}")
print("\n=== done ===")
For the xml.sax stdlib alternative — xml.sax.parseString(data, handler) provides the same event-driven parsing as Expat but through the standard SAX2 interface with ContentHandler.startElement(), endElement(), and characters() callbacks plus ErrorHandler and EntityResolver — use xml.sax when you want a standardised SAX2 API with swappable parser backends; use xml.parsers.expat directly when you need maximum performance or access to Expat-specific features like CurrentByteIndex, namespace-prefix mode, or incremental Parse() control. For the lxml.etree (PyPI) alternative — lxml.etree.iterparse(source, events=("start","end")) provides SAX-speed streaming with a cleaner API, DTD/schema validation, XPath, and XSLT — use lxml for all production large-file XML work; use xml.parsers.expat for zero-dependency, maximum-speed stdlib-only XML streaming. The Claude Skills 360 bundle includes xml.parsers.expat skill sets covering count_elements() tag-frequency counter, ExtractedElement/extract_elements() text extractor, extract_namespaces() namespace mapper, ParseError/validate_xml() well-formedness validator, and stream_parse() large-file streaming parser. Start with the free tier to try Expat streaming patterns and xml.parsers.expat pipeline code generation.