Python’s xml.sax module provides a streaming event-driven XML parser — ideal for large files where loading the entire tree into memory is impractical. import xml.sax. Define handler: subclass xml.sax.handler.ContentHandler and override startElement(name, attrs), endElement(name), characters(content), startDocument(), endDocument(). Parse: xml.sax.parse("data.xml", MyHandler()) or xml.sax.parseString(b"<r/>", MyHandler()). Attributes: attrs.getValue("id"), attrs.getNames() → list[str], attrs.getLength(). Namespace mode: parser = xml.sax.make_parser(); parser.setFeature(xml.sax.handler.feature_namespaces, True) — handler gets startElementNS((ns, local), qname, attrs). Error handler: parser.setErrorHandler(xml.sax.handler.ErrorHandler()). Incremental feed: parser.setContentHandler(h); parser.feed(chunk_bytes); parser.close(). Note: xml.sax is vulnerable to billion-laughs and quadratic attacks on untrusted XML — use defusedxml.sax (PyPI) for external data. Claude Code generates streaming large-file parsers, element collectors, path-based extractors, namespace-aware processors, and SAX-to-dict converters.
CLAUDE.md for xml.sax
## xml.sax Stack
- Stdlib: import xml.sax, xml.sax.handler
- Handler: class MyHandler(xml.sax.handler.ContentHandler):
- def startElement(self, name, attrs): ...
- def endElement(self, name): ...
- def characters(self, content): ...
- Parse: xml.sax.parse("file.xml", MyHandler())
- xml.sax.parseString(xml_bytes, MyHandler())
- NS: parser.setFeature(xml.sax.handler.feature_namespaces, True)
- Note: use defusedxml.sax for untrusted XML input
xml.sax Streaming Parser Pipeline
# app/saxutil.py — element collector, path extractor, counter, dict builder, streamer
from __future__ import annotations
import io
import xml.sax
import xml.sax.handler
import xml.sax.xmlreader
from dataclasses import dataclass, field
from typing import Any, Callable
# ─────────────────────────────────────────────────────────────────────────────
# 1. Base handler helpers
# ─────────────────────────────────────────────────────────────────────────────
def attrs_to_dict(attrs: xml.sax.xmlreader.AttributesImpl) -> dict[str, str]:
"""
Convert a SAX Attributes object to a plain dict.
Example:
def startElement(self, name, attrs):
d = attrs_to_dict(attrs)
print(d) # {"id": "42", "lang": "en"}
"""
return {attrs.getQNameByName(n): attrs.getValueByQName(attrs.getQNameByName(n))
for n in attrs.getNames()} if attrs.getLength() else {}
def _safe_attrs_dict(attrs: Any) -> dict[str, str]:
"""Convert SAX Attributes to dict regardless of implementation."""
try:
return {name: attrs.getValue(name) for name in attrs.getNames()}
except Exception:
return {}
# ─────────────────────────────────────────────────────────────────────────────
# 2. Element collector — collect all instances of one tag
# ─────────────────────────────────────────────────────────────────────────────
@dataclass
class SAXRecord:
"""One collected element record."""
tag: str
attrs: dict[str, str]
text: str
class ElementCollector(xml.sax.handler.ContentHandler):
"""
SAX handler that collects all elements with a given tag name.
Efficiently handles large XML files without loading the whole tree.
Example:
collector = ElementCollector("book")
xml.sax.parse("library.xml", collector)
for record in collector.records:
print(record.attrs.get("id"), record.text.strip())
"""
def __init__(self, target_tag: str) -> None:
super().__init__()
self.target_tag = target_tag
self.records: list[SAXRecord] = []
self._inside = False
self._attrs: dict[str, str] = {}
self._buf: list[str] = []
def startElement(self, name: str, attrs: Any) -> None:
if name == self.target_tag:
self._inside = True
self._attrs = _safe_attrs_dict(attrs)
self._buf = []
def endElement(self, name: str) -> None:
if name == self.target_tag and self._inside:
self.records.append(SAXRecord(
tag=name,
attrs=self._attrs,
text="".join(self._buf),
))
self._inside = False
def characters(self, content: str) -> None:
if self._inside:
self._buf.append(content)
def collect_elements(source: "str | bytes | io.IOBase", tag: str) -> list[SAXRecord]:
"""
Parse source and return all SAXRecords for the given tag.
Example:
records = collect_elements("library.xml", "book")
records = collect_elements(xml_bytes, "item")
"""
handler = ElementCollector(tag)
if isinstance(source, (str, bytes)):
b = source.encode() if isinstance(source, str) else source
xml.sax.parseString(b, handler)
else:
xml.sax.parse(source, handler)
return handler.records
# ─────────────────────────────────────────────────────────────────────────────
# 3. Path extractor — extract text at a specific element path
# ─────────────────────────────────────────────────────────────────────────────
class PathExtractor(xml.sax.handler.ContentHandler):
"""
SAX handler that collects text content at a specific element path
(e.g. "catalog/book/title").
Example:
extractor = PathExtractor("catalog/book/title")
xml.sax.parseString(xml_bytes, extractor)
print(extractor.values) # all title texts
"""
def __init__(self, path: str) -> None:
super().__init__()
self._path_parts = path.split("/")
self._stack: list[str] = []
self._collecting = False
self._buf: list[str] = []
self.values: list[str] = []
def startElement(self, name: str, attrs: Any) -> None:
self._stack.append(name)
if self._stack == self._path_parts:
self._collecting = True
self._buf = []
def endElement(self, name: str) -> None:
if self._collecting and self._stack == self._path_parts:
self.values.append("".join(self._buf).strip())
self._collecting = False
if self._stack:
self._stack.pop()
def characters(self, content: str) -> None:
if self._collecting:
self._buf.append(content)
def extract_path(source: "str | bytes", path: str) -> list[str]:
"""
Parse XML and return all text values at the given element path.
Example:
titles = extract_path(xml_bytes, "catalog/book/title")
"""
handler = PathExtractor(path)
b = source.encode() if isinstance(source, str) else source
xml.sax.parseString(b, handler)
return handler.values
# ─────────────────────────────────────────────────────────────────────────────
# 4. Tag counter
# ─────────────────────────────────────────────────────────────────────────────
class TagCounter(xml.sax.handler.ContentHandler):
"""
SAX handler that counts how many times each tag appears.
Example:
counter = TagCounter()
xml.sax.parseString(xml_bytes, counter)
print(counter.counts) # {"catalog": 1, "book": 5, "title": 5, ...}
"""
def __init__(self) -> None:
super().__init__()
self.counts: dict[str, int] = {}
def startElement(self, name: str, attrs: Any) -> None:
self.counts[name] = self.counts.get(name, 0) + 1
# ─────────────────────────────────────────────────────────────────────────────
# 5. SAX-to-dict builder
# ─────────────────────────────────────────────────────────────────────────────
class DictBuilder(xml.sax.handler.ContentHandler):
"""
SAX handler that builds a nested dict/list structure from XML.
Lists are created when the same tag appears multiple times.
Example:
builder = DictBuilder()
xml.sax.parseString(xml_bytes, builder)
print(builder.result)
"""
def __init__(self) -> None:
super().__init__()
self._stack: list[dict[str, Any]] = []
self._keys: list[str] = []
self._text_buf: list[str] = []
self.result: "dict[str, Any] | None" = None
def startElement(self, name: str, attrs: Any) -> None:
node: dict[str, Any] = {}
a = _safe_attrs_dict(attrs)
if a:
node["@attrs"] = a
self._stack.append(node)
self._keys.append(name)
self._text_buf = []
def endElement(self, name: str) -> None:
node = self._stack.pop()
text = "".join(self._text_buf).strip()
if text and len(node) == 0:
node = text # type: ignore[assignment] # pure text node → string
elif text:
node["#text"] = text
if self._stack:
parent = self._stack[-1]
if name in parent:
if not isinstance(parent[name], list):
parent[name] = [parent[name]]
parent[name].append(node)
else:
parent[name] = node
else:
self.result = {name: node}
self._text_buf = []
def characters(self, content: str) -> None:
self._text_buf.append(content)
def xml_to_dict(source: "str | bytes") -> "dict[str, Any] | None":
"""
Parse XML and return as a nested dict/list structure.
Example:
d = xml_to_dict(xml_bytes)
print(d["catalog"]["book"][0]["title"])
"""
builder = DictBuilder()
b = source.encode() if isinstance(source, str) else source
xml.sax.parseString(b, builder)
return builder.result
# ─────────────────────────────────────────────────────────────────────────────
# Demo
# ─────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
import json
print("=== xml.sax demo ===")
xml_src = b"""<?xml version="1.0"?>
<catalog>
<book id="1" lang="en">
<title>Python Cookbook</title>
<author>Beazley</author>
<year>2013</year>
</book>
<book id="2" lang="en">
<title>Fluent Python</title>
<author>Ramalho</author>
<year>2022</year>
</book>
<book id="3" lang="de">
<title>Python lernen</title>
<author>Ziadé</author>
<year>2021</year>
</book>
</catalog>"""
# ── collect_elements ──────────────────────────────────────────────────────
print("\n--- collect_elements('book') ---")
books = collect_elements(xml_src, "book")
print(f" found {len(books)} books")
for b in books:
print(f" id={b.attrs.get('id')} lang={b.attrs.get('lang')}")
# ── extract_path ──────────────────────────────────────────────────────────
print("\n--- extract_path ---")
titles = extract_path(xml_src, "catalog/book/title")
authors = extract_path(xml_src, "catalog/book/author")
for t, a in zip(titles, authors):
print(f" {t!r:30s} by {a!r}")
# ── TagCounter ────────────────────────────────────────────────────────────
print("\n--- TagCounter ---")
counter = TagCounter()
xml.sax.parseString(xml_src, counter)
for tag, count in sorted(counter.counts.items()):
print(f" {tag:15s}: {count}")
# ── xml_to_dict ───────────────────────────────────────────────────────────
print("\n--- xml_to_dict ---")
d = xml_to_dict(xml_src)
print(json.dumps(d, indent=2)[:400])
print("\n=== done ===")
For the xml.etree.ElementTree alternative — ET.parse()/ET.iterparse(source) provides both tree-mode and event-mode iteration; iterparse yields (event, element) tuples and is suitable for streaming large files without the boilerplate of writing a ContentHandler subclass — prefer ET.iterparse() over xml.sax when you want streaming without subclassing; use xml.sax when you need full SAX/LotusXML compliance or are integrating with systems that expect ContentHandler callbacks. For the lxml.etree.iterparse alternative — lxml.etree.iterparse(source, events=("start","end")) is 2–5× faster than stdlib SAX on large files and supports XPath and schema validation in the same pass — use lxml.iterparse for performance-critical large-XML pipelines; use xml.sax for zero-dependency stdlib-only streaming. The Claude Skills 360 bundle includes xml.sax skill sets covering attrs_to_dict() / _safe_attrs_dict() helpers, SAXRecord dataclass + ElementCollector / collect_elements(), PathExtractor / extract_path(), TagCounter, and DictBuilder / xml_to_dict() SAX-to-nested-dict converter. Start with the free tier to try streaming XML patterns and xml.sax pipeline code generation.