Python’s xml.sax.handler module defines the base handler classes for SAX2 XML parsing. from xml.sax import handler. Four handler interfaces: ContentHandler — the main handler; override startElement(name, attrs), endElement(name), characters(content), startDocument(), endDocument(), startPrefixMapping(prefix, uri), endPrefixMapping(prefix), ignorableWhitespace(whitespace), processingInstruction(target, data). ErrorHandler — override warning(exc), error(exc), fatalError(exc); default error() and fatalError() re-raise. EntityResolver — override resolveEntity(publicId, systemId) → InputSource; default returns None (expat blocks external entities). DTDHandler — override notationDecl(name, publicId, systemId) and unparsedEntityDecl(name, publicId, systemId, ndata). Feature constants: handler.feature_namespaces — enable namespace processing; handler.feature_validation — DTD validation. Property constants: handler.property_lexical_handler — register a LexicalHandler for comments and CDATA events. Register handlers with parser.setContentHandler(h), parser.setErrorHandler(e). Claude Code generates content accumulator handlers, streaming element collectors, schema-like structure validators, event loggers, and multi-pass XML pipeline stages.
CLAUDE.md for xml.sax.handler
## xml.sax.handler Stack
- Stdlib: import xml.sax
- from xml.sax import handler, parseString, parse
- Content: class MyHandler(handler.ContentHandler):
- def startElement(self, name, attrs): ...
- def endElement(self, name): ...
- def characters(self, content): ...
- Error: class MyErrors(handler.ErrorHandler):
- def fatalError(self, exc): raise exc
- Parse: p = xml.sax.make_parser()
- p.setContentHandler(MyHandler())
- p.setErrorHandler(MyErrors())
- p.setFeature(handler.feature_namespaces, True)
- p.parse(io.BytesIO(xml_bytes))
xml.sax.handler SAX2 Pipeline
# app/xmlsaxhandlerutil.py — collect, count, validate, log, locate, namespace
from __future__ import annotations
import io
import xml.sax
import xml.sax.handler as _handler
from dataclasses import dataclass, field
from typing import Any
# ─────────────────────────────────────────────────────────────────────────────
# 1. Text-accumulator ContentHandler
# ─────────────────────────────────────────────────────────────────────────────
class TextAccumulator(_handler.ContentHandler):
"""
Collect the text content of all (or specific) elements.
After parsing: .results is a list of (tag, text) tuples.
Example:
acc = TextAccumulator(target_tags={"title", "author"})
xml.sax.parseString(xml_bytes, acc)
for tag, text in acc.results:
print(tag, text)
"""
def __init__(self, target_tags: "set[str] | None" = None) -> None:
super().__init__()
self.target_tags = target_tags
self.results: list[tuple[str, str]] = []
self._stack: list[tuple[str, list[str]]] = []
self._active = False
def startElement(self, name: str, attrs: Any) -> None:
if self.target_tags is None or name in self.target_tags:
self._stack.append((name, []))
self._active = True
elif self._active:
self._stack.append((name, []))
def characters(self, content: str) -> None:
if self._stack:
self._stack[-1][1].append(content)
def endElement(self, name: str) -> None:
if not self._stack:
return
top_name, parts = self._stack[-1]
if top_name == name:
self._stack.pop()
if self.target_tags is None or name in self.target_tags:
self.results.append((name, "".join(parts).strip()))
self._active = bool(self._stack)
# ─────────────────────────────────────────────────────────────────────────────
# 2. Attribute collector
# ─────────────────────────────────────────────────────────────────────────────
@dataclass
class ElementRecord:
tag: str
attrs: dict[str, str]
depth: int
class AttributeCollector(_handler.ContentHandler):
"""
Collect all elements with their attributes.
Example:
col = AttributeCollector(target_tag="book")
xml.sax.parseString(xml_bytes, col)
for rec in col.records:
print(rec.attrs)
"""
def __init__(self, target_tag: str | None = None) -> None:
super().__init__()
self.target_tag = target_tag
self.records: list[ElementRecord] = []
self._depth = 0
def startElement(self, name: str, attrs: Any) -> None:
self._depth += 1
if self.target_tag is None or name == self.target_tag:
self.records.append(ElementRecord(
tag=name,
attrs={k: v for k, v in attrs.items()},
depth=self._depth,
))
def endElement(self, name: str) -> None:
self._depth -= 1
# ─────────────────────────────────────────────────────────────────────────────
# 3. Structure validator
# ─────────────────────────────────────────────────────────────────────────────
@dataclass
class StructureReport:
valid: bool
element_count: int
max_depth: int
found_tags: list[str]
missing_required: list[str]
errors: list[str] = field(default_factory=list)
class StructureValidator(_handler.ContentHandler, _handler.ErrorHandler):
"""
Validate that required elements are present and document is well-formed.
Example:
val = StructureValidator(required_tags=["title", "author"])
xml.sax.parseString(xml_bytes, val)
print(val.report)
"""
def __init__(self, required_tags: "list[str] | None" = None) -> None:
super().__init__()
self.required = set(required_tags or [])
self._found: set[str] = set()
self._depth = 0
self._max_depth = 0
self._count = 0
self._errors: list[str] = []
def startElement(self, name: str, attrs: Any) -> None:
self._depth += 1
self._max_depth = max(self._max_depth, self._depth)
self._count += 1
self._found.add(name)
def endElement(self, name: str) -> None:
self._depth -= 1
def warning(self, exc: Exception) -> None:
self._errors.append(f"warning: {exc}")
def error(self, exc: Exception) -> None:
self._errors.append(f"error: {exc}")
def fatalError(self, exc: Exception) -> None:
self._errors.append(f"fatal: {exc}")
raise exc
@property
def report(self) -> StructureReport:
missing = sorted(self.required - self._found)
return StructureReport(
valid=not self._errors and not missing,
element_count=self._count,
max_depth=self._max_depth,
found_tags=sorted(self._found),
missing_required=missing,
errors=self._errors,
)
# ─────────────────────────────────────────────────────────────────────────────
# 4. Namespace-aware handler
# ─────────────────────────────────────────────────────────────────────────────
class NamespaceLogger(_handler.ContentHandler):
"""
Track namespace prefix mappings and log namespace-qualified element starts.
Example:
ns_log = NamespaceLogger()
p = xml.sax.make_parser()
p.setFeature(handler.feature_namespaces, True)
p.setContentHandler(ns_log)
p.parse(io.BytesIO(xml_bytes))
print(ns_log.ns_map)
print(ns_log.qualified_elements[:5])
"""
def __init__(self) -> None:
super().__init__()
self.ns_map: dict[str, str] = {}
self.qualified_elements: list[str] = []
def startPrefixMapping(self, prefix: str, uri: str) -> None:
self.ns_map[prefix or "(default)"] = uri
def startElementNS(self, name: Any, qname: Any, attrs: Any) -> None:
ns_uri, local = name if isinstance(name, tuple) else (None, name)
if ns_uri:
self.qualified_elements.append(f"{{{ns_uri}}}{local}")
else:
self.qualified_elements.append(str(local))
# ─────────────────────────────────────────────────────────────────────────────
# 5. Convenience parse wrappers
# ─────────────────────────────────────────────────────────────────────────────
def collect_text(xml_source: "bytes | str",
tags: "set[str] | None" = None) -> list[tuple[str, str]]:
"""
Parse XML and return (tag, text) pairs for all or specific tags.
Example:
texts = collect_text(xml_bytes, {"title", "author"})
"""
if isinstance(xml_source, str):
xml_source = xml_source.encode("utf-8")
handler = TextAccumulator(tags)
xml.sax.parseString(xml_source, handler)
return handler.results
def collect_attrs(xml_source: "bytes | str",
tag: str | None = None) -> list[ElementRecord]:
"""
Parse XML and return ElementRecord list for all or specific tags.
Example:
records = collect_attrs(xml_bytes, "book")
"""
if isinstance(xml_source, str):
xml_source = xml_source.encode("utf-8")
handler = AttributeCollector(tag)
xml.sax.parseString(xml_source, handler)
return handler.records
def validate_structure(xml_source: "bytes | str",
required: "list[str] | None" = None) -> StructureReport:
"""
Validate XML structure. Returns a StructureReport.
Example:
report = validate_structure(xml_bytes, required=["title", "author"])
print(report.valid, report.missing_required)
"""
if isinstance(xml_source, str):
xml_source = xml_source.encode("utf-8")
val = StructureValidator(required)
try:
xml.sax.parseString(xml_source, val, val)
except Exception:
pass
return val.report
# ─────────────────────────────────────────────────────────────────────────────
# Demo
# ─────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
print("=== xml.sax.handler demo ===")
sample = b"""<?xml version="1.0"?>
<catalog>
<book id="b1" lang="en">
<title>Python Cookbook</title>
<author>David Beazley</author>
<price>39.99</price>
</book>
<book id="b2" lang="fr">
<title>Apprendre Python</title>
<author>Mark Lutz</author>
<price>29.99</price>
</book>
<magazine id="m1">
<title>Python Weekly</title>
</magazine>
</catalog>"""
ns_xml = b"""<?xml version="1.0"?>
<catalog xmlns:dc="http://purl.org/dc/elements/1.1/"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
<dc:title>Namespace Demo</dc:title>
</catalog>"""
# ── collect_text ──────────────────────────────────────────────────────
print("\n--- collect_text (title, author) ---")
for tag, text in collect_text(sample, {"title", "author"}):
print(f" {tag:10s}: {text!r}")
# ── collect_attrs ─────────────────────────────────────────────────────
print("\n--- collect_attrs (book) ---")
for rec in collect_attrs(sample, "book"):
print(f" depth={rec.depth} attrs={rec.attrs}")
# ── validate_structure ────────────────────────────────────────────────
print("\n--- validate_structure ---")
good = validate_structure(sample, required=["title", "author"])
bad = validate_structure(sample, required=["title", "isbn"])
print(f" good: valid={good.valid} missing={good.missing_required}"
f" tags={len(good.found_tags)}")
print(f" bad : valid={bad.valid} missing={bad.missing_required}")
# ── namespace handler ─────────────────────────────────────────────────
print("\n--- NamespaceLogger ---")
ns_log = NamespaceLogger()
p = xml.sax.make_parser()
p.setFeature(_handler.feature_namespaces, True)
p.setContentHandler(ns_log)
p.parse(io.BytesIO(ns_xml))
print(f" ns_map : {ns_log.ns_map}")
print(f" qualified_elements: {ns_log.qualified_elements}")
# ── handler feature / property constants ─────────────────────────────
print("\n--- handler constants ---")
attrs_to_show = [a for a in dir(_handler) if a.startswith("feature_") or a.startswith("property_")]
for attr in attrs_to_show:
print(f" {attr:35s} = {getattr(_handler, attr)!r}")
print("\n=== done ===")
For the xml.sax stdlib entry point — xml.sax.parseString(data, contentHandler) / xml.sax.parse(source, contentHandler) automatically create a parser and call setContentHandler(), making them the most convenient way to parse when a single ContentHandler is all that’s needed; use xml.sax.make_parser() + explicit setContentHandler()/setErrorHandler()/setFeature() calls when you need a custom ErrorHandler, namespace features, or a lexical handler. For the lxml.sax (PyPI) alternative — lxml.sax.saxify(etree_element, content_handler) generates SAX2 events from an already-parsed lxml element tree, letting you drive existing ContentHandler code from lxml-parsed documents — use lxml.sax when you need lxml’s speed and validation on the parsing side but want to consume events through a ContentHandler interface. The Claude Skills 360 bundle includes xml.sax.handler skill sets covering TextAccumulator text-collecting handler, AttributeCollector/ElementRecord attribute handler, StructureValidator/StructureReport structure checker, NamespaceLogger namespace handler, and collect_text()/collect_attrs()/validate_structure() convenience wrappers. Start with the free tier to try SAX2 handler patterns and xml.sax.handler pipeline code generation.