Python’s xml.dom.minidom module provides a lightweight W3C DOM implementation for parsing and building XML documents. from xml.dom import minidom. Parse: doc = minidom.parse("file.xml") or doc = minidom.parseString(b"<root/>"). Access: doc.documentElement → root Element; el.tagName → str; el.getAttribute("attr") → str; el.childNodes → NodeList; el.getElementsByTagName("item") → NodeList. Build: doc = minidom.Document(); root = doc.createElement("root"); doc.appendChild(root). Text: txt = doc.createTextNode("hello"); el.appendChild(txt). Serialise: doc.toxml(encoding=None) → str or bytes; doc.toprettyxml(indent=" ") → indented str. Other node types: createComment(data), createProcessingInstruction(target, data), createCDATASection(data). Traversal helpers: el.normalize() — merge adjacent text nodes; el.hasAttribute(name) — bool; el.setAttribute(name, value) — set/overwrite. Namespace-aware: doc.createElementNS(ns, qname). Note: minidom loads the entire document into memory; use xml.dom.pulldom or xml.etree.ElementTree for large files. Claude Code generates DOM document builders, element finders, attribute extractors, XML template engines, element-to-dict converters, and round-trip XML processors.
CLAUDE.md for xml.dom.minidom
## xml.dom.minidom Stack
- Stdlib: from xml.dom import minidom
- Parse: doc = minidom.parse("file.xml")
- doc = minidom.parseString(xml_bytes)
- Root: root = doc.documentElement
- Find: els = root.getElementsByTagName("item") # NodeList
- el.getAttribute("id")
- Build: doc = minidom.Document()
- el = doc.createElement("tag")
- doc.appendChild(el)
- el.setAttribute("k", "v")
- el.appendChild(doc.createTextNode("text"))
- Serial: doc.toxml() # compact str
- doc.toprettyxml(indent=" ") # indented str
- Notes: loads full doc into memory; use ET.iterparse for large files
xml.dom.minidom DOM Pipeline
# app/xmlminidomutil.py — parse, build, find, convert, merge, validate
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Any, Iterator
from xml.dom import minidom
from xml.dom.minidom import Document, Element, Node, Text
# ─────────────────────────────────────────────────────────────────────────────
# 1. Parse helpers
# ─────────────────────────────────────────────────────────────────────────────
def parse_bytes(xml_bytes: bytes) -> Document:
"""
Parse XML bytes into a minidom Document.
Example:
doc = parse_bytes(b"<catalog><book id='1'/></catalog>")
print(doc.documentElement.tagName)
"""
return minidom.parseString(xml_bytes)
def parse_file(path: str) -> Document:
"""
Parse an XML file into a minidom Document.
Example:
doc = parse_file("catalog.xml")
"""
return minidom.parse(path)
def safe_parse(xml_source: "bytes | str") -> "tuple[Document | None, str]":
"""
Parse XML, returning (Document, '') on success or (None, error_str) on failure.
Example:
doc, err = safe_parse(b"<root/>")
if err: print("Error:", err)
"""
if isinstance(xml_source, str):
xml_source = xml_source.encode("utf-8")
try:
return minidom.parseString(xml_source), ""
except Exception as e:
return None, str(e)
# ─────────────────────────────────────────────────────────────────────────────
# 2. DOM traversal helpers
# ─────────────────────────────────────────────────────────────────────────────
def get_text(node: Node) -> str:
"""
Collect all text content from a node and its descendants.
Example:
text = get_text(el) # concatenates all Text children recursively
"""
parts: list[str] = []
for child in node.childNodes:
if child.nodeType == Node.TEXT_NODE:
parts.append(child.data)
elif child.nodeType == Node.ELEMENT_NODE:
parts.append(get_text(child))
return "".join(parts)
def iter_elements(node: Node,
tag: str | None = None) -> Iterator[Element]:
"""
Yield all Element nodes in the subtree (pre-order DFS), optionally filtered by tagName.
Example:
for el in iter_elements(doc.documentElement, "book"):
print(el.getAttribute("id"))
"""
if node.nodeType == Node.ELEMENT_NODE:
el: Element = node # type: ignore[assignment]
if tag is None or el.tagName == tag:
yield el
for child in node.childNodes:
yield from iter_elements(child, tag)
def get_attrs(el: Element) -> dict[str, str]:
"""
Return all attributes of an Element as a plain dict.
Example:
attrs = get_attrs(book_el)
print(attrs) # {"id": "1", "lang": "en"}
"""
result: dict[str, str] = {}
if el.attributes:
for i in range(el.attributes.length):
a = el.attributes.item(i)
result[a.name] = a.value
return result
# ─────────────────────────────────────────────────────────────────────────────
# 3. Element-to-dict converter
# ─────────────────────────────────────────────────────────────────────────────
def element_to_dict(el: Element) -> Any:
"""
Recursively convert a minidom Element to a Python dict/list/str.
Attributes are stored under "@attr", text content under "#text".
Repeated sibling elements become a list.
Example:
doc = parse_bytes(b"<book id='1'><title>Python</title></book>")
d = element_to_dict(doc.documentElement)
print(d) # {"@id": "1", "title": "Python"}
"""
result: dict[str, Any] = {}
for name, val in get_attrs(el).items():
result[f"@{name}"] = val
for child in el.childNodes:
if child.nodeType == Node.TEXT_NODE:
text = child.data.strip()
if text:
result["#text"] = result.get("#text", "") + text
elif child.nodeType == Node.ELEMENT_NODE:
child_el: Element = child # type: ignore[assignment]
child_val = element_to_dict(child_el)
key = child_el.tagName
if key in result:
existing = result[key]
if not isinstance(existing, list):
result[key] = [existing]
result[key].append(child_val)
else:
result[key] = child_val
# Simplify: if only #text, return the string
if list(result.keys()) == ["#text"]:
return result["#text"]
return result or ""
def xml_to_dict(xml_source: "bytes | str",
root_tag: str | None = None) -> Any:
"""
Parse XML and convert the root element (or first matching tag) to a dict.
Example:
d = xml_to_dict(b"<book id='1'><title>Python</title></book>")
print(d["title"])
"""
doc, err = safe_parse(xml_source)
if err:
return {"__error__": err}
root = doc.documentElement # type: ignore[union-attr]
if root_tag and root.tagName != root_tag:
els = list(iter_elements(doc, root_tag))
return element_to_dict(els[0]) if els else {}
return element_to_dict(root)
# ─────────────────────────────────────────────────────────────────────────────
# 4. DOM document builder
# ─────────────────────────────────────────────────────────────────────────────
def build_document(root_tag: str,
root_attrs: "dict[str, str] | None" = None) -> tuple[Document, Element]:
"""
Create a new minidom Document with a root element.
Returns (doc, root_element).
Example:
doc, root = build_document("catalog", {"version": "1.0"})
book = add_element(doc, root, "book", {"id": "1"})
add_text(doc, book, "Python Cookbook")
print(doc.toprettyxml(indent=" "))
"""
doc = minidom.Document()
root = doc.createElement(root_tag)
for k, v in (root_attrs or {}).items():
root.setAttribute(k, v)
doc.appendChild(root)
return doc, root
def add_element(doc: Document,
parent: Element,
tag: str,
attrs: "dict[str, str] | None" = None,
text: str | None = None) -> Element:
"""
Create a child element, optionally with attributes and text content.
Example:
el = add_element(doc, root, "book", {"id": "1"}, "Python Cookbook")
"""
el = doc.createElement(tag)
for k, v in (attrs or {}).items():
el.setAttribute(k, v)
if text is not None:
el.appendChild(doc.createTextNode(text))
parent.appendChild(el)
return el
def add_text(doc: Document, parent: Element, text: str) -> Text:
"""
Append a text node to an element.
Example:
add_text(doc, title_el, "Hello World")
"""
t = doc.createTextNode(text)
parent.appendChild(t)
return t
# ─────────────────────────────────────────────────────────────────────────────
# 5. Serialisation helpers
# ─────────────────────────────────────────────────────────────────────────────
def to_xml(doc: Document, *, pretty: bool = False, indent: str = " ") -> str:
"""
Serialise a Document to an XML string.
pretty=True uses toprettyxml (adds unnecessary blank lines — strip them).
Example:
xml_str = to_xml(doc, pretty=True)
print(xml_str)
"""
if not pretty:
return doc.toxml()
raw = doc.toprettyxml(indent=indent)
# Remove blank lines added by toprettyxml
lines = [ln for ln in raw.splitlines() if ln.strip()]
return "\n".join(lines)
def round_trip(xml_source: "bytes | str") -> str:
"""
Parse XML and re-serialise it (normalises whitespace).
Example:
normal = round_trip(b"<root> <a>1</a> </root>")
print(normal)
"""
doc, err = safe_parse(xml_source)
if err:
return f"<!-- error: {err} -->"
doc.documentElement.normalize() # type: ignore[union-attr]
return to_xml(doc, pretty=True)
# ─────────────────────────────────────────────────────────────────────────────
# Demo
# ─────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
print("=== xml.dom.minidom demo ===")
sample_xml = b"""<?xml version="1.0"?>
<catalog>
<book id="b1" lang="en">
<title>Python Cookbook</title>
<author>David Beazley</author>
<price>39.99</price>
</book>
<book id="b2" lang="fr">
<title>Apprendre Python</title>
<author>Mark Lutz</author>
<price>29.99</price>
</book>
<magazine id="m1">
<title>Python Weekly</title>
</magazine>
</catalog>"""
# ── parse & traverse ──────────────────────────────────────────────────
print("\n--- parse & iter_elements ---")
doc = parse_bytes(sample_xml)
for book in iter_elements(doc.documentElement, "book"):
bid = book.getAttribute("id")
title = get_text(next(iter_elements(book, "title")))
price = get_text(next(iter_elements(book, "price")))
print(f" {bid}: {title!r} price={price}")
# ── element_to_dict ───────────────────────────────────────────────────
print("\n--- element_to_dict (first book) ---")
books = list(iter_elements(doc.documentElement, "book"))
d = element_to_dict(books[0])
print(f" {d}")
# ── xml_to_dict ──────────────────────────────────────────────────────
print("\n--- xml_to_dict ---")
small = b"<person id='42'><name>Alice</name><email>[email protected]</email></person>"
result = xml_to_dict(small)
print(f" {result}")
# ── build_document ────────────────────────────────────────────────────
print("\n--- build_document ---")
new_doc, root = build_document("library", {"version": "2.0"})
b1 = add_element(new_doc, root, "book", {"id": "x1"})
add_element(new_doc, b1, "title", text="Learning Python")
add_element(new_doc, b1, "year", text="2029")
print(to_xml(new_doc, pretty=True))
# ── round_trip ────────────────────────────────────────────────────────
print("\n--- round_trip (normalise) ---")
messy = b"<root> <a> hello </a> <b/> </root>"
print(round_trip(messy))
print("\n=== done ===")
For the xml.etree.ElementTree stdlib alternative — ET.fromstring(xml) / ET.parse(file) provides a faster, simpler tree API with el.find("tag"), el.findall("tag"), el.get("attr"), el.text, and XPath subset support — use ElementTree for most XML parsing tasks; it is faster than minidom, has a cleaner API, and supports ET.iterparse() for streaming. For the lxml.etree (PyPI) alternative — lxml.etree.fromstring(xml) is 10–100× faster than stdlib parsers, adds full XPath 1.0, XSLT, RelaxNG/XSD validation, and lxml.objectify for attribute-style DOM access — use lxml for production XML workloads, large documents, or schemas; use xml.dom.minidom when you need W3C DOM node traversal, toxml()/toprettyxml() serialisation, or creating XML documents node-by-node with DOM methods. The Claude Skills 360 bundle includes xml.dom.minidom skill sets covering parse_bytes()/parse_file()/safe_parse() parsers, get_text()/iter_elements()/get_attrs() traversal helpers, element_to_dict()/xml_to_dict() converters, build_document()/add_element()/add_text() DOM builders, and to_xml()/round_trip() serialisers. Start with the free tier to try DOM XML patterns and xml.dom.minidom pipeline code generation.