Python’s html module provides safe HTML escaping and entity decoding utilities. import html. Escape: html.escape(s, quote=True) → converts & → &, < → <, > → >, " → " (when quote=True); pass quote=False to skip quote escaping for text content outside attributes. Unescape: html.unescape(s) → converts all HTML/XML character references (named like &, decimal like A, hex like A) back to Unicode. Entity dicts: html.entities.html5 → {"amp;": "&", "lt;": "<", ...} (entity name with trailing semicolon → character(s)); html.entities.name2codepoint → {"amp": 38, "lt": 60, ...} (name without semicolon → int); html.entities.codepoint2name → {38: "amp", 60: "lt", ...} (int → name). Claude Code generates safe HTML renderers, attribute serializers, entity converters, template helpers, and XSS sanitizers.
CLAUDE.md for html
## html Stack
- Stdlib: import html, html.entities
- Escape: html.escape(s) # &<>" → entities
- html.escape(s, quote=False) # &<> → entities (text only)
- Unescape: html.unescape("<b>") # → "<b>"
- Entities: html.entities.name2codepoint["lt"] # → 60
- html.entities.codepoint2name[60] # → "lt"
- html.entities.html5["lt;"] # → "<"
- Note: For attribute values always use quote=True (default)
html Escaping and Entity Pipeline
# app/htmlutil.py — escaping, attributes, entities, tags, table, strip
from __future__ import annotations
import html
import html.entities
import re
from dataclasses import dataclass, field
from typing import Any
# ─────────────────────────────────────────────────────────────────────────────
# 1. Escaping helpers
# ─────────────────────────────────────────────────────────────────────────────
def escape_text(s: str) -> str:
"""
Escape a string for safe insertion as HTML text content.
Escapes & < > but NOT quotes (safe between tags, not inside attributes).
Example:
print(escape_text("<script>alert('xss')</script>"))
# <script>alert('xss')</script>
"""
return html.escape(s, quote=False)
def escape_attr(s: str) -> str:
"""
Escape a string for safe insertion inside an HTML attribute value.
Escapes & < > and " (always use this for attribute values).
Example:
url = 'https://example.com/?q=<foo>&bar="baz"'
print(f'<a href="{escape_attr(url)}">link</a>')
"""
return html.escape(s, quote=True)
def unescape_text(s: str) -> str:
"""
Convert all HTML character references (named, decimal, hex) to Unicode.
Example:
print(unescape_text("<b>Hello & World!</b>"))
# <b>Hello & World!</b>
print(unescape_text("😀")) # 😀
"""
return html.unescape(s)
# ─────────────────────────────────────────────────────────────────────────────
# 2. Entity lookups
# ─────────────────────────────────────────────────────────────────────────────
def html_entity_name(codepoint: int) -> "str | None":
"""
Return the canonical HTML entity name for a Unicode codepoint, or None.
Example:
print(html_entity_name(38)) # "amp"
print(html_entity_name(60)) # "lt"
print(html_entity_name(9829)) # "hearts"
"""
return html.entities.codepoint2name.get(codepoint)
def entity_to_char(name: str) -> "str | None":
"""
Convert an entity name (without & or ;) to its Unicode character.
Checks html5 dict (with semicolon) and name2codepoint (without semicolon).
Example:
print(entity_to_char("amp")) # "&"
print(entity_to_char("hearts")) # "♥"
print(entity_to_char("nbsp")) # "\xa0"
"""
# Try html5 dict first (name + semicolon)
ch = html.entities.html5.get(name + ";")
if ch is not None:
return ch
# Fallback: name2codepoint
cp = html.entities.name2codepoint.get(name)
if cp is not None:
return chr(cp)
return None
def char_to_entity_ref(ch: str) -> str:
"""
Return the shortest HTML entity reference for a single character,
or a decimal numeric reference if no named entity exists.
Example:
print(char_to_entity_ref("&")) # "&"
print(char_to_entity_ref("♥")) # "♥"
print(char_to_entity_ref("😀")) # "😀"
"""
cp = ord(ch)
name = html.entities.codepoint2name.get(cp)
if name:
return f"&{name};"
return f"&#{cp};"
def find_unknown_entities(s: str) -> list[str]:
"""
Return a list of named entity references in s that are not in html5.
Example:
find_unknown_entities("< &foo; &") # ["&foo;"]
"""
pattern = re.compile(r"&([A-Za-z][A-Za-z0-9]*);")
unknown = []
for m in pattern.finditer(s):
name = m.group(1)
if (name + ";") not in html.entities.html5:
unknown.append(m.group(0))
return unknown
# ─────────────────────────────────────────────────────────────────────────────
# 3. Tag and attribute builders
# ─────────────────────────────────────────────────────────────────────────────
def render_attrs(attrs: "dict[str, Any]") -> str:
"""
Serialize a dict of HTML attributes to a safe attribute string.
Boolean True values emit the attribute name without a value.
None values are omitted.
Example:
print(render_attrs({"href": "/go?x=1&y=2", "class": "btn", "disabled": True}))
# href="/go?x=1&y=2" class="btn" disabled
"""
parts = []
for k, v in attrs.items():
safe_k = escape_attr(str(k))
if v is None:
continue
if v is True:
parts.append(safe_k)
elif v is False:
continue
else:
parts.append(f'{safe_k}="{escape_attr(str(v))}"')
return " ".join(parts)
def build_tag(
tag: str,
content: "str | None" = None,
attrs: "dict[str, Any] | None" = None,
self_closing: bool = False,
) -> str:
"""
Build an HTML tag string with safely escaped attributes and content.
Example:
print(build_tag("a", "click & go", {"href": "/x?a=1&b=2", "class": "link"}))
# <a href="/x?a=1&b=2" class="link">click & go</a>
print(build_tag("input", attrs={"type": "text", "value": "<hi>"}, self_closing=True))
# <input type="text" value="<hi>" />
"""
safe_tag = re.sub(r"[^A-Za-z0-9]", "", tag) # sanitize tag name
attr_str = (" " + render_attrs(attrs)) if attrs else ""
if self_closing:
return f"<{safe_tag}{attr_str} />"
body = escape_text(content) if content is not None else ""
return f"<{safe_tag}{attr_str}>{body}</{safe_tag}>"
# ─────────────────────────────────────────────────────────────────────────────
# 4. HTML table renderer
# ─────────────────────────────────────────────────────────────────────────────
def safe_html_table(
headers: list[str],
rows: "list[list[Any]]",
table_attrs: "dict[str, Any] | None" = None,
header_attrs: "dict[str, Any] | None" = None,
) -> str:
"""
Render a 2-D data table as a safe HTML <table> string.
All cell content is escaped.
Example:
html_str = safe_html_table(
headers=["Name", "Score", "Note"],
rows=[["Alice", 42, "good & fast"], ["Bob<script>", 99, "careful"]],
)
print(html_str)
"""
ta = render_attrs(table_attrs or {"border": "1", "cellpadding": "4"})
lines: list[str] = [f"<table {ta}>", " <thead>", " <tr>"]
ha = (" " + render_attrs(header_attrs)) if header_attrs else ""
for h in headers:
lines.append(f" <th{ha}>{escape_text(str(h))}</th>")
lines += [" </tr>", " </thead>", " <tbody>"]
for row in rows:
lines.append(" <tr>")
for cell in row:
lines.append(f" <td>{escape_text(str(cell))}</td>")
lines.append(" </tr>")
lines += [" </tbody>", "</table>"]
return "\n".join(lines)
# ─────────────────────────────────────────────────────────────────────────────
# 5. Tag stripper and sanitizer
# ─────────────────────────────────────────────────────────────────────────────
_TAG_RE = re.compile(r"<[^>]+>")
_MULTI_WS = re.compile(r"\s{2,}")
def strip_tags(s: str, collapse_whitespace: bool = True) -> str:
"""
Remove all HTML tags from s and unescape entities.
Optionally collapse runs of whitespace to single spaces.
Example:
strip_tags("<p>Hello & <b>World</b>!</p>")
# "Hello & World!"
strip_tags("<ul>\\n <li>one</li>\\n <li>two</li>\\n</ul>")
# "one two"
"""
text = _TAG_RE.sub(" ", s)
text = html.unescape(text)
if collapse_whitespace:
text = _MULTI_WS.sub(" ", text).strip()
return text
@dataclass
class HtmlSanitizer:
"""
Whitelist-based HTML sanitizer: strips tags not in allowed_tags
and removes all attributes not in allowed_attrs.
Example:
san = HtmlSanitizer(allowed_tags={"b", "i", "a"}, allowed_attrs={"a": {"href"}})
print(san.sanitize('<b>Hello</b> <script>evil()</script> <a href="/ok" onclick="x">link</a>'))
# <b>Hello</b> <a href="/ok">link</a>
"""
allowed_tags: "set[str]" = field(default_factory=lambda: {"b", "i", "em", "strong", "a", "p", "br"})
allowed_attrs: "dict[str, set[str]]" = field(default_factory=dict)
_TAG_OPEN = re.compile(r"<(/?)([A-Za-z][A-Za-z0-9]*)((?:\s+[^>]*?)?)(/?)>", re.DOTALL)
_ATTR_RE = re.compile(r'([A-Za-z][A-Za-z0-9-]*)(?:\s*=\s*(?:"([^"]*)"|\'([^\']*)\'|(\S+)))?')
def _parse_attrs(self, attr_str: str, tag: str) -> str:
allowed = self.allowed_attrs.get(tag, set())
parts = []
for m in self._ATTR_RE.finditer(attr_str):
name = m.group(1).lower()
if name not in allowed:
continue
val = m.group(2) or m.group(3) or m.group(4) or ""
parts.append(f'{escape_attr(name)}="{escape_attr(val)}"')
return " ".join(parts)
def sanitize(self, s: str) -> str:
"""Sanitize HTML, keeping only whitelisted tags and attributes."""
def replace(m: re.Match) -> str:
slash = m.group(1)
tag = m.group(2).lower()
attrs = m.group(3)
selfcl = m.group(4)
if tag not in self.allowed_tags:
return ""
attr_str = self._parse_attrs(attrs, tag)
attr_part = (" " + attr_str) if attr_str else ""
if selfcl:
return f"<{tag}{attr_part} />"
if slash:
return f"</{tag}>"
return f"<{tag}{attr_part}>"
return self._TAG_OPEN.sub(replace, s)
# ─────────────────────────────────────────────────────────────────────────────
# Demo
# ─────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
print("=== html demo ===")
# ── escape / unescape ─────────────────────────────────────────────────────
print("\n--- escape_text / escape_attr / unescape_text ---")
raw = '<script>alert("XSS & fun")</script>'
print(f" raw: {raw!r}")
print(f" escape_text: {escape_text(raw)!r}")
print(f" escape_attr: {escape_attr(raw)!r}")
encoded = "<b>Hello & World! 😀</b>"
print(f" unescape: {unescape_text(encoded)!r}")
# ── entity lookups ────────────────────────────────────────────────────────
print("\n--- entity lookups ---")
for cp in [38, 60, 62, 9829, 9824]:
name = html_entity_name(cp)
ch = chr(cp)
print(f" U+{cp:05X} {ch!r} → &{name};" if name else f" U+{cp:05X} {ch!r} → (no name)")
print("\n--- entity_to_char ---")
for name in ["amp", "lt", "hearts", "nbsp", "unknown_entity"]:
ch = entity_to_char(name)
print(f" &{name}; → {ch!r}")
print("\n--- char_to_entity_ref ---")
for ch in ["&", "<", "♥", "😀", "A"]:
print(f" {ch!r} → {char_to_entity_ref(ch)}")
# ── build_tag / render_attrs ──────────────────────────────────────────────
print("\n--- build_tag ---")
a = build_tag("a", "click & go", {"href": "/search?q=<hi>&lang=en", "class": "link btn"})
print(f" {a}")
inp = build_tag("input", attrs={"type": "text", "value": '<script>evil()</script>'}, self_closing=True)
print(f" {inp}")
# ── safe_html_table ────────────────────────────────────────────────────────
print("\n--- safe_html_table ---")
tbl = safe_html_table(
headers=["Name", "Score", "Note"],
rows=[
["Alice", 98, "good & fast"],
["Bob<script>alert(1)</script>", 42, "needs <review>"],
],
)
print(tbl)
# ── strip_tags ────────────────────────────────────────────────────────────
print("\n--- strip_tags ---")
messy = "<p>Hello <b>&</b> <em>World</em>!</p>"
print(f" strip_tags: {strip_tags(messy)!r}")
# ── HtmlSanitizer ─────────────────────────────────────────────────────────
print("\n--- HtmlSanitizer ---")
san = HtmlSanitizer(
allowed_tags={"b", "i", "a", "em"},
allowed_attrs={"a": {"href"}},
)
dirty = '<b>OK</b> <script>evil()</script> <a href="/safe" onclick="bad()">link</a>'
print(f" input: {dirty!r}")
print(f" output: {san.sanitize(dirty)!r}")
print("\n=== done ===")
For the xml.sax.saxutils alternative — xml.sax.saxutils.escape(data) and xml.sax.saxutils.quoteattr(data) perform XML-safe escaping following XML rules (which are close but not identical to HTML escaping) — use xml.sax.saxutils when generating XML or XHTML where XML rules apply; use html.escape when generating HTML5 where the entity set and rules differ. For the markupsafe (PyPI) / bleach (PyPI) alternative — markupsafe.Markup is a string subclass that tracks “already escaped” state and prevents double-escaping in Jinja2 templates; bleach provides a configurable whitelist HTML sanitizer with link-detection — use markupsafe in Jinja2/Flask where template auto-escaping is managed for you; use bleach when you need proven, maintained sanitization with CSS and URL safety; use html.escape for zero-dependency single-value escaping or when building your own lightweight renderer. The Claude Skills 360 bundle includes html skill sets covering escape_text() / escape_attr() / unescape_text() wrappers, html_entity_name() / entity_to_char() / char_to_entity_ref() entity lookups, find_unknown_entities() linter, render_attrs() / build_tag() safe tag builders, safe_html_table() data renderer, strip_tags() with whitespace collapsing, and HtmlSanitizer whitelist-based tag and attribute cleaner. Start with the free tier to try HTML escaping patterns and html pipeline code generation.