Python’s html.entities module contains the HTML named entity lookup tables used by html.unescape() and html.parser. from html import entities. HTML5 map: entities.html5 — dict mapping "amp;" → "&", "copy;" → "©", "mdash;" → "—" (includes multi-character sequences and trailing semicolons). HTML4 maps: entities.name2codepoint — {"amp": 38, "lt": 60, "copy": 169, ...} (no semicolons, always integer). Reverse: entities.codepoint2name — {38: "amp", 60: "lt", ...}. Character lookup: entities.name2unichr — {"amp": "&", "lt": "<", ...} equivalent to {k: chr(v) for k, v in name2codepoint.items()}. Typical usage: resolve a named reference not handled by html.unescape(), enumerate all entity names, build custom entity encoders/decoders. For most decoding tasks use html.unescape(s) which internally consults html5. For encoding, use html.escape(s) for the 5 critical characters; for full Unicode-to-entity, iterate codepoint2name. Claude Code generates HTML entity encoders, HTML5 reference resolvers, smart-quote expanders, HTML-to-text converters, and entity sanitization pipelines.
CLAUDE.md for html.entities
## html.entities Stack
- Stdlib: from html import entities
- HTML5: entities.html5["amp;"] # → "&" (with trailing ;)
- entities.html5["mdash;"] # → "—"
- HTML4: entities.name2codepoint["copy"] # → 169
- entities.codepoint2name[169] # → "copy"
- entities.name2unichr["copy"] # → "©"
- Decode: html.unescape("&—") # → "&—" (uses html5 table)
- Encode: html.escape(s) # & < > " '
- Count: len(entities.html5) # ~2231 entries (Python 3.12)
html.entities Pipeline
# app/htmlentitiesutil.py — resolve, encode, decode, stats, converter, sanitizer
from __future__ import annotations
import html
import re
import unicodedata
from html import entities as _ent
from typing import Iterator
# ─────────────────────────────────────────────────────────────────────────────
# 1. Entity lookup helpers
# ─────────────────────────────────────────────────────────────────────────────
def resolve_entity(name: str) -> str | None:
"""
Resolve an HTML entity name (with or without & and ;) to its Unicode string.
Checks HTML5 table first, then HTML4.
Example:
resolve_entity("amp") # "&"
resolve_entity("©") # "©"
resolve_entity("mdash") # "—"
resolve_entity("½") # "½"
"""
# Normalise: strip & and ;
n = name.strip().lstrip("&").rstrip(";")
# HTML5 lookup (includes trailing semicolon in key)
result = _ent.html5.get(n + ";")
if result is not None:
return result
# HTML4 lookup
cp = _ent.name2codepoint.get(n)
if cp is not None:
return chr(cp)
return None
def codepoint_to_entity(cp: int, prefer_named: bool = True) -> str:
"""
Return the best HTML entity representation for a Unicode code point.
prefer_named: use named entity if available; else numeric &#N; or &#xN;.
Example:
codepoint_to_entity(169) # "©"
codepoint_to_entity(8212) # "—"
codepoint_to_entity(0x2603) # "☃"
"""
if prefer_named:
name = _ent.codepoint2name.get(cp)
if name:
return f"&{name};"
if cp < 128:
return chr(cp) # plain ASCII — no entity needed
return f"&#{cp};"
def char_to_entity(char: str) -> str:
"""
Return the HTML entity for a character (preferred named, else numeric).
Example:
char_to_entity("©") # "©"
char_to_entity("—") # "—"
char_to_entity("A") # "A"
"""
return codepoint_to_entity(ord(char[0]))
# ─────────────────────────────────────────────────────────────────────────────
# 2. Full encode / decode
# ─────────────────────────────────────────────────────────────────────────────
def encode_named(text: str, ascii_only: bool = True) -> str:
"""
Encode all non-ASCII (or all non-safe) characters as named HTML entities
where a name exists; use numeric entities for the rest.
ascii_only=True (default): only encode characters outside ASCII range.
ascii_only=False: also encode &, <, >, ", '.
Example:
encode_named("Hello © World — café")
# "Hello © World — café"
"""
if not ascii_only:
# escape the 5 critical chars first
text = html.escape(text, quote=True)
parts: list[str] = []
for ch in text:
cp = ord(ch)
if ascii_only and cp < 128:
parts.append(ch)
elif not ascii_only and cp < 128:
parts.append(ch) # already escaped by html.escape
else:
name = _ent.codepoint2name.get(cp)
if name:
parts.append(f"&{name};")
else:
parts.append(f"&#{cp};")
return "".join(parts)
def decode_all(text: str) -> str:
"""
Decode all HTML entities (named, numeric decimal, numeric hex).
Wraps html.unescape which uses the html5 table internally.
Example:
decode_all("&<©—©—")
# "&<©—©—"
"""
return html.unescape(text)
# ─────────────────────────────────────────────────────────────────────────────
# 3. Entity finder — locate all referenced entities in text
# ─────────────────────────────────────────────────────────────────────────────
_ENTITY_RE = re.compile(r"&([a-zA-Z][a-zA-Z0-9]*|#[0-9]+|#x[0-9a-fA-F]+);")
def find_entities(text: str) -> list[tuple[str, str, int]]:
"""
Find all HTML entity references in text.
Returns list of (raw_ref, resolved, offset).
Example:
refs = find_entities("Hello & © world")
for raw, resolved, pos in refs:
print(raw, "→", resolved, "at", pos)
"""
results: list[tuple[str, str, int]] = []
for m in _ENTITY_RE.finditer(text):
raw = m.group(0)
resolved = html.unescape(raw)
results.append((raw, resolved, m.start()))
return results
def unknown_entities(text: str) -> list[str]:
"""
Return entity references that html.unescape() cannot resolve
(they would be left unchanged in the output).
Example:
unknown_entities("& &bogus; ") # ["&bogus;"]
"""
unknowns: list[str] = []
for m in _ENTITY_RE.finditer(text):
raw = m.group(0)
if html.unescape(raw) == raw:
unknowns.append(raw)
return unknowns
# ─────────────────────────────────────────────────────────────────────────────
# 4. HTML entity statistics
# ─────────────────────────────────────────────────────────────────────────────
def entity_stats() -> dict[str, int]:
"""
Return counts of HTML4 and HTML5 entity table sizes.
Example:
stats = entity_stats()
print(stats)
"""
return {
"html5_entries": len(_ent.html5),
"name2codepoint": len(_ent.name2codepoint),
"codepoint2name": len(_ent.codepoint2name),
"name2unichr": len(_ent.name2unichr),
"multi_char_html5": sum(1 for v in _ent.html5.values() if len(v) > 1),
}
def entity_search(query: str) -> list[tuple[str, str]]:
"""
Search HTML5 entity table for names containing query (case-insensitive).
Returns list of (entity_name_with_semicolon, unicode_result).
Example:
entity_search("arrow")[:5]
# [("↓", "↓"), ("↑", "↑"), ...]
"""
q = query.lower()
return [(f"&{k}", v)
for k, v in sorted(_ent.html5.items())
if q in k.lower()]
# ─────────────────────────────────────────────────────────────────────────────
# 5. Smart-quote / typography expander
# ─────────────────────────────────────────────────────────────────────────────
_TYPOGRAPHY = {
r"---": "\u2014", # em dash
r"--": "\u2013", # en dash
r"...": "\u2026", # ellipsis
r"(c)": "\u00a9", # copyright
r"(r)": "\u00ae", # registered
r"(tm)": "\u2122", # trademark
r"'": "\u2019", # right single quote (simple apostrophe replacement)
}
def expand_typography(text: str) -> str:
"""
Replace common ASCII typography sequences with proper Unicode characters.
Example:
expand_typography("Hello --- world (c) 2024 (tm)")
# "Hello \u2014 world © 2024 ™"
"""
# Sort by length descending to avoid partial matches
for pattern, replacement in sorted(_TYPOGRAPHY.items(),
key=lambda x: -len(x[0])):
text = text.replace(pattern, replacement)
return text
# ─────────────────────────────────────────────────────────────────────────────
# Demo
# ─────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
print("=== html.entities demo ===")
# ── entity stats ──────────────────────────────────────────────────────────
print("\n--- entity_stats ---")
for k, v in entity_stats().items():
print(f" {k:25s}: {v}")
# ── resolve_entity ────────────────────────────────────────────────────────
print("\n--- resolve_entity ---")
for name in ["amp", "©", "mdash;", "½", "hearts", "bogus"]:
result = resolve_entity(name)
print(f" {name:15s} → {result!r}")
# ── encode_named / decode_all ─────────────────────────────────────────────
print("\n--- encode + decode ---")
text = "Hello © World — café ½ ∞"
encoded = encode_named(text)
decoded = decode_all(encoded)
print(f" original : {text!r}")
print(f" encoded : {encoded!r}")
print(f" decoded : {decoded!r}")
print(f" round-trip: {decoded == text}")
# ── find_entities ─────────────────────────────────────────────────────────
print("\n--- find_entities ---")
html_str = "Price: £10—expensive & tax &bogus; included"
for raw, resolved, pos in find_entities(html_str):
print(f" pos={pos:3d} {raw:12s} → {resolved!r}")
print(f" unknown: {unknown_entities(html_str)}")
# ── codepoint_to_entity ───────────────────────────────────────────────────
print("\n--- codepoint_to_entity ---")
for cp in [38, 60, 169, 8212, 9829, 0x2603]:
ent = codepoint_to_entity(cp)
print(f" U+{cp:04X} {chr(cp)!r} → {ent}")
# ── entity_search ─────────────────────────────────────────────────────────
print("\n--- entity_search('arrow') (first 5) ---")
for name, char in entity_search("arrow")[:5]:
print(f" {name:20s} → {char!r}")
# ── expand_typography ─────────────────────────────────────────────────────
print("\n--- expand_typography ---")
raw = "Claude Code --- Python (c) 2024 (tm) ... it's great"
expanded = expand_typography(raw)
print(f" before: {raw!r}")
print(f" after : {expanded!r}")
print("\n=== done ===")
For the html stdlib companion — html.unescape(s) (decodes all entities including html5) and html.escape(s) (encodes &, <, >, ", ') are the recommended high-level API that internally references html.entities.html5 — use html.escape()/html.unescape() for all production encoding and decoding; use html.entities directly only when you need to enumerate entity tables, build custom encoders, or search for specific named references. For the markupsafe (PyPI) alternative — markupsafe.escape(s) and markupsafe.Markup(s) provide context-aware HTML escaping with a Markup string type that prevents double-escaping in Jinja2 and Flask templates — use markupsafe in template engines and web frameworks to avoid XSS; use html.escape/html.entities in scripts and data pipelines that don’t need template-level safety. The Claude Skills 360 bundle includes html.entities skill sets covering resolve_entity()/codepoint_to_entity()/char_to_entity() lookups, encode_named()/decode_all() round-trip helpers, find_entities()/unknown_entities() finders, entity_stats()/entity_search() table tools, and expand_typography() smart-quote expander. Start with the free tier to try HTML entity patterns and html.entities pipeline code generation.