Python’s html.parser module provides a simple, stdlib-only HTML parser based on subclassing HTMLParser and overriding event handler methods. from html.parser import HTMLParser. Subclass: override handle_starttag(tag, attrs), handle_endtag(tag), handle_data(data), handle_comment(data), handle_entityref(name), handle_charref(name), handle_decl(decl). Feed: parser.feed(html_string) — can be called multiple times with chunks; parser.close() flushes remaining data. Reset: parser.reset() — clears all state. Attributes: attrs in handle_starttag is a list of (name, value) tuples; value is None for boolean attributes (e.g. disabled). convert_charrefs=True (default since Python 3.4) — automatically converts &,  , etc. to Unicode in handle_data. HTMLParser.getpos() → (line, col) of the current token — useful for error reporting. The module is not a full DOM; it is a streaming SAX-style event parser. Claude Code generates link extractors, text extractors, table scrapers, meta tag readers, and sitemap parsers.
CLAUDE.md for html.parser
## html.parser Stack
- Stdlib: from html.parser import HTMLParser
- Subclass: class MyParser(HTMLParser):
- def handle_starttag(self, tag, attrs): ...
- def handle_endtag(self, tag): ...
- def handle_data(self, data): ...
- Feed: p = MyParser(); p.feed(html); p.close()
- Attrs: dict(attrs) # (name, value) list → dict
- Note: convert_charrefs=True by default (handles & etc.)
html.parser HTML Parsing Pipeline
# app/htmlparserutil.py — links, text, tables, meta, structured data
from __future__ import annotations
import html
import re
from collections import defaultdict
from dataclasses import dataclass, field
from html.parser import HTMLParser
from typing import Any
# ─────────────────────────────────────────────────────────────────────────────
# 1. Link extractor
# ─────────────────────────────────────────────────────────────────────────────
@dataclass
class LinkInfo:
href: str
text: str
rel: str # e.g. "nofollow", "noopener", or ""
title: str
class LinkExtractor(HTMLParser):
"""
Extract all <a href="..."> links with anchor text, rel, and title.
Example:
ex = LinkExtractor()
ex.feed(html_text)
for link in ex.links:
print(link.href, link.text)
"""
def __init__(self) -> None:
super().__init__(convert_charrefs=True)
self.links: list[LinkInfo] = []
self._in_a = False
self._current: dict[str, str] = {}
self._buf: list[str] = []
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
if tag == "a":
a = dict(attrs) # type: ignore[arg-type]
href = a.get("href") or ""
self._current = {
"href": href,
"rel": a.get("rel") or "",
"title": a.get("title") or "",
}
self._in_a = True
self._buf = []
def handle_endtag(self, tag: str) -> None:
if tag == "a" and self._in_a:
self.links.append(LinkInfo(
href=self._current["href"],
text=" ".join("".join(self._buf).split()),
rel=self._current["rel"],
title=self._current["title"],
))
self._in_a = False
self._buf = []
def handle_data(self, data: str) -> None:
if self._in_a:
self._buf.append(data)
def extract_links(html_text: str) -> list[LinkInfo]:
"""
Return all links in html_text.
Example:
links = extract_links(resp_body)
hrefs = [l.href for l in links if l.href.startswith("https://")]
"""
parser = LinkExtractor()
parser.feed(html_text)
parser.close()
return parser.links
# ─────────────────────────────────────────────────────────────────────────────
# 2. Text extractor
# ─────────────────────────────────────────────────────────────────────────────
_SKIP_TAGS = frozenset({"script", "style", "head", "noscript", "svg", "math"})
class TextExtractor(HTMLParser):
"""
Extract visible text from HTML, skipping script/style/head sections.
Example:
ex = TextExtractor()
ex.feed(html_text)
print(ex.get_text())
"""
def __init__(self) -> None:
super().__init__(convert_charrefs=True)
self._skip_depth = 0
self._parts: list[str] = []
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
if tag in _SKIP_TAGS:
self._skip_depth += 1
def handle_endtag(self, tag: str) -> None:
if tag in _SKIP_TAGS and self._skip_depth > 0:
self._skip_depth -= 1
def handle_data(self, data: str) -> None:
if self._skip_depth == 0:
stripped = data.strip()
if stripped:
self._parts.append(stripped)
def get_text(self, separator: str = " ") -> str:
return separator.join(self._parts)
def extract_text(html_text: str, separator: str = " ") -> str:
"""
Return the visible text of an HTML document.
Example:
text = extract_text(page_html)
words = len(text.split())
"""
parser = TextExtractor()
parser.feed(html_text)
parser.close()
return parser.get_text(separator)
# ─────────────────────────────────────────────────────────────────────────────
# 3. Meta tag reader
# ─────────────────────────────────────────────────────────────────────────────
@dataclass
class PageMeta:
title: str = ""
description: str = ""
keywords: str = ""
og_title: str = ""
og_description: str = ""
og_image: str = ""
canonical: str = ""
robots: str = ""
charset: str = ""
viewport: str = ""
class MetaExtractor(HTMLParser):
"""
Extract <title>, <meta>, and <link rel='canonical'> from HTML <head>.
Example:
ex = MetaExtractor()
ex.feed(html_text)
print(ex.meta.title, ex.meta.description)
"""
def __init__(self) -> None:
super().__init__(convert_charrefs=True)
self.meta = PageMeta()
self._in_title = False
self._title_buf: list[str] = []
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
a: dict[str, str] = {k: (v or "") for k, v in attrs}
if tag == "title":
self._in_title = True
self._title_buf = []
elif tag == "meta":
name = a.get("name", "").lower()
prop = a.get("property", "").lower()
content = a.get("content", "")
if name == "description":
self.meta.description = content
elif name == "keywords":
self.meta.keywords = content
elif name == "robots":
self.meta.robots = content
elif name == "viewport":
self.meta.viewport = content
elif prop == "og:title":
self.meta.og_title = content
elif prop == "og:description":
self.meta.og_description = content
elif prop == "og:image":
self.meta.og_image = content
elif a.get("charset"):
self.meta.charset = a["charset"]
elif tag == "link":
if a.get("rel") == "canonical":
self.meta.canonical = a.get("href", "")
def handle_endtag(self, tag: str) -> None:
if tag == "title" and self._in_title:
self.meta.title = " ".join("".join(self._title_buf).split())
self._in_title = False
def handle_data(self, data: str) -> None:
if self._in_title:
self._title_buf.append(data)
def extract_meta(html_text: str) -> PageMeta:
"""
Return PageMeta with title, description, og: tags, canonical, etc.
Example:
meta = extract_meta(page_html)
print(meta.title, meta.og_image)
"""
parser = MetaExtractor()
parser.feed(html_text)
parser.close()
return parser.meta
# ─────────────────────────────────────────────────────────────────────────────
# 4. Simple table scraper
# ─────────────────────────────────────────────────────────────────────────────
class TableScraper(HTMLParser):
"""
Extract all <table> data as list-of-list-of-strings.
Example:
scraper = TableScraper()
scraper.feed(html_text)
for table in scraper.tables:
for row in table:
print(row)
"""
def __init__(self) -> None:
super().__init__(convert_charrefs=True)
self.tables: list[list[list[str]]] = []
self._in_table = False
self._in_cell = False
self._current_table: list[list[str]] = []
self._current_row: list[str] = []
self._cell_buf: list[str] = []
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
if tag == "table":
self._in_table = True
self._current_table = []
elif tag == "tr" and self._in_table:
self._current_row = []
elif tag in ("td", "th") and self._in_table:
self._in_cell = True
self._cell_buf = []
def handle_endtag(self, tag: str) -> None:
if tag == "table":
self.tables.append(self._current_table)
self._in_table = False
elif tag == "tr" and self._in_table:
if self._current_row:
self._current_table.append(self._current_row)
elif tag in ("td", "th") and self._in_cell:
self._current_row.append(" ".join("".join(self._cell_buf).split()))
self._in_cell = False
def handle_data(self, data: str) -> None:
if self._in_cell:
self._cell_buf.append(data)
def extract_tables(html_text: str) -> list[list[list[str]]]:
"""
Return all tables in html_text as [table[row[cell]]].
Example:
tables = extract_tables(page_html)
first = tables[0] # first table
headers = first[0] # header row
"""
parser = TableScraper()
parser.feed(html_text)
parser.close()
return parser.tables
# ─────────────────────────────────────────────────────────────────────────────
# Demo
# ─────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
print("=== html.parser demo ===")
SAMPLE_HTML = """<!DOCTYPE html>
<html>
<head>
<meta charset="utf-8">
<title>Demo Page & More</title>
<meta name="description" content="A demo page for html.parser">
<meta property="og:title" content="OG Demo">
<link rel="canonical" href="https://example.com/demo">
</head>
<body>
<h1>Hello html.parser</h1>
<p>Visit <a href="https://python.org" rel="nofollow" title="Python">Python.org</a>
or <a href="/docs">the docs</a>.</p>
<script>var x = "<not visible>";</script>
<table>
<tr><th>Name</th><th>Score</th></tr>
<tr><td>Alice</td><td>95</td></tr>
<tr><td>Bob</td><td>87</td></tr>
</table>
<!-- a comment -->
</body>
</html>"""
# ── extract_links ─────────────────────────────────────────────────────────
print("\n--- extract_links ---")
for link in extract_links(SAMPLE_HTML):
print(f" href={link.href!r} text={link.text!r} rel={link.rel!r}")
# ── extract_text ──────────────────────────────────────────────────────────
print("\n--- extract_text ---")
text = extract_text(SAMPLE_HTML)
print(f" {text[:100]!r}")
# ── extract_meta ──────────────────────────────────────────────────────────
print("\n--- extract_meta ---")
meta = extract_meta(SAMPLE_HTML)
print(f" title: {meta.title!r}")
print(f" description: {meta.description!r}")
print(f" og_title: {meta.og_title!r}")
print(f" canonical: {meta.canonical!r}")
print(f" charset: {meta.charset!r}")
# ── extract_tables ────────────────────────────────────────────────────────
print("\n--- extract_tables ---")
tables = extract_tables(SAMPLE_HTML)
print(f" {len(tables)} table(s) found")
for i, row in enumerate(tables[0]):
print(f" row {i}: {row}")
# ── html.escape / unescape ────────────────────────────────────────────────
print("\n--- html.escape / unescape ---")
raw = '<script>alert("xss")</script>'
escaped = html.escape(raw)
print(f" escaped: {escaped}")
print(f" unescaped: {html.unescape(escaped)[:40]}")
print("\n=== done ===")
For the lxml / html5lib (PyPI) alternative — lxml.etree.HTML(text) and lxml.html.fromstring(text) parse HTML into a full XPath-queryable element tree; html5lib.parse(text) parses into a W3C-compliant DOM with full HTML5 error recovery — use lxml when you need XPath queries, CSS selectors (lxml.cssselect), or high-performance parsing on large documents; use html5lib when you need strict HTML5 compliance and handling of malformed markup exactly as browsers do; use html.parser for zero-dependency scripts on well-formed HTML where streaming SAX-style event handling is sufficient. For the BeautifulSoup (PyPI) alternative — bs4.BeautifulSoup(text, "html.parser") wraps html.parser (or lxml) in a convenient tree-search API with find(), find_all(), CSS selectors, and .text properties — use BeautifulSoup when extraction logic is complex and the fluent search API significantly reduces code; use html.parser directly when you want zero dependencies, maximum control over events, or are building a streaming parser for very large HTML documents. The Claude Skills 360 bundle includes html.parser skill sets covering LinkExtractor with extract_links() for href, anchor text, rel, and title, TextExtractor with extract_text() visible-text extractor with skip-tag support, MetaExtractor with extract_meta() for PageMeta title/description/og/canonical/charset, and TableScraper with extract_tables() returning list-of-list-of-strings. Start with the free tier to try HTML parsing patterns and html.parser pipeline code generation.