pyparsing builds text parsers from composable primitives. pip install pyparsing. Tokens: from pyparsing import Word, Keyword, Literal, Regex, Suppress, alphas, alphanums, nums. Word(alphas) — one or more alpha chars. Word(alphanums + "_") — identifier. Keyword("if") — whole-word match. Suppress("[") — match but discard. Regex(r"\d+\.\d*") — regex token. QuotedString('"') — double-quoted string. pyparsing_common.integer — int literal. pyparsing_common.real — float. Composition: And([a, b]) = a + b. Or([a, b]) = a | b. ZeroOrMore(expr). OneOrMore(expr). Optional(expr). Group(expr) — wrap in sub-list. Suppress(literal) — parse, discard. Forward() — recursive grammar. nestedExpr("(",")"). Results: result = expr.parseString(text). result[0] index. result["name"] named. result.asDict(). result.asList(). Names: expr.setResultsName("key") or expr("key"). Scan: for tokens, start, end in expr.scanString(text):. Transform: expr.transformString(text). Packrat: ParserElement.enablePackrat() — memoize repeated sub-expressions. Actions: expr.setParseAction(fn) — transform matched tokens. expr.addCondition(fn) — filter. Combine(a + b) — merge adjacent. commaSeparatedList shorthand. Claude Code generates pyparsing grammars, expression parsers, and config DSL readers.
CLAUDE.md for pyparsing
## pyparsing Stack
- Version: pyparsing >= 3.1 | pip install pyparsing
- Tokens: Word(alphas) | Keyword("if") | Regex(r"...") | QuotedString('"')
- Compose: a + b (And) | a | b (Or) | ZeroOrMore(x) | Optional(x) | Group(x)
- Names: expr("key") or expr.setResultsName("key") — access as result["key"]
- Results: parseString(text)[0] | .asDict() | .asList()
- Scan: for tokens, s, e in expr.scanString(text): — find all matches
- Perf: ParserElement.enablePackrat() before complex grammars
pyparsing Parser Construction Pipeline
# app/parsers.py — pyparsing grammars for expressions, config, and DSL parsing
from __future__ import annotations
import operator
from typing import Any
from pyparsing import (
CaselessKeyword,
Combine,
Forward,
Group,
Keyword,
Literal,
OneOrMore,
OpAssoc,
Optional,
ParserElement,
ParseResults,
QuotedString,
Regex,
Suppress,
Word,
ZeroOrMore,
alphanums,
alphas,
infixNotation,
nums,
printables,
pyparsing_common,
rest_of_line,
)
# Enable memoization — dramatically speeds up complex grammars with backtracking
ParserElement.enablePackrat()
LPAR, RPAR = Suppress("("), Suppress(")")
LBRACKET, RBRACKET = Suppress("["), Suppress("]")
LBRACE, RBRACE = Suppress("{"), Suppress("}")
COMMA = Suppress(",")
EQUALS = Suppress("=")
COLON = Suppress(":")
SEMI = Suppress(";")
# ─────────────────────────────────────────────────────────────────────────────
# 1. Arithmetic expression parser with operator precedence
# ─────────────────────────────────────────────────────────────────────────────
def _make_arith_parser():
"""
infixNotation builds operator-precedence parsers automatically.
Each tuple: (operator_expr, arity, associativity, parse_action).
"""
integer = pyparsing_common.integer
real = pyparsing_common.real
number = (real | integer)("number")
ident = Word(alphas + "_", alphanums + "_")("name")
atom = number | ident | (LPAR + Forward() + RPAR)
# infixNotation handles left/right associativity and precedence automatically
expr = infixNotation(
atom,
[
(Literal("**"), 2, OpAssoc.RIGHT), # power (right-assoc)
(Literal("-"), 1, OpAssoc.RIGHT), # unary minus
(Literal("*") | Literal("/") | Literal("%"),
2, OpAssoc.LEFT),
(Literal("+") | Literal("-"),2, OpAssoc.LEFT),
],
)
return expr
_arith_parser = _make_arith_parser()
def parse_expression(text: str) -> ParseResults:
"""Parse an arithmetic expression: '2 * (x + 3) ** 2'."""
return _arith_parser.parseString(text, parseAll=True)
# ─────────────────────────────────────────────────────────────────────────────
# 2. INI / Key=value config file parser
# ─────────────────────────────────────────────────────────────────────────────
def make_config_parser():
"""
Parses INI-style config:
[section]
key = value
key2 = "quoted value"
# comment
Returns dict of {section: {key: value}}.
"""
comment = Suppress("#" + rest_of_line)
identifier = Word(alphas + "_", alphanums + "_-.")
value_str = QuotedString('"') | QuotedString("'") | Regex(r"[^\n#]+").leaveWhitespace().stripWhitespace()
integer = pyparsing_common.integer
real = pyparsing_common.real
boolean = (CaselessKeyword("true") | CaselessKeyword("false")).setParseAction(
lambda t: t[0].lower() == "true"
)
value = boolean | real | integer | value_str
key_value = Group(identifier("key") + EQUALS + value("value"))
section_head = Suppress("[") + identifier("name") + Suppress("]")
section = Group(section_head + Group(ZeroOrMore(key_value | comment))("items"))
config_file = ZeroOrMore(comment | section)
return config_file
def parse_config(text: str) -> dict[str, dict[str, Any]]:
"""Parse an INI-style string into a nested dict."""
parser = make_config_parser()
result = parser.parseString(text)
out: dict[str, dict[str, Any]] = {}
for section in result:
name = section[0]
items = {kv["key"]: kv["value"] for kv in section[1]}
out[name] = items
return out
# ─────────────────────────────────────────────────────────────────────────────
# 3. Simple SQL SELECT parser
# ─────────────────────────────────────────────────────────────────────────────
def make_select_parser():
"""
Parses: SELECT col1, col2 FROM table WHERE col = 'val' LIMIT 100
Demonstrates Keyword for reserved words and Group for column lists.
"""
SELECT = Keyword("SELECT", caseless=True)
FROM = Keyword("FROM", caseless=True)
WHERE = Keyword("WHERE", caseless=True)
LIMIT = Keyword("LIMIT", caseless=True)
AND = Keyword("AND", caseless=True)
OR = Keyword("OR", caseless=True)
AS_KW = Keyword("AS", caseless=True)
STAR = Literal("*")
identifier = Word(alphas + "_", alphanums + "_")
table_name = identifier
column = Combine(identifier + Optional("." + identifier)) | STAR
alias = Group(column + Optional(Suppress(AS_KW) + identifier("alias")))
column_list = Group(alias + ZeroOrMore(COMMA + alias))("columns")
string_val = QuotedString("'") | QuotedString('"')
number_val = pyparsing_common.real | pyparsing_common.integer
value = string_val | number_val | identifier
op = Regex(r"[!=<>]+")
condition = Group(identifier("col") + op("op") + value("val"))
where_expr = condition + ZeroOrMore((AND | OR) + condition)
select_stmt = (
SELECT
+ column_list
+ Suppress(FROM)
+ table_name("table")
+ Optional(Suppress(WHERE) + Group(where_expr)("where"))
+ Optional(Suppress(LIMIT) + pyparsing_common.integer("limit"))
)
return select_stmt
def parse_select(sql: str) -> dict[str, Any]:
parser = make_select_parser()
result = parser.parseString(sql.strip(), parseAll=True)
return {
"table": result.get("table", ""),
"columns": result.get("columns", []).asList(),
"where": result.get("where", []).asList(),
"limit": result.get("limit"),
}
# ─────────────────────────────────────────────────────────────────────────────
# 4. Log line extractor (scanString)
# ─────────────────────────────────────────────────────────────────────────────
def make_log_parser():
"""
Extract structured fields from log lines like:
2024-01-05 14:30:22 ERROR app.db Connection timeout after 30s
scanString finds all matches in a multi-line string.
"""
date = Combine(Word(nums, exact=4) + "-" + Word(nums, exact=2) + "-" + Word(nums, exact=2))
time_ = Combine(Word(nums, exact=2) + ":" + Word(nums, exact=2) + ":" + Word(nums, exact=2))
level = (Keyword("DEBUG") | Keyword("INFO") | Keyword("WARNING") | Keyword("ERROR") | Keyword("CRITICAL"))
logger = Combine(Word(alphanums + "_") + ZeroOrMore("." + Word(alphanums + "_")))
msg = Regex(r".+")
log_line = (
date("date") + time_("time") + level("level")
+ logger("logger") + msg("message")
)
return log_line
def extract_log_entries(log_text: str) -> list[dict[str, str]]:
"""Extract all parseable log entries from multi-line output."""
parser = make_log_parser()
entries = []
for tokens, start, end in parser.scanString(log_text):
entries.append({
"date": tokens.get("date", ""),
"time": tokens.get("time", ""),
"level": tokens.get("level", ""),
"logger": tokens.get("logger", ""),
"message": tokens.get("message", ""),
})
return entries
# ─────────────────────────────────────────────────────────────────────────────
# 5. Version string parser
# ─────────────────────────────────────────────────────────────────────────────
def make_version_parser():
"""
Parse semantic version strings: 1.2.3, 1.2.3-beta.1, 1.2.3+build.42
Demonstrates Combine for concatenated tokens and Optional for suffixes.
"""
integer = Word(nums)
dot = Literal(".")
pre_id = Combine(Word(alphanums + "."))
build_id = Combine(Word(alphanums + "."))
version = (
integer("major") + Suppress(dot)
+ integer("minor") + Suppress(dot)
+ integer("patch")
+ Optional(Suppress("-") + pre_id("prerelease"))
+ Optional(Suppress("+") + build_id("build"))
)
return version
def parse_version(v: str) -> dict[str, Any]:
parser = make_version_parser()
result = parser.parseString(v.strip(), parseAll=True)
return {
"major": int(result["major"]),
"minor": int(result["minor"]),
"patch": int(result["patch"]),
"prerelease": result.get("prerelease"),
"build": result.get("build"),
}
# ─────────────────────────────────────────────────────────────────────────────
# Demo
# ─────────────────────────────────────────────────────────────────────────────
CONFIG_SAMPLE = """
[database]
host = localhost
port = 5432
name = "myapp_db"
ssl = true
[server]
port = 8000
debug = false
workers = 4
"""
LOG_SAMPLE = """
2024-01-05 14:30:22 INFO app.server Server started on port 8000
2024-01-05 14:31:05 ERROR app.db Connection timeout after 30s
2024-01-05 14:31:06 WARNING app.cache Cache miss rate 45%
"""
if __name__ == "__main__":
print("=== Arithmetic expression ===")
r = parse_expression("2 * (x + 3) ** 2")
print(f" {r.asList()}")
print("\n=== Config parser ===")
cfg = parse_config(CONFIG_SAMPLE)
for section, vals in cfg.items():
print(f" [{section}]")
for k, v in vals.items():
print(f" {k} = {v!r}")
print("\n=== SELECT parser ===")
queries = [
"SELECT id, name FROM users WHERE active = 1 LIMIT 10",
"SELECT * FROM products WHERE category = 'books'",
"SELECT user_id, count FROM stats",
]
for sql in queries:
r = parse_select(sql)
print(f" table={r['table']} cols={r['columns']} limit={r['limit']}")
print("\n=== Log extraction ===")
entries = extract_log_entries(LOG_SAMPLE)
for e in entries:
print(f" [{e['level']:8}] {e['logger']:15} {e['message']}")
print("\n=== Version parser ===")
for v in ["1.2.3", "2.0.0-beta.1", "1.0.0+build.42", "3.1.4-rc.2+sha.abc123"]:
p = parse_version(v)
print(f" {v:25} → {p}")
For the re (regex) alternative — Python’s re module is the right tool for fixed-pattern extraction from homogeneous text, but composing a grammar from regex primitives for hierarchical structures (nested brackets, operator precedence, recursive rules) requires writing a manual recursive-descent parser; pyparsing’s infixNotation() builds a correct operator-precedence grammar in 8 lines, nestedExpr("(",")") handles arbitrarily nested parentheses, and Group() / named results (expr("key")) give you a structured parse tree instead of raw string matches. For the lark alternative — lark uses a formal BNF or EBNF grammar string that you write as a separate text block and parses it with Earley or LALR algorithm, which handles ambiguous or left-recursive grammars and is faster for large inputs; pyparsing builds the grammar in pure Python using operator overloading (a + b, a | b) so the grammar lives in the same file as the parse actions, making it easier to prototype and debug smaller parsers and DSLs without context-switching to a separate grammar file. The Claude Skills 360 bundle includes pyparsing skill sets covering Word/Keyword/Regex/QuotedString primitives, And/Or/ZeroOrMore/OneOrMore/Optional composition, Group and Suppress for structure, setResultsName and (“key”) syntax, infixNotation for operator-precedence grammars, Forward for recursive rules, scanString for multi-match extraction, parseString with parseAll, parse_config INI file reader, parse_select SQL SELECT parser, extract_log_entries log scanner, parse_version semantic version parser, and enablePackrat memoization. Start with the free tier to try parser grammar code generation.