The regex module extends Python’s re with Unicode properties, fuzzy matching, and more. pip install regex. Drop-in: import regex as re — all re functions work. Unicode properties: regex.findall(r"\p{Letter}+", text) — matches Unicode letters. \p{Script=Latin} \p{Script=Han} \p{Category=Lu} \p{Block=CJK_Unified_Ideographs}. \P{Digit} — negated Unicode property. Fuzzy: regex.search(r"(?:word){e<=1}", text) — match “word” with up to 1 error. {s<=1,i<=1,d<=1} — substitutions, insertions, deletions. {e<=2} — up to 2 errors. {i<=1,d<=1,e<=2} — combined constraints. match.fuzzy_counts → (subs, ins, dels). match.fuzzy_changes → positions of changes. Overlapping: regex.findall(r"(?=(\d+))", "123", overlapped=True). Variable lookbehind: regex.search(r"(?<=a+)b", "aaab") — works in regex, not re. Possessive: r"a++b" — no backtrack. Atomic group: r"(?>a+)b". Branch reset: r"(?|(\w+)|(\d+))" — same group numbers for both. \K — keep nothing to left; resets start of match. regex.sub(r"pattern", repl, text, count=1, flags=regex.MULTILINE). Capture: regex.fullmatch(). Timeout: none built-in but use signal or concurrent.futures. Claude Code generates regex Unicode extractors, fuzzy search pipelines, and text normalization patterns.
CLAUDE.md for regex
## regex Stack
- Version: regex >= 2023 | pip install regex | import regex as re (drop-in)
- Unicode: \p{Letter}, \p{Script=Latin}, \p{Category=Lu}, \P{Digit}
- Fuzzy: r"(?:word){e<=1}" — 1 error | fuzzy_counts → (subs, ins, dels)
- Overlapping: findall(pattern, text, overlapped=True)
- Variable lookbehind: (?<=a+)b — not supported in stdlib re
- Possessive: a++ | atomic group: (?>a+) — prevent catastrophic backtracking
- Branch reset: (?|(\w+)|(\d+)) — same group number in alternatives
regex Advanced Pattern Matching Pipeline
# app/advanced_re.py — regex Unicode properties, fuzzy matching, and extraction
from __future__ import annotations
from typing import Any
import regex
# ─────────────────────────────────────────────────────────────────────────────
# 1. Unicode property matching
# ─────────────────────────────────────────────────────────────────────────────
# Compiled patterns for common Unicode categories
_UNICODE_WORD = regex.compile(r"\p{Letter}[\p{Letter}\p{Number}\p{Connector_Punctuation}]*")
_UNICODE_LETTER = regex.compile(r"\p{Letter}+")
_UNICODE_DIGIT = regex.compile(r"\p{Decimal_Digit_Number}+")
_LATIN_WORD = regex.compile(r"[\p{Script=Latin}\p{Mark}]+")
_CJK = regex.compile(r"\p{Script=Han}+")
_ARABIC = regex.compile(r"\p{Script=Arabic}+")
_CYRILLIC = regex.compile(r"\p{Script=Cyrillic}+")
def extract_words(text: str) -> list[str]:
"""
Extract Unicode words from any script.
Uses \\p{Letter}+ so "München", "北京", "Москва" are all matched.
"""
return _UNICODE_LETTER.findall(text)
def extract_numbers(text: str) -> list[str]:
"""
Extract numeric sequences in any Unicode script.
Matches Arabic numerals, Devanagari digits, etc.
"""
return _UNICODE_DIGIT.findall(text)
def extract_cjk(text: str) -> list[str]:
"""Extract CJK (Han script) character sequences."""
return _CJK.findall(text)
def split_by_script(text: str) -> list[tuple[str, str]]:
"""
Split text into (script, segment) pairs: "Hello 北京 World" →
[("Latin", "Hello"), ("Han", "北京"), ("Latin", "World")]
"""
scripts = {
"Han": regex.compile(r"\p{Script=Han}+"),
"Latin": regex.compile(r"[\p{Script=Latin}\p{Mark}]+"),
"Cyrillic": regex.compile(r"\p{Script=Cyrillic}+"),
"Arabic": regex.compile(r"\p{Script=Arabic}+"),
"Greek": regex.compile(r"\p{Script=Greek}+"),
"Hiragana": regex.compile(r"\p{Script=Hiragana}+"),
"Katakana": regex.compile(r"\p{Script=Katakana}+"),
"Hangul": regex.compile(r"\p{Script=Hangul}+"),
}
all_matches: list[tuple[int, int, str, str]] = []
for script_name, pat in scripts.items():
for m in pat.finditer(text):
all_matches.append((m.start(), m.end(), script_name, m.group()))
all_matches.sort(key=lambda x: x[0])
return [(s, t) for _, _, s, t in all_matches]
def strip_diacritics(text: str) -> str:
"""
Remove combining diacritical marks (e.g. accents, tildes).
"café" → "cafe" | "naïve" → "naive"
NFD decomposes the character, then \\p{Mark} removes the combining marks.
"""
import unicodedata
nfd = unicodedata.normalize("NFD", text)
return regex.sub(r"\p{Mark}", "", nfd)
def normalize_whitespace(text: str) -> str:
"""
Collapse all Unicode whitespace (including non-breaking, zero-width, etc.)
to a single ASCII space.
"""
return regex.sub(r"\p{Separator}+|\p{Control}", " ", text).strip()
# ─────────────────────────────────────────────────────────────────────────────
# 2. Fuzzy matching
# ─────────────────────────────────────────────────────────────────────────────
def fuzzy_search(
pattern: str,
text: str,
max_errors: int = 1,
) -> list[dict[str, Any]]:
"""
Find all fuzzy matches of `pattern` in `text` with at most `max_errors` errors.
Errors = substitutions + insertions + deletions.
Returns [{"match", "start", "end", "errors", "fuzzy_counts"}].
"""
pat = regex.compile(rf"(?:{regex.escape(pattern)}){{e<={max_errors}}}")
results = []
for m in pat.finditer(text, overlapped=True):
subs, ins, dels = m.fuzzy_counts
results.append({
"match": m.group(),
"start": m.start(),
"end": m.end(),
"errors": subs + ins + dels,
"substitutions": subs,
"insertions": ins,
"deletions": dels,
})
return results
def fuzzy_match(
pattern: str,
text: str,
max_substitutions: int = 1,
max_insertions: int = 1,
max_deletions: int = 1,
) -> dict[str, Any] | None:
"""
Find first fuzzy match with separate error-type budgets.
(?:word){s<=1,i<=1,d<=1}: up to 1 substitution, 1 insertion, 1 deletion.
"""
pat = regex.compile(
rf"(?:{regex.escape(pattern)})"
rf"{{s<={max_substitutions},i<={max_insertions},d<={max_deletions}}}"
)
m = pat.search(text)
if m is None:
return None
subs, ins, dels = m.fuzzy_counts
return {
"match": m.group(),
"start": m.start(),
"end": m.end(),
"substitutions": subs,
"insertions": ins,
"deletions": dels,
}
def find_typos(
word: str,
text: str,
max_errors: int = 1,
) -> list[str]:
"""
Find all typos/misspellings of `word` in `text`.
Returns the variant spellings that were matched.
"""
results = fuzzy_search(word, text, max_errors=max_errors)
variants = {r["match"] for r in results if r["match"] != word}
return sorted(variants)
# ─────────────────────────────────────────────────────────────────────────────
# 3. Overlapping matches
# ─────────────────────────────────────────────────────────────────────────────
def find_overlapping(pattern: str, text: str, flags: int = 0) -> list[str]:
"""
Find all overlapping matches of a pattern.
Standard re.findall only returns non-overlapping matches.
find_overlapping(r"\d\d", "12345") → ["12", "23", "34", "45"]
"""
return regex.findall(pattern, text, overlapped=True, flags=flags)
def find_overlapping_spans(pattern: str, text: str) -> list[tuple[int, int, str]]:
"""Return (start, end, match) for every overlapping match."""
return [
(m.start(), m.end(), m.group())
for m in regex.finditer(pattern, text, overlapped=True)
]
# ─────────────────────────────────────────────────────────────────────────────
# 4. Variable-length lookbehind (not in stdlib re)
# ─────────────────────────────────────────────────────────────────────────────
def find_after_prefix(text: str, prefix_pattern: str, word_pattern: str) -> list[str]:
"""
Find words that appear immediately after a variable-length prefix.
e.g. find_after_prefix(text, r"https?://", r"\S+") — URLs after scheme.
Variable-length lookbehind is only supported in `regex`, not `re`.
"""
pat = regex.compile(rf"(?<={prefix_pattern}){word_pattern}")
return pat.findall(text)
# ─────────────────────────────────────────────────────────────────────────────
# 5. Practical extraction patterns
# ─────────────────────────────────────────────────────────────────────────────
# Compiled for reuse
_EMAIL_RE = regex.compile(r"[\w.+-]+@[\w-]+\.[\w.-]+", regex.IGNORECASE)
_URL_RE = regex.compile(r"https?://[\w\-._~:/?#\[\]@!$&'()*+,;=%]+", regex.IGNORECASE)
_HASHTAG = regex.compile(r"#\p{Letter}[\p{Letter}\p{Number}_]*")
_MENTION = regex.compile(r"@\p{Letter}[\p{Letter}\p{Number}_]*")
_PHONE_RE = regex.compile(r"\+?[\d\s\-\(\)]{7,20}\d")
def extract_emails(text: str) -> list[str]:
return _EMAIL_RE.findall(text)
def extract_urls(text: str) -> list[str]:
return _URL_RE.findall(text)
def extract_hashtags(text: str) -> list[str]:
"""Extract hashtags from any Unicode script: #python #北京 #Москва."""
return _HASHTAG.findall(text)
def extract_mentions(text: str) -> list[str]:
return _MENTION.findall(text)
def remove_emoji(text: str) -> str:
"""Strip emoji (and other symbol/pictographic characters) from text."""
return regex.sub(r"\p{So}|\p{Cs}|\p{Co}", "", text)
def keep_only_letters_digits(text: str) -> str:
"""Remove everything that is not a Unicode letter or decimal digit."""
return regex.sub(r"[^\p{Letter}\p{Decimal_Digit_Number}\s]", "", text)
# ─────────────────────────────────────────────────────────────────────────────
# 6. Pandas integration
# ─────────────────────────────────────────────────────────────────────────────
def extract_column(df, column: str, pattern: str, new_column: str | None = None, group: int = 0):
"""
Apply a regex extraction to a pandas DataFrame column.
group=0: return whole match; group=1: first capture group.
"""
out_col = new_column or f"{column}_extracted"
pat = regex.compile(pattern)
df[out_col] = df[column].apply(
lambda x: (m.group(group) if (m := pat.search(str(x))) else None)
)
return df
def filter_rows_matching(df, column: str, pattern: str, fuzzy_errors: int = 0):
"""Return rows where column matches pattern (optionally with fuzzy errors)."""
if fuzzy_errors > 0:
pat = regex.compile(rf"(?:{pattern}){{e<={fuzzy_errors}}}")
else:
pat = regex.compile(pattern)
mask = df[column].apply(lambda x: bool(pat.search(str(x))))
return df[mask]
# ─────────────────────────────────────────────────────────────────────────────
# Demo
# ─────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
print("=== Unicode property matching ===")
mixed = "Hello 世界 Привет мир مرحبا café"
print(f" extract_words: {extract_words(mixed)}")
print(f" extract_cjk: {extract_cjk(mixed)}")
print(f" split_by_script: {split_by_script(mixed)}")
print("\n=== Unicode properties ===")
samples = [
(r"\p{Letter}+", "Unicode letters", "café München 北京"),
(r"\p{Script=Latin}+", "Latin script", "café München"),
(r"\p{Uppercase_Letter}", "Uppercase", "Hello World ABC"),
(r"\p{Decimal_Digit_Number}+", "Digits", "abc 123 ۱۲۳"),
]
for pat, label, text in samples:
matches = regex.findall(pat, text)
print(f" {label:20}: {matches}")
print("\n=== Fuzzy matching ===")
typo_text = "The woord Python is programing langgauge."
for word in ["word", "Python", "programming", "language"]:
results = fuzzy_search(word, typo_text, max_errors=1)
if results:
r = results[0]
print(f" {word!r:15} → matched {r['match']!r:15} "
f"(s={r['substitutions']},i={r['insertions']},d={r['deletions']})")
print("\n=== find_typos ===")
words = ["colour", "behaviour", "analyse"]
text = "Use colors and behaviors in your analysis"
for word in words:
typos = find_typos(word, text, max_errors=2)
print(f" {word!r:15} → variants: {typos}")
print("\n=== Overlapping ===")
for pat, text in [
(r"\d\d", "12345"),
(r"aa", "aaaa"),
(r"[aeiou]","beautiful"),
]:
matches = find_overlapping(pat, text)
print(f" {pat!r:15} in {text!r:15} → {matches}")
print("\n=== Extraction ===")
sample_text = "Email me at [email protected] or [email protected]. See https://example.com."
print(f" emails: {extract_emails(sample_text)}")
print(f" urls: {extract_urls(sample_text)}")
social = "Hey @Alice and #python fans — check #北京 updates from @Москва"
print(f" hashtags: {extract_hashtags(social)}")
print(f" mentions: {extract_mentions(social)}")
print("\n=== Strip diacritics ===")
for t in ["café", "naïve", "München", "Ångström", "señor"]:
print(f" {t!r:15} → {strip_diacritics(t)!r}")
print("\n=== remove_emoji ===")
emoji_text = "Hello 👋 World 🌍 Python 🐍"
print(f" {emoji_text!r} → {remove_emoji(emoji_text)!r}")
For the stdlib re alternative — Python’s re module doesn’t support Unicode property escapes (\p{Letter}), fuzzy matching, overlapping matches, or variable-length lookbehind; regex is exactly a superset — it passes all re tests and adds these features. The import regex as re idiom is a safe drop-in once you have the package installed. For the pyparsing alternative — pyparsing constructs grammars from composable Python objects and is better for structured parsing of domain-specific languages where whitespace and precedence rules matter; regex is better for pattern-based text extraction and transformation where a single expression can express what you need — they’re complementary, with pyparsing handling grammar-level parsing and regex handling extraction and search within text. The Claude Skills 360 bundle includes regex skill sets covering \p{Letter}/\p{Script=Latin}/\p{Script=Han} Unicode properties, extract_words()/extract_cjk()/split_by_script() multilingual extraction, strip_diacritics() with NFD + \p{Mark} removal, normalize_whitespace() Unicode whitespace collapse, fuzzy_search() and fuzzy_match() with error budgets, find_typos() variant finder, find_overlapping() and find_overlapping_spans(), variable-length lookbehind, extract_emails()/extract_urls()/extract_hashtags()/extract_mentions(), remove_emoji() \p{So} cleanup, keep_only_letters_digits() filter, and pandas extract_column()/filter_rows_matching(). Start with the free tier to try advanced Unicode regex code generation.