rapidfuzz is a fast fuzzy string matching library replacing fuzzywuzzy. pip install rapidfuzz. Ratio: from rapidfuzz import fuzz; fuzz.ratio("hello world", "hello word") → 0–100. Partial: fuzz.partial_ratio("hello", "hello world") → checks if shorter string matches a substring. Token sort: fuzz.token_sort_ratio("john doe", "doe john") — sorts tokens before comparing. Token set: fuzz.token_set_ratio("the quick brown fox", "quick fox") — set intersection. WRatio: fuzz.WRatio(a, b) — weighted combination choosing best scorer. QRatio: fuzz.QRatio(a,b) — QuickRatio. Best match: from rapidfuzz import process; process.extractOne("query", choices) → (match, score, key). Top N: process.extract("query", choices, limit=5). Threshold: process.extractOne(q, choices, score_cutoff=80). CDist: from rapidfuzz.process import cdist; matrix = cdist(queries, choices). Levenshtein distance: from rapidfuzz.distance import Levenshtein; Levenshtein.distance("kitten","sitting"). Edit ops: Levenshtein.editops("abc","axc"). Jaro: from rapidfuzz.distance import JaroWinkler; JaroWinkler.similarity("john","joan"). DamerauLevenshtein: handles transpositions. score_cutoff: short-circuit below threshold — much faster. processor: process.extractOne(q, choices, processor=str.lower). scorer=fuzz.token_sort_ratio. partial_scorer=fuzz.partial_ratio. Claude Code generates rapidfuzz deduplicators, name matchers, search suggestions, and address normalizers.
CLAUDE.md for rapidfuzz
## rapidfuzz Stack
- Version: rapidfuzz >= 3.0 | pip install rapidfuzz
- Ratio: fuzz.ratio(a, b) | fuzz.partial_ratio | fuzz.token_sort_ratio | fuzz.WRatio
- Best match: process.extractOne(query, choices, scorer=fuzz.WRatio, score_cutoff=80)
- Many matches: process.extract(query, choices, limit=5)
- Distance: Levenshtein.distance(a, b) | JaroWinkler.similarity(a, b)
- Bulk: cdist(queries, choices) → similarity matrix
rapidfuzz Fuzzy Matching Pipeline
# app/fuzzy.py — rapidfuzz ratio, process, dedup, name normalize, record linkage
from __future__ import annotations
import re
import unicodedata
from collections import defaultdict
from typing import Any
from rapidfuzz import fuzz, process
from rapidfuzz.distance import DamerauLevenshtein, JaroWinkler, Levenshtein
from rapidfuzz.process import cdist
# ─────────────────────────────────────────────────────────────────────────────
# 1. Single-pair scoring helpers
# ─────────────────────────────────────────────────────────────────────────────
def similarity(
a: str,
b: str,
scorer: str = "wratio",
) -> float:
"""
Compute a 0–100 similarity score between two strings.
scorer: "ratio" | "partial" | "token_sort" | "token_set" | "wratio" | "jaro" | "jaro_winkler"
Example:
similarity("New York", "New York City") # ~84 (partial)
similarity("john doe", "doe john", scorer="token_sort") # 100
similarity("café", "cafe", scorer="ratio") # ~89
"""
scorers = {
"ratio": fuzz.ratio,
"partial": fuzz.partial_ratio,
"token_sort": fuzz.token_sort_ratio,
"token_set": fuzz.token_set_ratio,
"wratio": fuzz.WRatio,
"qratio": fuzz.QRatio,
"jaro": lambda a, b: JaroWinkler.similarity(a, b) * 100,
"jaro_winkler":lambda a, b: JaroWinkler.similarity(a, b, prefix_weight=0.1) * 100,
}
fn = scorers.get(scorer, fuzz.WRatio)
return fn(a, b)
def edit_distance(a: str, b: str, method: str = "levenshtein") -> int:
"""
Compute edit distance between two strings.
method: "levenshtein" | "damerau" (handles transpositions)
Example:
edit_distance("kitten", "sitting") # 3
edit_distance("ca", "abc", method="damerau") # 2
"""
if method == "damerau":
return DamerauLevenshtein.distance(a, b)
return Levenshtein.distance(a, b)
def is_match(a: str, b: str, threshold: float = 80.0, scorer: str = "wratio") -> bool:
"""Return True if similarity(a, b) >= threshold."""
return similarity(a, b, scorer) >= threshold
# ─────────────────────────────────────────────────────────────────────────────
# 2. Best-match lookup
# ─────────────────────────────────────────────────────────────────────────────
def best_match(
query: str,
choices: list[str] | dict,
threshold: float = 0.0,
scorer: str = "wratio",
processor=str.lower,
) -> tuple[str, float, Any] | None:
"""
Find the best matching string from choices.
Returns (match, score, key) or None if no match above threshold.
choices: list of strings or dict {key: string}.
key: index (list) or dict key.
Example:
best_match("Nw York", ["New York", "Los Angeles", "Chicago"])
# ("New York", 91.0, 0)
catalog = {"nyc": "New York City", "la": "Los Angeles"}
best_match("new york", catalog)
# ("New York City", 95.0, "nyc")
"""
scorer_fn = {
"ratio": fuzz.ratio,
"partial": fuzz.partial_ratio,
"token_sort": fuzz.token_sort_ratio,
"token_set": fuzz.token_set_ratio,
"wratio": fuzz.WRatio,
}.get(scorer, fuzz.WRatio)
result = process.extractOne(
query,
choices,
scorer=scorer_fn,
score_cutoff=threshold,
processor=processor,
)
return result # (match, score, key) or None
def top_matches(
query: str,
choices: list[str] | dict,
n: int = 5,
threshold: float = 0.0,
scorer: str = "wratio",
processor=str.lower,
) -> list[tuple[str, float, Any]]:
"""
Return top N matches above threshold, sorted by score descending.
Example:
top_matches("pythón", ["Python", "Cython", "PHP", "Julia"], n=3)
# [("Python", 95, 0), ("Cython", 91, 1), ...]
"""
scorer_fn = {
"ratio": fuzz.ratio,
"partial": fuzz.partial_ratio,
"token_sort": fuzz.token_sort_ratio,
"token_set": fuzz.token_set_ratio,
"wratio": fuzz.WRatio,
}.get(scorer, fuzz.WRatio)
return process.extract(
query,
choices,
scorer=scorer_fn,
limit=n,
score_cutoff=threshold,
processor=processor,
)
# ─────────────────────────────────────────────────────────────────────────────
# 3. Deduplication
# ─────────────────────────────────────────────────────────────────────────────
def deduplicate(
strings: list[str],
threshold: float = 90.0,
scorer: str = "token_sort",
keep: str = "first",
) -> list[str]:
"""
Remove near-duplicate strings from a list.
threshold: strings scoring >= threshold are considered duplicates.
keep: "first" (keep the first seen) | "longest" (keep longest string per group).
Example:
names = ["Apple Inc.", "apple inc", "Apple Inc", "Google LLC", "google"]
deduplicate(names, threshold=85)
# ["Apple Inc.", "Google LLC"]
"""
scorer_fn = {
"ratio": fuzz.ratio,
"partial": fuzz.partial_ratio,
"token_sort": fuzz.token_sort_ratio,
"token_set": fuzz.token_set_ratio,
"wratio": fuzz.WRatio,
}.get(scorer, fuzz.token_sort_ratio)
# Union-find for grouping
parent = list(range(len(strings)))
def find(x):
while parent[x] != x:
parent[x] = parent[parent[x]]
x = parent[x]
return x
def union(x, y):
parent[find(x)] = find(y)
for i in range(len(strings)):
for j in range(i + 1, len(strings)):
if scorer_fn(strings[i], strings[j]) >= threshold:
union(i, j)
groups: dict[int, list[int]] = defaultdict(list)
for i in range(len(strings)):
groups[find(i)].append(i)
result = []
for indices in groups.values():
if keep == "longest":
chosen = max(indices, key=lambda i: len(strings[i]))
else: # "first"
chosen = min(indices)
result.append((chosen, strings[chosen]))
result.sort(key=lambda x: x[0])
return [s for _, s in result]
def cluster_strings(
strings: list[str],
threshold: float = 85.0,
scorer: str = "token_sort",
) -> list[list[str]]:
"""
Group strings into clusters of near-duplicates.
Returns list of groups; each group is a list of similar strings.
Example:
cluster_strings(["New York", "new york", "NYC", "Los Angeles", "la"])
# [["New York", "new york"], ["NYC"], ["Los Angeles"], ["la"]]
"""
scorer_fn = getattr(fuzz, scorer if scorer != "wratio" else "WRatio",
fuzz.token_sort_ratio)
clusters: list[list[int]] = []
assigned = [False] * len(strings)
for i, s in enumerate(strings):
if assigned[i]:
continue
group = [i]
assigned[i] = True
for j in range(i + 1, len(strings)):
if not assigned[j] and scorer_fn(s, strings[j]) >= threshold:
group.append(j)
assigned[j] = True
clusters.append(group)
return [[strings[i] for i in group] for group in clusters]
# ─────────────────────────────────────────────────────────────────────────────
# 4. Name normalization + matching
# ─────────────────────────────────────────────────────────────────────────────
_STOPWORDS = {"inc", "llc", "ltd", "corp", "co", "company", "the", "a", "an"}
def normalize_name(name: str, remove_stopwords: bool = True) -> str:
"""
Normalize a company/person name for better fuzzy matching.
- Lowercase
- Remove accent marks (café → cafe)
- Strip punctuation except spaces
- Optionally remove business stopwords (Inc, LLC, Corp)
Example:
normalize_name("Apple Inc.") # "apple"
normalize_name("Café de Paris") # "cafe de paris"
"""
# NFKD decompose + strip combining marks
name = unicodedata.normalize("NFKD", name)
name = "".join(c for c in name if not unicodedata.combining(c))
# Lowercase
name = name.lower()
# Remove punctuation except spaces
name = re.sub(r"[^\w\s]", " ", name)
# Collapse whitespace
name = " ".join(name.split())
if remove_stopwords:
words = [w for w in name.split() if w not in _STOPWORDS]
name = " ".join(words)
return name.strip()
def match_names(
query: str,
catalog: dict[Any, str],
threshold: float = 75.0,
scorer: str = "token_set",
) -> list[tuple[Any, str, float]]:
"""
Match a query name against a catalog of {id: name} pairs.
Returns list of (id, name, score) sorted by score descending.
Example:
catalog = {1: "Apple Inc.", 2: "Google LLC", 3: "Apple Store"}
match_names("appel store", catalog, threshold=70)
# [(3, "Apple Store", 88.0), (1, "Apple Inc.", 72.0)]
"""
norm_query = normalize_name(query)
norm_catalog = {k: normalize_name(v) for k, v in catalog.items()}
results = top_matches(norm_query, norm_catalog, n=10, threshold=threshold, scorer=scorer)
# results: [(name, score, key)]
return [(r[2], catalog[r[2]], r[1]) for r in results]
# ─────────────────────────────────────────────────────────────────────────────
# 5. Bulk pairwise scoring
# ─────────────────────────────────────────────────────────────────────────────
def similarity_matrix(
strings_a: list[str],
strings_b: list[str] | None = None,
scorer: str = "wratio",
workers: int = 1,
) -> list[list[float]]:
"""
Compute a similarity matrix between two lists (or self-comparison).
Returns a 2D list of 0–100 scores.
Example:
queries = ["apple", "google", "amazon"]
targets = ["Apple Inc.", "Alphabet", "Amazon.com"]
matrix = similarity_matrix(queries, targets, scorer="token_set")
"""
scorer_fn = {
"ratio": fuzz.ratio,
"partial": fuzz.partial_ratio,
"token_sort": fuzz.token_sort_ratio,
"token_set": fuzz.token_set_ratio,
"wratio": fuzz.WRatio,
}.get(scorer, fuzz.WRatio)
b = strings_b if strings_b is not None else strings_a
matrix = cdist(strings_a, b, scorer=scorer_fn, workers=workers)
return matrix.tolist()
# ─────────────────────────────────────────────────────────────────────────────
# Demo
# ─────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
print("=== Similarity scores ===")
pairs = [
("hello world", "hello word", "ratio"),
("new york city", "New York", "partial"),
("john doe", "doe john", "token_sort"),
("the quick brown fox","quick fox", "token_set"),
("Python", "Pyhton", "wratio"), # typo
]
for a, b, scorer in pairs:
s = similarity(a, b, scorer)
print(f" {scorer:12s}: {a!r:25s} vs {b!r:20s} → {s:.1f}")
print("\n=== Edit distance ===")
print(f" Levenshtein(kitten, sitting) = {edit_distance('kitten', 'sitting')}")
print(f" Damerau(ca, abc) = {edit_distance('ca', 'abc', 'damerau')}")
print("\n=== Best match ===")
choices = ["New York", "Los Angeles", "Chicago", "Houston", "Phoenix"]
result = best_match("nw york", choices, threshold=60)
print(f" 'nw york' → {result}")
print("\n=== Top matches ===")
for match, score, idx in top_matches("pthon", ["Python", "Cython", "PHP", "Julia", "Rust"], n=3):
print(f" {match:10s}: {score:.1f}")
print("\n=== Deduplication ===")
names = ["Apple Inc.", "apple inc", "Apple Inc", "Google LLC",
"google llc", "Google", "Microsoft Corp", "microsoft"]
deduped = deduplicate(names, threshold=85)
print(f" Input: {len(names)} names → {len(deduped)} unique: {deduped}")
print("\n=== Clustering ===")
entries = ["John Smith", "J. Smith", "john smith", "Jane Doe", "jane doe", "J Doe"]
clusters = cluster_strings(entries, threshold=80)
for i, group in enumerate(clusters):
print(f" Cluster {i+1}: {group}")
print("\n=== Name normalization ===")
for name in ["Apple Inc.", "café de París", "The Google LLC", "Amazon.com"]:
print(f" {name:25s} → {normalize_name(name)!r}")
print("\n=== Similarity matrix ===")
a = ["apple", "google", "amazon"]
b = ["Apple Inc.", "Alphabet", "Amazon.com"]
mat = similarity_matrix(a, b, scorer="token_set")
for row_label, row in zip(a, mat):
print(f" {row_label:8s}: {[round(v,1) for v in row]}")
For the fuzzywuzzy / thefuzz alternative — thefuzz (formerly fuzzywuzzy) is the original Python fuzzy matching library using a similar string-ratio API; rapidfuzz is a modern C++-accelerated reimplementation that is 10–100× faster with an identical API (from rapidfuzz import fuzz, process is a drop-in for from thefuzz import fuzz, process) and no dependency on python-Levenshtein — use rapidfuzz for any new project. For the difflib (stdlib) alternative — Python’s built-in difflib.SequenceMatcher computes similarity ratios but is ~10× slower and lackes token-based scorers; rapidfuzz adds token sort/set ratios, partial substring matching, and the process module for batch extraction with score_cutoff short-circuiting — use rapidfuzz whenever you need fuzzy matching beyond a simple one-off comparison. The Claude Skills 360 bundle includes rapidfuzz skill sets covering similarity() with 8 scorer types, edit_distance() Levenshtein/Damerau, is_match() threshold check, best_match() with dict/list choices, top_matches() top-N extraction, deduplicate() union-find grouping, cluster_strings() near-dup clustering, normalize_name() accent/stopword stripping, match_names() catalog lookup, and similarity_matrix() cdist bulk scoring. Start with the free tier to try fuzzy string matching and deduplication code generation.