spaCy provides industrial-strength NLP with pretrained pipelines. pip install spacy, python -m spacy download en_core_web_sm. import spacy. Load: nlp = spacy.load("en_core_web_sm") — CPU. Transformer: "en_core_web_trf" (requires pip install spacy-transformers). Process: doc = nlp("Apple is headquartered in Cupertino."). Tokens: for token in doc: token.text, token.lemma_, token.pos_, token.dep_, token.is_stop. Entities: for ent in doc.ents: ent.text, ent.label_, ent.start_char, ent.end_char. Batch: docs = list(nlp.pipe(texts, batch_size=64)). Sentences: for sent in doc.sents: sent.text. Noun chunks: doc.noun_chunks. Matcher: from spacy.matcher import Matcher, matcher = Matcher(nlp.vocab), matcher.add("PATTERN", [[{"LOWER":"iphone"},{"IS_DIGIT":True}]]), matches = matcher(doc). PhraseMatcher: from spacy.matcher import PhraseMatcher, pm = PhraseMatcher(nlp.vocab), pm.add("BRAND", [nlp.make_doc("Apple Inc")]). EntityRuler: ruler = nlp.add_pipe("entity_ruler"), ruler.add_patterns([{"label":"ORG","pattern":"OpenAI"}]). Custom component: @Language.component("my_comp"), def my_comp(doc): ... return doc. displacy: from spacy import displacy, displacy.render(doc, style="ent") — HTML. Serialize: doc.to_disk("/path"), DocBin for batches. Training: spacy train config.cfg --output ./model. Claude Code generates spaCy NLP pipelines, NER trainers, rule-based extractors, text classifiers, and custom component scripts.
CLAUDE.md for spaCy
## spaCy Stack
- Version: spacy >= 3.7
- Models: en_core_web_sm | en_core_web_md | en_core_web_lg | en_core_web_trf
- Load: nlp = spacy.load("en_core_web_sm") | spacy.blank("en")
- Process: doc = nlp(text) | docs = list(nlp.pipe(texts, batch_size=64))
- Tokens: token.text | .lemma_ | .pos_ | .dep_ | .is_stop | .ent_type_
- Entities: doc.ents → (ent.text, ent.label_, ent.start_char, ent.end_char)
- Rules: Matcher(vocab) | PhraseMatcher | nlp.add_pipe("entity_ruler")
- Custom: @Language.component("name") → nlp.add_pipe("name")
- Train: spacy train config.cfg --output ./output
spaCy NLP Pipeline
# nlp/spacy_pipeline.py — industrial NLP with spaCy
from __future__ import annotations
import re
from collections import Counter, defaultdict
from pathlib import Path
from typing import Iterator
import spacy
from spacy import displacy
from spacy.language import Language
from spacy.matcher import Matcher, PhraseMatcher
from spacy.tokens import Doc, Span, Token
from spacy.training import Example
# ── 1. Model loading ──────────────────────────────────────────────────────────
def load_nlp(
model: str = "en_core_web_sm",
disable: list[str] = None,
exclude: list[str] = None,
) -> Language:
"""
Load a spaCy pipeline.
Models (English):
- en_core_web_sm — 12MB, CPU, fast (NER+POS+DEP)
- en_core_web_md — 43MB, + word vectors
- en_core_web_lg — 741MB, + larger word vectors
- en_core_web_trf — transformer (requires spacy-transformers)
disable: skip components but keep in pipeline (e.g., ["ner"])
exclude: remove components entirely (e.g., ["parser"] for faster NER-only)
"""
nlp = spacy.load(model, disable=disable or [], exclude=exclude or [])
print(f"spaCy {spacy.__version__} | model={model} | pipes={nlp.pipe_names}")
return nlp
def create_blank(lang: str = "en") -> Language:
"""Create a blank pipeline for building from scratch."""
return spacy.blank(lang)
# ── 2. Text processing ────────────────────────────────────────────────────────
def process(nlp: Language, text: str) -> Doc:
"""Process a single text."""
return nlp(text)
def process_batch(
nlp: Language,
texts: list[str],
batch_size: int = 64,
n_process: int = 1, # Parallelism (1 = no multiprocessing)
) -> list[Doc]:
"""
Process many texts efficiently with nlp.pipe.
n_process > 1 uses multiprocessing (not compatible with GPU).
"""
return list(nlp.pipe(texts, batch_size=batch_size, n_process=n_process))
# ── 3. Token-level extraction ─────────────────────────────────────────────────
def extract_tokens(doc: Doc, include_punct: bool = False, include_stop: bool = True) -> list[dict]:
"""Extract token features as a list of dicts."""
tokens = []
for tok in doc:
if not include_punct and tok.is_punct:
continue
if not include_stop and tok.is_stop:
continue
tokens.append({
"text": tok.text,
"lemma": tok.lemma_,
"pos": tok.pos_, # Universal POS
"tag": tok.tag_, # Fine-grained POS
"dep": tok.dep_, # Dependency label
"is_stop": tok.is_stop,
"is_alpha": tok.is_alpha,
})
return tokens
def lemmatize(nlp: Language, text: str) -> list[str]:
"""Lemmatize all non-stop, non-punct tokens."""
doc = nlp(text)
return [tok.lemma_ for tok in doc if not tok.is_stop and not tok.is_punct and tok.is_alpha]
def get_noun_phrases(doc: Doc) -> list[str]:
"""Extract noun chunks (base noun phrases)."""
return [chunk.text for chunk in doc.noun_chunks]
def get_word_frequencies(
nlp: Language,
texts: list[str],
min_count: int = 2,
) -> dict[str, int]:
"""Count word frequencies across a corpus (excluding stop words)."""
counter: Counter = Counter()
for doc in nlp.pipe(texts, batch_size=64):
for tok in doc:
if tok.is_alpha and not tok.is_stop and len(tok.text) > 2:
counter[tok.lemma_.lower()] += 1
return {w: c for w, c in counter.most_common() if c >= min_count}
# ── 4. Named entity recognition ───────────────────────────────────────────────
def extract_entities(doc: Doc) -> list[dict]:
"""Extract named entities from a doc."""
return [
{
"text": ent.text,
"label": ent.label_,
"start_char": ent.start_char,
"end_char": ent.end_char,
"start_tok": ent.start,
"end_tok": ent.end,
}
for ent in doc.ents
]
def extract_entities_by_type(doc: Doc, entity_type: str) -> list[str]:
"""Get all entities of a specific type (e.g., 'ORG', 'PERSON', 'GPE')."""
return [ent.text for ent in doc.ents if ent.label_ == entity_type]
def entity_counts(
nlp: Language,
texts: list[str],
types: list[str] = None,
) -> dict[str, Counter]:
"""Count entity occurrences across a corpus."""
counts: dict[str, Counter] = defaultdict(Counter)
for doc in nlp.pipe(texts, batch_size=64):
for ent in doc.ents:
if types is None or ent.label_ in types:
counts[ent.label_][ent.text] += 1
return dict(counts)
# ── 5. Rule-based matching ────────────────────────────────────────────────────
def build_token_matcher(
nlp: Language,
patterns: dict[str, list[list[dict]]],
) -> Matcher:
"""
Build a token-pattern Matcher.
patterns: {"LABEL": [[{token_attr: value}, ...]]}
Token attributes: LOWER, TEXT, LEMMA, POS, TAG, DEP, IS_DIGIT, IS_ALPHA, ORTH
"""
matcher = Matcher(nlp.vocab)
for label, pattern_list in patterns.items():
matcher.add(label, pattern_list)
return matcher
def build_phrase_matcher(
nlp: Language,
phrases: dict[str, list[str]],
attr: str = "LOWER", # "LOWER" | "TEXT" | "LEMMA"
) -> PhraseMatcher:
"""
Build a PhraseMatcher for bulk substring matching.
phrases: {"LABEL": ["phrase1", "phrase2", ...]}
"""
pm = PhraseMatcher(nlp.vocab, attr=attr)
for label, phrase_list in phrases.items():
patterns = [nlp.make_doc(p) for p in phrase_list]
pm.add(label, patterns)
return pm
def match_text(
doc: Doc,
matcher: Matcher | PhraseMatcher,
) -> list[dict]:
"""Run matcher on doc and return structured matches."""
matches = matcher(doc)
results = []
for match_id, start, end in matches:
span = doc[start:end]
results.append({
"label": doc.vocab.strings[match_id],
"text": span.text,
"start_char": span.start_char,
"end_char": span.end_char,
})
return results
# ── 6. Entity ruler (gazetteer) ───────────────────────────────────────────────
def add_entity_ruler(
nlp: Language,
patterns: list[dict],
before: str = "ner",
overwrite: bool = False,
) -> Language:
"""
Add an EntityRuler for dictionary-based NER.
patterns: [{"label": "ORG", "pattern": "OpenAI"}, {"label": "PRODUCT", "pattern": [{"LOWER": "iphone"}, {"IS_DIGIT": True}]}]
The ruler runs before the statistical NER by default.
"""
config = {"overwrite_ents": overwrite}
ruler = nlp.add_pipe("entity_ruler", config=config, before=before)
ruler.add_patterns(patterns)
return nlp
# ── 7. Custom pipeline component ─────────────────────────────────────────────
def add_sentence_stats_component(nlp: Language) -> Language:
"""
Example custom component: adds sentence stats as Doc extension attributes.
"""
if not Doc.has_extension("n_sentences"):
Doc.set_extension("n_sentences", default=0)
if not Doc.has_extension("avg_sent_len"):
Doc.set_extension("avg_sent_len", default=0.0)
if not Doc.has_extension("entity_density"):
Doc.set_extension("entity_density", default=0.0)
@Language.component("sentence_stats")
def sentence_stats(doc: Doc) -> Doc:
sents = list(doc.sents)
n_sents = len(sents)
avg_len = sum(len(s) for s in sents) / max(n_sents, 1)
ent_density = len(doc.ents) / max(len(doc), 1)
doc._.n_sentences = n_sents
doc._.avg_sent_len = round(avg_len, 2)
doc._.entity_density = round(ent_density, 4)
return doc
if "sentence_stats" not in nlp.pipe_names:
nlp.add_pipe("sentence_stats", last=True)
return nlp
# ── 8. Training data preparation ─────────────────────────────────────────────
def make_ner_examples(
nlp: Language,
data: list[tuple[str, dict]],
) -> list[Example]:
"""
Convert (text, {"entities": [(start, end, label)]}) training data to Examples.
Skips misaligned annotations automatically using alignment_mode="contract".
"""
examples = []
for text, annotations in data:
doc = nlp.make_doc(text)
ents = []
for start, end, label in annotations.get("entities", []):
span = doc.char_span(start, end, label=label, alignment_mode="contract")
if span is not None:
ents.append(span)
doc.set_ents(ents)
examples.append(Example.from_dict(doc, {"entities": [
(span.start_char, span.end_char, span.label_) for span in ents
]}))
return examples
def train_ner(
nlp: Language,
examples: list[Example],
n_iter: int = 30,
drop: float = 0.2,
save_path: str = "./ner-model",
) -> Language:
"""
Train NER component from scratch on provided examples.
nlp should be a blank model with NER added: nlp.add_pipe("ner").
"""
ner = nlp.get_pipe("ner")
# Add labels
for ex in examples:
for ent in ex.reference.ents:
ner.add_label(ent.label_)
other_pipes = [p for p in nlp.pipe_names if p != "ner"]
with nlp.disable_pipes(*other_pipes):
optimizer = nlp.initialize()
for i in range(n_iter):
losses: dict = {}
nlp.update(examples, drop=drop, sgd=optimizer, losses=losses)
if (i + 1) % 10 == 0:
print(f" iter {i+1}/{n_iter} | NER loss: {losses.get('ner', 0):.3f}")
nlp.to_disk(save_path)
print(f"NER model saved: {save_path}")
return nlp
# ── 9. Visualization ──────────────────────────────────────────────────────────
def visualize_entities(
doc: Doc,
style: str = "ent",
return_html: bool = True,
) -> str | None:
"""Render entity or dependency visualization as HTML."""
return displacy.render(doc, style=style, jupyter=False) if return_html else None
# ── Demo ──────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
nlp = load_nlp("en_core_web_sm")
texts = [
"Apple Inc. was founded by Steve Jobs in Cupertino, California.",
"Elon Musk's Tesla and SpaceX are headquartered in Austin, Texas.",
"The European Union imposed a €1.49 billion fine on Google in 2023.",
]
print("Named Entity Recognition:")
for text in texts:
doc = nlp(text)
ents = extract_entities(doc)
orgs = extract_entities_by_type(doc, "ORG")
people = extract_entities_by_type(doc, "PERSON")
print(f"\n '{text[:60]}...'")
print(f" ORG: {orgs}")
print(f" PERSON: {people}")
print("\nNoun phrases:")
doc = nlp(texts[0])
print(f" {get_noun_phrases(doc)}")
print("\nLemmatized tokens:")
print(f" {lemmatize(nlp, texts[0])}")
print("\nToken patterns — iPhone model numbers:")
matcher = build_token_matcher(nlp, {
"IPHONE_MODEL": [[{"LOWER": "iphone"}, {"IS_DIGIT": True}]],
})
test_doc = nlp("I have an iPhone 15 and my friend has an iPhone 14 Pro.")
matches = match_text(test_doc, matcher)
print(f" {matches}")
print("\nEntity ruler example:")
nlp2 = load_nlp("en_core_web_sm")
add_entity_ruler(nlp2, [
{"label": "AI_COMPANY", "pattern": "OpenAI"},
{"label": "AI_COMPANY", "pattern": "Anthropic"},
{"label": "AI_MODEL", "pattern": "Claude"},
])
doc2 = nlp2("Anthropic released Claude, which competes with OpenAI's ChatGPT.")
print(f" {[(e.text, e.label_) for e in doc2.ents]}")
For the NLTK alternative when performing classical NLP tasks like n-gram language models, WordNet lookups, or corpus-driven linguistics research — NLTK’s breadth of corpora and algorithms is unmatched for research while spaCy’s industrial design with pre-built pipelines, blazing nlp.pipe batch processing, and Cython-optimized tokenizer runs 10-100x faster on production text volumes, making it the clear choice for building APIs and microservices that process user-submitted text at scale. For the Hugging Face Transformers NLP alternative when needing state-of-the-art transformer NER, zero-shot classification, or question answering with the full BERT/RoBERTa ecosystem — transformers provides SOTA accuracy while spaCy’s en_core_web_trf pipeline wraps RoBERTa for transformer-quality NER with spaCy’s unified Doc API, and the EntityRuler + Matcher combination for rule-based pattern extraction has no equivalent in transformers, making spaCy the right choice when you need both statistical and symbolic NLP in one pipeline. The Claude Skills 360 bundle includes spaCy skill sets covering model loading, batch processing, token and entity extraction, noun chunk and lemma analysis, Matcher and PhraseMatcher rule patterns, EntityRuler gazetteers, custom pipeline components, NER training, and displacy visualization. Start with the free tier to try NLP pipeline code generation.