Gensim handles large-scale topic modeling and word embeddings. pip install gensim. from gensim import corpora, models, similarities. Dictionary: dictionary = corpora.Dictionary(tokenized_docs), dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=50000). BoW corpus: corpus = [dictionary.doc2bow(tokens) for tokens in tokenized_docs]. LDA: lda = models.LdaModel(corpus, num_topics=10, id2word=dictionary, passes=10, alpha="auto", eta="auto"). Topics: lda.print_topics(num_words=10). Inference: lda[new_bow] — returns [(topic_id, prob)]. Coherence: from gensim.models import CoherenceModel, cm = CoherenceModel(model=lda, texts=texts, dictionary=dictionary, coherence="c_v"), cm.get_coherence(). LdaMulticore: models.LdaMulticore(corpus, num_topics=20, workers=4). Word2Vec: from gensim.models import Word2Vec, w2v = Word2Vec(sentences, vector_size=300, window=5, min_count=5, sg=1) — sg=1 skip-gram, sg=0 CBOW. Vectors: w2v.wv["king"], w2v.wv.most_similar("king", topn=10), w2v.wv.doesnt_match(["king","queen","man","banana"]). Arithmetic: w2v.wv.most_similar(positive=["king","woman"], negative=["man"]). FastText: from gensim.models import FastText, ft = FastText(sentences, vector_size=100, window=5) — handles OOV via subwords. Doc2Vec: from gensim.models.doc2vec import Doc2Vec, TaggedDocument, docs = [TaggedDocument(tokens, [i]) for i, tokens in enumerate(corpus)], d2v = Doc2Vec(docs, vector_size=100, epochs=40). TF-IDF: tfidf = models.TfidfModel(corpus), tfidf_corpus = tfidf[corpus]. Similarity index: index = similarities.MatrixSimilarity(tfidf_corpus), sims = index[tfidf[query_bow]]. Claude Code generates LDA topic pipelines, Word2Vec trainers, document similarity engines, and topic coherence optimizers.
CLAUDE.md for Gensim
## Gensim Stack
- Version: gensim >= 4.3
- Dictionary: corpora.Dictionary(tokenized_docs) → filter_extremes(no_below, no_above)
- BoW: [dictionary.doc2bow(tokens) for tokens in docs]
- LDA: LdaModel(corpus, num_topics, id2word=dict, passes=10, alpha="auto")
- Coherence: CoherenceModel(model, texts, dictionary, coherence="c_v").get_coherence()
- Word2Vec: Word2Vec(sentences, vector_size=300, window=5, min_count=5, sg=1)
- Vectors: wv["word"] | wv.most_similar("word") | wv.doesnt_match([...])
- FastText: FastText(sentences, ...) — handles OOV via character n-grams
- Doc2Vec: Doc2Vec([TaggedDocument(tokens, [id])], vector_size=100, epochs=40)
Gensim Topic Modeling and Embeddings
# nlp/gensim_pipeline.py — topic modeling and word embeddings with Gensim
from __future__ import annotations
import re
import numpy as np
from pathlib import Path
from gensim import corpora, models, similarities
from gensim.models import (
LdaModel, LdaMulticore, Word2Vec, FastText,
CoherenceModel, TfidfModel,
)
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
# ── 1. Text preprocessing ─────────────────────────────────────────────────────
def simple_tokenize(text: str, min_len: int = 3) -> list[str]:
"""Basic lowercase tokenizer — replace with spaCy/NLTK for production."""
tokens = re.findall(r"\b[a-z]{" + str(min_len) + r",}\b", text.lower())
return tokens
STOPWORDS = {
"the","a","an","is","it","in","on","at","to","for","of","and","or",
"but","not","with","this","that","are","was","were","has","have","had",
"be","been","being","do","does","did","will","would","could","should",
"may","might","can","from","by","as","i","we","you","he","she","they",
}
def remove_stopwords(tokens: list[str]) -> list[str]:
return [t for t in tokens if t not in STOPWORDS]
def preprocess_corpus(
texts: list[str],
min_len: int = 3,
rm_stopwords: bool = True,
) -> list[list[str]]:
"""Tokenize and clean a list of texts."""
processed = [simple_tokenize(t, min_len) for t in texts]
if rm_stopwords:
processed = [remove_stopwords(t) for t in processed]
return processed
# ── 2. Dictionary and corpus ──────────────────────────────────────────────────
def build_dictionary(
tokenized_docs: list[list[str]],
no_below: int = 5, # Min document frequency
no_above: float = 0.5, # Max document fraction
keep_n: int = 50_000,
) -> corpora.Dictionary:
"""
Build vocabulary dictionary.
no_below: ignore terms in fewer than N docs (rare)
no_above: ignore terms in more than X% of docs (too common)
"""
dictionary = corpora.Dictionary(tokenized_docs)
print(f"Before filtering: {len(dictionary)} unique tokens")
dictionary.filter_extremes(no_below=no_below, no_above=no_above, keep_n=keep_n)
print(f"After filtering: {len(dictionary)} unique tokens")
return dictionary
def to_bow_corpus(
dictionary: corpora.Dictionary,
tokenized_docs: list[list[str]],
) -> list[list[tuple[int, int]]]:
"""Convert tokenized docs to bag-of-words format."""
return [dictionary.doc2bow(tokens) for tokens in tokenized_docs]
def to_tfidf_corpus(
bow_corpus: list,
smartirs: str = "ntc", # TF-IDF weighting scheme
) -> tuple[TfidfModel, list]:
"""Apply TF-IDF weighting to a BoW corpus."""
tfidf = TfidfModel(bow_corpus, smartirs=smartirs)
corpus = tfidf[bow_corpus]
return tfidf, list(corpus)
# ── 3. LDA topic modeling ─────────────────────────────────────────────────────
def train_lda(
corpus: list,
dictionary: corpora.Dictionary,
num_topics: int = 10,
passes: int = 15,
alpha: str = "auto", # "auto" | "symmetric" | float | list
eta: str = "auto",
random_state: int = 42,
workers: int = 1, # >1 uses LdaMulticore
) -> LdaModel:
"""
Train LDA topic model.
alpha="auto" learns asymmetric priors from data (usually better).
Use workers > 1 for large corpora.
"""
cls = LdaMulticore if workers > 1 else LdaModel
kwargs = dict(
corpus=corpus,
num_topics=num_topics,
id2word=dictionary,
passes=passes,
alpha=alpha,
eta=eta,
random_state=random_state,
per_word_topics=True,
)
if workers > 1:
kwargs.pop("alpha", None) # LdaMulticore uses fixed alpha
kwargs["workers"] = workers
model = cls(**kwargs)
return model
def get_topics(
model: LdaModel,
num_words: int = 10,
) -> list[tuple[int, list[tuple[str, float]]]]:
"""Return topics as list of (topic_id, [(word, prob), ...])."""
results = []
for topic_id in range(model.num_topics):
word_probs = model.show_topic(topic_id, topn=num_words)
results.append((topic_id, word_probs))
return results
def print_topics(model: LdaModel, num_words: int = 10) -> None:
"""Print dominant words per topic."""
for topic_id, words in get_topics(model, num_words):
words_str = " | ".join([f"{w} ({p:.3f})" for w, p in words])
print(f"Topic {topic_id:2d}: {words_str}")
def infer_topics(
model: LdaModel,
dictionary: corpora.Dictionary,
text: str | list[str],
min_prob: float = 0.05,
) -> list[tuple[int, float]]:
"""Get topic distribution for new text(s)."""
if isinstance(text, str):
tokens = remove_stopwords(simple_tokenize(text))
else:
tokens = text
bow = dictionary.doc2bow(tokens)
topics = model.get_document_topics(bow, minimum_probability=min_prob)
return sorted(topics, key=lambda x: x[1], reverse=True)
def find_optimal_topics(
corpus: list,
dictionary: corpora.Dictionary,
texts: list[list[str]],
topic_range: range = range(5, 30, 5),
passes: int = 10,
) -> list[tuple[int, float]]:
"""
Compute coherence score for different topic counts.
Returns list of (num_topics, coherence) to find the elbow.
"""
scores = []
for n in topic_range:
model = train_lda(corpus, dictionary, num_topics=n, passes=passes, workers=1)
cm = CoherenceModel(model=model, texts=texts,
dictionary=dictionary, coherence="c_v")
score = cm.get_coherence()
print(f" num_topics={n:3d} coherence={score:.4f}")
scores.append((n, score))
return scores
# ── 4. Word2Vec embeddings ────────────────────────────────────────────────────
def train_word2vec(
sentences: list[list[str]],
vector_size: int = 300,
window: int = 5,
min_count: int = 5,
sg: int = 1, # 1 = Skip-gram, 0 = CBOW
workers: int = 4,
epochs: int = 10,
negative: int = 10,
) -> Word2Vec:
"""
Train Word2Vec embeddings.
sg=1 (Skip-gram): better for rare words and analogies
sg=0 (CBOW): faster training, better for frequent words
"""
model = Word2Vec(
sentences=sentences,
vector_size=vector_size,
window=window,
min_count=min_count,
sg=sg,
workers=workers,
epochs=epochs,
negative=negative,
seed=42,
)
print(f"Word2Vec trained: vocab={len(model.wv)} words, dim={vector_size}")
return model
def word_analogies(
wv,
positive: list[str],
negative: list[str],
top_k: int = 5,
) -> list[tuple[str, float]]:
"""
Solve analogies via vector arithmetic.
Example: king + woman - man ≈ queen
"""
return wv.most_similar(positive=positive, negative=negative, topn=top_k)
# ── 5. FastText (handles OOV) ─────────────────────────────────────────────────
def train_fasttext(
sentences: list[list[str]],
vector_size: int = 100,
window: int = 5,
min_count: int = 3,
min_n: int = 3, # Min char n-gram
max_n: int = 6, # Max char n-gram
epochs: int = 10,
workers: int = 4,
) -> FastText:
"""
Train FastText embeddings.
Handles OOV words via character n-grams — ideal for morphologically rich languages
and text with typos/abbreviations.
"""
model = FastText(
sentences=sentences,
vector_size=vector_size,
window=window,
min_count=min_count,
min_n=min_n,
max_n=max_n,
epochs=epochs,
workers=workers,
seed=42,
)
print(f"FastText trained: vocab={len(model.wv)}, dim={vector_size}")
return model
# ── 6. Doc2Vec document embeddings ────────────────────────────────────────────
def train_doc2vec(
tokenized_docs: list[list[str]],
vector_size: int = 100,
window: int = 5,
min_count: int = 2,
epochs: int = 40,
dm: int = 1, # 1 = PV-DM, 0 = PV-DBOW
workers: int = 4,
) -> Doc2Vec:
"""
Train Doc2Vec for document-level embeddings.
dm=1 (PV-DM): better for semantic tasks
dm=0 (PV-DBOW): faster, better for classification
"""
tagged = [TaggedDocument(tokens, [i]) for i, tokens in enumerate(tokenized_docs)]
model = Doc2Vec(
tagged,
vector_size=vector_size,
window=window,
min_count=min_count,
epochs=epochs,
dm=dm,
workers=workers,
seed=42,
)
print(f"Doc2Vec trained: {len(tagged)} documents, dim={vector_size}")
return model
def find_similar_documents(
model: Doc2Vec,
doc_id: int,
top_k: int = 5,
) -> list[tuple[int, float]]:
"""Find documents most similar to a given document by ID."""
return model.dv.most_similar([doc_id], topn=top_k)
def embed_new_document(
model: Doc2Vec,
tokens: list[str],
epochs: int = 20,
) -> np.ndarray:
"""Infer embedding for a new (unseen) document."""
return model.infer_vector(tokens, epochs=epochs)
# ── 7. Document similarity with TF-IDF ───────────────────────────────────────
def build_similarity_index(
bow_corpus: list,
tfidf_model: TfidfModel = None,
) -> similarities.MatrixSimilarity:
"""Build a MatrixSimilarity index for fast document retrieval."""
corpus_for_index = tfidf_model[bow_corpus] if tfidf_model else bow_corpus
return similarities.MatrixSimilarity(corpus_for_index)
def query_similarity_index(
index: similarities.MatrixSimilarity,
query_bow: list[tuple[int, int]],
tfidf_model: TfidfModel = None,
top_k: int = 5,
) -> list[tuple[int, float]]:
"""Retrieve top-k similar documents for a query bow vector."""
vec = tfidf_model[query_bow] if tfidf_model else query_bow
sims = index[vec]
top = sorted(enumerate(sims), key=lambda x: x[1], reverse=True)[:top_k]
return [(doc_id, round(float(score), 4)) for doc_id, score in top]
# ── Demo ──────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
print("Gensim Demo")
print("="*50)
# Sample documents
raw_texts = [
"Machine learning algorithms learn from training data",
"Deep learning uses neural networks with many layers",
"Natural language processing handles text and speech",
"Computer vision recognizes objects in images",
"Reinforcement learning trains agents through rewards",
"Data science includes statistics and machine learning",
"Python is popular for data science and machine learning",
"Neural networks are inspired by the human brain",
"Image recognition uses convolutional neural networks",
"Text classification is a natural language processing task",
]
docs = preprocess_corpus(raw_texts)
dictionary = build_dictionary(docs, no_below=2, no_above=0.9)
corpus = to_bow_corpus(dictionary, docs)
# LDA
print("\nLDA Topics:")
lda = train_lda(corpus, dictionary, num_topics=3, passes=10)
print_topics(lda, num_words=5)
# Word2Vec
print("\nWord2Vec:")
w2v = train_word2vec(docs, vector_size=50, min_count=2, epochs=50)
similar = w2v.wv.most_similar("learning", topn=3)
print(f" Words similar to 'learning': {similar}")
# Document similarity
tfidf, tfidf_corpus = to_tfidf_corpus(corpus)
index = build_similarity_index(corpus, tfidf)
query_tokens = simple_tokenize("neural network classification")
query_bow = dictionary.doc2bow(query_tokens)
results = query_similarity_index(index, query_bow, tfidf, top_k=3)
print(f"\nDocuments similar to 'neural network classification':")
for doc_id, score in results:
print(f" [{score:.3f}] {raw_texts[doc_id]}")
For the scikit-learn TF-IDF + NMF alternative when fitting non-negative matrix factorization topics in an sklearn Pipeline — sklearn NMF trains faster than LDA on clean, shorter documents while Gensim’s LdaMulticore scales to millions of documents through streaming corpus iteration without loading all data in memory, and the CoherenceModel with c_v metric provides the standard automatic topic count selection that sklearn’s unsupervised NMF lacks. For the Hugging Face sentence-transformers alternative for document similarity — transformers achieve higher semantic similarity scores but Gensim’s Doc2Vec produces lightweight 100-300D vectors that are 50x faster to train on large corpora, and the similarities.MatrixSimilarity index gives sub-second retrieval across millions of documents with pure numpy dot products, making it practical for corpus analysis without GPU infrastructure. The Claude Skills 360 bundle includes Gensim skill sets covering Dictionary building and filtering, bag-of-words and TF-IDF corpus creation, LDA topic training and inference, coherence-based topic count selection, Word2Vec skip-gram and CBOW embeddings, FastText OOV handling, Doc2Vec document embeddings, and MatrixSimilarity document retrieval. Start with the free tier to try topic modeling code generation.