DSPy replaces hand-written prompts with declarative modules that compile into optimized prompt instructions or fine-tuned weights. A ChainOfThought module declares reasoning intent; the DSPy compiler finds the few-shot examples and instructions that maximize your metric on a dev set. Programs are modular: compose Retrieve, ChainOfThought, and custom Module classes into complex pipelines. Optimizers like BootstrapFewShot and MIPRO auto-generate demonstrations. dspy.Assert enforces output constraints at inference time with automatic retry. Claude Code generates DSPy modules, dataset annotation pipelines, optimizer configurations, and the evaluation harnesses for production LLM systems.
CLAUDE.md for DSPy Projects
## DSPy Stack
- Version: dspy-ai >= 2.5 (or dspy >= 2.5 — package renamed in 2024)
- LM: dspy.LM("anthropic/claude-sonnet-4-6") or dspy.LM("openai/gpt-4o")
- Retrieval: dspy.ColBERTv2 or custom dspy.Retrieve with embedding index
- Optimization: BootstrapFewShot (fast), MIPROv2 (best quality, slower)
- Evaluation: dspy.Evaluate with metric function returning 0.0-1.0
- Assertions: dspy.Assert / dspy.Suggest for output constraints
- Teleprompters: compile with trainset (100-500 examples), validate on devset (50-100)
Basic Modules and Signatures
# modules/signatures.py — DSPy type-safe signatures
import dspy
# Configure the LM
lm = dspy.LM("anthropic/claude-sonnet-4-6", max_tokens=1024)
dspy.configure(lm=lm)
# Signature: declares input/output fields and their descriptions
class ClassifyIntent(dspy.Signature):
"""Classify the intent of a customer support message."""
message: str = dspy.InputField(desc="Customer support message")
intent: str = dspy.OutputField(
desc="One of: billing, technical, shipping, returns, general"
)
confidence: float = dspy.OutputField(
desc="Confidence score from 0.0 to 1.0"
)
class SummarizeAndExtract(dspy.Signature):
"""Summarize a document and extract key entities."""
document: str = dspy.InputField(desc="Document text to analyze")
context: str = dspy.InputField(desc="Context about the document type")
summary: str = dspy.OutputField(desc="2-3 sentence summary")
entities: list[str] = dspy.OutputField(
desc="List of named entities (people, orgs, products)"
)
sentiment: str = dspy.OutputField(desc="positive, negative, or neutral")
# Module: wraps a signature with a predictor
class IntentClassifier(dspy.Module):
def __init__(self):
super().__init__()
self.predict = dspy.Predict(ClassifyIntent)
def forward(self, message: str):
result = self.predict(message=message)
# Validate and normalize intent
valid_intents = {"billing", "technical", "shipping", "returns", "general"}
if result.intent.lower() not in valid_intents:
result.intent = "general"
return result
# ChainOfThought: adds a reasoning step before the output
class ReasoningClassifier(dspy.Module):
def __init__(self):
super().__init__()
# ChainOfThought automatically adds a "reasoning" field
self.classify = dspy.ChainOfThought(ClassifyIntent)
def forward(self, message: str):
return self.classify(message=message)
RAG Pipeline with Retrieve
# modules/rag.py — retrieval-augmented generation
import dspy
from dspy.retrieve.chromadb_rm import ChromadbRM
import chromadb
class AnswerWithCitations(dspy.Signature):
"""Answer a question using retrieved context passages."""
question: str = dspy.InputField()
context: list[str] = dspy.InputField(desc="Retrieved passages from knowledge base")
answer: str = dspy.OutputField(
desc="Factual answer based only on the provided context"
)
citations: list[int] = dspy.OutputField(
desc="Indices (0-based) of context passages used for the answer"
)
confidence: str = dspy.OutputField(desc="high, medium, or low")
class RAGPipeline(dspy.Module):
def __init__(self, retriever, k: int = 5):
super().__init__()
self.retriever = retriever
self.k = k
self.generate = dspy.ChainOfThought(AnswerWithCitations)
def forward(self, question: str):
# Retrieve relevant passages
passages = self.retriever(question, k=self.k).passages
# Generate answer with citations
result = self.generate(question=question, context=passages)
# Enforce: citations must be valid indices
dspy.Assert(
all(0 <= i < len(passages) for i in result.citations),
"Citations must be valid passage indices"
)
return dspy.Prediction(
answer=result.answer,
citations=result.citations,
passages=passages,
confidence=result.confidence,
)
# Multi-hop RAG: retrieve, reason, retrieve again
class MultiHopRAG(dspy.Module):
def __init__(self, retriever, k: int = 3):
super().__init__()
self.retriever = retriever
self.k = k
self.generate_query = dspy.ChainOfThought("question, context -> search_query")
self.generate_answer = dspy.ChainOfThought(AnswerWithCitations)
def forward(self, question: str):
context = []
# First hop: direct retrieval
passages = self.retriever(question, k=self.k).passages
context.extend(passages)
# Second hop: refined query from initial context
refined_query = self.generate_query(
question=question,
context=context,
).search_query
more_passages = self.retriever(refined_query, k=self.k).passages
context.extend(p for p in more_passages if p not in context)
# Final answer
return self.generate_answer(question=question, context=context)
def setup_chroma_retriever(collection_name: str, embedding_model: str):
"""Set up ChromaDB retriever for DSPy."""
client = chromadb.PersistentClient(path="./chroma_db")
retriever = ChromadbRM(
collection_name=collection_name,
persist_directory="./chroma_db",
embedding_function=embedding_model,
k=5,
)
return retriever
Compilation with Optimizers
# optimization/compile.py — optimize prompts with DSPy optimizers
import dspy
from dspy.teleprompt import BootstrapFewShot, MIPROv2, BootstrapFewShotWithRandomSearch
from typing import Callable
def intent_accuracy_metric(example, prediction, trace=None) -> float:
"""Metric function: returns 0.0 or 1.0."""
return float(example.intent == prediction.intent.lower())
def rag_faithfulness_metric(example, prediction, trace=None) -> float:
"""Metric for RAG: answer must be grounded in context (LM-judged)."""
if not prediction.answer or not prediction.passages:
return 0.0
judge = dspy.Predict("answer, context -> is_faithful: bool")
result = judge(
answer=prediction.answer,
context="\n".join(prediction.passages[:3]),
)
return float(result.is_faithful)
def compile_with_bootstrap(
module: dspy.Module,
trainset: list,
metric: Callable,
max_bootstrapped_demos: int = 4,
num_candidate_programs: int = 8,
) -> dspy.Module:
"""Fast optimization with BootstrapFewShot."""
optimizer = BootstrapFewShot(
metric=metric,
max_bootstrapped_demos=max_bootstrapped_demos,
max_labeled_demos=16,
)
compiled = optimizer.compile(module, trainset=trainset)
return compiled
def compile_with_mipro(
module: dspy.Module,
trainset: list,
devset: list,
metric: Callable,
num_trials: int = 20,
) -> dspy.Module:
"""High-quality optimization with MIPROv2 (uses Bayesian search)."""
optimizer = MIPROv2(
metric=metric,
auto="medium", # "light" / "medium" / "heavy" — controls trial budget
num_threads=8,
)
compiled = optimizer.compile(
module,
trainset=trainset,
valset=devset,
num_trials=num_trials,
minibatch=True,
minibatch_size=25,
minibatch_full_eval_steps=5,
requires_permission_to_run=False,
)
return compiled
# Save and load compiled programs
def save_program(program: dspy.Module, path: str) -> None:
program.save(path)
print(f"Saved compiled program to {path}")
def load_program(module_class: type, path: str) -> dspy.Module:
program = module_class()
program.load(path)
return program
Evaluation Harness
# evaluation/evaluate.py — structured DSPy evaluation
import dspy
from dspy.evaluate import Evaluate
def evaluate_program(
program: dspy.Module,
devset: list,
metric,
num_threads: int = 8,
display_progress: bool = True,
) -> dict:
"""Run evaluation with DSPy's built-in parallel evaluator."""
evaluator = Evaluate(
devset=devset,
metric=metric,
num_threads=num_threads,
display_progress=display_progress,
display_table=10, # Show first 10 rows in console
)
score = evaluator(program)
return {
"score": score,
"num_examples": len(devset),
"passed": int(score / 100 * len(devset)),
}
def compare_programs(
baseline: dspy.Module,
optimized: dspy.Module,
devset: list,
metric,
) -> dict:
"""Compare baseline vs. optimized program performance."""
baseline_score = Evaluate(devset=devset, metric=metric, num_threads=8)(baseline)
optimized_score = Evaluate(devset=devset, metric=metric, num_threads=8)(optimized)
improvement = optimized_score - baseline_score
print(f"Baseline: {baseline_score:.1f}%")
print(f"Optimized: {optimized_score:.1f}%")
print(f"Delta: {improvement:+.1f}%")
return {
"baseline": baseline_score,
"optimized": optimized_score,
"improvement": improvement,
}
Typed Predictors with Pydantic
# modules/typed.py — structured output with Pydantic + DSPy
import dspy
from pydantic import BaseModel, field_validator
from typing import Literal
class OrderAnalysis(BaseModel):
intent: Literal["cancel", "modify", "track", "return", "complaint", "general"]
sentiment: Literal["positive", "negative", "neutral"]
urgency: Literal["high", "medium", "low"]
extracted_order_id: str | None
summary: str
suggested_action: str
@field_validator("extracted_order_id")
@classmethod
def normalize_order_id(cls, v):
if v:
return v.upper().strip()
return v
class AnalyzeOrderMessage(dspy.Signature):
"""Analyze a customer order-related message comprehensively."""
message: str = dspy.InputField()
order_analysis: OrderAnalysis = dspy.OutputField()
class OrderAnalyzer(dspy.Module):
def __init__(self):
super().__init__()
# TypedPredictor enforces Pydantic model at output
self.analyze = dspy.TypedPredictor(AnalyzeOrderMessage)
def forward(self, message: str) -> OrderAnalysis:
result = self.analyze(message=message)
# High urgency complaint → escalate
if (result.order_analysis.urgency == "high" and
result.order_analysis.intent == "complaint"):
dspy.Suggest(
False,
"High urgency complaints should have a specific suggested_action with escalation path",
)
return result.order_analysis
For the LangChain/LangGraph alternative that uses explicit prompt templates and agent graphs rather than DSPy’s compiler-based optimization, see the LangChain guide. For the LlamaIndex RAG pipeline that integrates with DSPy’s Retrieve module, the LlamaIndex guide covers ingestion pipelines and rerankers. The Claude Skills 360 bundle includes DSPy skill sets covering module design, optimizer configuration, and evaluation harnesses. Start with the free tier to try DSPy pipeline generation.