LLM applications are hard to test: outputs are stochastic, correctness is often subjective, and traditional unit tests don’t capture what matters. The solution is systematic evals — structured datasets of inputs and expected behaviors, automated scoring with LLM-as-judge patterns, and regression tracking over time. Claude Code builds eval frameworks, generates diverse test cases from edge cases in production logs, and sets up continuous eval pipelines that catch regressions before they ship.
CLAUDE.md for LLM Eval Projects
## LLM Eval Stack
- Eval framework: custom Python (flexibility) or Promptfoo (off-the-shelf)
- Model under test: Claude claude-sonnet-4-6 via Anthropic SDK
- Scoring: LLM-as-judge (claude-opus-4-6 as judge), regex for structured outputs, human eval for subjective quality
- Dataset format: JSONL, versioned in git
- CI: run evals on every PR that touches prompts or system prompts
- Regression threshold: 95% of previous score to pass (configurable per metric)
- Cost tracking: log token usage per eval run for budget visibility
Eval Dataset Format
# datasets/customer_support.jsonl — each line is one test case
# {"id": "cs-001", "input": {...}, "expected": {...}, "metadata": {...}}
# Example entries:
{
"id": "cs-001",
"category": "order_status",
"input": {
"customer_message": "Where is my order? It's been 5 days.",
"context": {
"order_id": "ORD-12345",
"status": "shipped",
"estimated_delivery": "2026-10-21",
"carrier": "FedEx",
"tracking": "774899289282"
}
},
"expected": {
"contains_tracking_number": true,
"contains_estimated_date": true,
"tone": "empathetic",
"escalate_to_human": false
},
"metadata": {
"difficulty": "easy",
"created_from": "production_log",
"date_added": "2026-09-01"
}
}
Eval Runner
# evals/runner.py
import json
import asyncio
from pathlib import Path
from dataclasses import dataclass, field
import anthropic
client = anthropic.Anthropic()
@dataclass
class EvalCase:
id: str
category: str
input: dict
expected: dict
metadata: dict = field(default_factory=dict)
@dataclass
class EvalResult:
case_id: str
pass_: bool
scores: dict[str, float]
output: str
reasoning: str
tokens_used: int
async def run_eval_case(case: EvalCase, system_prompt: str) -> EvalResult:
"""Run a single eval case and score it."""
# Call the system under test
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=1024,
system=system_prompt,
messages=[{
"role": "user",
"content": build_user_message(case.input),
}],
)
output = response.content[0].text
tokens = response.usage.input_tokens + response.usage.output_tokens
# Score the output
scores = await score_output(output, case)
pass_ = all(s >= case.expected.get(f"{k}_threshold", 0.7) for k, s in scores.items())
return EvalResult(
case_id=case.id,
pass_=pass_,
scores=scores,
output=output,
reasoning="",
tokens_used=tokens,
)
async def run_eval_suite(
dataset_path: str,
system_prompt: str,
concurrency: int = 5,
) -> dict:
cases = load_dataset(dataset_path)
semaphore = asyncio.Semaphore(concurrency)
async def run_with_limit(case):
async with semaphore:
return await run_eval_case(case, system_prompt)
results = await asyncio.gather(*[run_with_limit(c) for c in cases])
return {
"total": len(results),
"passed": sum(1 for r in results if r.pass_),
"pass_rate": sum(1 for r in results if r.pass_) / len(results),
"by_category": compute_by_category(results, cases),
"total_tokens": sum(r.tokens_used for r in results),
"results": results,
}
def load_dataset(path: str) -> list[EvalCase]:
cases = []
with open(path) as f:
for line in f:
data = json.loads(line)
cases.append(EvalCase(**data))
return cases
LLM-as-Judge Scoring
# evals/scorers.py — LLM judges the output quality
JUDGE_PROMPT = """You are an expert evaluator for customer support AI systems.
Evaluate the following AI response based on these criteria:
1. Accuracy: Does the response contain accurate information from the context?
2. Tone: Is the response empathetic and professional? (1-10)
3. Completeness: Does it address all aspects of the customer's question? (1-10)
4. Action clarity: Are next steps/actions clear? (1-10)
Customer message: {customer_message}
Context provided: {context}
AI response: {ai_response}
Expected behaviors: {expected}
Respond with JSON only:
{{
"accuracy_score": 0.0-1.0,
"tone_score": 1-10,
"completeness_score": 1-10,
"action_clarity_score": 1-10,
"escalate_recommended": true/false,
"reasoning": "brief explanation",
"pass": true/false
}}"""
async def llm_judge_score(output: str, case: EvalCase) -> dict[str, float]:
judge_response = client.messages.create(
model="claude-opus-4-6", # Use stronger model as judge
max_tokens=512,
messages=[{
"role": "user",
"content": JUDGE_PROMPT.format(
customer_message=case.input["customer_message"],
context=json.dumps(case.input.get("context", {})),
ai_response=output,
expected=json.dumps(case.expected),
),
}],
)
try:
scores = json.loads(judge_response.content[0].text)
return {
"accuracy": scores["accuracy_score"],
"tone": scores["tone_score"] / 10, # Normalize to 0-1
"completeness": scores["completeness_score"] / 10,
"action_clarity": scores["action_clarity_score"] / 10,
}
except (json.JSONDecodeError, KeyError):
return {"accuracy": 0, "tone": 0, "completeness": 0, "action_clarity": 0}
# Deterministic scorers for structured outputs
def regex_scorer(output: str, case: EvalCase) -> dict[str, float]:
scores = {}
if case.expected.get("contains_tracking_number") and case.input.get("context", {}).get("tracking"):
tracking = case.input["context"]["tracking"]
scores["tracking_mentioned"] = 1.0 if tracking in output else 0.0
if case.expected.get("contains_estimated_date") and case.input.get("context", {}).get("estimated_delivery"):
date_str = case.input["context"]["estimated_delivery"]
scores["date_mentioned"] = 1.0 if date_str in output else 0.0
return scores
Regression Tracking
# evals/regression.py — compare against baseline
import json
from pathlib import Path
def check_regression(
current_results: dict,
baseline_path: str,
threshold: float = 0.95,
) -> bool:
"""Returns True if current results pass regression check."""
if not Path(baseline_path).exists():
print("No baseline found — saving current results as baseline")
save_baseline(current_results, baseline_path)
return True
with open(baseline_path) as f:
baseline = json.load(f)
current_rate = current_results["pass_rate"]
baseline_rate = baseline["pass_rate"]
min_acceptable = baseline_rate * threshold
if current_rate < min_acceptable:
print(f"REGRESSION DETECTED:")
print(f" Baseline: {baseline_rate:.1%}")
print(f" Current: {current_rate:.1%}")
print(f" Required: {min_acceptable:.1%}")
# Show which categories regressed
for category, stats in current_results["by_category"].items():
baseline_cat = baseline.get("by_category", {}).get(category, {})
if baseline_cat and stats["pass_rate"] < baseline_cat["pass_rate"] * threshold:
print(f" Category '{category}' regressed: {baseline_cat['pass_rate']:.1%} → {stats['pass_rate']:.1%}")
return False
print(f"Regression check passed: {current_rate:.1%} (baseline: {baseline_rate:.1%})")
return True
CI Integration
# .github/workflows/evals.yml
name: LLM Evals
on:
push:
paths:
- 'prompts/**'
- 'evals/**'
- 'src/ai/**'
jobs:
run-evals:
runs-on: ubuntu-latest
timeout-minutes: 30
steps:
- uses: actions/checkout@v4
- name: Install deps
run: pip install anthropic pytest
- name: Run eval suite
env:
ANTHROPIC_API_KEY: ${{ secrets.ANTHROPIC_API_KEY }}
run: |
python evals/runner.py \
--dataset evals/datasets/customer_support.jsonl \
--system-prompt prompts/customer_support.txt \
--output results/eval_results.json
- name: Check regression
run: |
python evals/regression.py \
--results results/eval_results.json \
--baseline evals/baselines/customer_support.json \
--threshold 0.95
- name: Comment on PR
if: github.event_name == 'pull_request'
uses: actions/github-script@v7
with:
script: |
const results = require('./results/eval_results.json');
const body = `## Eval Results
**Pass rate:** ${(results.pass_rate * 100).toFixed(1)}%
**Total cases:** ${results.total} | **Passed:** ${results.passed}
**Tokens used:** ${results.total_tokens.toLocaleString()}`;
github.rest.issues.createComment({ ...context.repo, issue_number: context.issue.number, body });
- name: Update baseline on main
if: github.ref == 'refs/heads/main'
run: cp results/eval_results.json evals/baselines/customer_support.json
For the vLLM inference serving that the models under test run on, see the vLLM inference guide. For the MLflow experiment tracking that records eval results over time, the MLOps guide covers metrics logging and run comparison. The Claude Skills 360 bundle includes AI evaluation skill sets covering LLM-as-judge patterns, regression tracking, and CI eval pipelines. Start with the free tier to try eval framework generation.