Retrieval-Augmented Generation (RAG) extends LLMs with your private data: documentation, databases, customer records, codebases. Rather than fine-tuning, which is expensive and goes stale, RAG retrieves relevant context at query time and injects it into the prompt. The quality of your RAG system depends almost entirely on retrieval quality — getting the right chunks into the context window.
Claude Code builds RAG pipelines correctly — from chunking strategy through vector indexing, hybrid retrieval, and response generation with proper citation.
System Architecture
CLAUDE.md for RAG Projects
## RAG Stack
- Embeddings: text-embedding-3-small (1536 dims) via OpenAI API
- Vector store: pgvector (PostgreSQL extension) for < 1M documents, Pinecone for larger
- Generation: Claude claude-sonnet-4-6 via Anthropic API
- Chunking: overlapping chunks by 20% to preserve context across boundaries
- Hybrid retrieval: keyword (BM25/full-text) + vector similarity, combined with RRF
- Reranking: Cohere rerank v3 for top-20 → top-5 before generation
- Evaluation: RAGAS metrics (faithfulness, answer relevance, context recall)
Document Ingestion Pipeline
Build the document ingestion pipeline for our knowledge base.
We have markdown docs, PDFs, and HTML pages.
Chunk them appropriately and store in pgvector.
// src/rag/ingestion.ts
import Anthropic from '@anthropic-ai/sdk';
import OpenAI from 'openai';
import { db } from '../lib/db';
import { extractText } from './extractors';
const openai = new OpenAI();
interface Document {
id: string;
source: string;
content: string;
metadata: Record<string, unknown>;
}
interface Chunk {
documentId: string;
chunkIndex: number;
content: string;
metadata: Record<string, unknown>;
}
// Smart chunking strategy: split on semantic boundaries, not character count
function chunkDocument(doc: Document, options = { maxTokens: 512, overlap: 0.2 }): Chunk[] {
const { maxTokens, overlap } = options;
// Estimate tokens (rough: 1 token ≈ 4 chars)
const maxChars = maxTokens * 4;
const overlapChars = Math.floor(maxChars * overlap);
// Split on natural boundaries first (headers, paragraphs)
const sections = doc.content
.split(/(?=#{1,3}\s)|(?:\n\n+)/)
.map(s => s.trim())
.filter(Boolean);
const chunks: Chunk[] = [];
let currentChunk = '';
let chunkIndex = 0;
for (const section of sections) {
// If adding this section would exceed limit, flush current chunk
if (currentChunk.length + section.length > maxChars && currentChunk.length > 0) {
chunks.push({
documentId: doc.id,
chunkIndex: chunkIndex++,
content: currentChunk.trim(),
metadata: {
...doc.metadata,
source: doc.source,
// Track position for citation
startPosition: doc.content.indexOf(currentChunk.trim()),
},
});
// Overlap: start next chunk with end of current chunk
const words = currentChunk.split(' ');
const overlapWords = words.slice(-Math.floor((overlapChars / 5)));
currentChunk = overlapWords.join(' ') + ' ' + section;
} else {
currentChunk += (currentChunk ? '\n\n' : '') + section;
}
}
// Final chunk
if (currentChunk.trim()) {
chunks.push({
documentId: doc.id,
chunkIndex: chunkIndex,
content: currentChunk.trim(),
metadata: { ...doc.metadata, source: doc.source },
});
}
return chunks;
}
// Generate embeddings in batches (API limit: 2048 inputs per request)
async function embedChunks(chunks: Chunk[]): Promise<Array<Chunk & { embedding: number[] }>> {
const batchSize = 100;
const results: Array<Chunk & { embedding: number[] }> = [];
for (let i = 0; i < chunks.length; i += batchSize) {
const batch = chunks.slice(i, i + batchSize);
const response = await openai.embeddings.create({
model: 'text-embedding-3-small',
input: batch.map(c => c.content),
encoding_format: 'float',
});
for (let j = 0; j < batch.length; j++) {
results.push({ ...batch[j], embedding: response.data[j].embedding });
}
// Rate limit: 1M tokens/min
if (i + batchSize < chunks.length) {
await new Promise(resolve => setTimeout(resolve, 100));
}
}
return results;
}
// Upsert into pgvector
async function indexChunks(chunks: Array<Chunk & { embedding: number[] }>): Promise<void> {
// Batch upsert
await db.raw(`
INSERT INTO document_chunks
(document_id, chunk_index, content, metadata, embedding)
SELECT
v.document_id,
v.chunk_index::int,
v.content,
v.metadata::jsonb,
v.embedding::vector
FROM jsonb_to_recordset(?::jsonb) AS v(
document_id text,
chunk_index text,
content text,
metadata jsonb,
embedding text
)
ON CONFLICT (document_id, chunk_index)
DO UPDATE SET
content = EXCLUDED.content,
metadata = EXCLUDED.metadata,
embedding = EXCLUDED.embedding,
updated_at = NOW()
`, [JSON.stringify(chunks.map(c => ({
...c,
embedding: `[${c.embedding.join(',')}]`
})))]);
}
// Database schema
const schema = `
CREATE EXTENSION IF NOT EXISTS vector;
CREATE TABLE documents (
id TEXT PRIMARY KEY,
source TEXT NOT NULL,
title TEXT,
content TEXT NOT NULL,
metadata JSONB DEFAULT '{}',
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW()
);
CREATE TABLE document_chunks (
id BIGSERIAL PRIMARY KEY,
document_id TEXT REFERENCES documents(id) ON DELETE CASCADE,
chunk_index INT NOT NULL,
content TEXT NOT NULL,
metadata JSONB DEFAULT '{}',
embedding vector(1536),
-- Full-text search
content_tsvector TSVECTOR GENERATED ALWAYS AS (to_tsvector('english', content)) STORED,
created_at TIMESTAMPTZ DEFAULT NOW(),
updated_at TIMESTAMPTZ DEFAULT NOW(),
UNIQUE(document_id, chunk_index)
);
-- HNSW index for fast approximate nearest neighbor search
CREATE INDEX ON document_chunks USING hnsw (embedding vector_cosine_ops)
WITH (m = 16, ef_construction = 64);
-- GIN index for full-text search
CREATE INDEX ON document_chunks USING GIN (content_tsvector);
`;
export async function ingestDocument(doc: Document): Promise<void> {
// Store document
await db('documents').insert({
id: doc.id,
source: doc.source,
content: doc.content,
metadata: JSON.stringify(doc.metadata),
}).onConflict('id').merge();
// Chunk
const chunks = chunkDocument(doc);
// Embed
const embeddedChunks = await embedChunks(chunks);
// Index
await indexChunks(embeddedChunks);
console.log(`Indexed ${chunks.length} chunks for document ${doc.id}`);
}
Hybrid Retrieval
Implement hybrid retrieval that combines vector similarity
with keyword search for better recall on technical terms.
// src/rag/retrieval.ts
interface RetrievalResult {
chunkId: number;
documentId: string;
content: string;
metadata: Record<string, unknown>;
vectorScore: number;
keywordScore: number;
combinedScore: number;
}
// Reciprocal Rank Fusion — combines rankings from multiple retrieval methods
function reciprocalRankFusion(
vectorResults: Array<{ id: number; score: number }>,
keywordResults: Array<{ id: number; score: number }>,
k = 60,
): Map<number, number> {
const scores = new Map<number, number>();
// RRF formula: score = Σ 1/(k + rank)
vectorResults.forEach((result, rank) => {
const current = scores.get(result.id) ?? 0;
scores.set(result.id, current + 1 / (k + rank + 1));
});
keywordResults.forEach((result, rank) => {
const current = scores.get(result.id) ?? 0;
scores.set(result.id, current + 1 / (k + rank + 1));
});
return scores;
}
export async function hybridRetrieve(
query: string,
options = { topK: 20, minScore: 0.3 },
): Promise<RetrievalResult[]> {
const { topK } = options;
// Embed the query
const embeddingResponse = await openai.embeddings.create({
model: 'text-embedding-3-small',
input: query,
});
const queryEmbedding = embeddingResponse.data[0].embedding;
// Parallel: vector search + keyword search
const [vectorResults, keywordResults] = await Promise.all([
// Vector search — cosine similarity
db.raw(`
SELECT
id,
document_id,
content,
metadata,
1 - (embedding <=> ?::vector) AS score
FROM document_chunks
WHERE 1 - (embedding <=> ?::vector) > 0.3
ORDER BY embedding <=> ?::vector
LIMIT ?
`, [`[${queryEmbedding.join(',')}]`, `[${queryEmbedding.join(',')}]`, `[${queryEmbedding.join(',')}]`, topK]),
// Full-text keyword search with ranking
db.raw(`
SELECT
id,
document_id,
content,
metadata,
ts_rank_cd(content_tsvector, query) AS score
FROM document_chunks,
plainto_tsquery('english', ?) query
WHERE content_tsvector @@ query
ORDER BY score DESC
LIMIT ?
`, [query, topK]),
]);
// Combine with RRF
const vectorRanked = vectorResults.rows.map((r: any, i: number) => ({ id: r.id, score: r.score }));
const keywordRanked = keywordResults.rows.map((r: any, i: number) => ({ id: r.id, score: r.score }));
const combinedScores = reciprocalRankFusion(vectorRanked, keywordRanked);
// Merge results and sort by combined score
const allResults = new Map<number, Omit<RetrievalResult, 'combinedScore'>>();
for (const row of [...vectorResults.rows, ...keywordResults.rows]) {
if (!allResults.has(row.id)) {
allResults.set(row.id, {
chunkId: row.id,
documentId: row.document_id,
content: row.content,
metadata: row.metadata,
vectorScore: 0,
keywordScore: 0,
});
}
}
// Return top K by combined RRF score
return Array.from(allResults.entries())
.map(([id, result]) => ({
...result,
combinedScore: combinedScores.get(id) ?? 0,
}))
.sort((a, b) => b.combinedScore - a.combinedScore)
.slice(0, topK);
}
Reranking and Generation
After retrieval, rerank with Cohere and generate
a response with citations.
// src/rag/generate.ts
import Anthropic from '@anthropic-ai/sdk';
import { CohereClient } from 'cohere-ai';
const anthropic = new Anthropic();
const cohere = new CohereClient({ token: process.env.COHERE_API_KEY });
export async function ragQuery(
query: string,
options = { topKRetrieve: 20, topKRerank: 5 },
): Promise<{ answer: string; sources: Array<{ content: string; source: string }> }> {
// 1. Retrieve candidates
const candidates = await hybridRetrieve(query, { topK: options.topKRetrieve });
// 2. Rerank with Cohere (semantic relevance, not just similarity)
const reranked = await cohere.rerank({
model: 'rerank-v3.5',
query,
documents: candidates.map(c => c.content),
topN: options.topKRerank,
});
const topChunks = reranked.results.map(r => candidates[r.index]);
// 3. Build context with source attribution
const contextParts = topChunks.map((chunk, i) =>
`[Source ${i + 1}: ${chunk.metadata.source ?? chunk.documentId}]\n${chunk.content}`
);
const context = contextParts.join('\n\n---\n\n');
// 4. Generate with Claude
const response = await anthropic.messages.create({
model: 'claude-sonnet-4-6',
max_tokens: 1024,
system: `You are a helpful assistant that answers questions based on the provided context.
Rules:
- Only answer based on the provided context. If the context doesn't contain enough information, say so.
- Cite your sources using [Source N] notation when making specific claims.
- If multiple sources say different things, acknowledge the discrepancy.
- Be concise and direct.`,
messages: [
{
role: 'user',
content: `Context:\n${context}\n\nQuestion: ${query}`,
},
],
});
const answer = response.content[0].type === 'text' ? response.content[0].text : '';
return {
answer,
sources: topChunks.map(c => ({
content: c.content.slice(0, 200) + '...', // Preview
source: String(c.metadata.source ?? c.documentId),
})),
};
}
RAG Evaluation
How do I know if my RAG pipeline is actually working well?
Set up evaluation metrics.
// src/rag/evaluation.ts
// RAGAS-style metrics without the library dependency
interface EvalSample {
question: string;
answer: string;
contexts: string[];
groundTruth?: string;
}
// Faithfulness: is the answer supported by the retrieved context?
async function evaluateFaithfulness(sample: EvalSample): Promise<number> {
const response = await anthropic.messages.create({
model: 'claude-haiku-4-5-20251001', // cheap model for eval
max_tokens: 512,
messages: [{
role: 'user',
content: `Given these context passages:
${sample.contexts.join('\n\n')}
And this answer:
${sample.answer}
Identify each claim in the answer. For each claim, determine if it is:
1. SUPPORTED - directly stated or clearly implied by the context
2. CONTRADICTED - contradicts information in the context
3. UNVERIFIABLE - cannot be verified from the context alone
Respond with JSON: {"supported": N, "contradicted": N, "unverifiable": N}`,
}],
});
const text = response.content[0].type === 'text' ? response.content[0].text : '{}';
try {
const counts = JSON.parse(text.match(/\{[^}]+\}/)?.[0] ?? '{}');
const total = (counts.supported ?? 0) + (counts.contradicted ?? 0) + (counts.unverifiable ?? 0);
return total > 0 ? (counts.supported ?? 0) / total : 0;
} catch {
return 0;
}
}
// Answer Relevance: does the answer address the question?
async function evaluateAnswerRelevance(sample: EvalSample): Promise<number> {
// Generate hypothetical questions from the answer, measure similarity to original
const response = await anthropic.messages.create({
model: 'claude-haiku-4-5-20251001',
max_tokens: 256,
messages: [{
role: 'user',
content: `Based on this answer: "${sample.answer}"
Generate 3 questions that this answer is responding to.
Respond with JSON: {"questions": ["q1", "q2", "q3"]}`,
}],
});
// Score by embedding similarity between generated questions and original
// (simplified: use string overlap as proxy)
const queryWords = new Set(sample.question.toLowerCase().split(/\W+/));
const text = response.content[0].type === 'text' ? response.content[0].text : '{}';
const { questions = [] } = JSON.parse(text.match(/\{[\s\S]+\}/)?.[0] ?? '{}');
const similarities = questions.map((q: string) => {
const qWords = new Set(q.toLowerCase().split(/\W+/));
const intersection = [...queryWords].filter(w => qWords.has(w)).length;
return intersection / Math.max(queryWords.size, qWords.size);
});
return similarities.reduce((a: number, b: number) => a + b, 0) / (similarities.length || 1);
}
export async function evaluateRAG(testSet: EvalSample[]): Promise<{
avgFaithfulness: number;
avgRelevance: number;
results: Array<{ question: string; faithfulness: number; relevance: number }>;
}> {
const results = await Promise.all(
testSet.map(async (sample) => ({
question: sample.question,
faithfulness: await evaluateFaithfulness(sample),
relevance: await evaluateAnswerRelevance(sample),
}))
);
return {
avgFaithfulness: results.reduce((s, r) => s + r.faithfulness, 0) / results.length,
avgRelevance: results.reduce((s, r) => s + r.relevance, 0) / results.length,
results,
};
}
For building more complex multi-step LLM agents beyond RAG, see the LLM agents guide. For storing vector embeddings at scale in PostgreSQL, the PostgreSQL advanced guide covers pgvector index tuning and query optimization. The Claude Skills 360 bundle includes RAG skill sets for chunking strategies, hybrid retrieval pipelines, and evaluation frameworks. Start with the free tier to try document ingestion and query generation prompts.