Vector databases power semantic search, RAG (retrieval-augmented generation), recommendation engines, and duplicate detection. The pattern: convert content to embeddings (dense float arrays), store them in a vector index, then find nearest neighbors by cosine or dot product similarity. Claude Code writes the embedding pipelines, vector store integrations, hybrid search logic, and chunking strategies that make retrieval reliable in production.
CLAUDE.md for Vector Database Projects
## Vector Stack
- Embeddings: text-embedding-3-small (1536 dims) or nomic-embed-text (768 dims)
- Vector stores: pgvector (Postgres — for existing RDS), Pinecone (managed, scale)
- Chunking: 512 tokens, 50-token overlap; semantic chunking for long docs
- Search: hybrid = dense vector (0.7 weight) + BM25 keyword (0.3 weight)
- Metadata filtering: always index category/tenant/date for pre-filtering
- Re-ranking: Cohere rerank API after top-20 retrieval before top-5 selection
- Batch embedding: process 100-500 texts per API call for efficiency
pgvector Setup and Queries
-- Enable pgvector extension
CREATE EXTENSION IF NOT EXISTS vector;
-- Documents table with embedding column
CREATE TABLE documents (
id UUID PRIMARY KEY DEFAULT gen_random_uuid(),
content TEXT NOT NULL,
embedding vector(1536), -- OpenAI text-embedding-3-small
metadata JSONB NOT NULL DEFAULT '{}',
source_url TEXT,
created_at TIMESTAMPTZ NOT NULL DEFAULT NOW()
);
-- HNSW index for fast approximate nearest neighbor search
-- ef_construction=128 is a good default; higher = more accurate, slower to build
CREATE INDEX documents_embedding_idx ON documents
USING hnsw (embedding vector_cosine_ops)
WITH (m=16, ef_construction=128);
-- Index JSONB metadata for filtering
CREATE INDEX documents_metadata_idx ON documents USING GIN (metadata);
-- Semantic search: find 10 most similar documents
-- $1 = query embedding as vector, $2 = metadata filter
SELECT
id,
content,
metadata,
1 - (embedding <=> $1::vector) AS similarity
FROM documents
WHERE metadata->>'category' = $2
ORDER BY embedding <=> $1::vector
LIMIT 10;
Embedding Generation Pipeline
# embeddings/pipeline.py
from openai import OpenAI
from anthropic import Anthropic
import numpy as np
from typing import Sequence
openai_client = OpenAI()
def embed_texts(texts: list[str], model: str = "text-embedding-3-small") -> list[list[float]]:
"""Embed a batch of texts. Handles the 2048-item limit per call."""
all_embeddings = []
batch_size = 100 # Stay well within limits
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
# Strip newlines — improves embedding quality
batch = [text.replace("\n", " ").strip() for text in batch]
response = openai_client.embeddings.create(
input=batch,
model=model,
)
embeddings = [item.embedding for item in response.data]
all_embeddings.extend(embeddings)
return all_embeddings
def cosine_similarity(a: list[float], b: list[float]) -> float:
"""Compute cosine similarity between two vectors."""
a_arr = np.array(a)
b_arr = np.array(b)
return float(np.dot(a_arr, b_arr) / (np.linalg.norm(a_arr) * np.linalg.norm(b_arr)))
# Chunk documents for embedding
def chunk_text(text: str, chunk_size: int = 512, overlap: int = 50) -> list[str]:
"""Split text into overlapping chunks by word count."""
words = text.split()
chunks = []
step = chunk_size - overlap
for i in range(0, len(words), step):
chunk = " ".join(words[i:i + chunk_size])
if chunk:
chunks.append(chunk)
return chunks
Pinecone Integration
# embeddings/pinecone_store.py
from pinecone import Pinecone, ServerlessSpec
import hashlib
pc = Pinecone(api_key=os.environ["PINECONE_API_KEY"])
def get_or_create_index(name: str, dimension: int = 1536) -> object:
"""Get existing index or create it."""
if name not in pc.list_indexes().names():
pc.create_index(
name=name,
dimension=dimension,
metric="cosine",
spec=ServerlessSpec(cloud="aws", region="us-east-1"),
)
return pc.Index(name)
index = get_or_create_index("documents")
def upsert_documents(documents: list[dict]):
"""Embed and upsert documents into Pinecone."""
texts = [doc["content"] for doc in documents]
embeddings = embed_texts(texts)
vectors = []
for doc, embedding in zip(documents, embeddings):
doc_id = hashlib.sha256(doc["content"].encode()).hexdigest()[:16]
vectors.append({
"id": doc_id,
"values": embedding,
"metadata": {
"content": doc["content"][:1000], # Store for retrieval
"source": doc.get("source", ""),
"category": doc.get("category", "general"),
}
})
# Upsert in batches of 100
for i in range(0, len(vectors), 100):
index.upsert(vectors=vectors[i:i+100])
def semantic_search(
query: str,
top_k: int = 10,
category: str | None = None,
) -> list[dict]:
"""Search for documents similar to query."""
[query_embedding] = embed_texts([query])
filter_dict = {}
if category:
filter_dict["category"] = {"$eq": category}
results = index.query(
vector=query_embedding,
top_k=top_k,
include_metadata=True,
filter=filter_dict if filter_dict else None,
)
return [
{
"id": match.id,
"score": match.score,
"content": match.metadata.get("content", ""),
"source": match.metadata.get("source", ""),
}
for match in results.matches
]
Hybrid Search (Dense + BM25)
# embeddings/hybrid_search.py — combine vector and keyword search
from rank_bm25 import BM25Okapi
import numpy as np
class HybridSearcher:
def __init__(self, documents: list[dict], alpha: float = 0.7):
"""
alpha: weight for dense vector search (1-alpha for BM25)
"""
self.documents = documents
self.alpha = alpha
# Build BM25 index
tokenized = [doc["content"].lower().split() for doc in documents]
self.bm25 = BM25Okapi(tokenized)
# Embed all documents
texts = [doc["content"] for doc in documents]
self.embeddings = embed_texts(texts)
def search(self, query: str, top_k: int = 5) -> list[dict]:
# Dense search scores (cosine similarity)
[query_embedding] = embed_texts([query])
dense_scores = [
cosine_similarity(query_embedding, doc_emb)
for doc_emb in self.embeddings
]
# BM25 keyword scores
tokenized_query = query.lower().split()
bm25_scores = self.bm25.get_scores(tokenized_query)
# Normalize both to 0-1
dense_arr = np.array(dense_scores)
bm25_arr = np.array(bm25_scores)
if dense_arr.max() > 0:
dense_arr = (dense_arr - dense_arr.min()) / (dense_arr.max() - dense_arr.min())
if bm25_arr.max() > 0:
bm25_arr = (bm25_arr - bm25_arr.min()) / (bm25_arr.max() - bm25_arr.min())
# Combine scores
combined = self.alpha * dense_arr + (1 - self.alpha) * bm25_arr
# Get top-k indices
top_indices = combined.argsort()[-top_k:][::-1]
return [
{**self.documents[i], "score": float(combined[i])}
for i in top_indices
]
RAG Chain with Claude
# rag/chain.py — retrieval-augmented generation
from anthropic import Anthropic
client = Anthropic()
def rag_answer(question: str, searcher: HybridSearcher) -> str:
"""Answer a question using retrieved context."""
# 1. Retrieve relevant documents
results = searcher.search(question, top_k=5)
# 2. Format context
context = "\n\n---\n\n".join([
f"Source: {r['source']}\n{r['content']}"
for r in results
if r['score'] > 0.3 # Filter low-relevance results
])
if not context:
return "I don't have enough relevant information to answer that question."
# 3. Generate answer grounded in context
response = client.messages.create(
model="claude-sonnet-4-6",
max_tokens=1024,
system="""You are a helpful assistant. Answer questions using ONLY the provided context.
If the context doesn't contain the answer, say so explicitly.
Always cite which source your answer comes from.""",
messages=[{
"role": "user",
"content": f"Context:\n{context}\n\nQuestion: {question}"
}],
)
return response.content[0].text
For the full RAG pipeline with document ingestion, chunking strategies, and evaluation, the RAG guide covers end-to-end retrieval architecture. For the LLM evaluation patterns that measure RAG answer quality, the LLM evals guide covers LLM-as-judge scoring for retrieval quality. The Claude Skills 360 bundle includes vector database skill sets covering pgvector setup, Pinecone integration, and hybrid search patterns. Start with the free tier to try semantic search pipeline generation.