Cohere specializes in enterprise NLP with best-in-class embeddings and reranking — new CohereClient({ token: apiKey }) initializes the client. cohere.chat({ model: "command-r-plus", message, chatHistory }) generates chat responses. cohere.chatStream({ ... }) streams with for await (const event of stream). Embeddings: cohere.embed({ texts, model: "embed-english-v3.0", inputType: "search_document" }) returns float embeddings. inputType: "search_query" for query embeddings. Rerank: cohere.rerank({ model: "rerank-english-v3.0", query, documents, topN: 5 }) reorders results by relevance. Classification: cohere.classify({ examples, inputs }) few-shot classifies texts. RAG with documents: cohere.chat({ message, documents: [{ id, title, snippet }] }) for grounded responses with citations. Tool use: tools: [{ name, description, parameterDefinitions }] with toolResults for the agentic loop. cohere.tokenize({ text, model }) counts tokens. Claude Code generates Cohere embedding pipelines, hybrid search with reranking, and classified document routing.
CLAUDE.md for Cohere
## Cohere Stack
- Version: cohere-ai >= 7.14
- Init: const cohere = new CohereClient({ token: process.env.COHERE_API_KEY! })
- Chat: const res = await cohere.chat({ model: "command-r-plus", message: prompt })
- Answer: res.text
- Stream: const stream = await cohere.chatStream({ model: "command-r-plus", message }); for await (const event of stream) if (event.eventType === "text-generation") console.log(event.text)
- Embed: const res = await cohere.embed({ texts, model: "embed-english-v3.0", inputType: "search_document" }); const vectors = res.embeddings.float
- Rerank: const res = await cohere.rerank({ model: "rerank-english-v3.0", query, documents, topN: 3 }); res.results[0].index, .relevanceScore
Cohere Client
// lib/cohere/client.ts — Cohere SDK with embeddings, reranking, and chat
import { CohereClient, type Message } from "cohere-ai"
const cohere = new CohereClient({ token: process.env.COHERE_API_KEY! })
export const MODELS = {
CHAT_PLUS: "command-r-plus", // Most capable, good for RAG
CHAT: "command-r", // Fast and cost-effective
EMBED_EN: "embed-english-v3.0", // 1024-dim, English
EMBED_ML: "embed-multilingual-v3.0", // 1024-dim, 100+ languages
RERANK_EN: "rerank-english-v3.0",
RERANK_ML: "rerank-multilingual-v3.0",
} as const
/** Embed documents for storage — use "search_document" */
export async function embedDocuments(texts: string[]): Promise<number[][]> {
const response = await cohere.embed({
texts,
model: MODELS.EMBED_EN,
inputType: "search_document",
embeddingTypes: ["float"],
})
return (response.embeddings as any).float as number[][]
}
/** Embed a search query — use "search_query" (different from document embedding) */
export async function embedQuery(query: string): Promise<number[]> {
const response = await cohere.embed({
texts: [query],
model: MODELS.EMBED_EN,
inputType: "search_query",
embeddingTypes: ["float"],
})
return ((response.embeddings as any).float as number[][])[0]
}
/** Rerank a list of documents by relevance to a query */
export async function rerank(
query: string,
documents: Array<{ id?: string; text: string }>,
topN?: number,
): Promise<Array<{ index: number; score: number; document: { text: string } }>> {
const response = await cohere.rerank({
model: MODELS.RERANK_EN,
query,
documents: documents.map((d) => d.text),
topN: topN ?? documents.length,
returnDocuments: true,
})
return response.results.map((r) => ({
index: r.index,
score: r.relevanceScore,
document: { text: documents[r.index].text },
}))
}
/** Chat with conversation history */
export async function chat(
message: string,
options: {
chatHistory?: Message[]
systemPrompt?: string
documents?: Array<{ id: string; title: string; snippet: string }>
model?: "command-r-plus" | "command-r"
temperature?: number
} = {},
): Promise<{ text: string; citations?: any[]; searchResults?: any[] }> {
const { chatHistory = [], systemPrompt, documents, model = MODELS.CHAT, temperature = 0.7 } = options
const response = await cohere.chat({
model,
message,
chatHistory,
...(systemPrompt ? { preamble: systemPrompt } : {}),
...(documents ? { documents } : {}),
temperature,
citationQuality: "accurate",
})
return {
text: response.text,
citations: response.citations,
searchResults: (response as any).documents,
}
}
/** Stream chat response */
export async function* streamChat(
message: string,
chatHistory?: Message[],
model: "command-r-plus" | "command-r" = MODELS.CHAT,
): AsyncGenerator<string> {
const stream = await cohere.chatStream({
model,
message,
chatHistory: chatHistory ?? [],
})
for await (const event of stream) {
if (event.eventType === "text-generation") {
yield event.text
}
}
}
export { cohere }
Hybrid Search Pipeline with Reranking
// lib/cohere/search.ts — BM25 + vector + Cohere rerank pipeline
import { embedQuery, rerank } from "./client"
export type Document = {
id: string
title: string
text: string
metadata?: Record<string, unknown>
}
export type SearchResult = Document & { score: number; rank: number }
/**
* Two-stage search: vector retrieval → Cohere rerank
*
* Stage 1: Do a broad vector search (top-K * 5 candidates)
* Stage 2: Rerank candidates using Cohere's cross-encoder for precision
*/
export async function hybridSearchWithRerank(
query: string,
options: {
vectorSearch: (queryVector: number[], topK: number) => Promise<Document[]>
topK?: number // Final results after reranking
candidateK?: number // Candidates to fetch before reranking
minScore?: number
},
): Promise<SearchResult[]> {
const { vectorSearch, topK = 8, candidateK = topK * 4, minScore = 0.3 } = options
// Stage 1: Broad vector retrieval
const queryVector = await embedQuery(query)
const candidates = await vectorSearch(queryVector, candidateK)
if (candidates.length === 0) return []
// Stage 2: Rerank with Cohere's cross-encoder
const reranked = await rerank(
query,
candidates.map((d) => ({ id: d.id, text: `${d.title}\n\n${d.text}` })),
topK,
)
return reranked
.filter((r) => r.score >= minScore)
.map((r, rank) => ({
...candidates[r.index],
score: r.score,
rank: rank + 1,
}))
}
/** RAG: retrieve relevant docs and generate a grounded answer with citations */
export async function ragAnswer(
question: string,
documents: Document[],
systemPrompt?: string,
): Promise<{ answer: string; citations: Array<{ text: string; document: string }> }> {
const { chat } = await import("./client")
// Rerank documents by relevance to the question first
const reranked = await rerank(question, documents.map((d) => ({ id: d.id, text: d.text })), 5)
const topDocs = reranked.map((r) => documents[r.index])
const result = await chat(question, {
systemPrompt: systemPrompt ?? "You are a helpful assistant. Answer based only on the provided documents. Cite your sources.",
documents: topDocs.map((d) => ({
id: d.id,
title: d.title,
snippet: d.text.slice(0, 800),
})),
model: "command-r-plus",
})
return {
answer: result.text,
citations: (result.citations ?? []).map((c: any) => ({
text: c.text,
document: c.documentIds?.[0] ?? "unknown",
})),
}
}
Next.js Search API
// app/api/search/route.ts — hybrid search with Cohere reranking
import { NextResponse } from "next/server"
import { z } from "zod"
import { hybridSearchWithRerank } from "@/lib/cohere/search"
import { embedQuery } from "@/lib/cohere/client"
import { searchDocuments } from "@/lib/pinecone" // or any vector DB
const SearchSchema = z.object({
query: z.string().min(2).max(500),
topK: z.number().int().min(1).max(20).default(8),
})
export async function POST(req: Request) {
const { query, topK } = SearchSchema.parse(await req.json())
const results = await hybridSearchWithRerank(query, {
vectorSearch: async (vector, k) => searchDocuments(vector, k),
topK,
minScore: 0.4,
})
return NextResponse.json({
results: results.map((r) => ({
id: r.id,
title: r.title,
excerpt: r.text.slice(0, 300) + "…",
score: r.score,
rank: r.rank,
})),
})
}
For the Pinecone alternative when a fully managed vector database with namespace isolation, metadata filtering, and sub-millisecond query performance at scale is needed rather than just the embedding/rerank functions — Pinecone handles vector storage and retrieval while Cohere’s strength is the embedding and reranking models themselves, which can be combined with any vector store, see the Pinecone guide. For the OpenAI Embeddings alternative when consolidating on a single OpenAI API key, using the text-embedding-3-small/large models, or fine-tuning embeddings on custom data with OpenAI’s fine-tuning API is preferred — OpenAI embeddings have broader community examples while Cohere embed-v3 benchmarks competitively on retrieval tasks especially for multilingual content, see the OpenAI guide. The Claude Skills 360 bundle includes Cohere skill sets covering embeddings, reranking, and RAG pipelines. Start with the free tier to try Cohere search generation.