LiteLLM calls 100+ LLMs with one unified API. pip install litellm. from litellm import completion. response = completion(model="gpt-4o", messages=[{"role":"user","content":"Hello"}]). Anthropic: completion(model="claude-sonnet-4-6", messages=...). Gemini: completion(model="gemini/gemini-1.5-pro", messages=...). Cohere: completion(model="command-r-plus", messages=...). Ollama: completion(model="ollama/llama3", messages=..., api_base="http://localhost:11434"). Access response: response.choices[0].message.content. response.usage.total_tokens. Streaming: for chunk in completion(model="gpt-4o", messages=messages, stream=True): print(chunk.choices[0].delta.content or ""). Async: from litellm import acompletion, response = await acompletion(model=..., messages=...). Cost: from litellm import completion_cost, cost = completion_cost(completion_response=response). Router with load balancing: from litellm import Router, router = Router(model_list=[{"model_name":"gpt-4","litellm_params":{"model":"gpt-4","api_key":"sk-..."}},{"model_name":"gpt-4","litellm_params":{"model":"azure/gpt-4","api_base":"...","api_key":"..."}}]). router.completion(model="gpt-4", messages=messages) — round-robins. Fallbacks: completion(model="gpt-4", messages=messages, fallbacks=["claude-sonnet-4-6","gemini/gemini-1.5-pro"]). Budget manager: from litellm import BudgetManager, bm = BudgetManager(project_name="myapp"), bm.create_budget(total_budget=10.0, user="user-123", duration="monthly"), bm.is_valid_user("user-123"). Proxy server: litellm --model claude-sonnet-4-6 --port 8000 starts OpenAI-compatible proxy. litellm --config litellm_config.yaml for multi-provider configuration. Claude Code generates LiteLLM routing logic, fallback chains, Router configs, budget management, and proxy YAML configs.
CLAUDE.md for LiteLLM
## LiteLLM Stack
- Version: litellm >= 1.40
- Unified: completion(model=provider/model-name, messages=[...]) — same API for all providers
- Models: gpt-4o, claude-sonnet-4-6, gemini/gemini-1.5-pro, ollama/llama3, vertex_ai/gemini-pro
- Streaming: completion(..., stream=True) → for chunk in response: chunk.choices[0].delta.content
- Async: await acompletion(model, messages)
- Router: Router(model_list=[{model_name, litellm_params}]) → router.completion(...)
- Fallback: completion(model, messages, fallbacks=["model2","model3"])
- Cost: completion_cost(completion_response=response) → float in USD
LiteLLM Routing and Proxy
# llm/litellm_router.py — multi-provider LLM routing with fallbacks and cost tracking
from __future__ import annotations
import asyncio
import os
from typing import Any, Generator
import litellm
from litellm import Router, acompletion, completion, completion_cost
# Configure LiteLLM global settings
litellm.set_verbose = False
litellm.drop_params = True # Drop unsupported params instead of raising errors
# ── 1. Simple unified completion ──────────────────────────────────────────────
def chat(
prompt: str,
model: str = "claude-sonnet-4-6",
temperature: float = 0.7,
max_tokens: int = 1024,
system: str = "You are a helpful assistant.",
) -> str:
"""Single LLM call — works with any supported provider."""
messages = [
{"role": "system", "content": system},
{"role": "user", "content": prompt},
]
response = completion(
model=model,
messages=messages,
temperature=temperature,
max_tokens=max_tokens,
)
return response.choices[0].message.content
def stream_chat(
prompt: str,
model: str = "gpt-4o",
) -> Generator[str, None, None]:
"""Streaming completion — provider-agnostic."""
messages = [{"role": "user", "content": prompt}]
for chunk in completion(model=model, messages=messages, stream=True):
delta = chunk.choices[0].delta.content
if delta:
yield delta
# ── 2. Router with load balancing ─────────────────────────────────────────────
def build_production_router() -> Router:
"""
Router distributes across models/deployments with:
- Load balancing (least-busy, round-robin, random, latency-based)
- Health checks + automatic failover
- Per-model rate limits and retry logic
"""
model_list = [
# Primary: Anthropic Claude (direct API)
{
"model_name": "fast",
"litellm_params": {
"model": "claude-haiku-4-5-20251001",
"api_key": os.environ.get("ANTHROPIC_API_KEY", ""),
},
},
# Primary: GPT-4o-mini (OpenAI)
{
"model_name": "fast",
"litellm_params": {
"model": "gpt-4o-mini",
"api_key": os.environ.get("OPENAI_API_KEY", ""),
},
},
# Quality tier: Claude Sonnet
{
"model_name": "quality",
"litellm_params": {
"model": "claude-sonnet-4-6",
"api_key": os.environ.get("ANTHROPIC_API_KEY", ""),
"rpm": 500, # rate limit: requests per minute
"tpm": 100_000, # rate limit: tokens per minute
},
},
# Quality tier: GPT-4o (fallback)
{
"model_name": "quality",
"litellm_params": {
"model": "gpt-4o",
"api_key": os.environ.get("OPENAI_API_KEY", ""),
},
},
# Local tier: Ollama (no API cost)
{
"model_name": "local",
"litellm_params": {
"model": "ollama/llama3",
"api_base": "http://localhost:11434",
},
},
]
return Router(
model_list=model_list,
routing_strategy="latency-based-routing", # "least-busy" | "simple-shuffle" | "cost-based-routing"
fallbacks=[{"fast": ["quality"]}, {"quality": ["local"]}],
allowed_fails=3,
retry_after=5,
set_verbose=False,
)
# Singleton router
_router: Router | None = None
def get_router() -> Router:
global _router
if _router is None:
_router = build_production_router()
return _router
def routed_chat(prompt: str, tier: str = "fast") -> tuple[str, float]:
"""Route to appropriate tier and return response + cost."""
router = get_router()
messages = [{"role": "user", "content": prompt}]
response = router.completion(model=tier, messages=messages, max_tokens=1024)
text = response.choices[0].message.content
cost = completion_cost(completion_response=response)
return text, cost
# ── 3. Fallback chain ─────────────────────────────────────────────────────────
def chat_with_fallback(prompt: str) -> tuple[str, str]:
"""
Try providers in priority order.
Returns (response, model_used).
"""
messages = [{"role": "user", "content": prompt}]
response = completion(
model="claude-opus-4-6",
messages=messages,
max_tokens=1024,
fallbacks=[ # Tried in order if primary fails
"claude-sonnet-4-6",
"gpt-4o",
"gemini/gemini-1.5-pro",
],
context_window_fallback_dict={ # Use smaller model for long contexts
"claude-opus-4-6": "claude-sonnet-4-6",
"gpt-4o": "gpt-4o-mini",
},
num_retries=2,
timeout=30.0,
)
return response.choices[0].message.content, response.model
# ── 4. Cost tracking ──────────────────────────────────────────────────────────
class CostTracker:
"""Per-user cost tracking using LiteLLM's cost utilities."""
def __init__(self, budget_per_call: float = 0.10):
self.budget = budget_per_call
self.totals: dict[str, float] = {}
def complete(self, user_id: str, prompt: str, model: str = "claude-sonnet-4-6") -> str:
current_spend = self.totals.get(user_id, 0.0)
if current_spend >= self.budget:
raise ValueError(f"User {user_id} has exceeded budget: ${current_spend:.4f}")
messages = [{"role": "user", "content": prompt}]
response = completion(model=model, messages=messages, user=user_id)
cost = completion_cost(completion_response=response)
self.totals[user_id] = current_spend + cost
return response.choices[0].message.content
def get_spend(self, user_id: str) -> float:
return self.totals.get(user_id, 0.0)
def summary(self) -> dict[str, Any]:
return {
"total_spend": sum(self.totals.values()),
"per_user": dict(self.totals),
"budget_per_call": self.budget,
}
# ── 5. Async batch completions ────────────────────────────────────────────────
async def batch_completions(
prompts: list[str],
model: str = "claude-haiku-4-5-20251001",
concurrency: int = 10,
) -> list[str]:
"""Run multiple completions concurrently with rate limiting."""
semaphore = asyncio.Semaphore(concurrency)
async def single(prompt: str) -> str:
async with semaphore:
response = await acompletion(
model=model,
messages=[{"role": "user", "content": prompt}],
max_tokens=512,
)
return response.choices[0].message.content
tasks = [single(p) for p in prompts]
results = await asyncio.gather(*tasks, return_exceptions=True)
return [r if not isinstance(r, Exception) else f"Error: {r}" for r in results]
# ── 6. Proxy config (YAML for litellm proxy server) ──────────────────────────
PROXY_CONFIG = """
# litellm_config.yaml — start with: litellm --config litellm_config.yaml
model_list:
- model_name: claude-fast
litellm_params:
model: claude-haiku-4-5-20251001
api_key: os.environ/ANTHROPIC_API_KEY
- model_name: claude-quality
litellm_params:
model: claude-sonnet-4-6
api_key: os.environ/ANTHROPIC_API_KEY
- model_name: gpt-fallback
litellm_params:
model: gpt-4o-mini
api_key: os.environ/OPENAI_API_KEY
router_settings:
routing_strategy: latency-based-routing
fallbacks:
- {"claude-quality": ["gpt-fallback"]}
litellm_settings:
drop_params: true
success_callback: ["langfuse"] # Auto-log all calls to Langfuse
failure_callback: ["langfuse"]
general_settings:
master_key: os.environ/LITELLM_MASTER_KEY
database_url: os.environ/DATABASE_URL
# Access at http://localhost:4000/v1/chat/completions (OpenAI-compatible)
"""
if __name__ == "__main__":
# Quick test
text, cost = routed_chat("What is the capital of France?", tier="fast")
print(f"Response: {text}\nCost: ${cost:.6f}")
# Batch
prompts = [f"Sentence {i}: explain {topic}" for i, topic in enumerate(["Python","Go","Rust"])]
responses = asyncio.run(batch_completions(prompts))
for p, r in zip(prompts, responses):
print(f"Q: {p[:40]}...\nA: {r[:80]}...\n")
For the direct provider SDK alternative when calling only one LLM provider and wanting the thinnest possible dependency — using anthropic or openai directly has zero overhead while LiteLLM removes vendor lock-in and enables A/B testing across providers without changing application code, making it the right choice for any application that might switch providers or run evaluations comparing model quality. For the OpenRouter alternative when wanting a single API key that routes to multiple providers through a third-party aggregator — OpenRouter is a hosted proxy while LiteLLM runs in your own infrastructure and gives full control over keys, budgets, and routing logic without sending API keys to a third party. The Claude Skills 360 bundle includes LiteLLM skill sets covering unified completion, Router load balancing, fallback chains, cost tracking, async batch, and proxy YAML configs. Start with the free tier to try LLM gateway code generation.