Langfuse traces and monitors LLM applications in production. pip install langfuse. from langfuse import Langfuse, lf = Langfuse(public_key="pk-...", secret_key="sk-...", host="https://cloud.langfuse.com"). Trace: trace = lf.trace(name="rag-pipeline", user_id="user-123", session_id="sess-abc", input={"query": "..."}). Span: span = trace.span(name="retrieval", input={"query": q}), span.end(output={"docs": results}). Generation: gen = trace.generation(name="llm-call", model="claude-sonnet-4-6", model_parameters={"temperature": 0.7}, input=messages, usage={"input": prompt_tokens, "output": completion_tokens, "unit": "TOKENS"}), gen.end(output=response_text). @observe() decorator: from langfuse.decorators import observe, langfuse_context, @observe()\ndef my_fn(x): langfuse_context.update_current_observation(metadata={"x": x}). LangChain: from langfuse.callback import CallbackHandler, handler = CallbackHandler(), pass as config={"callbacks": [handler]} or to chain’s .invoke. LlamaIndex: from llama_index.callbacks.langfuse import LlamaIndexCallbackHandler. Prompt management: prompt = lf.get_prompt("rag-system-prompt"), compiled = prompt.compile(context=ctx). Scoring: lf.score(trace_id=trace.id, name="user_feedback", value=1.0, comment="helpful"). Datasets: dataset = lf.get_dataset("eval-set"), iterate for item in dataset.items: run_and_score(item). lf.flush() before shutdown. TypeScript SDK: import Langfuse from "langfuse". Claude Code generates Langfuse tracing wrappers, LangChain callback integration, prompt versioning, evaluation pipelines, and cost tracking dashboards.
CLAUDE.md for Langfuse
## Langfuse Stack
- Version: langfuse >= 2.0
- Init: Langfuse(public_key, secret_key, host) — or LANGFUSE_PUBLIC_KEY/SECRET_KEY env vars
- Trace → Span → Generation hierarchy; call .end(output=...) to close each
- @observe() decorator: wraps functions, auto-creates spans with I/O capture
- LangChain: CallbackHandler() passed in config={callbacks:[...]} to any chain/llm
- Prompts: lf.get_prompt(name) → prompt.compile(**variables) for versioned prompts
- Score: lf.score(trace_id, name, value) — custom evaluation metrics per trace
- Flush: lf.flush() at app shutdown or after batch jobs
Langfuse Tracing Pipeline
# observability/langfuse_tracing.py — LLM observability with Langfuse
from __future__ import annotations
import os
import time
from typing import Any
from langfuse import Langfuse
from langfuse.decorators import langfuse_context, observe
# Init from environment variables (LANGFUSE_PUBLIC_KEY, LANGFUSE_SECRET_KEY, LANGFUSE_HOST)
lf = Langfuse()
# ── 1. Manual trace / span / generation API ───────────────────────────────────
def traced_rag_pipeline(
query: str,
user_id: str,
session_id: str,
) -> str:
"""RAG pipeline with full Langfuse tracing."""
trace = lf.trace(
name="rag-pipeline",
user_id=user_id,
session_id=session_id,
input={"query": query},
tags=["rag", "production"],
metadata={"app_version": "2.1.0"},
)
try:
# Step 1: Retrieval span
retrieval_span = trace.span(
name="vector-retrieval",
input={"query": query, "top_k": 5},
)
docs = _retrieve_documents(query)
retrieval_span.end(
output={"num_docs": len(docs), "doc_ids": [d["id"] for d in docs]},
level="DEFAULT",
)
# Step 2: Context ranking span
rank_span = trace.span(name="rerank")
ranked_docs = _rerank(docs, query)
rank_span.end(output={"top_doc_score": ranked_docs[0]["score"] if ranked_docs else 0})
# Step 3: LLM generation
context = "\n\n".join(d["text"] for d in ranked_docs[:3])
messages = [
{"role": "system", "content": "Answer based on context only."},
{"role": "user", "content": f"Context:\n{context}\n\nQuestion: {query}"},
]
gen = trace.generation(
name="answer-generation",
model="claude-sonnet-4-6",
model_parameters={"temperature": 0.3, "max_tokens": 512},
input=messages,
)
response, usage = _call_llm(messages)
gen.end(
output=response,
usage={
"input": usage["input_tokens"],
"output": usage["output_tokens"],
"unit": "TOKENS",
},
)
trace.update(output=response, level="DEFAULT")
return response
except Exception as e:
trace.update(level="ERROR", status_message=str(e))
raise
finally:
lf.flush()
# ── 2. Decorator-based tracing (@observe) ─────────────────────────────────────
@observe(name="classify-intent")
def classify_intent(text: str) -> str:
"""Classify user intent — auto-traced by @observe."""
langfuse_context.update_current_observation(
metadata={"input_length": len(text)},
tags=["classification"],
)
# Simulate classification
intent = "question" if "?" in text else "command"
langfuse_context.update_current_observation(
output={"intent": intent},
)
return intent
@observe(name="rag-with-decorators")
def rag_with_decorators(query: str, user_id: str) -> str:
"""Full RAG pipeline using @observe decorators."""
langfuse_context.update_current_trace(
user_id=user_id,
tags=["rag", "decorator"],
)
intent = classify_intent(query) # Creates nested span automatically
docs = retrieve(query) # Another nested span
answer = generate_answer(query, docs)
return answer
@observe(as_type="retrieval")
def retrieve(query: str) -> list[dict]:
"""Document retrieval — traced as a retrieval span."""
docs = _retrieve_documents(query)
langfuse_context.update_current_observation(
output={"count": len(docs)},
)
return docs
@observe(as_type="generation")
def generate_answer(query: str, docs: list[dict]) -> str:
"""LLM generation — traced as a generation span with usage."""
messages = [
{"role": "user", "content": f"{query}\n\nContext: {docs[0]['text'][:500] if docs else ''}"},
]
response, usage = _call_llm(messages)
langfuse_context.update_current_observation(
model="claude-sonnet-4-6",
usage={"input": usage["input_tokens"], "output": usage["output_tokens"], "unit": "TOKENS"},
)
return response
# ── 3. LangChain integration ──────────────────────────────────────────────────
def langchain_with_tracing(question: str, user_id: str) -> str:
"""LangChain chain with automatic Langfuse callback tracing."""
from langchain_anthropic import ChatAnthropic
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langfuse.callback import CallbackHandler
callback = CallbackHandler(
user_id=user_id,
session_id=f"lc-session-{user_id}",
trace_name="langchain-qa",
tags=["langchain"],
)
prompt = ChatPromptTemplate.from_messages([
("system", "You are a helpful assistant."),
("human", "{question}"),
])
chain = prompt | ChatAnthropic(model="claude-sonnet-4-6") | StrOutputParser()
# Pass callback handler — all LLM calls are auto-traced
return chain.invoke(
{"question": question},
config={"callbacks": [callback]},
)
# ── 4. Prompt management ──────────────────────────────────────────────────────
def use_managed_prompt(query: str, context: str) -> str:
"""
Fetch a versioned prompt from Langfuse prompt management.
Allows non-technical users to update prompts without code deploys.
"""
# Create prompt via Langfuse UI, then fetch by name:
# lf.create_prompt(name="rag-answer", prompt="Answer: {{context}}\n\n{{query}}", labels=["production"])
try:
prompt = lf.get_prompt("rag-answer", label="production")
compiled = prompt.compile(context=context, query=query)
return compiled
except Exception:
# Fallback to hardcoded prompt
return f"Context: {context}\n\nQuestion: {query}"
# ── 5. Scoring and human feedback ─────────────────────────────────────────────
def log_user_feedback(
trace_id: str,
rating: float, # 0.0-1.0
comment: str = "",
) -> None:
"""Log user-provided feedback as a Langfuse score."""
lf.score(
trace_id=trace_id,
name="user_rating",
value=rating,
comment=comment,
data_type="NUMERIC",
)
def run_llm_eval(
trace_id: str,
expected: str,
actual: str,
) -> float:
"""Auto-evaluate output quality and log as Langfuse score."""
# Simple exact match — replace with embedding similarity or LLM-as-judge
score = 1.0 if expected.strip().lower() in actual.lower() else 0.0
lf.score(
trace_id=trace_id,
name="exact_match",
value=score,
data_type="BOOLEAN",
)
return score
# ── 6. Dataset evaluation ─────────────────────────────────────────────────────
def run_eval_dataset(dataset_name: str = "qa-eval-set") -> dict[str, float]:
"""Run evaluation pipeline over a Langfuse dataset."""
dataset = lf.get_dataset(dataset_name)
scores: list[float] = []
for item in dataset.items:
query = item.input.get("query", "")
expected = item.expected_output or ""
with item.observe(run_name="eval-run-v2") as trace_id:
actual = generate_answer(query, [])
score = run_llm_eval(trace_id, expected, actual)
scores.append(score)
avg = sum(scores) / len(scores) if scores else 0.0
print(f"Dataset: {dataset_name} | Items: {len(scores)} | Avg Score: {avg:.3f}")
return {"avg_score": avg, "n_items": len(scores)}
# ── Stubs ─────────────────────────────────────────────────────────────────────
def _retrieve_documents(query: str) -> list[dict]:
return [{"id": f"doc-{i}", "text": f"Document {i} relevant to: {query}", "score": 0.9 - i*0.1}
for i in range(5)]
def _rerank(docs: list[dict], query: str) -> list[dict]:
return sorted(docs, key=lambda d: d["score"], reverse=True)
def _call_llm(messages: list[dict]) -> tuple[str, dict]:
return "Simulated LLM response.", {"input_tokens": 150, "output_tokens": 50}
For the LangSmith (LangChain’s native tracing) alternative when already deeply embedded in the LangChain ecosystem and wanting tight integration with LangChain Hub prompt versioning — LangSmith’s native integration with every LangChain component auto-traces without configuration while Langfuse is framework-agnostic and works identically with raw Anthropic/OpenAI calls, LlamaIndex, and custom pipelines, making it the better choice when building applications that mix LangChain with other frameworks or direct API calls. For the Helicone/Braintrust alternative when prioritizing a simple proxy-based integration that captures all LLM calls by just changing the base URL — proxy-based tools add zero SDK dependencies while Langfuse’s SDK traces multi-step pipelines with spans, sessions, and user tracking that a proxy cannot capture from HTTP headers alone. The Claude Skills 360 bundle includes Langfuse skill sets covering manual trace/span/generation APIs, @observe decorator usage, LangChain callback integration, prompt management, scoring, and dataset evaluation runs. Start with the free tier to try LLM observability code generation.