TruLens evaluates and traces LLM applications with the RAG triad. pip install trulens trulens-providers-openai. from trulens.core import TruSession, session = TruSession(). session.reset_database() — clear previous runs. Providers: from trulens.providers.openai import OpenAI as TruOpenAI, provider = TruOpenAI(model_engine="gpt-4o-mini"). Feedback functions: from trulens.core import Feedback, f_relevance = Feedback(provider.relevance, name="Answer Relevance").on_input_output(). f_groundedness = Feedback(provider.groundedness_measure_with_cot_reasons, name="Groundedness").on(context).on_output(). f_context_relevance = Feedback(provider.context_relevance_with_cot_reasons, name="Context Relevance").on_input().on(context). Wrap LangChain: from trulens.apps.langchain import TruChain, tru_app = TruChain(chain, app_name="RAG-v1", feedbacks=[f_relevance, f_groundedness, f_context_relevance]). Record: with tru_app as recording: response = chain.invoke({"question": "What is RAG?"}). record = recording.get() — access trace. LlamaIndex: from trulens.apps.llamaindex import TruLlama, tru_app = TruLlama(query_engine, app_name="LlamaRAG", feedbacks=[...]). Custom: from trulens.apps.basic import TruBasicApp, tru_app = TruBasicApp(my_llm_fn, app_name="Custom", feedbacks=[...]). Leaderboard: session.get_leaderboard() returns DataFrame with average scores per app version. Dashboard: from trulens.dashboard import run_dashboard; run_dashboard(session) — opens at http://localhost:8501. session.get_records_and_feedback()[0] returns records DataFrame. Hugging Face provider: from trulens.providers.huggingface import Huggingface. Feedback(provider.not_toxic) for toxicity filtering. Claude Code generates TruLens feedback functions, instrumentation wrappers, RAG triad setups, leaderboard comparisons, and evaluation pipelines for RAG applications.
CLAUDE.md for TruLens
## TruLens Stack
- Version: trulens >= 1.0, trulens-providers-openai
- Session: TruSession() + session.reset_database() for clean start
- Provider: OpenAI(model_engine="gpt-4o-mini") | Huggingface()
- Feedback: Feedback(provider.method, name="...").on_input_output() | .on(selector).on_output()
- RAG triad: context_relevance + groundedness + answer_relevance
- Wrap: TruChain(chain, app_name, feedbacks) | TruLlama(query_engine, ...) | TruBasicApp(fn, ...)
- Record: with tru_app as recording: response = app(input) → recording.get()
- Compare: session.get_leaderboard() → DataFrame sorted by aggregate score
- Dashboard: run_dashboard(session) → http://localhost:8501
TruLens RAG Triad Evaluation
# evaluation/trulens_eval.py — RAG triad evaluation with TruLens
from __future__ import annotations
import os
import numpy as np
import pandas as pd
from typing import Callable
from trulens.core import TruSession, Feedback, Select
from trulens.providers.openai import OpenAI as TruOpenAI
# ── 1. Initialize session and providers ───────────────────────────────────────
def init_session(reset: bool = True) -> TruSession:
"""Initialize TruLens session. Reset clears previous evaluation data."""
session = TruSession()
if reset:
session.reset_database()
print("TruLens session initialized (database cleared)")
return session
def get_provider(model: str = "gpt-4o-mini") -> TruOpenAI:
"""Get OpenAI feedback provider for LLM-as-judge evaluation."""
return TruOpenAI(
model_engine=model,
api_key=os.environ.get("OPENAI_API_KEY", ""),
)
# ── 2. RAG triad feedback functions ───────────────────────────────────────────
def build_rag_triad_feedbacks(provider: TruOpenAI) -> list[Feedback]:
"""
The RAG triad: three feedback functions measuring complete RAG quality.
- Answer Relevance: Is the response relevant to the question?
- Context Relevance: Are the retrieved chunks relevant to the question?
- Groundedness: Is the response supported by the retrieved context?
"""
# Select the context from the RAG call — path depends on app structure
context_selector = Select.RecordCalls.retrieve.rets[:]
# 1. Answer Relevance — input → output
f_answer_relevance = (
Feedback(provider.relevance_with_cot_reasons, name="Answer Relevance")
.on_input_output()
)
# 2. Context Relevance — question → each retrieved chunk
f_context_relevance = (
Feedback(provider.context_relevance_with_cot_reasons, name="Context Relevance")
.on_input()
.on(context_selector)
.aggregate(np.mean) # Average across all retrieved chunks
)
# 3. Groundedness — each context chunk → output
f_groundedness = (
Feedback(
provider.groundedness_measure_with_cot_reasons,
name="Groundedness",
)
.on(context_selector.collect()) # Pass all chunks together
.on_output()
)
return [f_answer_relevance, f_context_relevance, f_groundedness]
def build_extra_feedbacks(provider: TruOpenAI) -> list[Feedback]:
"""Additional quality signals beyond the core RAG triad."""
f_coherence = Feedback(
provider.coherence_with_cot_reasons, name="Coherence"
).on_output()
f_conciseness = Feedback(
provider.conciseness, name="Conciseness"
).on_output()
f_harmless = Feedback(
provider.harmlessness, name="Harmlessness"
).on_output()
return [f_coherence, f_conciseness, f_harmless]
# ── 3. LangChain RAG app ──────────────────────────────────────────────────────
def build_langchain_rag() -> "Runnable":
"""Build a simple LangChain RAG chain for evaluation demo."""
from langchain_anthropic import ChatAnthropic
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
llm = ChatAnthropic(model="claude-haiku-4-5-20251001", max_tokens=512)
prompt = ChatPromptTemplate.from_template(
"Answer the following question based only on the provided context.\n\n"
"Context: {context}\n\nQuestion: {question}\n\nAnswer:"
)
def retrieve(query: dict) -> dict:
"""Simulated retriever — replace with a real vector store."""
question = query["question"]
docs = [
f"Document 1: Context relevant to '{question}' — key facts and details.",
f"Document 2: Supporting evidence for '{question}' from authoritative source.",
]
return {"context": "\n".join(docs), "question": question}
chain = (
RunnablePassthrough()
| retrieve
| prompt
| llm
| StrOutputParser()
)
return chain
def wrap_langchain_with_trulens(
chain,
session: TruSession,
provider: TruOpenAI,
app_name: str = "RAG-LangChain",
version: str = "v1",
) -> "TruChain":
"""Wrap LangChain chain with TruLens evaluation."""
from trulens.apps.langchain import TruChain
feedbacks = build_rag_triad_feedbacks(provider)
return TruChain(
chain,
app_name=app_name,
app_version=version,
feedbacks=feedbacks,
feedback_mode="deferred", # Compute feedback async after recording
)
# ── 4. Custom app wrapper ─────────────────────────────────────────────────────
def build_custom_rag_fn() -> Callable[[str], str]:
"""A custom RAG function (no framework) for TruBasicApp wrapping."""
import anthropic
client = anthropic.Anthropic()
def rag_fn(question: str) -> str:
# Simulated retrieval
context = (
f"Retrieved context for '{question}': "
"Transformers use self-attention to process sequences in parallel. "
"BERT uses masked language modeling for pre-training."
)
message = client.messages.create(
model="claude-haiku-4-5-20251001",
max_tokens=512,
messages=[{
"role": "user",
"content": f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"
}],
)
return message.content[0].text
return rag_fn
def wrap_custom_fn_with_trulens(
rag_fn,
session: TruSession,
provider: TruOpenAI,
app_name: str = "RAG-Custom",
) -> "TruBasicApp":
"""Wrap a plain Python function with TruLens."""
from trulens.apps.basic import TruBasicApp
simple_feedbacks = [
Feedback(provider.relevance, name="Answer Relevance").on_input_output(),
Feedback(provider.coherence, name="Coherence").on_output(),
]
return TruBasicApp(
rag_fn,
app_name=app_name,
feedbacks=simple_feedbacks,
)
# ── 5. Run evaluation ──────────────────────────────────────────────────────────
EVAL_QUESTIONS = [
"What is a transformer model?",
"How does attention mechanism work in deep learning?",
"What is the difference between BERT and GPT?",
"Explain retrieval-augmented generation.",
"What are embeddings in machine learning?",
]
def run_evaluation(
tru_app,
questions: list[str] = EVAL_QUESTIONS,
) -> list:
"""Run evaluation questions through a TruLens-wrapped app."""
records = []
for question in questions:
print(f"Q: {question[:60]}...")
with tru_app as recording:
try:
# Invoke varies by app type (LangChain vs custom)
if hasattr(tru_app.app, "invoke"):
response = tru_app.app.invoke({"question": question})
else:
response = tru_app.app(question)
print(f"A: {str(response)[:80]}...")
except Exception as e:
print(f"Error: {e}")
record = recording.get()
records.append(record)
print(f"\nCompleted {len(records)} evaluation runs")
return records
# ── 6. Compare and report ─────────────────────────────────────────────────────
def print_leaderboard(session: TruSession) -> pd.DataFrame:
"""Print aggregated scores per app version."""
leaderboard = session.get_leaderboard()
if leaderboard.empty:
print("No results yet — run evaluations first")
return leaderboard
print("\n=== TruLens Leaderboard ===")
display_cols = [
c for c in leaderboard.columns
if any(kw in c.lower() for kw in ["relevance", "groundedness", "coherence", "latency", "total"])
]
if display_cols:
print(leaderboard[display_cols].to_string(index=True))
else:
print(leaderboard.to_string(index=True))
return leaderboard
def get_record_details(session: TruSession) -> pd.DataFrame:
"""Get record-level details including per-call feedback scores."""
records_df, feedback_df = session.get_records_and_feedback()
if records_df.empty:
print("No records found")
return records_df
print(f"\nTotal records: {len(records_df)}")
if not feedback_df.empty:
feedback_cols = [c for c in feedback_df.columns if "result" in c.lower()]
score_cols = [c for c in feedback_df.columns if feedback_df[c].dtype in [float, "float64"]]
print(f"Average scores:\n{feedback_df[score_cols[:5]].mean().to_string()}")
return records_df
# ── 7. A/B comparison across app versions ────────────────────────────────────
def run_ab_comparison(session: TruSession, provider: TruOpenAI) -> pd.DataFrame:
"""
Compare two RAG configurations by running the same questions
through both and comparing their TruLens leaderboard scores.
"""
chain_v1 = build_langchain_rag()
chain_v2 = build_langchain_rag() # In practice: different retriever/prompt/model
tru_v1 = wrap_langchain_with_trulens(chain_v1, session, provider, version="v1-baseline")
tru_v2 = wrap_langchain_with_trulens(chain_v2, session, provider, version="v2-improved")
print("Evaluating baseline (v1)...")
run_evaluation(tru_v1, EVAL_QUESTIONS[:3])
print("\nEvaluating improved (v2)...")
run_evaluation(tru_v2, EVAL_QUESTIONS[:3])
session.wait_for_feedback_results() # Ensure deferred feedback computed
return print_leaderboard(session)
# ── Main ──────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
session = init_session(reset=True)
provider = get_provider("gpt-4o-mini")
# Evaluate custom function app
rag_fn = build_custom_rag_fn()
tru_app = wrap_custom_fn_with_trulens(rag_fn, session, provider)
run_evaluation(tru_app, EVAL_QUESTIONS[:3])
# Wait for async feedback computation
session.wait_for_feedback_results()
# Print results
leaderboard = print_leaderboard(session)
records_df = get_record_details(session)
# Launch dashboard (blocks — run in separate terminal for interactive use)
# from trulens.dashboard import run_dashboard
# run_dashboard(session) # → http://localhost:8501
print("\nDone. Run `run_dashboard(session)` to explore traces interactively.")
For the Arize Phoenix alternative when needing OpenTelemetry-compatible distributed tracing across microservices with span-level visualization and custom evaluator templates — Phoenix excels at full distributed trace observability while TruLens’s RAG triad (context relevance + groundedness + answer relevance) provides the most complete three-way quality measurement with a visual leaderboard specifically designed for comparing RAG pipeline versions. For the RAGAS alternative when needing reference-free batch evaluation metrics without a running application — RAGAS evaluates static datasets offline while TruLens wraps live application code with TruChain/TruLlama/TruBasicApp to capture real user interactions in production and evaluate them continuously, making it better suited for production monitoring and iterative RAG development. The Claude Skills 360 bundle includes TruLens skill sets covering RAG triad feedback setup, LangChain and LlamaIndex wrappers, custom app instrumentation, leaderboard comparison, dashboard visualization, and A/B evaluation pipelines. Start with the free tier to try RAG tracing and evaluation code generation.