Arize Phoenix traces LLM applications and runs automated evaluations. pip install arizephoenix openinference-instrumentation-anthropic. Start: import phoenix as px; px.launch_app() — opens UI at http://localhost:6006. Instrument Anthropic: from openinference.instrumentation.anthropic import AnthropicInstrumentor, AnthropicInstrumentor().instrument() — all Anthropic calls auto-traced. OpenAI: from openinference.instrumentation.openai import OpenAIInstrumentor; OpenAIInstrumentor().instrument(). LangChain: from openinference.instrumentation.langchain import LangChainInstrumentor; LangChainInstrumentor().instrument(). LlamaIndex: LlamaIndexInstrumentor().instrument(). Query traces: client = px.Client(), spans_df = client.get_spans_dataframe(). Evaluate hallucination: from phoenix.evals import HallucinationEvaluator, run_evals, evaluator = HallucinationEvaluator(OpenAIModel(model="gpt-4o")), results = run_evals(dataframe=spans_df, evaluators=[evaluator]). RAG relevance: from phoenix.evals import RelevanceEvaluator. QA correctness: from phoenix.evals import QAEvaluator. Custom eval: from phoenix.evals import llm_classify, template = ClassificationTemplate(rails=["correct","incorrect"], template="Is {response} correct for {question}?"), results = llm_classify(dataframe=df, template=template, model=OpenAIModel()). Datasets: from phoenix.datasets import Dataset, dataset = px.Client().upload_dataset(dataframe=df, dataset_name="rag-eval"). px.active_session() returns current session handle. session.get_evaluations() lists all logged evals. Annotations: from phoenix.trace.openai import OpenAIInstrumentor — compatible with OpenTelemetry exporters. Claude Code generates Phoenix instrumentation, RAG eval pipelines, custom eval templates, and evaluation result analysis.
CLAUDE.md for Arize Phoenix
## Arize Phoenix Stack
- Version: arize-phoenix >= 4.0, openinference-instrumentation-* for auto-trace
- Launch: px.launch_app() → http://localhost:6006 (or connect to remote)
- Instrument: AnthropicInstrumentor/OpenAIInstrumentor/LangChainInstrumentor().instrument()
- Trace spans: SpanKind.LLM | RETRIEVER | CHAIN | EMBEDDING | RERANKER
- Query: px.Client().get_spans_dataframe() → pandas DataFrame with trace data
- Eval: run_evals(dataframe, evaluators=[HallucinationEvaluator, RelevanceEvaluator])
- Custom: llm_classify(df, template=ClassificationTemplate(rails=[...], template=...))
- Upload: px.Client().upload_dataset(df, dataset_name) for persistent eval datasets
Phoenix Tracing and Evaluation
# observability/phoenix_eval.py — LLM tracing and automated evaluation with Phoenix
from __future__ import annotations
import os
import pandas as pd
import phoenix as px
from phoenix.evals import (
HallucinationEvaluator,
QAEvaluator,
RelevanceEvaluator,
OpenAIModel,
run_evals,
llm_classify,
)
from phoenix.evals.templates import ClassificationTemplate, PromptTemplate
from phoenix.trace import SpanKind
# ── 1. Launch Phoenix and configure instrumentation ───────────────────────────
def setup_phoenix(
project_name: str = "my-rag-app",
port: int = 6006,
remote_url: str | None = None,
) -> px.Client:
"""
Launch Phoenix UI and instrument all LLM SDKs.
Returns Phoenix client for querying traces.
"""
if remote_url:
# Connect to remote Phoenix instance (Arize Cloud or self-hosted)
import phoenix as px
os.environ["PHOENIX_COLLECTOR_ENDPOINT"] = remote_url
else:
# Launch local Phoenix server
session = px.launch_app(port=port)
print(f"Phoenix UI: {session.url}")
# Auto-instrument Anthropic
try:
from openinference.instrumentation.anthropic import AnthropicInstrumentor
AnthropicInstrumentor().instrument()
print("Anthropic instrumentation enabled")
except ImportError:
print("pip install openinference-instrumentation-anthropic")
# Auto-instrument OpenAI
try:
from openinference.instrumentation.openai import OpenAIInstrumentor
OpenAIInstrumentor().instrument()
print("OpenAI instrumentation enabled")
except ImportError:
pass
# Auto-instrument LangChain
try:
from openinference.instrumentation.langchain import LangChainInstrumentor
LangChainInstrumentor().instrument()
print("LangChain instrumentation enabled")
except ImportError:
pass
# Auto-instrument LlamaIndex
try:
from openinference.instrumentation.llama_index import LlamaIndexInstrumentor
LlamaIndexInstrumentor().instrument()
print("LlamaIndex instrumentation enabled")
except ImportError:
pass
return px.Client()
# ── 2. Manual span creation ───────────────────────────────────────────────────
def traced_rag_pipeline_with_spans(query: str) -> str:
"""
Manual tracing with Phoenix spans for custom pipelines.
Uses OpenTelemetry API for span creation.
"""
from opentelemetry import trace
from opentelemetry.trace import SpanKind as OTSpanKind
tracer = trace.get_tracer(__name__)
with tracer.start_as_current_span("rag-pipeline", kind=OTSpanKind.INTERNAL) as pipeline_span:
pipeline_span.set_attribute("input.value", query)
pipeline_span.set_attribute("session.id", "user-session-123")
# Retrieval span
with tracer.start_as_current_span("vector-retrieval", kind=OTSpanKind.CLIENT) as ret_span:
ret_span.set_attribute("input.value", query)
ret_span.set_attribute("openinference.span.kind", "RETRIEVER")
docs = [{"doc_id": f"d{i}", "text": f"Relevant doc {i} for {query}"} for i in range(3)]
ret_span.set_attribute("retrieval.documents", str(docs))
ret_span.set_attribute("output.value", f"Retrieved {len(docs)} docs")
# LLM generation span
context = " ".join(d["text"] for d in docs)
with tracer.start_as_current_span("llm-completion", kind=OTSpanKind.CLIENT) as llm_span:
llm_span.set_attribute("openinference.span.kind", "LLM")
llm_span.set_attribute("llm.model_name", "claude-sonnet-4-6")
llm_span.set_attribute("input.value", f"Context: {context}\nQ: {query}")
response = f"Answer to '{query}' based on context."
llm_span.set_attribute("output.value", response)
llm_span.set_attribute("llm.token_count.prompt", len(context.split()) * 2)
llm_span.set_attribute("llm.token_count.completion", len(response.split()) * 2)
pipeline_span.set_attribute("output.value", response)
return response
# ── 3. Retrieve spans for evaluation ─────────────────────────────────────────
def get_rag_spans_for_eval(client: px.Client) -> pd.DataFrame:
"""
Query traced RAG spans and prepare DataFrame for evaluation.
Expected columns: input, output, reference (retrieved context).
"""
spans_df = client.get_spans_dataframe(
filter_condition="span_kind == 'LLM'",
start_time=pd.Timestamp.now() - pd.Timedelta(hours=24),
)
if spans_df.empty:
# Synthetic data if no traces yet
spans_df = pd.DataFrame({
"input": ["What is RAG?", "Explain embeddings", "How does attention work?"],
"output": [
"RAG combines retrieval with generation for grounded responses.",
"Embeddings are dense vector representations of text.",
"Attention weighs token importance within a sequence.",
],
"reference": [
"Retrieval-Augmented Generation (RAG) retrieves relevant documents...",
"Word embeddings map words to high-dimensional vectors...",
"The attention mechanism assigns weights to each token...",
],
"context": [
"Document: RAG overview. RAG combines information retrieval with LLMs...",
"Document: ML glossary. Embeddings are numerical representations...",
"Document: Transformer paper. Attention is all you need...",
],
})
return spans_df
# ── 4. Automated evaluation ────────────────────────────────────────────────────
def run_rag_evaluations(
spans_df: pd.DataFrame,
eval_model_name: str = "gpt-4o-mini",
) -> dict[str, pd.DataFrame]:
"""
Run multiple automated evaluators on RAG traces.
Returns dict of evaluator_name → results DataFrame.
"""
eval_model = OpenAIModel(
model=eval_model_name,
api_key=os.environ.get("OPENAI_API_KEY", ""),
)
evaluators = {
"hallucination": HallucinationEvaluator(eval_model),
"relevance": RelevanceEvaluator(eval_model),
"qa_correctness": QAEvaluator(eval_model),
}
results = {}
for name, evaluator in evaluators.items():
print(f"Running {name} evaluation...")
try:
result_df = run_evals(
dataframe=spans_df,
evaluators=[evaluator],
provide_explanation=True,
)
results[name] = result_df
score_col = [c for c in result_df.columns if "score" in c.lower()]
if score_col:
avg = result_df[score_col[0]].mean()
print(f" {name}: avg_score={avg:.3f} over {len(result_df)} samples")
except Exception as e:
print(f" {name} failed: {e}")
return results
# ── 5. Custom LLM-as-judge evaluation ────────────────────────────────────────
def custom_conciseness_eval(
spans_df: pd.DataFrame,
eval_model_name: str = "gpt-4o-mini",
) -> pd.DataFrame:
"""Custom evaluation: is the response concise (under 50 words)?"""
eval_model = OpenAIModel(model=eval_model_name)
template = ClassificationTemplate(
rails=["concise", "verbose"],
template=(
"You are evaluating response conciseness.\n\n"
"Question: {input}\n"
"Response: {output}\n\n"
"Is this response concise (under 50 words) or verbose (over 50 words)?\n"
"Answer:"
),
explanation_template=(
"Explain in one sentence why this response is {label}."
),
)
return llm_classify(
dataframe=spans_df,
template=template,
model=eval_model,
rails=["concise", "verbose"],
provide_explanation=True,
)
# ── 6. Upload evaluation datasets ────────────────────────────────────────────
def create_eval_dataset(
client: px.Client,
dataset_name: str = "rag-gold-standard",
) -> None:
"""Upload a golden dataset for repeatable benchmark evaluation."""
gold_df = pd.DataFrame({
"input": ["What is attention?", "Define RLHF"],
"expected_output":["A mechanism that weighs...", "RLHF trains models..."],
"context": ["Attention paper abstract...", "InstructGPT paper..."],
})
dataset = client.upload_dataset(
dataframe=gold_df,
dataset_name=dataset_name,
input_keys=["input", "context"],
output_keys=["expected_output"],
)
print(f"Dataset uploaded: {dataset_name} ({len(gold_df)} examples)")
# ── Main pipeline ─────────────────────────────────────────────────────────────
if __name__ == "__main__":
client = setup_phoenix()
# Run some traced queries
for query in ["What is a transformer?", "Explain gradient descent"]:
response = traced_rag_pipeline_with_spans(query)
print(f"Q: {query}\nA: {response[:80]}...\n")
# Evaluate
spans_df = get_rag_spans_for_eval(client)
eval_results = run_rag_evaluations(spans_df)
for eval_name, df in eval_results.items():
print(f"\n{eval_name} results:\n{df.head(3)}")
For the Langfuse alternative when needing cost tracking, prompt versioning, and team collaboration features baked into the same observability tool — Langfuse handles cost/budget management tightly while Phoenix’s automated LLM-as-judge evaluators (HallucinationEvaluator, RelevanceEvaluator, QAEvaluator) and built-in RAG evaluation metrics make it the stronger choice specifically for teams running systematic quality evaluation pipelines on retrieval-augmented generation applications. For the custom LLM evaluation harness alternative when writing evaluation logic from scratch in Python with pytest or a custom framework — a custom harness gives full control but Phoenix’s pre-built evaluators covering the most common RAG failure modes (hallucination, context relevance, answer faithfulness, QA correctness) with OpenAI-compatible LLM-as-judge templates save several days of implementation work. The Claude Skills 360 bundle includes Arize Phoenix skill sets covering auto-instrumentation, manual span tracing, RAG evaluation, custom LLM-as-judge templates, dataset uploads, and evaluation result analysis. Start with the free tier to try LLM evaluation pipeline generation.