Argilla creates and manages annotation datasets for NLP and LLMs. pip install argilla. rg.init(api_url="http://localhost:6900", api_key="argilla.apikey"). Feedback dataset: dataset = rg.FeedbackDataset(fields=[rg.TextField(name="instruction"), rg.TextField(name="response")], questions=[rg.RatingQuestion(name="quality", description="Rate quality", values=[1,2,3,4,5]), rg.LabelQuestion(name="category", labels=["helpful","incorrect","harmful"])]). dataset.push_to_argilla(name="llm-evals", workspace="default"). Add records: dataset.add_records([rg.FeedbackRecord(fields={"instruction": "...", "response": "..."}, suggestions=[rg.Suggestion(question_name="quality", value=4, score=0.9)])]). Retrieve dataset: dataset = rg.FeedbackDataset.from_argilla(name="llm-evals"). Filter: dataset.filter_by(response_status=["submitted"]). Export to HuggingFace: hf_dataset = dataset.format_as("datasets"), hf_dataset.push_to_hub("org/dataset"). Text classification: rg.TextClassificationRecord(text="...", prediction=[("positive", 0.9)], annotation="positive"). NER: rg.TokenClassificationRecord(text="...", tokens=[...], annotation=[("ORG", 4, 11)]). rg.log(records, name="my-dataset", workspace="default"). V2 SDK: client = rg.Argilla(api_url=..., api_key=...), dataset = rg.Dataset(name=..., settings=rg.Settings(fields=[...], questions=[...])), dataset.create(), dataset.records.log([rg.Record(fields={"text": "..."})]). Docker: docker run -d -p 6900:6900 argilla/argilla-server:latest. dataset.pull().to_datasets() exports. Claude Code generates Argilla dataset configs, record ingestion scripts, suggestion pre-labeling, HuggingFace export pipelines, and TypeScript clients.
CLAUDE.md for Argilla
## Argilla Stack
- Version: argilla >= 2.0 (V2 SDK: rg.Argilla client)
- Init: rg.Argilla(api_url, api_key) — or ARGILLA_API_URL + ARGILLA_API_KEY env vars
- Dataset: rg.Dataset(name, settings=rg.Settings(fields, questions)) → dataset.create()
- Records: dataset.records.log([rg.Record(fields={"col": value}, suggestions=[...])])
- Suggestions: rg.Suggestion(question_name, value, score) for model pre-labels
- Fetch: rg.Dataset.from_argilla(name) or dataset.records.with_suggestions()
- Export: dataset.records.to_datasets() → HuggingFace Dataset
- Filter: dataset.records.filter(response_status=["submitted"])
Dataset Creation and Record Ingestion
# annotation/argilla_setup.py — Argilla dataset and annotation workflow
from __future__ import annotations
import json
import os
from typing import Any
import argilla as rg
ARGILLA_URL = os.environ.get("ARGILLA_API_URL", "http://localhost:6900")
ARGILLA_KEY = os.environ.get("ARGILLA_API_KEY", "argilla.apikey")
WORKSPACE = os.environ.get("ARGILLA_WORKSPACE", "default")
def get_client() -> rg.Argilla:
return rg.Argilla(api_url=ARGILLA_URL, api_key=ARGILLA_KEY)
# ── LLM Preference Dataset ────────────────────────────────────────────────────
def create_rlhf_dataset(dataset_name: str = "llm-preference") -> rg.Dataset:
"""Create a dataset for LLM response preference annotation (RLHF)."""
client = get_client()
settings = rg.Settings(
fields=[
rg.TextField(
name="instruction",
title="User Instruction",
use_markdown=True,
required=True,
),
rg.TextField(
name="response_a",
title="Response A",
use_markdown=True,
required=True,
),
rg.TextField(
name="response_b",
title="Response B",
use_markdown=True,
required=True,
),
],
questions=[
rg.LabelQuestion(
name="preference",
title="Which response is better?",
labels=["A", "B", "tie", "both_bad"],
required=True,
description="Choose the response that is more helpful and accurate",
),
rg.RatingQuestion(
name="quality_a",
title="Quality of Response A",
values=[1, 2, 3, 4, 5],
description="1=very poor, 5=excellent",
),
rg.RatingQuestion(
name="quality_b",
title="Quality of Response B",
values=[1, 2, 3, 4, 5],
),
rg.SpanQuestion(
name="issues_a",
title="Mark problematic spans in Response A",
field="response_a",
labels=["factual_error", "harmful", "off_topic", "repetitive"],
required=False,
allow_overlapping=False,
),
rg.TextQuestion(
name="rationale",
title="Explain your preference (optional)",
required=False,
use_markdown=False,
),
],
guidelines=(
"Compare the two responses to the given instruction. "
"Choose the response that is more helpful, accurate, and safe. "
"Rate each response from 1 (poor) to 5 (excellent)."
),
)
dataset = rg.Dataset(name=dataset_name, workspace=WORKSPACE, settings=settings, client=client)
dataset.create()
print(f"Created RLHF dataset: {dataset_name}")
return dataset
def create_text_classification_dataset(
dataset_name: str = "customer-sentiment",
labels: list[str] | None = None,
) -> rg.Dataset:
"""Create a text classification annotation dataset."""
client = get_client()
labels = labels or ["positive", "neutral", "negative"]
settings = rg.Settings(
fields=[
rg.TextField(name="text", title="Customer Comment", required=True),
rg.TextField(name="source", title="Source Channel", required=False),
],
questions=[
rg.LabelQuestion(
name="sentiment",
title="Sentiment",
labels=labels,
required=True,
),
rg.LabelQuestion(
name="topic",
title="Topic",
labels=["product_quality", "shipping", "support", "pricing", "other"],
required=False,
),
],
guidelines="Classify the sentiment of each customer comment.",
)
dataset = rg.Dataset(name=dataset_name, workspace=WORKSPACE, settings=settings, client=client)
dataset.create()
return dataset
# ── Ingest records with suggestions ──────────────────────────────────────────
def ingest_rlhf_records(
dataset_name: str,
pairs: list[dict],
pre_label: bool = True,
) -> None:
"""
Ingest (instruction, response_a, response_b) pairs with optional pre-labels.
pairs: [{"instruction": str, "response_a": str, "response_b": str}]
"""
client = get_client()
dataset = client.datasets(dataset_name, workspace=WORKSPACE)
records: list[rg.Record] = []
for pair in pairs:
suggestions: list[rg.Suggestion] = []
if pre_label:
# Auto pre-label using a simple heuristic (replace with real model)
len_a = len(pair["response_a"])
len_b = len(pair["response_b"])
pref = "A" if len_a > len_b * 1.2 else "B" if len_b > len_a * 1.2 else "tie"
suggestions = [
rg.Suggestion(
question_name="preference",
value=pref,
score=0.6, # Low confidence — needs human review
agent="heuristic-v1",
),
]
records.append(rg.Record(
fields={
"instruction": pair["instruction"],
"response_a": pair["response_a"],
"response_b": pair["response_b"],
},
suggestions=suggestions,
metadata=pair.get("metadata", {}),
))
dataset.records.log(records)
print(f"Logged {len(records)} records to '{dataset_name}'")
def ingest_with_model_suggestions(
dataset_name: str,
texts: list[str],
classifier, # A predict_proba-style model
label_map: dict[int, str],
) -> None:
"""Ingest text records with model pre-labels as suggestions."""
client = get_client()
dataset = client.datasets(dataset_name, workspace=WORKSPACE)
records: list[rg.Record] = []
for text in texts:
proba = classifier.predict_proba([text])[0]
label = label_map[int(proba.argmax())]
score = float(proba.max())
records.append(rg.Record(
fields={"text": text},
suggestions=[
rg.Suggestion(
question_name="sentiment",
value=label,
score=score,
agent="sklearn-v1",
)
],
))
dataset.records.log(records)
print(f"Logged {len(records)} records with model suggestions")
# ── Export to HuggingFace ─────────────────────────────────────────────────────
def export_to_huggingface(
dataset_name: str,
hf_repo_id: str,
status: list[str] = ["submitted"],
) -> None:
"""Export submitted annotations to HuggingFace Hub."""
from datasets import Dataset as HFDataset
client = get_client()
dataset = client.datasets(dataset_name, workspace=WORKSPACE)
# Pull submitted/reviewed annotations
annotated = dataset.records.with_suggestions(
status=status,
).to_datasets()
print(f"Exporting {len(annotated)} annotated records to {hf_repo_id}")
annotated.push_to_hub(hf_repo_id, private=True)
print(f"Dataset pushed to HuggingFace: {hf_repo_id}")
# ── Stats ─────────────────────────────────────────────────────────────────────
def print_annotation_progress(dataset_name: str) -> None:
"""Print annotation progress statistics."""
client = get_client()
dataset = client.datasets(dataset_name, workspace=WORKSPACE)
info = dataset.info # type: ignore
print(f"\nDataset: {dataset_name}")
progress = dataset.progress()
print(f" Total records: {progress.total}")
print(f" Annotated: {progress.completed}")
print(f" Pending: {progress.pending}")
pct = 100 * progress.completed / max(progress.total, 1)
print(f" Progress: {pct:.1f}%")
if __name__ == "__main__":
dataset = create_rlhf_dataset("llm-preference-v1")
ingest_rlhf_records("llm-preference-v1", [
{
"instruction": "Explain quantum computing in simple terms",
"response_a": "Quantum computing uses quantum bits. It is fast.",
"response_b": "Quantum computers use qubits which can be 0 and 1 simultaneously...",
},
], pre_label=True)
TypeScript Client
// lib/argilla/client.ts — Argilla REST API client
const ARGILLA_URL = process.env.ARGILLA_API_URL ?? "http://localhost:6900"
const ARGILLA_KEY = process.env.ARGILLA_API_KEY ?? "argilla.apikey"
const headers = {
"X-Argilla-Api-Key": ARGILLA_KEY,
"Content-Type": "application/json",
}
async function argillaFetch<T>(path: string, options?: RequestInit): Promise<T> {
const res = await fetch(`${ARGILLA_URL}/api/v1${path}`, { ...options, headers: { ...headers, ...options?.headers } })
if (!res.ok) throw new Error(`Argilla ${res.status}: ${await res.text()}`)
return res.json()
}
export type ArgillaDataset = { id: string; name: string; workspace_id: string; status: string }
/** List all datasets in a workspace */
export async function listDatasets(workspace: string = "default"): Promise<ArgillaDataset[]> {
const data = await argillaFetch<{ items: ArgillaDataset[] }>(`/datasets?workspace=${workspace}`)
return data.items
}
/** Get record count and annotation progress for a dataset */
export async function getDatasetProgress(datasetId: string) {
return argillaFetch(`/datasets/${datasetId}/records/progress`)
}
/** Submit an annotation for a record */
export async function submitAnnotation(
recordId: string,
responses: Array<{ question_name: string; value: unknown }>,
) {
return argillaFetch(`/records/${recordId}/responses`, {
method: "POST",
body: JSON.stringify({ responses }),
})
}
For the Label Studio alternative when needing multi-modal annotation beyond text (images, audio, video, time series) with a richer visual annotation UI and ML backend for pre-labeling across all data types — Label Studio is more general while Argilla is purpose-built for NLP and LLM data with first-class support for LLM preference ranking, span annotations for instruction tuning, and direct HuggingFace dataset integration that makes it the natural choice for teams building fine-tuning and RLHF datasets. For the Scale AI alternative when needing a managed labeling workforce with quality control, SLA-backed delivery, and enterprise contracts — Scale AI handles the human workforce externally while Argilla provides the open-source self-hosted infrastructure for teams with their own annotators who need structured workflows, model pre-labeling with Suggestions, and direct HuggingFace Hub export. The Claude Skills 360 bundle includes Argilla skill sets covering LLM preference datasets, text classification annotation, model pre-labeling, HuggingFace export, and TypeScript API clients. Start with the free tier to try NLP annotation workflow generation.