Label Studio is the open-source data labeling platform. pip install label-studio label-studio-sdk. label-studio start launches at localhost:8080. Create project via SDK: from label_studio_sdk import Client, ls = Client(url="http://localhost:8080", api_key="token"). project = ls.start_project(title="Churn Comments", label_config=xml_config). Labeling config XML for text classification: <View><Text name="text" value="$text"/><Choices name="sentiment" toName="text"><Choice value="positive"/><Choice value="negative"/></Choices></View>. For NER: <View><Labels name="label" toName="text"><Label value="PERSON" background="#FFA39E"/><Label value="ORG" background="#D4380D"/></Labels><Text name="text" value="$text"/></View>. For image classification: <View><Image name="image" value="$image"/><Choices name="category" toName="image"><Choice value="defect"/></Choices></View>. Import tasks: project.import_tasks([{"data": {"text": "..."}}]). From S3: project.connect_cloud_storage(s3_settings). Export annotations: annotations = project.export_tasks(export_type="JSON"). YOLO format: export_type="YOLO". COCO: export_type="COCO". ML Backend for pre-labeling: class extending LabelStudioMLBase, implement predict(tasks) returns [{"result": [{"from_name": "sentiment", "to_name": "text", "type": "choices", "value": {"choices": ["positive"]}}], "score": 0.9}]. Start backend: label-studio-ml start my_ml_backend/. Register: project.connect_ml_backend("http://localhost:9090"). Webhooks: project.set_params({"webhook_url": "https://myapp.com/webhook"}). Active learning: predict → human review low-confidence → retrain loop. Claude Code generates Label Studio project configs, XML label definitions, ML backend classes, import scripts, and TypeScript API clients.
CLAUDE.md for Label Studio
## Label Studio Stack
- Version: label-studio >= 1.10, label-studio-sdk >= 1.0
- Server: label-studio start --port 8080 (or Docker: heartexlabs/label-studio:latest)
- SDK: Client(url, api_key) → start_project(title, label_config=xml) or get_project(id)
- Import: project.import_tasks([{"data": {"text": "..."}}]) or import_tasks_from_url
- Export: project.export_tasks(export_type="JSON"/"YOLO"/"COCO"/"CSV")
- ML Backend: class extending LabelStudioMLBase → predict(tasks) → fit(completions)
- Webhook: POST to your URL on annotation events
Project Setup and Import
# labeling/setup_project.py — Label Studio project setup via SDK
from __future__ import annotations
import json
import os
from pathlib import Path
from typing import Any
from label_studio_sdk import Client
from label_studio_sdk.data_manager import Filters, Column, Operator, Type
LS_URL = os.environ.get("LABEL_STUDIO_URL", "http://localhost:8080")
LS_API_KEY = os.environ.get("LABEL_STUDIO_API_KEY", "")
def get_client() -> Client:
return Client(url=LS_URL, api_key=LS_API_KEY)
# ── Label config templates ────────────────────────────────────────────────────
TEXT_CLASSIFICATION_CONFIG = """
<View>
<Header value="Classify customer feedback sentiment"/>
<Text name="text" value="$text"/>
<Choices name="sentiment" toName="text" choice="single" showInLine="true">
<Choice value="positive" background="#00AA00"/>
<Choice value="neutral" background="#CCBB00"/>
<Choice value="negative" background="#CC0000"/>
</Choices>
<Rating name="confidence" toName="text" maxRating="5" icon="star" size="medium"/>
</View>
"""
NER_CONFIG = """
<View>
<Labels name="label" toName="text">
<Label value="PERSON" background="#FFA39E"/>
<Label value="ORG" background="#D4380D"/>
<Label value="PRODUCT" background="#096DD9"/>
<Label value="LOCATION" background="#87D068"/>
<Label value="DATE" background="#722ED1"/>
</Labels>
<Text name="text" value="$text"/>
</View>
"""
IMAGE_CLASSIFICATION_CONFIG = """
<View>
<Image name="image" value="$image" zoom="true"/>
<Choices name="category" toName="image" choice="single">
<Choice value="defect"/>
<Choice value="normal"/>
<Choice value="uncertain"/>
</Choices>
<TextArea name="notes" toName="image" rows="3" placeholder="Optional notes"/>
</View>
"""
# ── Create projects ───────────────────────────────────────────────────────────
def create_text_classification_project(title: str = "Customer Sentiment") -> int:
"""Create a text classification labeling project."""
ls = get_client()
project = ls.start_project(
title=title,
label_config=TEXT_CLASSIFICATION_CONFIG,
description="Classify customer feedback comments by sentiment",
)
print(f"Created project '{title}' — ID: {project.id}")
return project.id
def create_ner_project(title: str = "Customer Comments NER") -> int:
ls = get_client()
project = ls.start_project(title=title, label_config=NER_CONFIG)
print(f"Created NER project '{title}' — ID: {project.id}")
return project.id
# ── Import tasks ──────────────────────────────────────────────────────────────
def import_text_tasks(project_id: int, texts: list[str]) -> None:
"""Import text records as labeling tasks."""
ls = get_client()
project = ls.get_project(project_id)
tasks = [{"data": {"text": text}} for text in texts]
project.import_tasks(tasks)
print(f"Imported {len(tasks)} tasks to project {project_id}")
def import_from_csv(project_id: int, csv_path: str, text_col: str = "text") -> None:
"""Import tasks from a CSV file."""
import pandas as pd
ls = get_client()
project = ls.get_project(project_id)
df = pd.read_csv(csv_path)
tasks = [{"data": {col: str(row[col]) for col in df.columns}} for _, row in df.iterrows()]
project.import_tasks(tasks)
print(f"Imported {len(tasks)} tasks from {csv_path}")
def import_from_s3(project_id: int, bucket: str, prefix: str = "") -> None:
"""Sync tasks from S3 storage."""
ls = get_client()
project = ls.get_project(project_id)
project.connect_cloud_storage(
type="s3",
bucket=bucket,
prefix=prefix,
region_name="us-east-1",
aws_access_key_id=os.environ.get("AWS_ACCESS_KEY_ID"),
aws_secret_access_key=os.environ.get("AWS_SECRET_ACCESS_KEY"),
)
print(f"Connected S3 bucket {bucket}/{prefix} to project {project_id}")
# ── Export annotations ────────────────────────────────────────────────────────
def export_annotations(
project_id: int,
output_dir: str = "annotations",
export_type: str = "JSON",
) -> list[dict]:
"""Export completed annotations in the specified format."""
Path(output_dir).mkdir(exist_ok=True)
ls = get_client()
project = ls.get_project(project_id)
annotations = project.export_tasks(export_type=export_type)
if isinstance(annotations, list):
output_path = f"{output_dir}/annotations_{project_id}.json"
with open(output_path, "w") as f:
json.dump(annotations, f, indent=2)
print(f"Exported {len(annotations)} tasks to {output_path}")
else:
# Binary formats like YOLO/COCO
output_path = f"{output_dir}/annotations_{project_id}.zip"
with open(output_path, "wb") as f:
f.write(annotations)
print(f"Exported {export_type} archive to {output_path}")
return annotations if isinstance(annotations, list) else []
ML Backend for Pre-labeling
# ml_backend/sentiment_backend.py — Label Studio ML Backend
from __future__ import annotations
from label_studio_ml import LabelStudioMLBase
class SentimentPredictor(LabelStudioMLBase):
"""
Pre-labeling ML backend: predicts sentiment labels before human review.
Register with: label-studio-ml start ml_backend/ --port 9090
Then connect in Label Studio: Settings → Machine Learning → Add Model
"""
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.classifier = None
self._load_model()
def _load_model(self):
"""Load the pre-trained sentiment classifier."""
try:
from transformers import pipeline
self.classifier = pipeline(
"text-classification",
model="cardiffnlp/twitter-roberta-base-sentiment-latest",
)
print("SentimentPredictor model loaded")
except Exception as e:
print(f"Could not load model: {e}")
self.classifier = None
def predict(self, tasks: list[dict], **kwargs) -> list[dict]:
"""Generate pre-predictions for labeling tasks."""
if not self.classifier:
return [{"result": [], "score": 0.0} for _ in tasks]
predictions = []
for task in tasks:
text = task["data"].get("text", "")
result = self.classifier(text[:512])[0]
label_map = {
"LABEL_0": "negative",
"LABEL_1": "neutral",
"LABEL_2": "positive",
}
label = label_map.get(result["label"], result["label"].lower())
score = float(result["score"])
predictions.append({
"result": [
{
"from_name": "sentiment",
"to_name": "text",
"type": "choices",
"score": score,
"value": {"choices": [label]},
}
],
"score": score,
"model_version": "sentiment-v1",
})
return predictions
def fit(self, annotations: list[dict], **kwargs) -> dict:
"""Fine-tune model on newly completed annotations (optional)."""
# Collect training data from recently labeled examples
labeled = []
for ann in annotations:
task = ann.get("task", {})
text = task.get("data", {}).get("text", "")
result = ann.get("result", [])
if result and text:
label = result[0]["value"]["choices"][0]
labeled.append({"text": text, "label": label})
print(f"Received {len(labeled)} new annotations for fine-tuning")
# Fine-tuning implementation would go here
return {"status": "ok", "n_labeled": len(labeled)}
TypeScript REST Client
// lib/label-studio/client.ts — Label Studio REST API client
const LS_URL = process.env.LABEL_STUDIO_URL ?? "http://localhost:8080"
const LS_API_KEY = process.env.LABEL_STUDIO_API_KEY ?? ""
const headers = {
"Authorization": `Token ${LS_API_KEY}`,
"Content-Type": "application/json",
}
export type LsProject = { id: number; title: string; task_count: number; num_tasks_with_annotations: number }
export type LsTask = { id: number; data: Record<string, string>; annotations: unknown[] }
async function lsFetch<T>(path: string, options?: RequestInit): Promise<T> {
const res = await fetch(`${LS_URL}/api${path}`, { ...options, headers: { ...headers, ...options?.headers } })
if (!res.ok) throw new Error(`Label Studio ${res.status}: ${await res.text()}`)
return res.json()
}
/** List all projects */
export async function listProjects(): Promise<LsProject[]> {
const data = await lsFetch<{ results: LsProject[] }>("/projects/")
return data.results
}
/** Get tasks with no annotations (unlabeled) */
export async function getUnlabeledTasks(projectId: number, limit = 50): Promise<LsTask[]> {
const data = await lsFetch<{ results: LsTask[] }>(
`/tasks/?project=${projectId}&was_annotated=false&page_size=${limit}`
)
return data.results
}
/** Submit an annotation programmatically */
export async function submitAnnotation(
taskId: number,
result: Array<{ from_name: string; to_name: string; type: string; value: Record<string, unknown> }>,
): Promise<void> {
await lsFetch(`/annotations/`, {
method: "POST",
body: JSON.stringify({ task: taskId, result }),
})
}
For the Scale AI alternative when needing enterprise-grade managed labeling with human annotator teams, quality control workflows, SLA guarantees, and native integrations with AWS Rekognition and other cloud AI services — Scale AI handles the human workforce management externally while Label Studio is self-hosted and open-source with no per-annotation pricing, giving full control over the annotation pipeline including the ML backend for model-assisted pre-labeling. For the Prodigy alternative when working in the spaCy ecosystem and needing a programmer-friendly annotation tool with command-line recipes, active learning built in, and seamless spaCy model integration — Prodigy excels for NLP annotation in spaCy workflows while Label Studio supports more media types (images, audio, video, time series) and has a visual web UI suitable for non-technical domain experts. The Claude Skills 360 bundle includes Label Studio skill sets covering project setup, label config XML, ML backend pre-labeling, annotation export, and TypeScript API clients. Start with the free tier to try data labeling workflow generation.