Cleanlab detects label errors and data quality issues automatically. pip install cleanlab. from cleanlab.filter import find_label_issues. issues = find_label_issues(labels=y_train, pred_probs=pred_probs_cv) — returns boolean array where True indicates a likely mislabeled sample. pred_probs must come from out-of-fold cross-validation, not in-sample. from cleanlab.rank import get_label_quality_scores, scores = get_label_quality_scores(labels=y_train, pred_probs=pred_probs_cv) — lower score = more suspicious. X_clean = X_train[~issues], y_clean = y_train[~issues]. Datalab: from cleanlab import Datalab, lab = Datalab(data={"text": texts, "label": labels}, label_name="label"), lab.find_issues(pred_probs=pred_probs, features=embeddings). lab.get_issues() returns a DataFrame with columns including is_label_issue, is_near_duplicate, is_outlier. lab.report() prints a summary. lab.get_info("label") shows per-class error rates. CleanLearning: from cleanlab.classification import CleanLearning, cl = CleanLearning(sklearn_clf), cl.fit(X_train, y_train) automatically removes label issues during training. cl.predict(X_test) uses the cleaned model. Cross-validation pred_probs: from cleanlab.internal.validation import assert_valid_inputs, use scikit-learn’s cross_val_predict(clf, X, y, cv=5, method="predict_proba"). HuggingFace: pass transformer embeddings as features to Datalab. Threshold: find_label_issues(frac_noise=0.1) limits to top 10% suspected errors. Issue types: "label", "outlier", "near_duplicate", "non_iid", "underperforming_group". Claude Code generates Cleanlab data auditing scripts, CleanLearning training loops, Datalab multi-issue detection, and integration with PyTorch and HuggingFace pipelines.
CLAUDE.md for Cleanlab
## Cleanlab Stack
- Version: cleanlab >= 2.6
- Core: find_label_issues(labels, pred_probs) → bool array of suspected mislabeled samples
- Scores: get_label_quality_scores(labels, pred_probs) → float array (lower = more suspicious)
- Datalab: Datalab(data_dict, label_name) → find_issues(pred_probs, features) → get_issues()
- CleanLearning: CleanLearning(sklearn_clf).fit(X, y) — auto-cleans during training
- pred_probs: MUST be out-of-fold CV predictions, NOT in-sample
→ cross_val_predict(clf, X, y, cv=5, method="predict_proba")
- Threshold: find_label_issues(frac_noise=0.1) to limit to top N% suspect labels
Data Auditing with Cleanlab
# data_quality/cleanlab_audit.py — comprehensive data quality audit
from __future__ import annotations
import json
from pathlib import Path
from typing import Any
import numpy as np
import pandas as pd
from cleanlab import Datalab
from cleanlab.classification import CleanLearning
from cleanlab.filter import find_label_issues
from cleanlab.rank import get_label_quality_scores
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import StratifiedKFold, cross_val_predict
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, LabelEncoder
FEATURE_COLS = ["age", "tenure_days", "monthly_spend", "support_tickets", "last_login_days"]
TARGET_COL = "churned"
# ── Step 1: Get out-of-fold predictions ───────────────────────────────────────
def get_cv_pred_probs(
X: np.ndarray,
y: np.ndarray,
n_splits: int = 5,
) -> np.ndarray:
"""
Compute out-of-fold predicted probabilities for Cleanlab.
CRITICAL: Must be cross-validation predictions, not in-sample.
"""
clf = GradientBoostingClassifier(n_estimators=100, random_state=42)
pipeline = Pipeline([("scaler", StandardScaler()), ("clf", clf)])
pred_probs = cross_val_predict(
pipeline,
X,
y,
cv=StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42),
method="predict_proba",
)
print(f"Generated CV pred_probs: shape={pred_probs.shape}")
return pred_probs
# ── Step 2: Find label issues ─────────────────────────────────────────────────
def audit_labels(
df: pd.DataFrame,
pred_probs: np.ndarray,
output_dir: str = "data_quality",
) -> tuple[np.ndarray, pd.DataFrame]:
"""Detect mislabeled samples and return boolean issue mask."""
Path(output_dir).mkdir(exist_ok=True)
y = df[TARGET_COL].values
# Boolean mask: True = likely mislabeled
issue_mask = find_label_issues(labels=y, pred_probs=pred_probs)
# Per-sample quality scores (lower = more suspicious)
scores = get_label_quality_scores(labels=y, pred_probs=pred_probs)
# Build audit DataFrame
audit_df = df.copy()
audit_df["label_quality_score"] = scores
audit_df["is_label_issue"] = issue_mask
audit_df["given_label"] = y
audit_df["predicted_label"] = pred_probs.argmax(axis=1)
audit_df["predicted_prob"] = pred_probs.max(axis=1)
n_issues = int(issue_mask.sum())
issue_rate = n_issues / len(y) * 100
print(f"\nLabel audit results:")
print(f" Total samples: {len(y)}")
print(f" Label issues: {n_issues} ({issue_rate:.1f}%)")
# Per-class error rates
for label in np.unique(y):
mask = y == label
n_errors = issue_mask[mask].sum()
pct = 100 * n_errors / mask.sum()
print(f" Class {label}: {n_errors}/{mask.sum()} issues ({pct:.1f}%)")
audit_df.sort_values("label_quality_score").to_csv(
f"{output_dir}/label_audit.csv", index=False
)
print(f"\nAudit saved: {output_dir}/label_audit.csv")
return issue_mask, audit_df
# ── Step 3: Datalab multi-issue detection ─────────────────────────────────────
def run_datalab(
df: pd.DataFrame,
pred_probs: np.ndarray,
embeddings: np.ndarray | None = None,
output_dir: str = "data_quality",
) -> pd.DataFrame:
"""
Run Datalab for comprehensive data issue detection:
- Label issues (mislabeled samples)
- Outliers (unusual samples)
- Near-duplicates (redundant samples)
- Non-IID (distribution shift)
"""
Path(output_dir).mkdir(exist_ok=True)
y = df[TARGET_COL].values
# Prepare data dict for Datalab
data = {"label": y}
if embeddings is not None:
data["embeddings"] = embeddings
lab = Datalab(data=data, label_name="label")
# Run all issue detectors
lab.find_issues(
pred_probs=pred_probs,
features=embeddings,
)
# Print summary report
lab.report()
# Get full DataFrame of issue scores
issues_df = lab.get_issues()
issues_df.to_csv(f"{output_dir}/datalab_issues.csv")
# Summarize each issue type
for col in issues_df.columns:
if col.startswith("is_"):
n = int(issues_df[col].sum())
pct = 100 * n / len(issues_df)
print(f" {col}: {n} ({pct:.1f}%)")
# Near-duplicate clusters
if "near_duplicate" in lab.issue_summary["issue_type"].values:
nd_info = lab.get_info("near_duplicate")
print(f"\nNear-duplicate sets: {len(nd_info.get('sets', []))}")
return issues_df
# ── Step 4: CleanLearning — auto-clean during training ───────────────────────
def train_with_cleanlearning(
df: pd.DataFrame,
output_dir: str = "data_quality",
) -> tuple[Pipeline, dict[str, float]]:
"""
Train using CleanLearning — automatically removes suspected label issues
during training, then evaluates on original labels.
"""
X = df[FEATURE_COLS].values
y = df[TARGET_COL].values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# CleanLearning wraps a sklearn classifier
base_clf = GradientBoostingClassifier(n_estimators=200, random_state=42)
cl = CleanLearning(clf=base_clf, seed=42)
# fit() internally does CV → finds label issues → retrains on clean subset
cl.fit(X_scaled, y)
# Evaluate on the full original dataset
y_proba = cl.predict_proba(X_scaled)[:, 1]
y_pred = cl.predict(X_scaled)
metrics = {
"train_auc": float(roc_auc_score(y, y_proba)),
}
print(f"\nCleanLearning metrics:")
print(f" Train AUC (original labels): {metrics['train_auc']:.4f}")
print(classification_report(y, y_pred, target_names=["no_churn", "churn"]))
# Package scaler + CL into a callable pipeline wrapper
import pickle
Path(output_dir).mkdir(exist_ok=True)
with open(f"{output_dir}/cl_model.pkl", "wb") as f:
pickle.dump((scaler, cl), f)
print(f"CleanLearning model saved: {output_dir}/cl_model.pkl")
return (scaler, cl), metrics # type: ignore
# ── Full pipeline ─────────────────────────────────────────────────────────────
def run_full_audit(data_path: str = "data/train.csv") -> None:
"""End-to-end data quality audit and clean training."""
df = pd.read_csv(data_path)
X = df[FEATURE_COLS].values
y = df[TARGET_COL].values
print("=== Step 1: Compute CV pred_probs ===")
pred_probs = get_cv_pred_probs(X, y)
print("\n=== Step 2: Label Issue Detection ===")
issue_mask, audit_df = audit_labels(df, pred_probs)
print("\n=== Step 3: Datalab Multi-Issue Analysis ===")
scaler = StandardScaler()
embeddings = scaler.fit_transform(X) # Use scaled features as "embeddings"
issues_df = run_datalab(df, pred_probs, embeddings=embeddings)
print("\n=== Step 4: CleanLearning Training ===")
model, metrics = train_with_cleanlearning(df)
print(f"\n{'='*50}")
print("Data quality audit complete.")
print(f"Results saved to: data_quality/")
if __name__ == "__main__":
run_full_audit()
For the Evidently alternative when needing production ML monitoring dashboards, column drift detection between reference and current datasets, and continuous snapshot-based monitoring with Prometheus export — Evidently monitors models in production while Cleanlab audits training data quality before you train, catching mislabeled samples that cause models to underfit in ways that are invisible until deployed. For the Great Expectations alternative when needing data validation with rich assertion suites for ETL pipelines checking schema, null rates, and value ranges on arriving data — Great Expectations validates data format while Cleanlab validates data correctness using the model’s own uncertainty to identify samples where the label is likely wrong, which is fundamentally different from schema validation. The Claude Skills 360 bundle includes Cleanlab skill sets covering label issue detection, Datalab multi-issue analysis, CleanLearning training, and HuggingFace integration. Start with the free tier to try data quality generation.