TorchMetrics provides modular, distributed-safe ML evaluation metrics for PyTorch. pip install torchmetrics. from torchmetrics import Accuracy, F1Score, AUROC, MeanSquaredError. Classification: acc = Accuracy(task="binary"), acc.update(preds, target), result = acc.compute(), acc.reset(). Multiclass: Accuracy(task="multiclass", num_classes=10). Multilabel: Accuracy(task="multilabel", num_labels=5, average="macro"). Precision/Recall: Precision(task="binary"), Recall(task="multiclass", num_classes=10, average="macro"). F1: F1Score(task="binary"), F1Score(task="multiclass", num_classes=10, average="weighted"). AUROC: AUROC(task="binary") — expects probabilities not logits. ConfusionMatrix(task="multiclass", num_classes=5). Regression: MeanSquaredError(), MeanAbsoluteError(), R2Score(). MeanSquaredError(squared=False) — RMSE. Collection: metrics = MetricCollection([Accuracy(task="binary"), F1Score(task="binary"), AUROC(task="binary")]). Lightning: self.train_metrics = MetricCollection({...}). self.log("val/acc", acc) — automatically resets after logging. MAP: from torchmetrics.detection import MeanAveragePrecision, map_metric = MeanAveragePrecision(), map_metric.update(preds, targets). IoU: IntersectionOverUnion(). BLEU: BLEUScore(n_gram=4). ROUGE: ROUGEScore(). Functional: from torchmetrics.functional import accuracy, acc = accuracy(preds, target, task="binary"). Device: acc = acc.to(device) — metrics live on same device as tensors. Distributed: updates are automatically sync’d across GPUs via dist.all_reduce. Claude Code generates TorchMetrics training loops, MetricCollection validation reporters, and detection mAP evaluators.
CLAUDE.md for TorchMetrics
## TorchMetrics Stack
- Version: torchmetrics >= 1.4
- Pattern: metric.update(preds, target) → metric.compute() → metric.reset()
- Classification: task="binary"|"multiclass"|"multilabel", num_classes=N
- Average: "micro" | "macro" | "weighted" | "none" (per-class)
- Collection: MetricCollection([...]) — call update/compute/reset once
- Device: metrics must be on same device as predictions (metric.to(device))
- Functional: torchmetrics.functional.* — stateless single-batch ops
TorchMetrics Evaluation Pipeline
# ml/torchmetrics_pipeline.py — ML evaluation metrics with TorchMetrics
from __future__ import annotations
from typing import Any
import torch
import torch.nn as nn
from torchmetrics import (
Accuracy, Precision, Recall, F1Score, AUROC,
ConfusionMatrix, CohenKappa, MatthewsCorrCoef,
MeanSquaredError, MeanAbsoluteError, R2Score,
MeanAbsolutePercentageError,
MetricCollection,
)
from torchmetrics.classification import (
BinaryPrecisionRecallCurve, MulticlassROC,
BinaryCalibrationError, BinaryStatScores,
MultilabelAccuracy,
)
# ── 0. Classification metric suites ──────────────────────────────────────────
def binary_metric_suite(device: torch.device = None) -> MetricCollection:
"""
Complete binary classification metric collection.
Covers accuracy, F1, AUROC, precision, recall, and calibration.
"""
metrics = MetricCollection({
"accuracy": Accuracy(task="binary"),
"precision": Precision(task="binary"),
"recall": Recall(task="binary"),
"f1": F1Score(task="binary"),
"auroc": AUROC(task="binary"),
"kappa": CohenKappa(task="binary"),
"mcc": MatthewsCorrCoef(task="binary"),
"ece": BinaryCalibrationError(n_bins=15),
})
if device:
metrics = metrics.to(device)
return metrics
def multiclass_metric_suite(
num_classes: int,
device: torch.device = None,
) -> MetricCollection:
"""
Multi-class classification metrics with macro and weighted averaging.
Pass integer class labels as targets; pass softmax probabilities as preds.
"""
metrics = MetricCollection({
"accuracy_macro": Accuracy(task="multiclass", num_classes=num_classes, average="macro"),
"accuracy_weighted": Accuracy(task="multiclass", num_classes=num_classes, average="weighted"),
"f1_macro": F1Score(task="multiclass", num_classes=num_classes, average="macro"),
"f1_weighted": F1Score(task="multiclass", num_classes=num_classes, average="weighted"),
"precision_macro": Precision(task="multiclass", num_classes=num_classes, average="macro"),
"recall_macro": Recall(task="multiclass", num_classes=num_classes, average="macro"),
"auroc_macro": AUROC(task="multiclass", num_classes=num_classes, average="macro"),
"kappa": CohenKappa(task="multiclass", num_classes=num_classes),
})
if device:
metrics = metrics.to(device)
return metrics
def per_class_metrics(
num_classes: int,
device: torch.device = None,
) -> MetricCollection:
"""Per-class precision, recall, F1 (average='none')."""
metrics = MetricCollection({
"precision_per_class": Precision(task="multiclass", num_classes=num_classes, average="none"),
"recall_per_class": Recall(task="multiclass", num_classes=num_classes, average="none"),
"f1_per_class": F1Score(task="multiclass", num_classes=num_classes, average="none"),
})
if device:
metrics = metrics.to(device)
return metrics
# ── 1. Regression metric suite ────────────────────────────────────────────────
def regression_metric_suite(device: torch.device = None) -> MetricCollection:
"""
Standard regression metrics: MSE, RMSE, MAE, MAPE, R².
"""
metrics = MetricCollection({
"mse": MeanSquaredError(squared=True),
"rmse": MeanSquaredError(squared=False),
"mae": MeanAbsoluteError(),
"mape": MeanAbsolutePercentageError(),
"r2": R2Score(),
})
if device:
metrics = metrics.to(device)
return metrics
# ── 2. Training loop integration ──────────────────────────────────────────────
class MetricTracker:
"""
Wraps MetricCollection for use in a training/validation loop.
Handles update, compute, reset, and per-epoch logging in one place.
"""
def __init__(
self,
metrics: MetricCollection,
prefix: str = "val",
):
self.metrics = metrics
self.prefix = prefix
def update(self, preds: torch.Tensor, targets: torch.Tensor) -> None:
"""Accumulate batch predictions."""
self.metrics.update(preds, targets)
def compute(self) -> dict[str, float]:
"""Compute epoch-level metrics and reset state."""
results = self.metrics.compute()
self.metrics.reset()
# Flatten nested tensors to scalars
flat = {}
for k, v in results.items():
if v.ndim == 0:
flat[f"{self.prefix}/{k}"] = v.item()
else:
# per-class vector: log each class separately
for i, vi in enumerate(v.tolist()):
flat[f"{self.prefix}/{k}_class{i}"] = vi
return flat
def reset(self) -> None:
self.metrics.reset()
def run_validation_epoch(
model: nn.Module,
loader, # DataLoader
metrics: MetricTracker,
device: torch.device,
task: str = "binary", # "binary" | "multiclass"
) -> dict[str, float]:
"""
Standard validation loop with TorchMetrics accumulation.
Returns dict of metric_name → float for logging.
"""
model.eval()
with torch.no_grad():
for batch in loader:
x, y = batch
x, y = x.to(device), y.to(device)
logits = model(x)
if task == "binary":
preds = torch.sigmoid(logits).squeeze(-1)
else:
preds = torch.softmax(logits, dim=-1)
metrics.update(preds, y)
return metrics.compute()
# ── 3. Confusion matrix utilities ────────────────────────────────────────────
def compute_confusion_matrix(
preds: torch.Tensor,
targets: torch.Tensor,
num_classes: int,
normalize: str | None = None, # None | "true" | "pred" | "all"
) -> torch.Tensor:
"""
Compute confusion matrix.
normalize="true": row-normalized (recall per class)
normalize="pred": col-normalized (precision per class)
normalize="all": divide all cells by total count
"""
cm_metric = ConfusionMatrix(
task="multiclass",
num_classes=num_classes,
normalize=normalize,
)
cm_metric.update(preds, targets)
return cm_metric.compute()
def per_class_report(
preds: torch.Tensor,
targets: torch.Tensor,
num_classes: int,
class_names: list[str] = None,
) -> list[dict]:
"""
Per-class precision, recall, F1 report as a list of dicts.
Analogous to sklearn's classification_report.
"""
metrics = per_class_metrics(num_classes)
metrics.update(preds, targets)
results = metrics.compute()
metrics.reset()
rows = []
for i in range(num_classes):
name = class_names[i] if class_names else f"class_{i}"
rows.append({
"class": name,
"precision": round(results["precision_per_class"][i].item(), 4),
"recall": round(results["recall_per_class"][i].item(), 4),
"f1": round(results["f1_per_class"][i].item(), 4),
})
return rows
# ── 4. Object detection metrics ───────────────────────────────────────────────
def compute_map(
preds: list[dict], # [{boxes:Tensor, scores:Tensor, labels:Tensor}, ...]
targets: list[dict], # [{boxes:Tensor, labels:Tensor}, ...]
iou_thresholds: list[float] = None,
) -> dict[str, float]:
"""
Compute COCO-style mAP using TorchMetrics MeanAveragePrecision.
preds / targets follow the TorchMetrics dict format.
"""
from torchmetrics.detection import MeanAveragePrecision
metric = MeanAveragePrecision(
iou_thresholds=iou_thresholds, # None = COCO [0.5:0.05:0.95]
box_format="xyxy",
)
metric.update(preds, targets)
result = metric.compute()
return {k: round(float(v), 4) for k, v in result.items() if v.ndim == 0}
# ── 5. NLP metrics ────────────────────────────────────────────────────────────
def compute_bleu(
predictions: list[str],
references: list[list[str]],
n_gram: int = 4,
) -> float:
"""Compute corpus-level BLEU score."""
from torchmetrics.text import BLEUScore
metric = BLEUScore(n_gram=n_gram)
metric.update(predictions, references)
return round(float(metric.compute()), 4)
def compute_rouge(
predictions: list[str],
references: list[str],
) -> dict[str, float]:
"""Compute ROUGE-1, ROUGE-2, ROUGE-L F1 scores."""
from torchmetrics.text import ROUGEScore
metric = ROUGEScore()
metric.update(predictions, references)
result = metric.compute()
return {k: round(float(v), 4) for k, v in result.items()}
# ── Demo ──────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
print("TorchMetrics Evaluation Demo")
print("=" * 50)
torch.manual_seed(42)
n = 1000
# Binary classification
print("\n── Binary Classification ──")
probs = torch.sigmoid(torch.randn(n))
targets = torch.randint(0, 2, (n,))
suite = binary_metric_suite()
suite.update(probs, targets)
results = suite.compute()
suite.reset()
for name, val in results.items():
print(f" {name:<12} {float(val):.4f}")
# Multi-class
print("\n── Multi-class (5 classes) ──")
num_classes = 5
mc_probs = torch.softmax(torch.randn(n, num_classes), dim=1)
mc_targets = torch.randint(0, num_classes, (n,))
mc_suite = multiclass_metric_suite(num_classes)
mc_suite.update(mc_probs, mc_targets)
mc_results = mc_suite.compute()
mc_suite.reset()
for name, val in mc_results.items():
print(f" {name:<22} {float(val):.4f}")
# Regression
print("\n── Regression ──")
y_pred = torch.randn(n)
y_true = y_pred + 0.3 * torch.randn(n)
reg_suite = regression_metric_suite()
reg_suite.update(y_pred, y_true)
reg_results = reg_suite.compute()
reg_suite.reset()
for name, val in reg_results.items():
print(f" {name:<6} {float(val):.4f}")
# Per-class report
print("\n── Per-class Report (3 classes) ──")
report = per_class_report(
mc_probs[:200, :3].softmax(dim=1),
mc_targets[:200] % 3,
num_classes=3,
class_names=["cat", "dog", "bird"],
)
for row in report:
print(f" {row['class']:<6} P={row['precision']:.3f} R={row['recall']:.3f} F1={row['f1']:.3f}")
# BLEU
print("\n── BLEU Score ──")
preds = ["the quick brown fox", "hello world"]
refs = [["the quick brown fox jumps"], ["hello world today"]]
bleu = compute_bleu(preds, refs, n_gram=2)
print(f" BLEU-2: {bleu}")
For the sklearn.metrics alternative — sklearn metrics are one-shot functions that require loading all predictions into memory at once while TorchMetrics’ stateful metric.update(batch_preds, batch_targets) accumulates intermediate statistics across batches, enabling correct epoch-level metrics without concatenating thousands of prediction arrays, and MetricCollection.update/compute/reset tracks 10 metrics in one call with automatic GPU tensor support. For the manual Pandas/NumPy computation alternative — manual precision/recall loops break on edge cases like all-negative batches (zero division), multi-GPU training (metrics must be all-reduced across ranks), and half-precision tensors (uint8 accumulation overflow) while TorchMetrics handles all three cases internally, AUROC(task="multiclass", average="macro") integrates the PR curve correctly for unbalanced classes, and MatthewsCorrCoef gives a single balanced metric without choosing a threshold. The Claude Skills 360 bundle includes TorchMetrics skill sets covering binary/multiclass/multilabel accuracy, precision/recall/F1/AUROC, MetricCollection, MetricTracker training loop wrapper, confusion matrix normalization, per-class report, MeanAveragePrecision for detection, BLEU/ROUGE NLP metrics, regression MSE/RMSE/MAE/MAPE/R², and BinaryCalibrationError. Start with the free tier to try ML evaluation code generation.