Weights & Biases tracks every experiment run — hyperparameters, metrics, system stats, model weights, and prediction tables — with three lines of code. wandb.init(config=config) starts a run. wandb.log({"loss": loss}) instruments the training loop. wandb.Artifact versions datasets and models. Sweep agents explore hyperparameter spaces with Bayesian optimization, running parallel trials across machines. W&B integrates into PyTorch, Hugging Face Trainer, and PyTorch Lightning with one-line callbacks. Claude Code generates W&B instrumented training scripts, sweep configurations, artifact pipelines, and the dashboard queries for production ML experiment management.
CLAUDE.md for W&B Projects
## Weights & Biases Stack
- wandb >= 0.17, initialized with wandb.init(project=PROJECT, entity=ENTITY)
- Config: pass hyperparams via config= dict — never hardcode in training loop
- Logging: wandb.log(metrics, step=epoch) — always include step for time-series plots
- Artifacts: log datasets as type="dataset", models as type="model"
- Sweeps: YAML sweep config + wandb agent — Bayesian for <15 params, grid for <5
- Integrations: WandbCallback for HF Trainer, WandbLogger for Lightning
- Team: set WANDB_ENTITY in env — never hardcode in scripts
Basic Experiment Tracking
# training/train_with_wandb.py — instrument a PyTorch training loop
import wandb
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import os
def train_model(config: dict):
"""Training loop fully instrumented with W&B."""
# Initialize run — config is logged and appears in the W&B UI
run = wandb.init(
project=os.environ.get("WANDB_PROJECT", "my-project"),
entity=os.environ.get("WANDB_ENTITY"),
config=config,
tags=["baseline", config.get("model_type", "unknown")],
)
# Access config through wandb.config (supports sweep overrides)
cfg = wandb.config
# Build model and optimizer
model = build_model(cfg.hidden_size, cfg.num_layers, cfg.dropout)
optimizer = torch.optim.AdamW(
model.parameters(),
lr=cfg.learning_rate,
weight_decay=cfg.weight_decay,
)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(
optimizer, T_max=cfg.num_epochs
)
# Watch the model: log gradients and parameter histograms
wandb.watch(model, log="gradients", log_freq=100)
train_loader, val_loader = get_dataloaders(cfg.batch_size)
criterion = nn.CrossEntropyLoss()
best_val_acc = 0.0
for epoch in range(cfg.num_epochs):
# Training phase
model.train()
train_losses, train_correct = [], 0
for batch_idx, (inputs, labels) in enumerate(train_loader):
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
# Gradient clipping
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
train_losses.append(loss.item())
train_correct += (outputs.argmax(1) == labels).sum().item()
# Log batch-level metrics every N steps
if batch_idx % 50 == 0:
wandb.log({
"batch/loss": loss.item(),
"batch/lr": scheduler.get_last_lr()[0],
}, step=epoch * len(train_loader) + batch_idx)
scheduler.step()
# Validation phase
val_loss, val_acc = evaluate(model, val_loader, criterion)
# Log epoch-level metrics
epoch_metrics = {
"epoch": epoch,
"train/loss": sum(train_losses) / len(train_losses),
"train/accuracy": train_correct / len(train_loader.dataset),
"val/loss": val_loss,
"val/accuracy": val_acc,
"lr": scheduler.get_last_lr()[0],
}
wandb.log(epoch_metrics, step=epoch)
# Save best model as W&B artifact
if val_acc > best_val_acc:
best_val_acc = val_acc
save_and_log_model(model, run, epoch, val_acc)
# Summary metrics — shown in the run overview table
wandb.summary["best_val_accuracy"] = best_val_acc
wandb.summary["total_epochs"] = cfg.num_epochs
run.finish()
return best_val_acc
def save_and_log_model(model, run, epoch: int, val_acc: float):
"""Save model checkpoint as a W&B artifact."""
checkpoint_path = f"checkpoint_epoch{epoch}_acc{val_acc:.3f}.pt"
torch.save(model.state_dict(), checkpoint_path)
artifact = wandb.Artifact(
name="classifier-checkpoint",
type="model",
metadata={"epoch": epoch, "val_accuracy": val_acc},
)
artifact.add_file(checkpoint_path)
run.log_artifact(artifact)
Dataset Versioning with Artifacts
# artifacts/dataset_artifact.py — version datasets with W&B Artifacts
import wandb
import pandas as pd
from pathlib import Path
def log_dataset_artifact(
data_dir: str,
artifact_name: str,
metadata: dict,
project: str,
) -> str:
"""Upload a dataset directory as a versioned W&B artifact."""
run = wandb.init(project=project, job_type="data-prep")
artifact = wandb.Artifact(
name=artifact_name,
type="dataset",
description=metadata.get("description", ""),
metadata=metadata,
)
# Add entire directory — files are deduplicated by content hash
artifact.add_dir(data_dir)
# Log a data preview as a W&B Table
train_df = pd.read_csv(f"{data_dir}/train.csv").head(100)
table = wandb.Table(dataframe=train_df)
artifact.add(table, "sample_preview")
run.log_artifact(artifact)
artifact_url = artifact.wait() # Block until upload completes
run.finish()
print(f"Dataset artifact: {artifact_url}")
return str(artifact_url)
def download_dataset_artifact(
artifact_path: str, # "entity/project/artifact_name:version"
download_dir: str = "./data",
) -> str:
"""Download a specific version of a dataset artifact."""
run = wandb.init(job_type="training")
artifact = run.use_artifact(artifact_path, type="dataset")
local_path = artifact.download(root=download_dir)
print(f"Dataset downloaded to: {local_path}")
print(f"Artifact metadata: {artifact.metadata}")
run.finish()
return local_path
def log_predictions_table(
texts: list[str],
true_labels: list[str],
pred_labels: list[str],
pred_scores: list[float],
run=None,
) -> None:
"""Log a prediction analysis table to W&B."""
columns = ["text", "true_label", "predicted_label", "score", "correct"]
data = [
[text, true, pred, score, true == pred]
for text, true, pred, score in zip(texts, true_labels, pred_labels, pred_scores)
]
table = wandb.Table(columns=columns, data=data)
if run:
run.log({"predictions": table})
else:
wandb.log({"predictions": table})
Hyperparameter Sweeps
# sweeps/run_sweep.py — Bayesian hyperparameter optimization
import wandb
import os
# Sweep configuration — defines the search space
SWEEP_CONFIG = {
"name": "transformer-classifier-sweep",
"method": "bayes", # bayes, random, or grid
"metric": {
"name": "val/f1_macro",
"goal": "maximize",
},
"parameters": {
"learning_rate": {
"distribution": "log_uniform_values",
"min": 1e-5,
"max": 1e-3,
},
"batch_size": {
"values": [16, 32, 64],
},
"warmup_ratio": {
"distribution": "uniform",
"min": 0.0,
"max": 0.15,
},
"weight_decay": {
"distribution": "log_uniform_values",
"min": 1e-4,
"max": 1e-1,
},
"lora_r": {
"values": [8, 16, 32, 64],
},
"lora_alpha_ratio": {
"values": [1, 2, 4], # lora_alpha = lora_r * ratio
},
"num_epochs": {
"values": [3, 5, 8],
},
},
"early_terminate": {
"type": "hyperband",
"min_iter": 2,
"eta": 2,
},
}
def train_sweep():
"""Single sweep trial — called by each wandb agent."""
# wandb.init() with no config — sweep agent injects config
run = wandb.init()
cfg = wandb.config
# Derived config values
lora_alpha = cfg.lora_r * cfg.lora_alpha_ratio
# Import here to avoid GPU memory before sweep agent forks
from training.lora_trainer import train as lora_train
metrics = lora_train(
learning_rate=cfg.learning_rate,
batch_size=cfg.batch_size,
warmup_ratio=cfg.warmup_ratio,
weight_decay=cfg.weight_decay,
lora_r=cfg.lora_r,
lora_alpha=lora_alpha,
num_epochs=cfg.num_epochs,
)
wandb.log(metrics)
run.finish()
def launch_sweep(project: str, count: int = 30) -> str:
"""Create a sweep and launch agents."""
sweep_id = wandb.sweep(
sweep=SWEEP_CONFIG,
project=project,
entity=os.environ.get("WANDB_ENTITY"),
)
print(f"Sweep created: {sweep_id}")
print(f"View at: https://wandb.ai/{os.environ.get('WANDB_ENTITY')}/{project}/sweeps/{sweep_id}")
# Run agent locally — launches `count` trials sequentially
# For parallel: run this function on multiple machines with the same sweep_id
wandb.agent(sweep_id, function=train_sweep, count=count)
return sweep_id
Hugging Face Trainer Integration
# integrations/hf_trainer_wandb.py — W&B + Hugging Face Trainer
import os
from transformers import TrainingArguments, Trainer
import wandb
def build_training_args_with_wandb(
output_dir: str,
run_name: str,
config: dict,
) -> TrainingArguments:
"""TrainingArguments configured to log to W&B."""
# Trainer reads WANDB_PROJECT from environment
os.environ["WANDB_PROJECT"] = os.environ.get("WANDB_PROJECT", "hf-experiments")
return TrainingArguments(
output_dir=output_dir,
run_name=run_name,
report_to="wandb", # Enable W&B logging
num_train_epochs=config["num_epochs"],
per_device_train_batch_size=config["batch_size"],
learning_rate=config["learning_rate"],
warmup_ratio=config.get("warmup_ratio", 0.06),
weight_decay=config.get("weight_decay", 0.01),
eval_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
metric_for_best_model="f1_macro",
fp16=True,
logging_steps=25,
# W&B-specific: log model checkpoints as artifacts
push_to_hub=False,
)
def log_evaluation_report(trainer: Trainer, eval_dataset, label_names: list[str]):
"""Log detailed evaluation metrics and confusion matrix to W&B."""
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix
predictions = trainer.predict(eval_dataset)
pred_labels = np.argmax(predictions.predictions, axis=-1)
true_labels = predictions.label_ids
report = classification_report(
true_labels, pred_labels,
target_names=label_names,
output_dict=True,
)
# Log per-class metrics
for label, metrics in report.items():
if isinstance(metrics, dict):
for metric_name, value in metrics.items():
wandb.log({f"eval/{label}/{metric_name}": value})
# Log confusion matrix as W&B plot
cm = confusion_matrix(true_labels, pred_labels)
wandb.log({
"eval/confusion_matrix": wandb.plot.confusion_matrix(
y_true=true_labels.tolist(),
preds=pred_labels.tolist(),
class_names=label_names,
)
})
For the MLflow alternative that stores experiment metadata locally or on your own server without a SaaS dependency, see the MLflow guide for experiment tracking and model registry. For the Hugging Face Trainer that integrates directly with W&B via report_to="wandb", the Transformers guide covers LoRA fine-tuning and TrainingArguments. The Claude Skills 360 bundle includes W&B skill sets covering experiment instrumentation, sweep configuration, and artifact pipelines. Start with the free tier to try W&B training script generation.