PyTorch is the dominant deep learning framework — dynamic computation graph, Python-native debugging, and a research-to-production path through TorchScript and ONNX. The training loop pattern is explicit: forward pass, loss computation, backward pass, optimizer step. Claude Code writes custom Dataset classes, training loops with gradient checkpointing, mixed-precision training for GPU efficiency, evaluation loops, early stopping, and the ONNX export that serves models in production without a Python runtime.
CLAUDE.md for PyTorch Projects
## ML Stack
- PyTorch 2.x with CUDA 12.x for GPU training
- Mixed precision: torch.amp.autocast + GradScaler (2x faster on A100/H100)
- Checkpointing: save state_dict every N epochs + best model by validation metric
- Datasets: custom Dataset class; DataLoader with num_workers=(CPU count //2)
- Logging: wandb or TensorBoard for loss curves and hyperparameter tracking
- Reproducibility: set seed for torch, numpy, random; CUDNN deterministic=True
- Export: TorchScript for C++ serving, ONNX for cross-framework deployment
Custom Dataset
# ml/dataset.py
import torch
from torch.utils.data import Dataset, DataLoader
from pathlib import Path
import json
import random
class OrderIntentDataset(Dataset):
"""Dataset for classifying customer support message intent."""
LABELS = ['order_status', 'refund_request', 'product_question', 'complaint', 'other']
LABEL_TO_IDX = {label: idx for idx, label in enumerate(LABELS)}
def __init__(
self,
data_path: str,
tokenizer,
max_length: int = 128,
split: str = 'train',
):
self.tokenizer = tokenizer
self.max_length = max_length
with open(data_path) as f:
all_data = json.load(f)
# Split deterministically
random.seed(42)
random.shuffle(all_data)
n = len(all_data)
splits = {
'train': all_data[:int(n * 0.8)],
'val': all_data[int(n * 0.8):int(n * 0.9)],
'test': all_data[int(n * 0.9):],
}
self.data = splits[split]
def __len__(self) -> int:
return len(self.data)
def __getitem__(self, idx: int) -> dict:
item = self.data[idx]
encoding = self.tokenizer(
item['text'],
max_length=self.max_length,
padding='max_length',
truncation=True,
return_tensors='pt',
)
return {
'input_ids': encoding['input_ids'].squeeze(0),
'attention_mask': encoding['attention_mask'].squeeze(0),
'label': torch.tensor(self.LABEL_TO_IDX[item['label']], dtype=torch.long),
}
def create_dataloaders(data_path: str, tokenizer, batch_size: int = 32):
train_ds = OrderIntentDataset(data_path, tokenizer, split='train')
val_ds = OrderIntentDataset(data_path, tokenizer, split='val')
test_ds = OrderIntentDataset(data_path, tokenizer, split='test')
return (
DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True),
DataLoader(val_ds, batch_size=batch_size * 2, shuffle=False, num_workers=4, pin_memory=True),
DataLoader(test_ds, batch_size=batch_size * 2, shuffle=False, num_workers=4),
)
Training Loop with Mixed Precision
# ml/trainer.py
import torch
import torch.nn as nn
from torch.amp import autocast, GradScaler
from pathlib import Path
import wandb
def set_seed(seed: int = 42):
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
import numpy as np, random
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.deterministic = True
def train(
model: nn.Module,
train_loader,
val_loader,
config: dict,
output_dir: str = './checkpoints',
):
set_seed(config.get('seed', 42))
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
optimizer = torch.optim.AdamW(
model.parameters(),
lr=config['learning_rate'],
weight_decay=config.get('weight_decay', 0.01),
)
# Cosine LR scheduler with warmup
total_steps = len(train_loader) * config['epochs']
warmup_steps = int(total_steps * 0.1)
scheduler = torch.optim.lr_scheduler.OneCycleLR(
optimizer, max_lr=config['learning_rate'],
total_steps=total_steps, pct_start=0.1,
)
scaler = GradScaler() # Mixed precision gradient scaler
criterion = nn.CrossEntropyLoss()
best_val_acc = 0
patience_counter = 0
for epoch in range(config['epochs']):
# Training phase
model.train()
train_loss = 0
for batch in train_loader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['label'].to(device)
optimizer.zero_grad(set_to_none=True) # Faster than zero_grad()
# Mixed precision forward pass
with autocast(device_type='cuda', dtype=torch.float16):
logits = model(input_ids, attention_mask=attention_mask)
loss = criterion(logits, labels)
# Scaled backward pass
scaler.scale(loss).backward()
# Gradient clipping (before unscaling)
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
scaler.step(optimizer)
scaler.update()
scheduler.step()
train_loss += loss.item()
# Validation phase
val_acc, val_loss = evaluate(model, val_loader, criterion, device)
avg_train_loss = train_loss / len(train_loader)
print(f"Epoch {epoch+1}: train_loss={avg_train_loss:.4f}, val_loss={val_loss:.4f}, val_acc={val_acc:.4f}")
# Save best model
if val_acc > best_val_acc:
best_val_acc = val_acc
patience_counter = 0
torch.save({
'epoch': epoch,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'val_acc': val_acc,
'config': config,
}, f'{output_dir}/best_model.pt')
else:
patience_counter += 1
# Early stopping
if patience_counter >= config.get('patience', 5):
print(f"Early stopping at epoch {epoch+1}")
break
return best_val_acc
@torch.no_grad()
def evaluate(model, loader, criterion, device):
model.eval()
correct, total, total_loss = 0, 0, 0
for batch in loader:
input_ids = batch['input_ids'].to(device)
attention_mask = batch['attention_mask'].to(device)
labels = batch['label'].to(device)
with autocast(device_type='cuda', dtype=torch.float16):
logits = model(input_ids, attention_mask=attention_mask)
loss = criterion(logits, labels)
predictions = logits.argmax(dim=-1)
correct += (predictions == labels).sum().item()
total += labels.size(0)
total_loss += loss.item()
return correct / total, total_loss / len(loader)
ONNX Export for Production
# ml/export.py — export to ONNX for inference without PyTorch runtime
import torch
import onnx
import onnxruntime as ort
import numpy as np
def export_to_onnx(model: torch.nn.Module, output_path: str, max_length: int = 128):
model.eval()
device = next(model.parameters()).device
# Dummy input matching real input shape
dummy_input_ids = torch.randint(0, 30522, (1, max_length), dtype=torch.long).to(device)
dummy_attention_mask = torch.ones(1, max_length, dtype=torch.long).to(device)
torch.onnx.export(
model,
(dummy_input_ids, dummy_attention_mask),
output_path,
input_names=['input_ids', 'attention_mask'],
output_names=['logits'],
dynamic_axes={
'input_ids': {0: 'batch_size', 1: 'sequence_length'},
'attention_mask': {0: 'batch_size', 1: 'sequence_length'},
'logits': {0: 'batch_size'},
},
opset_version=17,
do_constant_folding=True,
)
# Verify the export
onnx_model = onnx.load(output_path)
onnx.checker.check_model(onnx_model)
print(f"ONNX model exported and verified: {output_path}")
# Inference with ONNX Runtime (no PyTorch needed)
class ONNXClassifier:
def __init__(self, model_path: str):
opts = ort.SessionOptions()
opts.intra_op_num_threads = 4
self.session = ort.InferenceSession(model_path, opts, providers=['CUDAExecutionProvider', 'CPUExecutionProvider'])
def predict(self, input_ids: np.ndarray, attention_mask: np.ndarray) -> np.ndarray:
logits = self.session.run(
['logits'],
{'input_ids': input_ids.astype(np.int64), 'attention_mask': attention_mask.astype(np.int64)},
)[0]
return logits.argmax(axis=-1)
For the Ray Tune hyperparameter search that automates finding optimal training configs, the Ray distributed guide covers Ray Tune schedulers and search algorithms. For the MLOps infrastructure that tracks training runs and model versions, the MLOps guide covers MLflow experiment tracking and model registry. The Claude Skills 360 bundle includes PyTorch skill sets covering custom datasets, mixed-precision training loops, evaluation metrics, and ONNX export. Start with the free tier to try PyTorch training loop generation.