PyTorch Lightning eliminates boilerplate from PyTorch training. pip install lightning. from lightning import LightningModule, Trainer. class Model(LightningModule):\n def forward(self, x): return self.net(x)\n def training_step(self, batch, batch_idx):\n x, y = batch; loss = F.cross_entropy(self(x), y)\n self.log("train_loss", loss); return loss\n def configure_optimizers(self):\n return torch.optim.AdamW(self.parameters(), lr=1e-3). Run: Trainer(max_epochs=10, accelerator="gpu", devices=1).fit(model, train_dl). Trainer(accelerator="gpu", devices=4, strategy="ddp") for 4-GPU DDP. strategy="fsdp" for FSDP. precision="16-mixed" or "bf16-mixed". LightningDataModule: class Data(LightningDataModule):\n def setup(self, stage):\n self.train_ds, self.val_ds = ...\n def train_dataloader(self): return DataLoader(self.train_ds, batch_size=32, num_workers=4)\n def val_dataloader(self): return DataLoader(self.val_ds, batch_size=64). Logging: self.log_dict({"val_acc": acc, "val_loss": loss}, prog_bar=True). Callbacks: ModelCheckpoint(monitor="val_loss", mode="min", save_top_k=3), EarlyStopping(monitor="val_loss", patience=5), LearningRateMonitor(). LR scheduler: return {"optimizer": opt, "lr_scheduler": {"scheduler": scheduler, "interval": "step", "monitor": "val_loss"}}. Lightning Fabric: fabric = Fabric(accelerator="gpu", devices=2, precision="bf16-mixed"), fabric.launch(), model, optimizer = fabric.setup(model, optimizer). loss.backward() → fabric.backward(loss). LightningCLI: LightningCLI(Model, Data) reads --config config.yaml. self.log("hp_metric", metric, on_epoch=True) for hyperparam logging. trainer.test(model, dataloaders=test_dl). Claude Code generates Lightning modules, data modules, training scripts, callback configs, and multi-GPU distributed training setups.
CLAUDE.md for PyTorch Lightning
## Lightning Stack
- Version: lightning >= 2.3 (unified package — includes both pytorch-lightning and Fabric)
- Module: LightningModule — forward, training_step, validation_step, configure_optimizers
- Data: LightningDataModule — prepare_data, setup(stage), train/val/test_dataloader
- Trainer: Trainer(accelerator, devices, strategy, precision, max_epochs, callbacks)
- Strategies: ddp (default multi-GPU), fsdp (large models), deepspeed (ZeRO)
- Precision: "16-mixed", "bf16-mixed", "32-true" — passed to Trainer
- Log: self.log(key, value, on_step, on_epoch, prog_bar, sync_dist)
- Fabric: low-level — fabric.setup(model,opt) → fabric.backward(loss)
Lightning Training Pipeline
# train/lightning_train.py — full LightningModule with DataModule and callbacks
from __future__ import annotations
from typing import Any
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, random_split
import lightning as L
from lightning.pytorch.callbacks import (
EarlyStopping,
LearningRateMonitor,
ModelCheckpoint,
RichProgressBar,
)
from lightning.pytorch.loggers import TensorBoardLogger, WandbLogger
# ── 1. Model (LightningModule) ─────────────────────────────────────────────────
class ChurnClassifier(L.LightningModule):
"""Churn prediction classifier — self-contained with optimizer config."""
def __init__(
self,
input_dim: int = 10,
hidden_dim: int = 128,
dropout: float = 0.2,
lr: float = 1e-3,
weight_decay: float = 1e-4,
warmup_steps: int = 100,
):
super().__init__()
self.save_hyperparameters() # Saves all __init__ args to self.hparams
self.net = nn.Sequential(
nn.Linear(input_dim, hidden_dim),
nn.BatchNorm1d(hidden_dim),
nn.GELU(),
nn.Dropout(dropout),
nn.Linear(hidden_dim, hidden_dim),
nn.BatchNorm1d(hidden_dim),
nn.GELU(),
nn.Dropout(dropout),
nn.Linear(hidden_dim, 2),
)
self.train_acc = L.pytorch.metrics.Accuracy(task="binary")
self.val_acc = L.pytorch.metrics.Accuracy(task="binary")
def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.net(x)
def _shared_step(self, batch: tuple, stage: str) -> torch.Tensor:
x, y = batch
logits = self(x)
loss = F.cross_entropy(logits, y)
preds = logits.argmax(dim=-1)
acc_metric = self.train_acc if stage == "train" else self.val_acc
acc_metric(preds, y)
self.log(f"{stage}_loss", loss, on_step=(stage=="train"), on_epoch=True, prog_bar=True)
self.log(f"{stage}_acc", acc_metric, on_step=False, on_epoch=True, prog_bar=True)
return loss
def training_step(self, batch: tuple, batch_idx: int) -> torch.Tensor:
return self._shared_step(batch, "train")
def validation_step(self, batch: tuple, batch_idx: int) -> None:
self._shared_step(batch, "val")
def test_step(self, batch: tuple, batch_idx: int) -> None:
self._shared_step(batch, "test")
def predict_step(self, batch: tuple, batch_idx: int) -> torch.Tensor:
x = batch[0] if isinstance(batch, (list, tuple)) else batch
return torch.softmax(self(x), dim=-1)[:, 1] # Probability of class 1
def configure_optimizers(self) -> dict[str, Any]:
optimizer = torch.optim.AdamW(
self.parameters(),
lr=self.hparams.lr,
weight_decay=self.hparams.weight_decay,
)
# Cosine annealing with linear warmup
def lr_lambda(current_step):
if current_step < self.hparams.warmup_steps:
return float(current_step) / float(max(1, self.hparams.warmup_steps))
progress = float(current_step - self.hparams.warmup_steps) / float(
max(1, self.trainer.estimated_stepping_batches - self.hparams.warmup_steps)
)
return max(0.0, 0.5 * (1.0 + torch.cos(torch.tensor(progress * 3.14159)).item()))
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)
return {
"optimizer": optimizer,
"lr_scheduler": {"scheduler": scheduler, "interval": "step"},
}
# ── 2. Data (LightningDataModule) ─────────────────────────────────────────────
class ChurnDataModule(L.LightningDataModule):
"""Churn prediction data module — handles splitting and loading."""
def __init__(
self,
data_path: str = "data/train.pt",
batch_size: int = 256,
val_split: float = 0.15,
num_workers: int = 4,
pin_memory: bool = True,
):
super().__init__()
self.save_hyperparameters()
self.train_ds = None
self.val_ds = None
self.test_ds = None
def prepare_data(self) -> None:
"""Called on rank 0 only — use for downloads or one-time processing."""
pass
def setup(self, stage: str | None = None) -> None:
"""Called on every device — use for data loading and splitting."""
import numpy as np
# Synthetic data — replace with torch.load(self.hparams.data_path)
rng = np.random.default_rng(42)
n = 10000
X = torch.tensor(rng.standard_normal((n, 10)), dtype=torch.float32)
y = torch.tensor(rng.integers(0, 2, n), dtype=torch.long)
full_ds = TensorDataset(X, y)
n_val = int(len(full_ds) * self.hparams.val_split)
n_train = len(full_ds) - n_val
self.train_ds, self.val_ds = random_split(
full_ds, [n_train, n_val],
generator=torch.Generator().manual_seed(42),
)
if stage in ("test", None):
self.test_ds = self.val_ds # Use val as test for demo
def _make_dl(self, ds, shuffle: bool) -> DataLoader:
return DataLoader(
ds,
batch_size=self.hparams.batch_size,
shuffle=shuffle,
num_workers=self.hparams.num_workers,
pin_memory=self.hparams.pin_memory,
persistent_workers=self.hparams.num_workers > 0,
)
def train_dataloader(self) -> DataLoader:
return self._make_dl(self.train_ds, shuffle=True)
def val_dataloader(self) -> DataLoader:
return self._make_dl(self.val_ds, shuffle=False)
def test_dataloader(self) -> DataLoader:
return self._make_dl(self.test_ds, shuffle=False)
# ── 3. Training script ────────────────────────────────────────────────────────
def train(
max_epochs: int = 50,
accelerator: str = "auto",
devices: int = 1,
strategy: str = "auto", # "ddp" for multi-GPU
precision: str = "bf16-mixed",
use_wandb: bool = False,
fast_dev_run: bool = False,
) -> L.Trainer:
"""Train the churn classifier with full callback and logging setup."""
# Loggers
loggers = [TensorBoardLogger("logs/tb", name="churn")]
if use_wandb:
loggers.append(WandbLogger(project="churn-lightning", log_model="all"))
# Callbacks
callbacks = [
ModelCheckpoint(
dirpath="checkpoints/churn",
filename="{epoch:02d}-{val_loss:.4f}",
monitor="val_loss",
mode="min",
save_top_k=3,
save_last=True,
),
EarlyStopping(
monitor="val_loss",
patience=8,
mode="min",
verbose=True,
),
LearningRateMonitor(logging_interval="step"),
RichProgressBar(),
]
trainer = L.Trainer(
max_epochs=max_epochs,
accelerator=accelerator,
devices=devices,
strategy=strategy,
precision=precision,
gradient_clip_val=1.0,
accumulate_grad_batches=2,
log_every_n_steps=10,
val_check_interval=0.25,
logger=loggers,
callbacks=callbacks,
fast_dev_run=fast_dev_run,
deterministic=True,
)
model = ChurnClassifier()
dm = ChurnDataModule()
trainer.fit(model, datamodule=dm)
trainer.test(model, datamodule=dm)
print(f"Best checkpoint: {callbacks[0].best_model_path}")
return trainer
# ── 4. Inference ──────────────────────────────────────────────────────────────
def load_and_predict(checkpoint_path: str, X: torch.Tensor) -> torch.Tensor:
"""Load from checkpoint and run prediction."""
model = ChurnClassifier.load_from_checkpoint(checkpoint_path)
model.eval()
trainer = L.Trainer(accelerator="auto", devices=1, logger=False, enable_checkpointing=False)
dl = DataLoader(TensorDataset(X), batch_size=512)
predictions = trainer.predict(model, dl)
return torch.cat(predictions)
if __name__ == "__main__":
train(max_epochs=5, fast_dev_run=False)
For the raw PyTorch + custom training loop alternative when needing full control over gradient accumulation timing, custom loss weighting between multiple task heads, or non-standard optimization algorithms not supported by Lightning’s abstraction — writing the loop manually avoids the abstraction layer while Lightning’s Trainer eliminates the 200+ lines of boilerplate needed for DDP, mixed precision, gradient clipping, checkpoint management, and early stopping, reducing the surface area for training bugs. For the Keras/TensorFlow alternative when operating in a TensorFlow-first organization with TFX pipelines and TensorFlow Serving deployment infrastructure — Keras compiles to TensorFlow while Lightning is purpose-built for PyTorch with native support for the broader PyTorch ecosystem including DeepSpeed, FSDP, TorchScript, and ONNX export. The Claude Skills 360 bundle includes Lightning skill sets covering LightningModule definitions, DataModule pipelines, multi-GPU DDP/FSDP training, callbacks, LightningCLI configs, and Fabric low-level control. Start with the free tier to try scalable model training generation.