JAX compiles Python functions to XLA for TPU/GPU. pip install jax flax optax. import jax.numpy as jnp — drop-in NumPy on devices. jax.grad(fn)(x) returns gradient of scalar fn w.r.t. x. jax.jit(fn) JIT-compiles on first call. jax.vmap(fn)(batch) vectorizes over batch. jax.pmap(fn)(xs) parallelizes over devices (TPU pods). Compose: jit(vmap(grad(fn))). PRNGs: key = jax.random.PRNGKey(42), key, subkey = jax.random.split(key), x = jax.random.normal(subkey, shape=(10,)). jnp.ones/zeros/arange match NumPy API. lax.scan(fn, init, xs) — memory-efficient loop over xs, returns (carry, ys). Flax NNX (new API): from flax import nnx, class MLP(nnx.Module): def __init__(self, din, dhid, dout, rngs): self.linear1 = nnx.Linear(din, dhid, rngs=rngs); self.linear2 = nnx.Linear(dhid, dout, rngs=rngs). def __call__(self, x): return self.linear2(nnx.relu(self.linear1(x))). model = MLP(784, 256, 10, rngs=nnx.Rngs(0)). optimizer = nnx.Optimizer(model, optax.adam(1e-3)). @nnx.jit\ndef train_step(model, optimizer, x, y): loss, grads = nnx.value_and_grad(loss_fn)(model, x, y); optimizer.update(grads). Flax Linen (older API): class Net(nn.Module): features: int\n def __call__(self, x): return nn.Dense(self.features)(x). params = Net(256).init(key, x)["params"]. y = Net(256).apply({"params": params}, x). TrainState: from flax.training import train_state, state = train_state.TrainState.create(apply_fn=model.apply, params=params, tx=optax.adam(lr)). Sharding: from jax.sharding import Mesh, PartitionSpec, NamedSharding, mesh = Mesh(devices, axis_names=("data","model")). Claude Code generates JAX training loops, Flax NNX modules, pmap multi-device training, and optax optimizer configurations.
CLAUDE.md for JAX/Flax
## JAX/Flax Stack
- Version: jax >= 0.4.25, flax >= 0.9 (NNX), optax >= 0.2
- JAX core: jit/vmap/grad/pmap — compose freely: jit(vmap(grad(fn)))
- RNG: PRNGKey(seed) → split(key) → consume subkey (pure/stateless)
- Flax NNX: nnx.Module with rngs=nnx.Rngs(seed) → nnx.jit/nnx.grad
- Flax Linen: nn.Module.init(key, x) → params dict; .apply({"params":p}, x)
- Optimizer: nnx.Optimizer(model, optax.adam(lr)) or train_state.TrainState
- Multi-device: pmap(fn) over leading batch dim; use mesh+PartitionSpec for TP
Training Loop
# train/jax_train.py — JAX + Flax NNX training with optax
from __future__ import annotations
import functools
from typing import Any
import jax
import jax.numpy as jnp
import numpy as np
import optax
from flax import nnx
from flax.training import train_state
# ── 1. Define model with Flax NNX ─────────────────────────────────────────────
class MLP(nnx.Module):
"""Multi-layer perceptron for tabular classification."""
def __init__(
self,
in_features: int,
hidden_dim: int,
out_features: int,
dropout_rate: float,
rngs: nnx.Rngs,
):
self.linear1 = nnx.Linear(in_features, hidden_dim, rngs=rngs)
self.linear2 = nnx.Linear(hidden_dim, hidden_dim, rngs=rngs)
self.linear3 = nnx.Linear(hidden_dim, out_features, rngs=rngs)
self.bn1 = nnx.BatchNorm(hidden_dim, rngs=rngs)
self.bn2 = nnx.BatchNorm(hidden_dim, rngs=rngs)
self.dropout = nnx.Dropout(dropout_rate, rngs=rngs)
def __call__(self, x: jnp.ndarray, training: bool = True) -> jnp.ndarray:
x = self.linear1(x)
x = self.bn1(x, use_running_average=not training)
x = nnx.relu(x)
x = self.dropout(x, deterministic=not training)
x = self.linear2(x)
x = self.bn2(x, use_running_average=not training)
x = nnx.relu(x)
return self.linear3(x)
class Transformer(nnx.Module):
"""Single-layer Transformer encoder block."""
def __init__(self, d_model: int, num_heads: int, mlp_dim: int, rngs: nnx.Rngs):
self.attn = nnx.MultiHeadAttention(num_heads=num_heads, in_features=d_model, rngs=rngs)
self.ln1 = nnx.LayerNorm(d_model, rngs=rngs)
self.ln2 = nnx.LayerNorm(d_model, rngs=rngs)
self.ff1 = nnx.Linear(d_model, mlp_dim, rngs=rngs)
self.ff2 = nnx.Linear(mlp_dim, d_model, rngs=rngs)
def __call__(self, x: jnp.ndarray) -> jnp.ndarray:
# Self-attention with residual
x = x + self.attn(self.ln1(x))
# Feed-forward with residual
h = self.ff2(nnx.gelu(self.ff1(self.ln2(x))))
return x + h
# ── 2. Loss and metrics ───────────────────────────────────────────────────────
def cross_entropy_loss(model: MLP, x: jnp.ndarray, y: jnp.ndarray) -> jnp.ndarray:
logits = model(x, training=True)
return optax.softmax_cross_entropy_with_integer_labels(logits, y).mean()
def accuracy(model: MLP, x: jnp.ndarray, y: jnp.ndarray) -> float:
logits = model(x, training=False)
return jnp.mean(jnp.argmax(logits, axis=-1) == y)
# ── 3. Train / eval steps (JIT-compiled) ─────────────────────────────────────
@nnx.jit
def train_step(
model: MLP,
optimizer: nnx.Optimizer,
metrics: nnx.MultiMetric,
x: jnp.ndarray,
y: jnp.ndarray,
) -> None:
"""Single gradient update — mutates model and optimizer in-place."""
loss, grads = nnx.value_and_grad(cross_entropy_loss)(model, x, y)
optimizer.update(grads)
metrics.update(loss=loss, logits=model(x, training=False), labels=y)
@nnx.jit
def eval_step(
model: MLP,
metrics: nnx.MultiMetric,
x: jnp.ndarray,
y: jnp.ndarray,
) -> None:
metrics.update(loss=cross_entropy_loss(model, x, y),
logits=model(x, training=False), labels=y)
# ── 4. Training loop ──────────────────────────────────────────────────────────
def train(
in_features: int = 5,
hidden_dim: int = 128,
out_features: int = 2,
lr: float = 1e-3,
epochs: int = 10,
batch_size: int = 256,
n_train: int = 10000,
):
# Synthetic dataset
key = jax.random.PRNGKey(42)
key, k1,k2 = jax.random.split(key, 3)
X_train = jax.random.normal(k1, (n_train, in_features))
y_train = jax.random.randint(k2, (n_train,), 0, out_features)
# Model + optimizer
model = MLP(in_features, hidden_dim, out_features, 0.1, rngs=nnx.Rngs(0))
optimizer = nnx.Optimizer(model, optax.chain(
optax.clip_by_global_norm(1.0),
optax.adamw(lr, weight_decay=1e-4),
))
# Metrics
metrics = nnx.MultiMetric(
loss=nnx.metrics.Average("loss"),
accuracy=nnx.metrics.Accuracy(),
)
n_batches = n_train // batch_size
for epoch in range(epochs):
# Shuffle
perm = jax.random.permutation(key, n_train)
key, _ = jax.random.split(key)
X_shuf = X_train[perm]
y_shuf = y_train[perm]
metrics.reset()
for i in range(n_batches):
x_b = X_shuf[i * batch_size:(i + 1) * batch_size]
y_b = y_shuf[i * batch_size:(i + 1) * batch_size]
train_step(model, optimizer, metrics, x_b, y_b)
computed = metrics.compute()
print(f"Epoch {epoch+1:3d} | loss={computed['loss']:.4f} | acc={computed['accuracy']:.4f}")
return model
# ── 5. Multi-device with pmap ─────────────────────────────────────────────────
def pmap_train_step_example():
"""
pmap distributes the leading batch dim across all devices.
Each device runs the function on its shard and averages gradients.
"""
n_devices = jax.local_device_count()
print(f"Training on {n_devices} device(s) with pmap")
# For pmap, use Flax Linen (pure functions required)
import flax.linen as nn
class PmapMLP(nn.Module):
features: int
@nn.compact
def __call__(self, x):
x = nn.Dense(self.features)(x)
x = nn.relu(x)
return nn.Dense(2)(x)
# Initialize on one device, replicate across all
key = jax.random.PRNGKey(0)
dummy = jnp.ones((1, 5))
params = PmapMLP(128).init(key, dummy)["params"]
# Replicate params for each device
replicated_params = jax.tree_util.tree_map(
lambda p: jnp.stack([p] * n_devices), params
)
tx = optax.adam(1e-3)
opt_state = jax.pmap(tx.init)(replicated_params)
@functools.partial(jax.pmap, axis_name="batch")
def pmap_update(params, opt_state, x_batch, y_batch):
def loss_fn(params):
logits = PmapMLP(128).apply({"params": params}, x_batch)
return optax.softmax_cross_entropy_with_integer_labels(logits, y_batch).mean()
loss, grads = jax.value_and_grad(loss_fn)(params)
# Average gradients across all devices
grads = jax.lax.pmean(grads, axis_name="batch")
loss = jax.lax.pmean(loss, axis_name="batch")
updates, new_opt_state = tx.update(grads, opt_state, params)
new_params = optax.apply_updates(params, updates)
return new_params, new_opt_state, loss
# Batch must have shape (n_devices, per_device_batch, ...)
per_device_batch = 64
x = jnp.ones((n_devices, per_device_batch, 5))
y = jnp.zeros((n_devices, per_device_batch), dtype=jnp.int32)
replicated_params, opt_state, loss = pmap_update(replicated_params, opt_state, x, y)
print(f"pmap step done. Loss: {loss[0]:.4f}")
return replicated_params
# ── 6. lax.scan for efficient RNN/LSTM ───────────────────────────────────────
@jax.jit
def simple_rnn_scan(params: dict, xs: jnp.ndarray) -> jnp.ndarray:
"""
Stateful RNN using lax.scan — avoids Python loop, O(1) memory.
xs shape: (seq_len, input_dim)
"""
W_h = params["W_h"] # (hidden, hidden)
W_x = params["W_x"] # (hidden, input_dim)
b = params["b"] # (hidden,)
def step(h, x_t):
h_new = jnp.tanh(W_h @ h + W_x @ x_t + b)
return h_new, h_new # (carry, output)
h0 = jnp.zeros(W_h.shape[0])
_, outputs = jax.lax.scan(step, h0, xs)
return outputs # (seq_len, hidden)
if __name__ == "__main__":
model = train()
pmap_train_step_example()
For the PyTorch alternative when needing a larger ecosystem of third-party libraries (TRL, DeepSpeed, vLLM, Flash Attention), better debugging with Python-native stack traces, and a larger community for LLM fine-tuning — PyTorch’s imperative style is easier to debug while JAX/Flax’s functional purity enables XLA compilation that achieves 2-4x faster training on TPU pods and more predictable memory usage through JIT fusion. For the TensorFlow/Keras alternative when targeting TensorFlow Serving, TFLite mobile deployment, or enterprise deployments with existing TF infrastructure — TensorFlow’s deployment ecosystem is broader while JAX runs on the same XLA backend and achieves better research flexibility through its composable transforms (grad, jit, vmap, pmap) that work uniformly across functions including nested calls. The Claude Skills 360 bundle includes JAX/Flax skill sets covering NNX module definitions, JIT training loops, pmap multi-device training, lax.scan recurrence, and optax optimizer pipelines. Start with the free tier to try functional ML code generation.