MLX runs ML models natively on Apple Silicon with unified memory. pip install mlx mlx-lm. import mlx.core as mx, import mlx.nn as nn. Arrays: a = mx.array([1.0, 2.0, 3.0]) — lives on GPU via Metal by default. mx.zeros((4, 4)), mx.random.normal((512,)). Operations lazy by default: mx.eval(result) triggers computation. Gradient: grad_fn = mx.grad(loss_fn), grads = grad_fn(model, x, y). loss, grads = mx.value_and_grad(loss_fn)(model, x, y). Module: class MLP(nn.Module): def __init__(self): super().__init__(); self.l1=nn.Linear(784,256); self.l2=nn.Linear(256,10). def __call__(self, x): return self.l2(nn.relu(self.l1(x))). Compile: mx.compile(fn) — JIT traces and caches the computation graph. Optimizer: opt = mlx.optimizers.AdamW(learning_rate=1e-3). opt.update(model, grads). mx.eval(model.parameters(), opt.state). LLM inference: from mlx_lm import load, generate. model, tokenizer = load("mlx-community/Meta-Llama-3.1-8B-Instruct-4bit"). response = generate(model, tokenizer, prompt="Hello!", max_tokens=200, verbose=True). Fine-tuning with LoRA: mlx_lm.lora --model mlx-community/Mistral-7B-v0.1-4bit --train --data data/ --batch-size 4 --lora-layers 4. Quantize: python -m mlx_lm.convert --hf-path meta-llama/Llama-3.2-3B-Instruct --mlx-path mlx-llama3 --quantize --q-bits 4. mx.quantize(layer, bits=4) quantizes Linear layer weights. Batch inference: mlx_lm.server --model mlx-community/Llama-3.2-3B-Instruct-4bit --port 8080 serves OpenAI-compatible HTTP endpoint. Claude Code generates MLX model definitions, training loops, LoRA fine-tuning scripts, quantization pipelines, and inference servers for Apple Silicon.
CLAUDE.md for Apple MLX
## Apple MLX Stack
- Version: mlx >= 0.18, mlx-lm >= 0.18 (Apple Silicon only — M1/M2/M3/M4)
- Arrays: mx.array / mx.zeros/ones/random.normal — lazy eval, call mx.eval() to materialize
- Grad: mx.grad(fn) or mx.value_and_grad(fn) — pure functions over module trees
- Module: nn.Module subclass with __call__; mx.compile(fn) for JIT
- Optimizer: mlx.optimizers.AdamW(lr); opt.update(model, grads); mx.eval(model, opt.state)
- LLM: mlx_lm.load(model_id) → generate(model, tokenizer, prompt, max_tokens)
- Quantize: mlx_lm.convert --quantize --q-bits 4 (4-bit, ~2GB for 7B)
- LoRA: mlx_lm.lora --model path --train --data dir/ --lora-layers 4
MLX Neural Network Training
# models/mlx_train.py — MLX neural network training on Apple Silicon
from __future__ import annotations
import time
from functools import partial
from pathlib import Path
from typing import Generator
import mlx.core as mx
import mlx.nn as nn
import mlx.optimizers as optim
import numpy as np
# ── 1. Model definitions ──────────────────────────────────────────────────────
class MLP(nn.Module):
"""Multi-layer perceptron — tabular classification or regression."""
def __init__(self, dims: list[int], dropout: float = 0.1):
super().__init__()
self.layers = [nn.Linear(dims[i], dims[i+1]) for i in range(len(dims)-2)]
self.out = nn.Linear(dims[-2], dims[-1])
self.dropout = nn.Dropout(p=dropout)
def __call__(self, x: mx.array, training: bool = True) -> mx.array:
for layer in self.layers:
x = nn.relu(layer(x))
if training:
x = self.dropout(x)
return self.out(x)
class TransformerBlock(nn.Module):
"""Single Transformer block with pre-norm."""
def __init__(self, d_model: int, num_heads: int, mlp_ratio: int = 4):
super().__init__()
self.norm1 = nn.LayerNorm(d_model)
self.norm2 = nn.LayerNorm(d_model)
self.attn = nn.MultiHeadAttention(d_model, num_heads)
self.ff1 = nn.Linear(d_model, d_model * mlp_ratio)
self.ff2 = nn.Linear(d_model * mlp_ratio, d_model)
def __call__(self, x: mx.array, mask: mx.array | None = None) -> mx.array:
# Self-attention with residual
h = self.norm1(x)
x = x + self.attn(h, h, h, mask=mask)
# Feed-forward with residual
h = self.norm2(x)
x = x + self.ff2(nn.gelu(self.ff1(h)))
return x
class LoRALinear(nn.Module):
"""
LoRA-augmented linear layer for efficient fine-tuning.
Adds trainable low-rank decomposition BA to frozen weights W.
"""
def __init__(self, base_layer: nn.Linear, rank: int = 16, alpha: float = 16.0):
super().__init__()
in_f = base_layer.weight.shape[1]
out_f = base_layer.weight.shape[0]
# Freeze base weights by NOT making them parameters
self.weight = base_layer.weight # Frozen
self.bias = base_layer.bias
# LoRA matrices — only these are trained
scale = alpha / rank
self.A = mx.random.normal((rank, in_f)) * 0.01
self.B = mx.zeros((out_f, rank))
self.scale = scale
def __call__(self, x: mx.array) -> mx.array:
base_out = x @ self.weight.T
if self.bias is not None:
base_out = base_out + self.bias
lora_out = (x @ self.A.T) @ self.B.T
return base_out + self.scale * lora_out
# ── 2. Loss functions ─────────────────────────────────────────────────────────
def cross_entropy_loss(model: nn.Module, x: mx.array, y: mx.array) -> mx.array:
logits = model(x, training=True)
return nn.losses.cross_entropy(logits, y, reduction="mean")
def mse_loss(model: nn.Module, x: mx.array, y: mx.array) -> mx.array:
return mx.mean((model(x, training=True) - y) ** 2)
# ── 3. Training step (compiled for speed) ────────────────────────────────────
def make_train_step(loss_fn):
"""Factory — creates a JIT-compiled training step for a given loss."""
@partial(mx.compile, inputs=nn.Module.trainable_parameters, outputs=nn.Module.trainable_parameters)
def train_step(model: nn.Module, optimizer: optim.Optimizer, x: mx.array, y: mx.array):
loss, grads = mx.value_and_grad(loss_fn)(model, x, y)
optimizer.update(model, grads)
return loss
return train_step
# ── 4. Data generator ─────────────────────────────────────────────────────────
def numpy_to_mx_batches(
X: np.ndarray,
y: np.ndarray,
batch_size: int = 256,
shuffle: bool = True,
) -> Generator[tuple[mx.array, mx.array], None, None]:
n = len(X)
idx = np.random.permutation(n) if shuffle else np.arange(n)
for start in range(0, n, batch_size):
b = idx[start : start + batch_size]
yield mx.array(X[b]), mx.array(y[b])
# ── 5. Full training loop ─────────────────────────────────────────────────────
def train(
in_features: int = 10,
out_features: int = 2,
hidden_dim: int = 128,
lr: float = 1e-3,
epochs: int = 20,
batch_size: int = 256,
):
# Synthetic data
rng = np.random.default_rng(42)
X_np = rng.standard_normal((5000, in_features)).astype(np.float32)
y_np = rng.integers(0, out_features, size=5000)
# Model + optimizer
model = MLP([in_features, hidden_dim, hidden_dim, out_features])
mx.eval(model.parameters()) # Materialize lazy params
optimizer = optim.AdamW(learning_rate=lr, weight_decay=1e-4)
train_step = make_train_step(cross_entropy_loss)
for epoch in range(epochs):
t0 = time.perf_counter()
losses: list[float] = []
for x_b, y_b in numpy_to_mx_batches(X_np, y_np, batch_size):
loss = train_step(model, optimizer, x_b, y_b)
mx.eval(loss) # Force evaluation to get the scalar value
losses.append(loss.item())
elapsed = (time.perf_counter() - t0) * 1e3
print(f"Epoch {epoch+1:3d} | loss={np.mean(losses):.4f} | {elapsed:.0f}ms")
return model
# ── 6. LLM inference with mlx-lm ─────────────────────────────────────────────
def run_llm_inference(
model_id: str = "mlx-community/Meta-Llama-3.1-8B-Instruct-4bit",
prompt: str = "Explain gradient descent in one paragraph.",
max_tokens: int = 300,
) -> str:
"""Load quantized LLM and generate text on Apple Silicon."""
from mlx_lm import load, generate
print(f"Loading {model_id}...")
model, tokenizer = load(model_id)
response = generate(
model,
tokenizer,
prompt=prompt,
max_tokens=max_tokens,
temp=0.7,
verbose=False,
)
return response
# ── 7. Save and load model weights ───────────────────────────────────────────
def save_model(model: nn.Module, path: str = "model.npz") -> None:
"""Save model weights as NumPy .npz file."""
weights = dict(mx.utils.tree_flatten(model.parameters()))
np.savez(path, **{k: np.array(v) for k, v in weights.items()})
print(f"Model saved: {path}")
def load_model(model: nn.Module, path: str = "model.npz") -> nn.Module:
"""Load weights back into model."""
data = np.load(path)
weights = mx.utils.tree_unflatten(list(data.items()))
model.load_weights(list(mx.utils.tree_flatten(weights)))
mx.eval(model.parameters())
return model
if __name__ == "__main__":
print("Training MLP on Apple Silicon...")
model = train()
print("\nRunning LLM inference...")
response = run_llm_inference()
print(f"Response: {response[:200]}...")
For the PyTorch MPS backend alternative when needing access to PyTorch’s full ecosystem (TRL, DeepSpeed, Flash Attention) while running on Apple Silicon — torch.device("mps") moves tensors to the M-chip GPU while MLX’s unified memory model avoids explicit copies between CPU and GPU entirely, and MLX’s LoRA fine-tuning via mlx_lm.lora is specifically optimized for Apple Silicon memory bandwidth in ways that PyTorch MPS cannot match. For the GGUF/Ollama alternative when wanting a completely no-code local LLM setup — Ollama downloads and runs GGUF-quantized models with a single command while MLX provides a Python API that allows customization of generation, evaluation pipelines, and fine-tuning that isn’t possible through Ollama’s black-box serving layer. The Claude Skills 360 bundle includes Apple MLX skill sets covering nn.Module definitions, training loops, LoRA fine-tuning, LLM inference with mlx-lm, quantization, and weight serialization. Start with the free tier to try Apple Silicon ML code generation.