einops provides readable tensor operations with named axes notation. pip install einops. from einops import rearrange, reduce, repeat, pack, unpack. from einops.layers.torch import Rearrange, Reduce. Rearrange: rearrange(x, "b c h w -> b (c h w)") — flatten C,H,W. rearrange(x, "b (h h2) (w w2) -> b h w (h2 w2) ", h2=2, w2=2) — unfold patches. rearrange(x, "b c h w -> b h w c") — channel last. rearrange(x, "(b t) c -> b t c", b=batch) — unbatch. Merge: rearrange(x, "b h w c -> b (h w) c"). Split: rearrange(x, "b (n d) -> b n d", n=8). Transpose batch dims: rearrange(x, "b c h w -> c b h w"). Reduce: reduce(x, "b c h w -> b c", "mean") — global average pool. reduce(x, "b c h w -> b c 1 1", "mean") — GAP keeping dims. reduce(x, "b n d -> b d", "max") — max over sequence. Repeat: repeat(x, "b c -> b c h w", h=H, w=W) — broadcast. repeat(x, "b 1 d -> b n d", n=N) — expand sequence. Pack/Unpack: packed, ps = pack([img, seq], "b * d") — pack heterogeneous shapes. img, seq = unpack(packed, ps, "b * d"). Layer: Rearrange("b c h w -> b (h w) c") — as nn.Module. Reduce("b c h w -> b c", "mean"). EinMix: from einops.layers.torch import EinMix, EinMix("b t c -> b t d", weight_shape="c d", c=512, d=256) — learnable. Claude Code generates einops tensor transformations for vision transformers, attention mechanisms, and data pipeline reshaping.
CLAUDE.md for einops
## einops Stack
- Version: einops >= 0.7
- Core: rearrange | reduce | repeat | pack | unpack
- Backends: numpy, torch, tensorflow, jax, cupy — auto-detected
- Syntax: "input_pattern -> output_pattern" with named axes
- Merge axes: (a b) in output | Split: (a b) in input with size kwarg
- Layers: from einops.layers.torch import Rearrange, Reduce — as nn.Module
- EinMix: learnable linear projection with named axes
einops Tensor Operations Pipeline
# ml/einops_pipeline.py — readable tensor operations with einops
from __future__ import annotations
import math
from typing import Any
import numpy as np
import torch
import torch.nn as nn
from einops import rearrange, reduce, repeat, pack, unpack, asnumpy
from einops.layers.torch import Rearrange, Reduce, EinMix
# ── 0. Core operation patterns ────────────────────────────────────────────────
def demo_rearrange(x: torch.Tensor) -> dict[str, torch.Tensor]:
"""
Demonstrate common rearrange patterns.
Input x: (batch, channels, height, width) — standard conv feature map.
"""
b, c, h, w = x.shape
results = {}
# Flatten spatial dimensions
results["flatten_hw"] = rearrange(x, "b c h w -> b c (h w)")
# Flatten all non-batch
results["flatten_all"] = rearrange(x, "b c h w -> b (c h w)")
# Channel last (for TensorFlow-style ops)
results["channel_last"] = rearrange(x, "b c h w -> b h w c")
# Transpose batch and channel
results["swap_bc"] = rearrange(x, "b c h w -> c b h w")
# Add sequence dim (for transformer input)
results["to_seq"] = rearrange(x, "b c h w -> b (h w) c")
# Unfold into non-overlapping patches (h2×w2)
results["patches"] = rearrange(x, "b c (h h2) (w w2) -> b (h w) (c h2 w2)", h2=2, w2=2)
# Add singleton dims
results["add_dims"] = rearrange(x, "b c h w -> b 1 c h w")
# Remove singleton (if c==1)
if c == 1:
results["squeeze_c"] = rearrange(x, "b 1 h w -> b h w")
return results
def demo_reduce(x: torch.Tensor) -> dict[str, torch.Tensor]:
"""
Global pooling and reduction patterns.
Input x: (batch, channels, height, width).
"""
return {
# Global average pool → (b, c)
"gap": reduce(x, "b c h w -> b c", "mean"),
# Global max pool → (b, c)
"gmp": reduce(x, "b c h w -> b c", "max"),
# GAP preserving dims → (b, c, 1, 1) — compatible with conv output
"gap_keepdims": reduce(x, "b c h w -> b c 1 1", "mean"),
# Row mean → (b, c, h)
"row_mean": reduce(x, "b c h w -> b c h", "mean"),
# Full spatial → (b,) scalar per sample
"total_mean": reduce(x, "b c h w -> b", "mean"),
}
def demo_repeat(x: torch.Tensor, n_heads: int = 8) -> dict[str, torch.Tensor]:
"""
Broadcasting / expansion patterns.
Input x: 1D or 2D tensor depending on context.
"""
b = x.shape[0]
# Expand a (b, d) vector along a sequence dim
vec = x[:, :64]
return {
# (b, d) → (b, n, d): same vector at every position
"seq_expand": repeat(vec, "b d -> b n d", n=16),
# (b, d) → (b, h, d): broadcast across heads
"head_expand": repeat(vec, "b d -> b h d", h=n_heads),
# (b, d) → (b, d, h, w): broadcast to spatial
"spatial_expand": repeat(vec, "b d -> b d h w", h=8, w=8),
}
# ── 1. Vision Transformer helpers ─────────────────────────────────────────────
def image_to_patches(
images: torch.Tensor, # (B, C, H, W)
patch_size: int = 16,
) -> torch.Tensor:
"""
Patchify an image into non-overlapping patches for ViT.
Returns: (B, num_patches, patch_dim) where patch_dim = C * P * P
"""
return rearrange(
images,
"b c (h p1) (w p2) -> b (h w) (c p1 p2)",
p1=patch_size, p2=patch_size,
)
def patches_to_image(
patches: torch.Tensor, # (B, num_patches, patch_dim)
H: int,
W: int,
patch_size: int = 16,
C: int = 3,
) -> torch.Tensor:
"""Reconstruct image from patches (reverse of image_to_patches)."""
h = H // patch_size
w = W // patch_size
return rearrange(
patches,
"b (h w) (c p1 p2) -> b c (h p1) (w p2)",
h=h, w=w, p1=patch_size, p2=patch_size, c=C,
)
def multi_head_split(
x: torch.Tensor, # (B, T, D)
n_heads: int,
) -> torch.Tensor:
"""Split embedding dim into heads: (B, T, D) → (B, H, T, D/H)."""
return rearrange(x, "b t (h d) -> b h t d", h=n_heads)
def multi_head_merge(
x: torch.Tensor, # (B, H, T, D/H)
) -> torch.Tensor:
"""Merge heads back: (B, H, T, D/H) → (B, T, H*D/H)."""
return rearrange(x, "b h t d -> b t (h d)")
def compute_attention(
q: torch.Tensor, # (B, H, T, D)
k: torch.Tensor,
v: torch.Tensor,
scale: float = None,
) -> torch.Tensor:
"""Scaled dot-product attention using einops-style notation with torch."""
d = q.shape[-1]
scale = scale or math.sqrt(d)
# (B, H, T, T) attention weights
attn = torch.einsum("bhqd, bhkd -> bhqk", q, k) / scale
attn = attn.softmax(dim=-1)
# (B, H, T, D)
out = torch.einsum("bhqk, bhkd -> bhqd", attn, v)
return out
# ── 2. Temporal / sequence patterns ──────────────────────────────────────────
def batch_sequences(
sequences: list[torch.Tensor], # list of (T_i, D) tensors (variable length)
pad_value: float = 0.0,
) -> tuple[torch.Tensor, torch.Tensor]:
"""
Pack variable-length sequences into a padded batch.
Returns (B, T_max, D) and (B,) length mask.
"""
max_t = max(s.shape[0] for s in sequences)
d = sequences[0].shape[-1]
batch = torch.full((len(sequences), max_t, d), pad_value, dtype=sequences[0].dtype)
lengths = torch.zeros(len(sequences), dtype=torch.long)
for i, seq in enumerate(sequences):
batch[i, :len(seq)] = seq
lengths[i] = len(seq)
return batch, lengths
def unbatch_time(
x: torch.Tensor, # (B*T, D) — flattened for efficient layer pass
batch: int,
) -> torch.Tensor:
"""Reshape (B*T, D) back to (B, T, D)."""
return rearrange(x, "(b t) d -> b t d", b=batch)
def sliding_window(
x: torch.Tensor, # (B, T, D)
win_size: int,
stride: int = 1,
) -> torch.Tensor:
"""
Create sliding windows along the time axis.
Returns (B, num_windows, win_size, D).
Non-overlapping windows only (stride==win_size) can be done via rearrange.
For arbitrary stride, use unfold.
"""
if stride == win_size:
t = x.shape[1] // win_size
return rearrange(x, "b (t w) d -> b t w d", w=win_size)
# General case via unfold
windows = x.unfold(1, win_size, stride) # (B, num_windows, D, win_size)
return rearrange(windows, "b n d w -> b n w d")
# ── 3. Pack / Unpack for mixed modalities ─────────────────────────────────────
def pack_multimodal(
image_feats: torch.Tensor, # (B, H*W, D) image patches
text_feats: torch.Tensor, # (B, T, D) text tokens
audio_feats: torch.Tensor, # (B, A, D) audio frames
) -> tuple[torch.Tensor, list]:
"""
Pack heterogeneous modality features along the sequence axis.
Returns packed (B, H*W+T+A, D) and pattern sizes for unpack.
"""
packed, ps = pack([image_feats, text_feats, audio_feats], "b * d")
return packed, ps
def unpack_multimodal(
packed: torch.Tensor,
ps: list,
) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
"""Unpack after cross-modal attention back to original shapes."""
img, txt, aud = unpack(packed, ps, "b * d")
return img, txt, aud
# ── 4. PyTorch module layers ──────────────────────────────────────────────────
def build_patch_embedding(
image_size: int = 224,
patch_size: int = 16,
in_channels: int = 3,
embed_dim: int = 768,
) -> nn.Sequential:
"""
ViT patch embedding using einops Rearrange + Linear.
Equivalent to Conv2d(kernel=patch_size, stride=patch_size) but explicit.
"""
num_patches = (image_size // patch_size) ** 2
patch_dim = in_channels * patch_size * patch_size
return nn.Sequential(
Rearrange("b c (h p1) (w p2) -> b (h w) (c p1 p2)",
p1=patch_size, p2=patch_size),
nn.LayerNorm(patch_dim),
nn.Linear(patch_dim, embed_dim),
nn.LayerNorm(embed_dim),
)
def build_gap_classifier(
feature_dim: int,
num_classes: int,
) -> nn.Sequential:
"""Global average pool + classifier head using einops Reduce."""
return nn.Sequential(
Reduce("b c h w -> b c", "mean"), # GAP
nn.LayerNorm(feature_dim),
nn.Linear(feature_dim, num_classes),
)
def build_einmix_ffn(
d_model: int,
d_ff: int,
dropout: float = 0.1,
) -> nn.Sequential:
"""
Feed-forward network with EinMix (named-dimension Linear).
EinMix is identical to nn.Linear but with explicit axis names.
"""
return nn.Sequential(
EinMix("b t d_model -> b t d_ff",
weight_shape="d_model d_ff",
d_model=d_model, d_ff=d_ff),
nn.GELU(),
nn.Dropout(dropout),
EinMix("b t d_ff -> b t d_model",
weight_shape="d_ff d_model",
d_ff=d_ff, d_model=d_model),
)
# ── Demo ──────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
print("einops Tensor Operations Demo")
print("=" * 50)
B, C, H, W = 4, 3, 32, 32
x = torch.randn(B, C, H, W)
# Core operations
reshaped = demo_rearrange(x)
for name, t in reshaped.items():
print(f" rearrange {name}: {tuple(t.shape)}")
print()
pooled = demo_reduce(x)
for name, t in pooled.items():
print(f" reduce {name}: {tuple(t.shape)}")
# ViT patchify
print()
imgs = torch.randn(2, 3, 64, 64)
patches = image_to_patches(imgs, patch_size=8)
print(f" image_to_patches: {tuple(imgs.shape)} → {tuple(patches.shape)}")
reconstructed = patches_to_image(patches, 64, 64, patch_size=8, C=3)
print(f" patches_to_image: {tuple(patches.shape)} → {tuple(reconstructed.shape)}")
# Multi-head attention
seq = torch.randn(2, 16, 128)
q = multi_head_split(seq, n_heads=4)
print(f"\n multi_head_split: {tuple(seq.shape)} → {tuple(q.shape)}")
merged = multi_head_merge(q)
print(f" multi_head_merge: {tuple(q.shape)} → {tuple(merged.shape)}")
# Pack/unpack
img_f = torch.randn(2, 49, 256)
txt_f = torch.randn(2, 16, 256)
aud_f = torch.randn(2, 8, 256)
packed, ps = pack_multimodal(img_f, txt_f, aud_f)
print(f"\n pack_multimodal: {tuple(packed.shape)} (49+16+8={49+16+8} tokens)")
i2, t2, a2 = unpack_multimodal(packed, ps)
print(f" unpack back: img={tuple(i2.shape)}, txt={tuple(t2.shape)}, aud={tuple(a2.shape)}")
# Layers
patch_embed = build_patch_embedding(64, patch_size=8, embed_dim=128)
out = patch_embed(torch.randn(2, 3, 64, 64))
print(f"\n patch_embedding output: {tuple(out.shape)}")
For the torch.view / torch.permute alternative — x.view(b, -1) loses all information about which dimensions were merged, requiring a mental model of the original shape at every call site, while rearrange(x, "b c h w -> b (c h w)") is self-documenting code where the pattern simultaneously specifies the input contract and the output shape, and rearrange(x, "b (h p1) (w p2) -> b (h w) (c p1 p2)", p1=16, p2=16, c=3) replaces a four-line view/permute/view chain with a single expression that a new team member can read directly. For the einsum alternative for multi-head attention — torch.einsum("bhqd,bhkd->bhqk", q, k) is powerful but rearrange(x, "b t (h d) -> b h t d", h=8) prepares the heads-split and rearrange(out, "b h t d -> b t (h d)") merges them without a manual .view() or .transpose(), making the multi-head split/merge pattern reproducible from the operation string alone, and pack([img, txt, audio], "b * d") collapses variable-length multi-modal sequences into one tensor for self-attention then unpack restores original shapes with zero padding logic. The Claude Skills 360 bundle includes einops skill sets covering rearrange flatten/unfold/transpose/split patterns, reduce GAP/GMP/keepdims, repeat broadcasting, pack/unpack for multi-modal batching, image_to_patches for ViT, multi_head_split/merge for attention, sliding_window unfold, Rearrange/Reduce PyTorch layers, EinMix FFN, and patch embedding module. Start with the free tier to try tensor reshape code generation.