Flash Attention computes exact attention in O(n) memory instead of O(n²). pip install flash-attn --no-build-isolation. Core: from flash_attn import flash_attn_func. output = flash_attn_func(q, k, v, dropout_p=0.0, causal=True) — q/k/v shape (batch, seqlen, nheads, headdim), dtype float16 or bfloat16. Variable-length (packed sequences): from flash_attn import flash_attn_varlen_func. cu_seqlens_q = torch.cat([torch.tensor([0]), torch.cumsum(seqlens, 0)]).int(). flash_attn_varlen_func(q, k, v, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, max_seqlen_k, causal=True). HuggingFace integration: model = AutoModelForCausalLM.from_pretrained(model_id, attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16) — auto-replaces all attention layers. Sliding window: flash_attn_func(q, k, v, window_size=(512, 0), causal=True) — window_size=(-1,-1) disables windowing. Softmax scale: flash_attn_func(q, k, v, softmax_scale=1.0/math.sqrt(head_dim)). KV cache inference: from flash_attn import flash_attn_with_kvcache. flash_attn_with_kvcache(q, k_cache, v_cache, k, v, cache_seqlens=cache_lens) — updates cache in-place and returns output. ALiBi slopes: compute get_alibi_slopes(nheads) and pass as alibi_slopes parameter. Flash Attention 3: from flash_attn_interface import flash_attn_func as flash_attn3_func — H100/H800 only, 1.5-2x faster via warp specialization and FP8 support. Benchmarks: FA2 is 2-4x faster than standard PyTorch attention at seq_len≥512 and uses 5-20x less memory, enabling 4-8x longer context at the same batch size. Claude Code generates Flash Attention modules, HuggingFace integration, KV cache patterns, and custom attention variants.
CLAUDE.md for Flash Attention
## Flash Attention Stack
- Version: flash-attn >= 2.5 (FA3 via flash_attn_interface for H100 only)
- Req: CUDA >= 11.6, PyTorch >= 2.0, bfloat16/float16 only (no float32)
- HF: AutoModelForCausalLM.from_pretrained(..., attn_implementation="flash_attention_2")
- Core: flash_attn_func(q, k, v, causal=True) — q/k/v: (batch, seqlen, nheads, headdim)
- Packed: flash_attn_varlen_func(q,k,v, cu_seqlens_q, cu_seqlens_k, max_sq, max_sk, causal=True)
- KV cache: flash_attn_with_kvcache(q, k_cache, v_cache, k, v, cache_seqlens)
- Sliding window: flash_attn_func(q, k, v, window_size=(left, right))
Flash Attention Module
# models/flash_attention.py — drop-in Flash Attention implementation
from __future__ import annotations
import math
from typing import Optional
import torch
import torch.nn as nn
import torch.nn.functional as F
try:
from flash_attn import flash_attn_func, flash_attn_varlen_func
from flash_attn.bert_padding import pad_input, unpad_input
FLASH_ATTN_AVAILABLE = True
except ImportError:
FLASH_ATTN_AVAILABLE = False
class FlashMultiHeadAttention(nn.Module):
"""
Multi-head attention with Flash Attention kernel.
Falls back to PyTorch SDPA when Flash Attention is unavailable.
"""
def __init__(
self,
hidden_dim: int,
num_heads: int,
dropout: float = 0.0,
causal: bool = True,
window_size: tuple[int, int] = (-1, -1), # (-1,-1) = full attention
use_rotary: bool = True,
):
super().__init__()
assert hidden_dim % num_heads == 0, "hidden_dim must be divisible by num_heads"
self.num_heads = num_heads
self.head_dim = hidden_dim // num_heads
self.scale = 1.0 / math.sqrt(self.head_dim)
self.causal = causal
self.dropout = dropout
self.window_size = window_size
self.qkv_proj = nn.Linear(hidden_dim, 3 * hidden_dim, bias=False)
self.out_proj = nn.Linear(hidden_dim, hidden_dim, bias=False)
if use_rotary:
self.rotary_emb = RotaryEmbedding(self.head_dim)
else:
self.rotary_emb = None
def forward(
self,
x: torch.Tensor, # (B, T, D)
attention_mask: Optional[torch.Tensor] = None, # (B, T_key) bool
position_ids: Optional[torch.Tensor] = None,
) -> torch.Tensor:
B, T, D = x.shape
# QKV projection and reshape
qkv = self.qkv_proj(x) # (B, T, 3D)
qkv = qkv.view(B, T, 3, self.num_heads, self.head_dim)
q, k, v = qkv.unbind(dim=2) # each: (B, T, H, Hd)
# Apply rotary positional embeddings
if self.rotary_emb is not None:
cos, sin = self.rotary_emb(v, position_ids)
q, k = apply_rotary_emb(q, k, cos, sin)
if FLASH_ATTN_AVAILABLE and x.dtype in (torch.float16, torch.bfloat16):
output = self._flash_forward(q, k, v, attention_mask, B, T)
else:
output = self._sdpa_forward(q, k, v, attention_mask, B, T)
output = output.reshape(B, T, D)
return self.out_proj(output)
def _flash_forward(
self, q, k, v, attention_mask, B: int, T: int
) -> torch.Tensor:
"""Flash Attention path — uses padding/unpadding for masked inputs."""
if attention_mask is not None:
# Unpad: remove padding tokens for variable-length attention
q_unpad, indices_q, cu_seqlens_q, max_seqlen_q = unpad_input(q, attention_mask)
k_unpad, _, cu_seqlens_k, max_seqlen_k = unpad_input(k, attention_mask)
v_unpad, _, _, _ = unpad_input(v, attention_mask)
output_unpad = flash_attn_varlen_func(
q_unpad, k_unpad, v_unpad,
cu_seqlens_q=cu_seqlens_q,
cu_seqlens_k=cu_seqlens_k,
max_seqlen_q=max_seqlen_q,
max_seqlen_k=max_seqlen_k,
dropout_p=self.dropout if self.training else 0.0,
softmax_scale=self.scale,
causal=self.causal,
window_size=self.window_size,
)
# Re-pad output to (B, T, H, Hd)
return pad_input(output_unpad, indices_q, B, T)
else:
return flash_attn_func(
q, k, v,
dropout_p=self.dropout if self.training else 0.0,
softmax_scale=self.scale,
causal=self.causal,
window_size=self.window_size,
)
def _sdpa_forward(
self, q, k, v, attention_mask, B: int, T: int
) -> torch.Tensor:
"""PyTorch scaled_dot_product_attention fallback."""
# Transpose to (B, H, T, Hd) for PyTorch SDPA
q = q.transpose(1, 2)
k = k.transpose(1, 2)
v = v.transpose(1, 2)
attn_mask = None
if attention_mask is not None:
attn_mask = ~attention_mask.bool().unsqueeze(1).unsqueeze(2)
out = F.scaled_dot_product_attention(
q, k, v,
attn_mask=attn_mask,
dropout_p=self.dropout if self.training else 0.0,
is_causal=self.causal and attn_mask is None,
scale=self.scale,
)
return out.transpose(1, 2) # back to (B, T, H, Hd)
# ── Rotary Positional Embedding (RoPE) ────────────────────────────────────────
class RotaryEmbedding(nn.Module):
"""
Rotary position embeddings compatible with Flash Attention.
cos/sin cached for performance; input position_ids allows arbitrary positions.
"""
def __init__(self, dim: int, base: int = 10000, max_position: int = 32768):
super().__init__()
inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
self.register_buffer("inv_freq", inv_freq, persistent=False)
self._cos_cached: Optional[torch.Tensor] = None
self._sin_cached: Optional[torch.Tensor] = None
self._seq_len_cached: int = 0
self.max_position = max_position
def _build_cache(self, x: torch.Tensor) -> None:
seq_len = x.shape[1]
if seq_len <= self._seq_len_cached:
return
self._seq_len_cached = seq_len
t = torch.arange(seq_len, device=x.device, dtype=self.inv_freq.dtype)
freqs = torch.einsum("i,j->ij", t, self.inv_freq)
emb = torch.cat([freqs, freqs], dim=-1)
self._cos_cached = emb.cos()[None, None, :, :] # (1, 1, T, D)
self._sin_cached = emb.sin()[None, None, :, :]
def forward(
self,
x: torch.Tensor,
position_ids: Optional[torch.Tensor] = None,
) -> tuple[torch.Tensor, torch.Tensor]:
self._build_cache(x)
if position_ids is not None:
cos = self._cos_cached.squeeze(0).squeeze(0)[position_ids]
sin = self._sin_cached.squeeze(0).squeeze(0)[position_ids]
else:
cos = self._cos_cached[:, :, :x.shape[1], :]
sin = self._sin_cached[:, :, :x.shape[1], :]
return cos.to(x.dtype), sin.to(x.dtype)
def rotate_half(x: torch.Tensor) -> torch.Tensor:
half = x.shape[-1] // 2
return torch.cat([-x[..., half:], x[..., :half]], dim=-1)
def apply_rotary_emb(
q: torch.Tensor,
k: torch.Tensor,
cos: torch.Tensor,
sin: torch.Tensor,
) -> tuple[torch.Tensor, torch.Tensor]:
q_rot = q * cos + rotate_half(q) * sin
k_rot = k * cos + rotate_half(k) * sin
return q_rot, k_rot
# ── HuggingFace integration ───────────────────────────────────────────────────
def load_with_flash_attention(
model_id: str,
dtype: torch.dtype = torch.bfloat16,
device: str = "auto",
):
"""Load a HuggingFace model with Flash Attention 2 kernel."""
from transformers import AutoModelForCausalLM, AutoTokenizer
model = AutoModelForCausalLM.from_pretrained(
model_id,
attn_implementation="flash_attention_2", # Drop-in replacement
torch_dtype=dtype, # Must be float16 or bfloat16
device_map=device,
use_cache=True,
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
print(f"Loaded {model_id} with Flash Attention 2")
print(f"Memory: {model.get_memory_footprint() / 1e9:.2f} GB")
return model, tokenizer
# ── KV-cache inference with Flash Attention ───────────────────────────────────
def streaming_generation_example():
"""
Flash Attention KV cache for fast autoregressive inference.
flash_attn_with_kvcache updates the cache in-place without copy.
"""
from flash_attn import flash_attn_with_kvcache
B, H, Hd = 1, 32, 128
max_len = 4096
cur_len = 512 # Current context length
# Pre-allocated KV cache (stays on device for the entire generation)
k_cache = torch.zeros(B, max_len, H, Hd, dtype=torch.bfloat16, device="cuda")
v_cache = torch.zeros(B, max_len, H, Hd, dtype=torch.bfloat16, device="cuda")
# New query/key/value for token at position cur_len
q_new = torch.randn(B, 1, H, Hd, dtype=torch.bfloat16, device="cuda")
k_new = torch.randn(B, 1, H, Hd, dtype=torch.bfloat16, device="cuda")
v_new = torch.randn(B, 1, H, Hd, dtype=torch.bfloat16, device="cuda")
cache_seqlens = torch.tensor([cur_len], device="cuda", dtype=torch.int32)
# Appends k_new/v_new to cache at position cur_len, returns attended output
out = flash_attn_with_kvcache(
q_new, k_cache, v_cache,
k=k_new, v=v_new,
cache_seqlens=cache_seqlens,
causal=True,
)
return out # (B, 1, H, Hd)
For the PyTorch SDPA (scaled_dot_product_attention) alternative when targeting CPU inference, MPS (Apple Silicon), or environments where CUDA is unavailable — PyTorch’s built-in SDPA automatically selects Flash Attention under the hood on supported CUDA hardware, so using F.scaled_dot_product_attention enables portability while losing the packed variable-length sequence support and more aggressive IO-tiling optimizations that flash_attn_varlen_func provides for training. For the xFormers memory-efficient attention alternative when using architectures that require custom masking patterns not supported by Flash Attention’s causal or sliding-window options — xFormers is more flexible in attention mask shapes while Flash Attention 2’s fused CUDA kernels outperform xFormers by 20-40% on standard causal attention. The Claude Skills 360 bundle includes Flash Attention skill sets covering custom attention modules, HuggingFace integration, KV cache inference patterns, RoPE implementation, and variable-length sequence packing. Start with the free tier to try fast attention code generation.