Albumentations is a fast image augmentation library for computer vision pipelines. pip install albumentations. import albumentations as A. from albumentations.pytorch import ToTensorV2. Compose: transform = A.Compose([A.HorizontalFlip(p=0.5), A.RandomBrightnessContrast(p=0.2)]). Apply: result = transform(image=img_np), aug_img = result["image"]. Crop: A.RandomCrop(height=224, width=224), A.CenterCrop(256, 256), A.RandomResizedCrop(224, 224, scale=(0.08, 1.0)). Geometric: A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.2, rotate_limit=30, p=0.5). A.Affine(scale=(0.8,1.2), rotate=(-15,15), shear=(-10,10)). A.ElasticTransform(alpha=1, sigma=50). A.GridDistortion(num_steps=5, distort_limit=0.3). Color: A.HueSaturationValue(hue_shift_limit=20, sat_shift_limit=30). A.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1). A.CLAHE(clip_limit=4.0, tile_grid_size=(8,8)). A.ToGray(p=0.1). A.RGBShift(). Noise/blur: A.GaussNoise(var_limit=(10,50)). A.GaussianBlur(blur_limit=3). A.MotionBlur(). A.ISONoise(). Dropout: A.CoarseDropout(max_holes=8, max_height=32, max_width=32). A.GridDropout(ratio=0.5). Normalize: A.Normalize(mean=[0.485,0.456,0.406], std=[0.229,0.224,0.225]). Bbox: A.Compose([...], bbox_params=A.BboxParams(format="yolo", label_fields=["labels"])). A.Compose([...], keypoint_params=A.KeypointParams(format="xy")). OneOf: A.OneOf([A.MotionBlur(), A.GaussianBlur()], p=0.5). Replay: A.ReplayCompose([...]) for deterministic reuse. Claude Code generates Albumentations classification/detection/segmentation pipelines, dataset wrappers, and augmentation policy search.
CLAUDE.md for Albumentations
## Albumentations Stack
- Version: albumentations >= 1.4
- Import: import albumentations as A | from albumentations.pytorch import ToTensorV2
- Input: numpy uint8 HWC | output same unless ToTensorV2 → CHW float32 tensor
- Compose: A.Compose([transforms], bbox_params=A.BboxParams(...))
- Bbox formats: "pascal_voc" [x1,y1,x2,y2] | "yolo" [cx,cy,w,h] | "coco" [x,y,w,h]
- Probabilities: set p= on each transform | p=1.0 on Compose itself
- Replay: A.ReplayCompose for same transform across image+mask pairs
Albumentations Augmentation Pipeline
# vision/albumentations_pipeline.py — image augmentation with Albumentations
from __future__ import annotations
from pathlib import Path
from typing import Any
import cv2
import numpy as np
import albumentations as A
from albumentations.pytorch import ToTensorV2
# ── 0. Classification augmentation pipelines ─────────────────────────────────
def classification_train_transform(
image_size: int = 224,
mean: tuple = (0.485, 0.456, 0.406),
std: tuple = (0.229, 0.224, 0.225),
aggressive: bool = False,
) -> A.Compose:
"""
Standard ImageNet-style augmentation for classification training.
aggressive=True adds more color/noise augmentations for self-supervised or
contrastive pretraining contexts.
"""
color_jitter = A.OneOf([
A.RandomBrightnessContrast(brightness_limit=0.3, contrast_limit=0.3, p=1.0),
A.ColorJitter(brightness=0.4, contrast=0.4, saturation=0.4, hue=0.1, p=1.0),
A.HueSaturationValue(hue_shift_limit=20, sat_shift_limit=30, val_shift_limit=20, p=1.0),
], p=0.8)
blur_noise = A.OneOf([
A.GaussianBlur(blur_limit=(3, 7), p=1.0),
A.MotionBlur(blur_limit=5, p=1.0),
A.GaussNoise(var_limit=(10.0, 50.0), p=1.0),
A.ISONoise(color_shift=(0.01, 0.05), intensity=(0.1, 0.5), p=1.0),
], p=0.3 if not aggressive else 0.5)
spatial = A.OneOf([
A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.2, rotate_limit=30,
border_mode=cv2.BORDER_REFLECT_101, p=1.0),
A.Affine(scale=(0.85, 1.15), rotate=(-20, 20), shear=(-8, 8), p=1.0),
], p=0.5)
transforms = [
A.RandomResizedCrop(height=image_size, width=image_size,
scale=(0.08, 1.0), ratio=(0.75, 1.33)),
A.HorizontalFlip(p=0.5),
spatial,
color_jitter,
blur_noise,
A.CLAHE(clip_limit=4.0, tile_grid_size=(8, 8), p=0.2),
A.ToGray(p=0.05),
A.CoarseDropout(max_holes=8, max_height=image_size // 8,
max_width=image_size // 8, min_holes=1, fill_value=0, p=0.3),
]
if aggressive:
transforms += [
A.GridDistortion(num_steps=5, distort_limit=0.2, p=0.2),
A.RandomShadow(p=0.2),
A.RandomFog(fog_coef_lower=0.1, fog_coef_upper=0.3, p=0.1),
]
transforms += [
A.Normalize(mean=mean, std=std),
ToTensorV2(),
]
return A.Compose(transforms)
def classification_val_transform(
image_size: int = 224,
crop_size: int = None,
mean: tuple = (0.485, 0.456, 0.406),
std: tuple = (0.229, 0.224, 0.225),
) -> A.Compose:
"""Deterministic center-crop pipeline for validation and inference."""
crop = crop_size or image_size
return A.Compose([
A.SmallestMaxSize(max_size=int(image_size * 256 / 224)),
A.CenterCrop(height=crop, width=crop),
A.Normalize(mean=mean, std=std),
ToTensorV2(),
])
# ── 1. Object detection augmentation ─────────────────────────────────────────
def detection_train_transform(
image_size: int = 640,
bbox_format: str = "pascal_voc", # "pascal_voc" | "yolo" | "coco"
) -> A.Compose:
"""
Augmentation for object detection (bounding boxes preserved).
Compatible with YOLO, Faster-RCNN, and similar detectors.
min_area / min_visibility prune boxes that become too small after crop.
"""
return A.Compose(
[
A.RandomResizedCrop(height=image_size, width=image_size,
scale=(0.5, 1.0), ratio=(0.75, 1.33)),
A.HorizontalFlip(p=0.5),
A.ShiftScaleRotate(shift_limit=0.1, scale_limit=0.2,
rotate_limit=10, border_mode=cv2.BORDER_CONSTANT,
value=114, p=0.5),
A.OneOf([
A.Perspective(scale=(0.05, 0.1), p=1.0),
A.Affine(scale=(0.9, 1.1), shear=(-5, 5), p=1.0),
], p=0.3),
A.RandomBrightnessContrast(brightness_limit=0.3, contrast_limit=0.3, p=0.5),
A.HueSaturationValue(p=0.3),
A.GaussNoise(var_limit=(10, 40), p=0.2),
A.GaussianBlur(blur_limit=3, p=0.2),
A.CoarseDropout(max_holes=4, max_height=64, max_width=64, fill_value=114, p=0.2),
A.Normalize(mean=(0.0, 0.0, 0.0), std=(1.0/255, 1.0/255, 1.0/255)),
ToTensorV2(),
],
bbox_params=A.BboxParams(
format=bbox_format,
label_fields=["labels"],
min_area=100, # drop boxes smaller than 100 px²
min_visibility=0.2, # drop boxes where < 20% visible after crop
),
)
# ── 2. Segmentation augmentation ──────────────────────────────────────────────
def segmentation_train_transform(
image_size: int = 512,
n_classes: int = 2,
) -> A.ReplayCompose:
"""
Augmentation for semantic segmentation.
Uses ReplayCompose so the same spatial transform applies to image + mask.
"""
return A.ReplayCompose([
A.RandomResizedCrop(height=image_size, width=image_size, scale=(0.5, 1.0)),
A.HorizontalFlip(p=0.5),
A.VerticalFlip(p=0.2),
A.RandomRotate90(p=0.5),
A.OneOf([
A.ElasticTransform(alpha=120, sigma=120 * 0.05, alpha_affine=120 * 0.03, p=1.0),
A.GridDistortion(num_steps=5, distort_limit=0.3, p=1.0),
A.OpticalDistortion(distort_limit=0.2, shift_limit=0.2, p=1.0),
], p=0.3),
A.OneOf([
A.RandomBrightnessContrast(p=1.0),
A.HueSaturationValue(p=1.0),
A.CLAHE(p=1.0),
], p=0.5),
A.GaussNoise(p=0.2),
A.Normalize(mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)),
ToTensorV2(),
])
def apply_segmentation_transform(
transform: A.ReplayCompose,
image: np.ndarray,
mask: np.ndarray,
) -> tuple[Any, Any]:
"""
Apply a ReplayCompose to both image and mask using the same random state.
mask is HW (no channel dim) with integer class indices.
"""
result = transform(image=image, mask=mask)
aug_img = result["image"] # CHW float32 tensor
aug_mask = result["mask"] # HW int64 tensor
return aug_img, aug_mask
# ── 3. Medical imaging augmentation ──────────────────────────────────────────
def medical_train_transform(
image_size: int = 512,
grayscale: bool = True,
) -> A.Compose:
"""
Conservative augmentation for medical images (CT, MRI, X-Ray).
No color jitter — preserves Hounsfield scale or intensity semantics.
"""
transforms_list = [
A.RandomCrop(height=image_size, width=image_size),
A.HorizontalFlip(p=0.5),
A.ShiftScaleRotate(shift_limit=0.05, scale_limit=0.1,
rotate_limit=15, border_mode=cv2.BORDER_CONSTANT, p=0.5),
A.OneOf([
A.ElasticTransform(alpha=80, sigma=80 * 0.05, alpha_affine=80 * 0.03, p=1.0),
A.GridDistortion(distort_limit=0.1, p=1.0),
], p=0.3),
A.RandomBrightnessContrast(brightness_limit=0.15, contrast_limit=0.15, p=0.4),
A.GaussNoise(var_limit=(5.0, 20.0), p=0.3),
A.GaussianBlur(blur_limit=3, p=0.2),
A.CoarseDropout(max_holes=4, max_height=32, max_width=32,
fill_value=0, p=0.2),
# No Normalize for grayscale medical: caller handles window/level manually
]
if not grayscale:
transforms_list.append(A.Normalize(mean=(0.5,), std=(0.5,)))
transforms_list.append(ToTensorV2())
return A.Compose(transforms_list)
# ── 4. PyTorch Dataset integration ───────────────────────────────────────────
class AugmentedImageDataset:
"""
Example PyTorch-compatible Dataset that applies Albumentations transforms.
Works with any PyTorch DataLoader.
"""
def __init__(
self,
image_paths: list[str],
labels: list[int],
transform: A.Compose | None = None,
):
self.image_paths = image_paths
self.labels = labels
self.transform = transform
def __len__(self) -> int:
return len(self.image_paths)
def __getitem__(self, idx: int) -> tuple[Any, int]:
# Read as uint8 RGB numpy array
bgr = cv2.imread(self.image_paths[idx])
image = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
if self.transform:
image = self.transform(image=image)["image"]
return image, self.labels[idx]
# ── 5. Augmentation visualization helper ─────────────────────────────────────
def visualize_augmentations(
image_path: str,
transform: A.Compose,
n_samples: int = 8,
save_path: str = None,
) -> np.ndarray:
"""
Apply a transform N times and tile the results.
Returns a grid of augmented images as a numpy array.
"""
bgr = cv2.imread(image_path)
image = cv2.cvtColor(bgr, cv2.COLOR_BGR2RGB)
augmented = [image]
for _ in range(n_samples - 1):
# Strip ToTensorV2 for visualization (can't render tensors directly)
viz_transform = A.Compose([
t for t in transform.transforms
if not isinstance(t, ToTensorV2)
])
result = viz_transform(image=image.copy())
augmented.append(result["image"])
# Tile images: 2 rows
cols = (n_samples + 1) // 2
rows_list = []
for i in range(0, n_samples, cols):
row_imgs = augmented[i:i+cols]
while len(row_imgs) < cols:
row_imgs.append(np.zeros_like(image))
rows_list.append(np.concatenate(row_imgs, axis=1))
grid = np.concatenate(rows_list, axis=0)
if save_path:
cv2.imwrite(save_path, cv2.cvtColor(grid, cv2.COLOR_RGB2BGR))
print(f"Saved augmentation grid to {save_path}")
return grid
# ── Demo ──────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
print("Albumentations Image Augmentation Demo")
print("=" * 50)
# Synthetic image
image = np.random.randint(0, 255, (480, 640, 3), dtype=np.uint8)
# Classification pipeline
train_tfm = classification_train_transform(image_size=224)
result = train_tfm(image=image)
print(f"\nClassification aug output: {result['image'].shape} {result['image'].dtype}")
val_tfm = classification_val_transform(image_size=224)
val_result = val_tfm(image=image)
print(f"Val output: {val_result['image'].shape}")
# Detection pipeline
bboxes = [[100, 50, 300, 250], [400, 100, 580, 400]] # pascal_voc
labels = [0, 1]
det_tfm = detection_train_transform(image_size=640, bbox_format="pascal_voc")
det_result = det_tfm(image=image, bboxes=bboxes, labels=labels)
print(f"\nDetection aug: {len(det_result['bboxes'])} boxes remaining (was {len(bboxes)})")
# Segmentation pipeline with mask
mask = np.zeros((480, 640), dtype=np.uint8)
mask[100:300, 200:400] = 1 # rectangle of class 1
seg_tfm = segmentation_train_transform(image_size=256)
aug_img, aug_mask = apply_segmentation_transform(seg_tfm, image, mask)
print(f"\nSegmentation aug: image={aug_img.shape}, mask={aug_mask.shape}")
print("\nA.Compose([...]) — all transforms applied to image + labels atomically.")
print("Use bbox_params=A.BboxParams() for object detection coordinate transforms.")
print("Use ReplayCompose when image/mask must receive identical spatial transforms.")
For the torchvision transforms alternative — torchvision transforms operate on PIL Images and apply augmentations in Python while Albumentations uses OpenCV as its backend, making crop/flip/rotate 10–50× faster on large images, supports arbitrary numpy arrays (float32 medical volumes, 16-bit satellite imagery) without PIL conversion, and BboxParams with min_area / min_visibility automatically prunes boxes that become invalid after crop—work that torchvision requires manual post-processing for. For the imgaug alternative — imgaug has not been actively maintained since 2020 while Albumentations has native A.Compose with keypoint and bbox coordinate transforms, ReplayCompose for deterministic replay across image/mask/bbox triplets, and A.OneOf with weighted probability selection, all compiled with C extensions that execute the full ImageNet training augmentation pipeline in under 2ms per image on a single CPU core. The Claude Skills 360 bundle includes Albumentations skill sets covering classification train/val pipelines, RandomResizedCrop and ShiftScaleRotate, color jitter and CLAHE, CoarseDropout, Normalize and ToTensorV2, BboxParams for detection, ReplayCompose for segmentation, medical imaging conservative augmentation, and PyTorch Dataset integration. Start with the free tier to try image augmentation code generation.