OpenCLIP provides open-source CLIP models with LAION pretrained weights. pip install open_clip_torch. import open_clip. List pretrained: open_clip.list_pretrained() — shows model+dataset combos. Load: model, _, preprocess = open_clip.create_model_and_transforms("ViT-B-32", pretrained="laion2b_s34b_b79k"). Large: "ViT-L-14" with "laion2b_s32b_b82k". EVA: "EVA02-E-14-plus" with "laion2b_s9b_b144k" — highest accuracy. Tokenize: tokenizer = open_clip.get_tokenizer("ViT-B-32"), text_tokens = tokenizer(["a photo of a cat", "a photo of a dog"]). Image: image_tensor = preprocess(PIL_image).unsqueeze(0). Encode: with torch.no_grad(): image_features = model.encode_image(image_tensor), text_features = model.encode_text(text_tokens). Normalize: image_features /= image_features.norm(dim=-1, keepdim=True). Similarity: similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1). Zero-shot class names: classes = ["cat", "dog", "bird"], prompts = [f"a photo of a {c}" for c in classes]. Forward: logits_per_image, logits_per_text = model(image, text). Batch encode: features = model.encode_image(batch_images) — (N, 512) for ViT-B-32. CoCa: model = open_clip.create_model("coca_ViT-L-14", pretrained="mscoco_finetuned_laion2b_s13b_b90k"). CoCa caption: generated = model.generate(image). Retrieval: compute pairwise image-text similarity matrix. Claude Code generates OpenCLIP zero-shot classifiers, image-text retrieval systems, embedding extractors, and CLIP fine-tuning scripts.
CLAUDE.md for OpenCLIP
## OpenCLIP Stack
- Version: open_clip_torch >= 2.24
- Load: create_model_and_transforms(model_name, pretrained=checkpoint_name)
- Models: "ViT-B-32" | "ViT-L-14" | "ViT-H-14" | "EVA02-E-14-plus"
- Tokenize: get_tokenizer(model_name)(list_of_strings) → (N, 77) tokens
- Encode: model.encode_image(images) | model.encode_text(tokens) → L2-normalize after
- Classify: (image_features @ text_features.T).softmax(-1) → class probabilities
- Batch: encode_image(N, 3, H, W) → (N, D) | encode_text(N, 77) → (N, D)
- CoCa: create_model("coca_ViT-L-14", ...) → model.generate(image) for captioning
- Pretrained: list_pretrained() shows all available model+dataset combos
OpenCLIP Vision-Language Pipeline
# vision/openclip_pipeline.py — vision-language embeddings with OpenCLIP
from __future__ import annotations
import os
from pathlib import Path
from typing import Optional
import torch
import torch.nn.functional as F
import numpy as np
from PIL import Image
import open_clip
# ── 1. Model loading ──────────────────────────────────────────────────────────
def load_clip_model(
model_name: str = "ViT-B-32",
pretrained: str = "laion2b_s34b_b79k",
device: str = "cpu",
) -> tuple:
"""
Load OpenCLIP model, transforms, and tokenizer.
Recommended combinations:
- Fast: ViT-B-32 + laion2b_s34b_b79k
- Balanced: ViT-L-14 + laion2b_s32b_b82k
- Best: EVA02-E-14-plus + laion2b_s9b_b144k (requires more VRAM)
"""
model, _, preprocess = open_clip.create_model_and_transforms(
model_name, pretrained=pretrained
)
tokenizer = open_clip.get_tokenizer(model_name)
model = model.to(device).eval()
print(f"OpenCLIP {model_name} ({pretrained}) ready on {device}")
return model, preprocess, tokenizer
def list_available_models(filter_str: str = None) -> list[tuple]:
"""List all available pretrained model+checkpoint combinations."""
all_models = open_clip.list_pretrained()
if filter_str:
all_models = [(m, d) for m, d in all_models if filter_str.lower() in m.lower()]
return all_models
# ── 2. Embedding extraction ───────────────────────────────────────────────────
@torch.no_grad()
def embed_images(
model,
preprocess,
images: list, # list of PIL.Image or file paths
batch_size: int = 64,
device: str = "cpu",
normalize: bool = True,
) -> torch.Tensor:
"""
Extract normalized image embeddings.
Returns (N, embed_dim) tensor.
"""
all_features = []
for i in range(0, len(images), batch_size):
batch = images[i : i + batch_size]
pil_batch = []
for img in batch:
if isinstance(img, (str, Path)):
img = Image.open(img).convert("RGB")
pil_batch.append(img)
tensor_batch = torch.stack([preprocess(img) for img in pil_batch]).to(device)
features = model.encode_image(tensor_batch)
if normalize:
features = F.normalize(features, dim=-1)
all_features.append(features.cpu())
return torch.cat(all_features)
@torch.no_grad()
def embed_texts(
model,
tokenizer,
texts: list[str],
batch_size: int = 256,
device: str = "cpu",
normalize: bool = True,
) -> torch.Tensor:
"""
Extract normalized text embeddings.
Returns (N, embed_dim) tensor.
"""
all_features = []
for i in range(0, len(texts), batch_size):
batch = texts[i : i + batch_size]
tokens = tokenizer(batch).to(device)
features = model.encode_text(tokens)
if normalize:
features = F.normalize(features, dim=-1)
all_features.append(features.cpu())
return torch.cat(all_features)
# ── 3. Zero-shot image classification ────────────────────────────────────────
IMAGENET_TEMPLATES = [
"a photo of a {}.",
"a good photo of a {}.",
"a bad photo of a {}.",
"a close-up photo of a {}.",
"a photo of many {}.",
"an image of a {}.",
"a {} in a scene.",
]
def build_class_embeddings(
model,
tokenizer,
class_names: list[str],
templates: list[str] = None,
device: str = "cpu",
ensemble: bool = True,
) -> torch.Tensor:
"""
Build class prototype embeddings using prompt templates.
ensemble=True averages over all templates for robustness.
Returns (n_classes, embed_dim).
"""
templates = templates or IMAGENET_TEMPLATES[:3]
with torch.no_grad():
class_embeds = []
for cls_name in class_names:
prompts = [t.format(cls_name) for t in templates]
tokens = tokenizer(prompts).to(device)
features = model.encode_text(tokens) # (n_templates, D)
features = F.normalize(features, dim=-1)
if ensemble:
class_embed = features.mean(dim=0)
class_embed = F.normalize(class_embed, dim=-1)
else:
class_embed = features[0]
class_embeds.append(class_embed)
return torch.stack(class_embeds).cpu() # (n_classes, D)
def zero_shot_classify(
model,
preprocess,
tokenizer,
images: list,
class_names: list[str],
templates: list[str] = None,
device: str = "cpu",
top_k: int = 5,
) -> list[list[tuple[str, float]]]:
"""
Zero-shot image classification without any training.
Returns list of (class_name, probability) for each image.
"""
# Build class embeddings
class_embeds = build_class_embeddings(
model, tokenizer, class_names, templates, device
).to(device) # (C, D)
# Encode images
image_embeds = embed_images(model, preprocess, images, device=device) # (N, D)
image_embeds = image_embeds.to(device)
# Compute similarities
logits = 100.0 * image_embeds @ class_embeds.T # (N, C)
probabilities = logits.softmax(dim=-1).cpu()
results = []
for probs in probabilities:
top_indices = torch.topk(probs, k=min(top_k, len(class_names))).indices
preds = [(class_names[i], round(float(probs[i]), 4)) for i in top_indices]
results.append(preds)
return results
# ── 4. Image-text retrieval ───────────────────────────────────────────────────
def build_image_index(
model,
preprocess,
image_paths: list[str],
device: str = "cpu",
) -> tuple[torch.Tensor, list[str]]:
"""Build an embedding index for a collection of images."""
embeddings = embed_images(model, preprocess, image_paths, device=device)
print(f"Image index: {len(image_paths)} images, {embeddings.shape[1]}D embeddings")
return embeddings, image_paths
def search_images_by_text(
query_text: str,
image_embeddings: torch.Tensor, # (N, D) normalized
image_paths: list[str],
model,
tokenizer,
top_k: int = 10,
device: str = "cpu",
) -> list[tuple[str, float]]:
"""Find images most relevant to a text query."""
text_embed = embed_texts(model, tokenizer, [query_text], device=device)[0]
text_embed = text_embed.to(device)
img_embeds = image_embeddings.to(device)
similarities = img_embeds @ text_embed # (N,)
top_indices = torch.topk(similarities, k=min(top_k, len(image_paths))).indices.cpu()
return [(image_paths[i], round(float(similarities[i]), 4)) for i in top_indices]
def search_texts_by_image(
query_image_path: str,
text_embeddings: torch.Tensor, # (N, D) normalized
texts: list[str],
model,
preprocess,
top_k: int = 5,
device: str = "cpu",
) -> list[tuple[str, float]]:
"""Find texts most matching an image query."""
img_embed = embed_images(model, preprocess, [query_image_path], device=device)[0]
img_embed = img_embed.to(device)
txt_embeds = text_embeddings.to(device)
similarities = txt_embeds @ img_embed # (N,)
top_indices = torch.topk(similarities, k=min(top_k, len(texts))).indices.cpu()
return [(texts[i], round(float(similarities[i]), 4)) for i in top_indices]
# ── 5. Image similarity ───────────────────────────────────────────────────────
def find_similar_images_by_image(
query_path: str,
gallery_embeddings: torch.Tensor,
gallery_paths: list[str],
model,
preprocess,
top_k: int = 10,
device: str = "cpu",
) -> list[tuple[str, float]]:
"""Find images visually similar to a query image."""
query_embed = embed_images(model, preprocess, [query_path], device=device)[0]
return search_images_by_text.__wrapped__ if False else [
(gallery_paths[i], round(float(s), 4))
for i, s in zip(
*[(t.item(), gallery_embeddings.to(device) @ query_embed.to(device))]
if False else
[
torch.topk(gallery_embeddings.to(device) @ query_embed.to(device), k=min(top_k, len(gallery_paths))).indices.cpu().tolist(),
torch.topk(gallery_embeddings.to(device) @ query_embed.to(device), k=min(top_k, len(gallery_paths))).values.cpu().tolist(),
]
)
]
@torch.no_grad()
def compute_pairwise_similarity(
embeddings_a: torch.Tensor, # (N, D)
embeddings_b: torch.Tensor, # (M, D)
) -> torch.Tensor:
"""Compute (N, M) cosine similarity matrix."""
return embeddings_a @ embeddings_b.T
# ── 6. CoCa image captioning ──────────────────────────────────────────────────
def load_coca_model(device: str = "cpu") -> tuple:
"""Load CoCa (Contrastive Captioners) for image captioning."""
model, _, preprocess = open_clip.create_model_and_transforms(
"coca_ViT-L-14",
pretrained="mscoco_finetuned_laion2b_s13b_b90k",
)
model = model.to(device).eval()
tokenizer = open_clip.get_tokenizer("ViT-L-14")
print("CoCa ViT-L-14 ready for image captioning")
return model, preprocess, tokenizer
@torch.no_grad()
def caption_image(
model,
preprocess,
image_path: str,
top_k: int = 1,
max_seq_len: int = 30,
device: str = "cpu",
) -> list[str]:
"""Generate captions for an image using CoCa."""
image = preprocess(Image.open(image_path).convert("RGB")).unsqueeze(0).to(device)
generated = model.generate(image, seq_len=max_seq_len)
# Decode tokens
tokenizer = open_clip.get_tokenizer("ViT-L-14")
captions = [
open_clip.decode(g).split("<end_of_text>")[0].replace("<start_of_text>", "").strip()
for g in generated
]
return captions
# ── Demo ──────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
device = "cuda" if torch.cuda.is_available() else "cpu"
model, preprocess, tokenizer = load_clip_model("ViT-B-32", device=device)
# Create synthetic image for demo
dummy_image = Image.fromarray(np.random.randint(0, 255, (224, 224, 3), dtype=np.uint8))
# Zero-shot classification
animal_classes = ["cat", "dog", "bird", "fish", "rabbit", "hamster"]
results = zero_shot_classify(
model, preprocess, tokenizer,
images=[dummy_image],
class_names=animal_classes,
device=device,
top_k=3,
)
print("Zero-shot predictions:")
for cls_name, prob in results[0]:
print(f" {cls_name:<12}: {prob:.3f}")
# Text embeddings
texts = ["a photo of a cat", "a dog running in a park", "a bird in the sky"]
text_embeds = embed_texts(model, tokenizer, texts, device=device)
print(f"\nText embeddings: {text_embeds.shape}")
# Similarity matrix
sim_matrix = compute_pairwise_similarity(text_embeds, text_embeds)
print(f"Text-text similarity:\n{sim_matrix.numpy().round(3)}")
print("\nAvailable ViT-B-32 checkpoints:")
for model_n, pretrained_n in list_available_models("ViT-B-32")[:5]:
print(f" {model_n} + {pretrained_n}")
For the OpenAI CLIP alternative when wanting the original OpenAI paper–matching checkpoints and relying on OpenAI’s official implementation — the original CLIP provides reliable baselines while OpenCLIP’s LAION-trained variants (ViT-L-14 on LAION-2B, EVA02-E-14 on LAION) achieve higher zero-shot accuracy than OpenAI’s public checkpoints on ImageNet and most retrieval benchmarks, and the open training data allows fine-tuning without commercial licensing restrictions. For the BLIP/BLIP-2 alternative when needing visual question answering, image captioning, and grounded understanding beyond classification and retrieval — BLIP-2 with frozen large language model produces richer visual answers while OpenCLIP’s dual-encoder architecture with joint embedding space is faster for large-scale retrieval (millions of images), similarity search, and zero-shot classification where contrastive embeddings are more efficient than decoder-based models. The Claude Skills 360 bundle includes OpenCLIP skill sets covering model loading, image and text embedding, zero-shot classification with template ensembling, image-text retrieval, image similarity search, CoCa captioning, and pairwise similarity matrices. Start with the free tier to try vision-language embedding code generation.