LLaVA enables visual question answering and image understanding with LLMs. pip install transformers accelerate. from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration. Load: processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf"), model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch.float16, device_map="auto"). Chat template: conversation = [{"role":"user","content":[{"type":"image","image":"path.jpg"},{"type":"text","text":"Describe this image."}]}]. prompt = processor.apply_chat_template(conversation, add_generation_prompt=True). Process: inputs = processor(images=PIL_image, text=prompt, return_tensors="pt").to(model.device). Generate: output = model.generate(**inputs, max_new_tokens=256). Decode: processor.decode(output[0], skip_special_tokens=True). Models: "llava-hf/llava-1.5-7b-hf" (7B fast), "llava-hf/llava-v1.6-mistral-7b-hf" (7B best quality), "llava-hf/llava-v1.6-34b-hf" (34B), "llava-hf/LLaVA-NeXT-Video-7B-hf" (video). Pipeline: from transformers import pipeline, pipe = pipeline("image-to-text", model="llava-hf/llava-1.5-7b-hf"), result = pipe(image, prompt="USER: <image>\nDescribe this.\nASSISTANT:"). Moondream: from transformers import AutoModelForCausalLM, AutoTokenizer, model = AutoModelForCausalLM.from_pretrained("vikhyatk/moondream2") — tiny 2B model, fast on CPU. model.answer_question(image, "What is in this image?", tokenizer). Claude Code generates LLaVA image QA pipelines, chart analyzers, document OCR workflows, and multi-turn visual chat applications.
CLAUDE.md for LLaVA
## LLaVA Stack
- Version: transformers >= 4.45
- Models: llava-hf/llava-1.5-7b-hf | llava-v1.6-mistral-7b-hf | LLaVA-NeXT-Video-7B-hf
- Processor: LlavaNextProcessor.from_pretrained(model_id)
- Model: LlavaNextForConditionalGeneration.from_pretrained(id, dtype=float16, device_map="auto")
- Prompt: processor.apply_chat_template([{role, content:[{type:image},{type:text}]}])
- Input: processor(images=PIL_image, text=prompt, return_tensors="pt")
- Generate: model.generate(**inputs, max_new_tokens=512)
- Lightweight: vikhyatk/moondream2 — 2B, fast CPU inference
- Pipeline: pipeline("image-to-text", model=...) for simple use cases
LLaVA Visual QA Pipeline
# vision/llava_pipeline.py — visual instruction following with LLaVA
from __future__ import annotations
import os
from pathlib import Path
from typing import Optional
import torch
from PIL import Image
# ── 1. Model loading ──────────────────────────────────────────────────────────
def load_llava(
model_id: str = "llava-hf/llava-v1.6-mistral-7b-hf",
device_map: str = "auto",
dtype: str = "float16",
load_in_4bit: bool = False,
) -> tuple:
"""
Load LLaVA-Next model and processor.
Model choices:
- llava-hf/llava-1.5-7b-hf — 7B, LLaMA2 backbone, fastest
- llava-hf/llava-v1.6-mistral-7b-hf — 7B, Mistral backbone, best quality
- llava-hf/llava-v1.6-34b-hf — 34B, highest quality (needs 2+ GPUs)
- llava-hf/LLaVA-NeXT-Video-7B-hf — 7B with video support
"""
from transformers import LlavaNextProcessor, LlavaNextForConditionalGeneration, BitsAndBytesConfig
processor = LlavaNextProcessor.from_pretrained(model_id)
load_kwargs = {
"device_map": device_map,
"torch_dtype": getattr(torch, dtype),
"low_cpu_mem_usage": True,
}
if load_in_4bit:
load_kwargs["quantization_config"] = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16,
)
model = LlavaNextForConditionalGeneration.from_pretrained(model_id, **load_kwargs)
model.eval()
print(f"LLaVA loaded: {model_id} ({dtype})")
return model, processor
def load_moondream(device: str = "cpu") -> tuple:
"""
Load Moondream2 — 2B VLM, fast on CPU.
Ideal for edge inference, embedded systems.
"""
from transformers import AutoModelForCausalLM, AutoTokenizer
model_id = "vikhyatk/moondream2"
tokenizer = AutoTokenizer.from_pretrained(model_id, revision="2024-07-23")
model = AutoModelForCausalLM.from_pretrained(
model_id,
revision="2024-07-23",
trust_remote_code=True,
torch_dtype=torch.float32 if device == "cpu" else torch.float16,
).to(device).eval()
print(f"Moondream2 loaded on {device}")
return model, tokenizer
# ── 2. Image question answering ───────────────────────────────────────────────
def ask_llava(
model,
processor,
image: str | Image.Image,
question: str,
max_tokens: int = 256,
temperature: float = 0.2,
do_sample: bool = False,
) -> str:
"""
Ask a question about an image.
Returns the model's text response.
"""
if isinstance(image, str):
image = Image.open(image).convert("RGB")
# Build conversation with chat template
conversation = [{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": question},
],
}]
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
inputs = processor(
images=image,
text=prompt,
return_tensors="pt",
).to(model.device)
with torch.no_grad():
output = model.generate(
**inputs,
max_new_tokens=max_tokens,
temperature=temperature if do_sample else 1.0,
do_sample=do_sample,
pad_token_id=processor.tokenizer.eos_token_id,
)
# Decode only the generated part
generated_ids = output[0][inputs["input_ids"].shape[1]:]
return processor.decode(generated_ids, skip_special_tokens=True).strip()
def ask_moondream(
model,
tokenizer,
image: str | Image.Image,
question: str,
) -> str:
"""Ask Moondream a question about an image."""
if isinstance(image, str):
image = Image.open(image).convert("RGB")
enc_image = model.encode_image(image)
return model.answer_question(enc_image, question, tokenizer)
# ── 3. Multi-turn visual conversation ────────────────────────────────────────
class VisualChatSession:
"""
Multi-turn conversation with a visual context.
Maintains conversation history with image context.
"""
def __init__(self, model, processor, image: str | Image.Image):
self.model = model
self.processor = processor
self.history = []
if isinstance(image, str):
image = Image.open(image).convert("RGB")
self.image = image
def chat(self, message: str, max_tokens: int = 512) -> str:
"""Send a message and get a response."""
# Add user message
if not self.history:
# First turn includes image
content = [{"type": "image"}, {"type": "text", "text": message}]
else:
content = [{"type": "text", "text": message}]
self.history.append({"role": "user", "content": content})
prompt = self.processor.apply_chat_template(
self.history, add_generation_prompt=True
)
inputs = self.processor(
images=self.image,
text=prompt,
return_tensors="pt",
).to(self.model.device)
with torch.no_grad():
output = self.model.generate(
**inputs,
max_new_tokens=max_tokens,
do_sample=False,
pad_token_id=self.processor.tokenizer.eos_token_id,
)
generated_ids = output[0][inputs["input_ids"].shape[1]:]
response = self.processor.decode(generated_ids, skip_special_tokens=True).strip()
self.history.append({"role": "assistant", "content": [{"type": "text", "text": response}]})
return response
def reset(self):
self.history = []
# ── 4. Specialized tasks ──────────────────────────────────────────────────────
def describe_image(model, processor, image_path: str, detail: str = "detailed") -> str:
"""Generate a detailed image description."""
prompts = {
"brief": "Describe this image in one or two sentences.",
"detailed": "Provide a detailed description of everything you can see in this image.",
"list": "List all the objects, people, and elements you can identify in this image.",
}
return ask_llava(model, processor, image_path, prompts.get(detail, prompts["detailed"]))
def read_text_in_image(model, processor, image_path: str) -> str:
"""Extract text from an image (OCR-like capability)."""
return ask_llava(
model, processor, image_path,
"Please read and transcribe all the text visible in this image. "
"Preserve the layout as much as possible.",
max_tokens=512,
)
def analyze_chart(model, processor, image_path: str) -> str:
"""Extract data and insights from charts and graphs."""
return ask_llava(
model, processor, image_path,
"This is a chart or graph. Please: "
"1. Identify the chart type. "
"2. Describe what data it shows. "
"3. List the key values and trends. "
"4. Provide a one-sentence summary of the main insight.",
max_tokens=512,
)
def check_image_content(
model,
processor,
image_path: str,
criteria: list[str],
) -> dict[str, bool]:
"""
Check whether specific elements are present in an image.
Returns {criterion: True/False} dict.
"""
criteria_str = "\n".join(f"- {c}" for c in criteria)
question = (
"For each of the following criteria, answer only 'yes' or 'no' on a new line:\n"
f"{criteria_str}\n\nOne answer per line:"
)
response = ask_llava(model, processor, image_path, question, max_tokens=100)
lines = [l.strip().lower() for l in response.strip().split("\n") if l.strip()]
results = {}
for i, criterion in enumerate(criteria):
if i < len(lines):
results[criterion] = "yes" in lines[i]
else:
results[criterion] = False
return results
def batch_analyze(
model,
processor,
image_paths: list[str],
question: str,
max_tokens: int = 256,
) -> list[str]:
"""Analyze multiple images with the same question."""
responses = []
for i, path in enumerate(image_paths):
print(f"[{i+1}/{len(image_paths)}] {Path(path).name}...")
response = ask_llava(model, processor, path, question, max_tokens=max_tokens)
responses.append(response)
return responses
# ── 5. Streaming generation ───────────────────────────────────────────────────
def ask_llava_stream(
model,
processor,
image: str | Image.Image,
question: str,
max_tokens: int = 512,
):
"""Stream LLaVA response token by token."""
from transformers import TextIteratorStreamer
from threading import Thread
if isinstance(image, str):
image = Image.open(image).convert("RGB")
conversation = [{"role": "user", "content": [
{"type": "image"}, {"type": "text", "text": question}
]}]
prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device)
streamer = TextIteratorStreamer(
processor.tokenizer,
skip_prompt=True,
skip_special_tokens=True,
)
def generate_fn():
model.generate(**inputs, max_new_tokens=max_tokens, streamer=streamer)
thread = Thread(target=generate_fn, daemon=True)
thread.start()
for token in streamer:
yield token
thread.join()
# ── Demo ──────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
print("LLaVA Visual QA Demo")
print("="*50)
# Use Moondream for quick demo (2B, works on CPU)
model, tokenizer = load_moondream(device="cpu")
# Create a synthetic test image
import numpy as np
img_array = np.ones((256, 256, 3), dtype=np.uint8) * 200
cv2 = None
try:
import cv2
cv2.putText(img_array, "Hello World", (40, 130),
cv2.FONT_HERSHEY_SIMPLEX, 1.5, (0, 0, 0), 3)
except ImportError:
pass
image = Image.fromarray(img_array)
# Ask questions
questions = [
"What do you see in this image?",
"Is there any text in this image?",
"Describe the colors present.",
]
for q in questions:
answer = ask_moondream(model, tokenizer, image, q)
print(f"\nQ: {q}\nA: {answer}")
print("\n" + "="*50)
print("For full LLaVA-Next (7B+), load with load_llava().")
print("Requires ~14GB VRAM for float16 or ~8GB with 4-bit quantization.")
For the GPT-4 Vision / Claude claude-sonnet-4-6 API alternative when needing production visual reasoning, complex document understanding, or multilingual scene analysis without hosting your own GPU server — cloud vision APIs handle infrastructure while LLaVA runs fully offline on your own hardware, enabling HIPAA-compliant medical image analysis, sensitive document processing, and industrial inspection applications where images cannot leave on-premises infrastructure. For the InternVL2 alternative when needing higher benchmark performance on document understanding, multi-image reasoning, and chart analysis than LLaVA at the same parameter count — InternVL2 achieves better OCR and reasoning scores while LLaVA’s larger open-source community, wider fine-tuning support via LLaVA-NeXT, and established training recipes make it the more accessible starting point for building custom visual instruction-following models on domain-specific datasets. The Claude Skills 360 bundle includes LLaVA skill sets covering model loading, image QA, multi-turn visual chat, OCR text extraction, chart analysis, streaming generation, batch processing, and Moondream lightweight inference. Start with the free tier to try visual language model code generation.