OpenVINO optimizes AI inference on Intel hardware. pip install openvino nncf optimum[openvino]. Load: import openvino as ov, core = ov.Core(). Convert PyTorch: import openvino.torch, ov_model = ov.convert_model(pt_model, example_input=torch.randn(1,3,224,224)). Convert ONNX: ov_model = core.read_model("model.onnx"). Compile for CPU: compiled = core.compile_model(ov_model, "CPU"). Infer: result = compiled(inputs), output = result[compiled.output(0)]. Devices: core.available_devices — CPU, GPU, NPU, AUTO. Performance hints: core.compile_model(model, "CPU", {"PERFORMANCE_HINT":"THROUGHPUT"}) for batch, "LATENCY" for realtime. Cache: core.set_property({"CACHE_DIR":"./ov_cache"}) avoids recompile. INT8 quantization: from nncf import quantize, quantized = quantize(ov_model, calibration_dataset). Async: infer_queue = ov.AsyncInferQueue(compiled, 4), infer_queue.set_callback(lambda request, userdata: ...), infer_queue.start_async(inputs), infer_queue.wait_all(). Optimum: from optimum.intel import OVModelForCausalLM, OVModelForSequenceClassification, model = OVModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B", export=True). GenAI: import openvino_genai, pipe = openvino_genai.LLMPipeline("./llama-ov", "CPU"), result = pipe.generate("What is OpenVINO?", max_new_tokens=256). Tokenizer: pipe.get_tokenizer(). Export OV model: ov.save_model(ov_model, "model.xml"). Claude Code generates OpenVINO conversion scripts, INT8 quantization pipelines, async inference queues, LLM GenAI pipelines, and optimum-intel optimization code.
CLAUDE.md for OpenVINO
## OpenVINO Stack
- Version: openvino >= 2024.4, nncf >= 2.13, optimum-intel >= 1.20
- Core: ov.Core().compile_model(model, device, config)
- Convert: ov.convert_model(pt_model, example_input=...) | core.read_model("model.onnx")
- Devices: "CPU" | "GPU" | "NPU" | "AUTO" | "MULTI:CPU,GPU"
- Hints: {"PERFORMANCE_HINT": "THROUGHPUT" | "LATENCY" | "CUMULATIVE_THROUGHPUT"}
- INT8: nncf.quantize(ov_model, calibration_dataset) → quantized model
- Async: ov.AsyncInferQueue(compiled, n_jobs) → start_async → wait_all
- LLM: openvino_genai.LLMPipeline(model_dir, "CPU") → pipe.generate(prompt, max_new_tokens)
- Optimum: OVModelForCausalLM.from_pretrained(hf_id, export=True) for HF models
OpenVINO Inference Pipeline
# inference/openvino_pipeline.py — Intel-optimized AI inference with OpenVINO
from __future__ import annotations
import os
import time
from pathlib import Path
from typing import Any, Callable
import numpy as np
import openvino as ov
# ── 1. Core setup ─────────────────────────────────────────────────────────────
def setup_core(cache_dir: str = "./ov_cache") -> ov.Core:
"""Initialize OpenVINO Core with model caching."""
core = ov.Core()
Path(cache_dir).mkdir(exist_ok=True)
core.set_property({"CACHE_DIR": cache_dir})
print("Available devices:")
for device in core.available_devices:
props = core.get_property(device, "FULL_DEVICE_NAME")
print(f" {device}: {props}")
return core
# ── 2. Model conversion ───────────────────────────────────────────────────────
def convert_pytorch_model(
pt_model,
example_input,
output_path: str = "model.xml",
verbose: bool = False,
) -> ov.Model:
"""Convert a PyTorch model to OpenVINO IR format."""
ov_model = ov.convert_model(pt_model, example_input=example_input)
if output_path:
ov.save_model(ov_model, output_path)
print(f"Saved: {output_path}")
return ov_model
def convert_torchvision_classifier(
model_name: str = "resnet50",
output_path: str = "./resnet50.xml",
) -> ov.Model:
"""Convert a torchvision model to OpenVINO."""
import torch
import torchvision
pt_model = getattr(torchvision.models, model_name)(weights="DEFAULT")
pt_model.eval()
dummy = torch.randn(1, 3, 224, 224)
ov_model = ov.convert_model(pt_model, example_input=dummy)
ov.save_model(ov_model, output_path)
print(f"Converted {model_name} → {output_path}")
return ov_model
def load_onnx_model(onnx_path: str, core: ov.Core) -> ov.Model:
"""Load an ONNX model into OpenVINO."""
model = core.read_model(onnx_path)
print(f"Loaded ONNX: {onnx_path}")
print(f" Inputs: {[i.get_any_name() for i in model.inputs]}")
print(f" Outputs: {[o.get_any_name() for o in model.outputs]}")
return model
# ── 3. Compilation and device targeting ──────────────────────────────────────
def compile_for_throughput(
core: ov.Core,
model: ov.Model,
device: str = "CPU",
) -> ov.CompiledModel:
"""
Compile for maximum throughput — best for batch processing.
AUTO device selects best available hardware automatically.
"""
config = {
"PERFORMANCE_HINT": "THROUGHPUT",
"NUM_STREAMS": "AUTO",
}
compiled = core.compile_model(model, device, config)
print(f"Compiled for {device} (THROUGHPUT mode)")
return compiled
def compile_for_latency(
core: ov.Core,
model: ov.Model,
device: str = "CPU",
) -> ov.CompiledModel:
"""Compile for minimum latency — best for single-sample real-time inference."""
config = {
"PERFORMANCE_HINT": "LATENCY",
"INFERENCE_NUM_THREADS": str(os.cpu_count() or 4),
}
return core.compile_model(model, device, config)
def compile_multi_device(
core: ov.Core,
model: ov.Model,
) -> ov.CompiledModel:
"""Use AUTO plugin to automatically select CPU/GPU/NPU."""
return core.compile_model(model, "AUTO", {
"PERFORMANCE_HINT": "CUMULATIVE_THROUGHPUT",
})
# ── 4. Synchronous inference ──────────────────────────────────────────────────
def classify_image(
compiled: ov.CompiledModel,
image_array: np.ndarray, # (H, W, C) uint8
top_k: int = 5,
) -> list[tuple[int, float]]:
"""
Run image classification inference.
Returns list of (class_index, score) tuples.
"""
# Preprocess: resize, normalize, add batch dim
import cv2
img = cv2.resize(image_array, (224, 224)).astype(np.float32) / 255.0
img = (img - np.array([0.485, 0.456, 0.406])) / np.array([0.229, 0.224, 0.225])
img = np.transpose(img, (2, 0, 1))[np.newaxis] # NCHW
result = compiled({compiled.input(0): img})
logits = result[compiled.output(0)][0]
scores = np.exp(logits) / np.sum(np.exp(logits)) # softmax
top_ids = np.argsort(scores)[::-1][:top_k]
return [(int(i), float(scores[i])) for i in top_ids]
def run_object_detection(
compiled: ov.CompiledModel,
image_array: np.ndarray,
conf_thresh: float = 0.5,
) -> list[dict]:
"""Run object detection (SSD/YOLO-style output parsing)."""
h, w = image_array.shape[:2]
img = cv2.resize(image_array, (640, 640)).astype(np.float32) / 255.0
img = np.transpose(img, (2, 0, 1))[np.newaxis]
result = compiled({compiled.input(0): img})
# Generic output — actual parsing depends on model architecture
output = result[compiled.output(0)]
detections = []
# ... parse boxes, classes, scores from output ...
return detections
def extract_embeddings(
compiled: ov.CompiledModel,
texts: list[str],
tokenizer,
max_length: int = 128,
) -> np.ndarray:
"""Extract sentence embeddings from a compiled encoder model."""
encoded = tokenizer(
texts,
padding="max_length",
max_length=max_length,
truncation=True,
return_tensors="np",
)
inputs = {
"input_ids": encoded["input_ids"].astype(np.int64),
"attention_mask": encoded["attention_mask"].astype(np.int64),
}
if "token_type_ids" in encoded:
inputs["token_type_ids"] = encoded["token_type_ids"].astype(np.int64)
result = compiled(inputs)
# Mean-pool last hidden state
hidden = result[compiled.output(0)] # (N, seq_len, hidden)
mask = encoded["attention_mask"][..., np.newaxis]
pooled = (hidden * mask).sum(axis=1) / mask.sum(axis=1)
norms = np.linalg.norm(pooled, axis=1, keepdims=True)
return pooled / np.maximum(norms, 1e-9)
# ── 5. INT8 quantization with NNCF ───────────────────────────────────────────
def quantize_model_int8(
ov_model: ov.Model,
dataset: Any, # Iterable of input dicts
output_path: str = "model_int8.xml",
subset_size: int = 300,
) -> ov.Model:
"""
Post-training INT8 quantization with NNCF.
dataset: yields dicts mapping input names to numpy arrays.
"""
import nncf
calibration_dataset = nncf.Dataset(dataset)
quantized = nncf.quantize(
ov_model,
calibration_dataset,
preset=nncf.QuantizationPreset.PERFORMANCE,
subset_size=subset_size,
)
ov.save_model(quantized, output_path)
print(f"INT8 model saved: {output_path}")
# Compare sizes
orig_size = Path("model.xml").stat().st_size if Path("model.xml").exists() else 0
quant_size = Path(output_path).stat().st_size
if orig_size:
print(f"Size reduction: {orig_size / quant_size:.1f}x")
return quantized
# ── 6. Async inference queue ──────────────────────────────────────────────────
def batch_inference_async(
compiled: ov.CompiledModel,
inputs: list[dict],
n_streams: int = 4,
) -> list[np.ndarray]:
"""
High-throughput async inference with OpenVINO AsyncInferQueue.
Saturates all CPU cores with concurrent inference requests.
"""
results: list[np.ndarray | None] = [None] * len(inputs)
def on_complete(request: ov.InferRequest, userdata: int):
results[userdata] = request.get_output_tensor(0).data.copy()
infer_queue = ov.AsyncInferQueue(compiled, n_streams)
infer_queue.set_callback(on_complete)
for i, inp in enumerate(inputs):
infer_queue.start_async(inp, userdata=i)
infer_queue.wait_all()
return results
# ── 7. LLM with OpenVINO GenAI ───────────────────────────────────────────────
def run_llm_genai(
model_dir: str = "./Llama-3.2-1B-ov",
prompt: str = "What is OpenVINO?",
max_tokens: int = 256,
device: str = "CPU",
) -> str:
"""
LLM inference with openvino_genai — simplest path for text generation.
pip install openvino-genai
"""
import openvino_genai as ov_genai
pipe = ov_genai.LLMPipeline(model_dir, device)
config = ov_genai.GenerationConfig()
config.max_new_tokens = max_tokens
config.do_sample = False
return pipe.generate(prompt, config)
def export_hf_model_for_openvino(
hf_model_id: str = "meta-llama/Llama-3.2-1B",
output_dir: str = "./Llama-3.2-1B-ov",
precision: str = "INT4", # INT4 | INT8 | FP16
):
"""
Export a Hugging Face LLM to OpenVINO format via optimum-intel.
pip install optimum[openvino]
"""
from optimum.intel import OVModelForCausalLM
from transformers import AutoTokenizer
print(f"Exporting {hf_model_id} → {output_dir} ({precision})...")
model = OVModelForCausalLM.from_pretrained(
hf_model_id,
export=True,
load_in_8bit=(precision == "INT8"),
load_in_4bit=(precision == "INT4"),
)
model.save_pretrained(output_dir)
tokenizer = AutoTokenizer.from_pretrained(hf_model_id)
tokenizer.save_pretrained(output_dir)
print(f"Model exported: {output_dir}")
# ── 8. Benchmark ──────────────────────────────────────────────────────────────
def benchmark_compiled_model(
compiled: ov.CompiledModel,
sample_input: dict,
n_warmup: int = 5,
n_runs: int = 50,
) -> dict:
"""Benchmark latency and throughput of a compiled model."""
# Warmup
for _ in range(n_warmup):
compiled(sample_input)
# Benchmark
latencies = []
for _ in range(n_runs):
t0 = time.perf_counter()
compiled(sample_input)
latencies.append((time.perf_counter() - t0) * 1000) # ms
stats = {
"mean_ms": np.mean(latencies),
"p50_ms": np.percentile(latencies, 50),
"p95_ms": np.percentile(latencies, 95),
"p99_ms": np.percentile(latencies, 99),
"fps": 1000 / np.mean(latencies),
}
print(f"\n=== OpenVINO Latency Benchmark ({n_runs} runs) ===")
for k, v in stats.items():
print(f" {k:<12}: {v:>8.2f}")
return stats
if __name__ == "__main__":
core = setup_core()
# Convert torchvision ResNet50
print("\nConverting ResNet50...")
ov_model = convert_torchvision_classifier("resnet50", "./resnet50.xml")
# Compile for throughput
compiled = compile_for_throughput(core, ov_model, "CPU")
# Benchmark with random input
dummy_input = {compiled.input(0): np.random.randn(1, 3, 224, 224).astype(np.float32)}
benchmark_compiled_model(compiled, dummy_input, n_runs=100)
# AsyncInferQueue batch example
batch_inputs = [dummy_input] * 20
async_results = batch_inference_async(compiled, batch_inputs, n_streams=4)
print(f"\nAsync inference: {len(async_results)} results")
For the ONNX Runtime alternative when needing cross-platform deployment across Windows, Linux, macOS, iOS, and Android with the broadest accelerator plugin ecosystem — ONNX Runtime covers the widest platform range while OpenVINO delivers consistently the highest throughput and lowest latency specifically on Intel CPUs, Intel Integrated GPUs, and Intel NPUs through device-specific kernel fusion and INT8 calibration that ONNX Runtime’s generic optimization cannot match on Intel silicon. For the TensorRT alternative when deploying on NVIDIA GPUs and needing FP8, dynamic shapes, and CUDA graph optimization — TensorRT is purpose-built for NVIDIA while OpenVINO’s MULTI:CPU,GPU device plugin and AUTO device selection enables transparent workload distribution across heterogeneous Intel hardware without code changes, and openvino-genai provides a single-call LLM interface for running quantized LLaMA and Qwen models on CPU-only servers. The Claude Skills 360 bundle includes OpenVINO skill sets covering model conversion, device targeting, INT8 NNCF quantization, async inference queues, LLM GenAI pipelines, optimum-intel HF model export, and throughput benchmarking. Start with the free tier to try Intel-optimized inference code generation.