ONNX (Open Neural Network Exchange) is a portable format for ML models: export once, run anywhere. PyTorch’s torch.onnx.export serializes computation graphs. ONNX Runtime’s graph optimizer folds constants, merges operations, and eliminates dead nodes — typically 2-4× faster than eager PyTorch. INT8 quantization reduces model size ~4× and CPU latency ~2× with <1% accuracy drop. ORT runs on CPU, CUDA, TensorRT, CoreML, and DirectML execution providers. Claude Code generates ONNX export scripts, ORT optimization pipelines, quantization configurations, and the runtime integration code for production model serving.
CLAUDE.md for ONNX Projects
## ONNX Stack
- Export: torch.onnx.export (PyTorch) or optimum.exporters (Hugging Face)
- Runtime: onnxruntime >= 1.19 (CPU), onnxruntime-gpu (CUDA/TensorRT)
- Optimization: onnxruntime.transformers for transformer-specific fusions
- Quantization: qdq_quantizer (dynamic INT8) or static INT8 with calibration dataset
- Validation: compare onnx vs pytorch outputs with np.allclose(rtol=1e-3)
- Serving: ORT in Python fastapi, or compile to C++ shared lib for embedded
Export PyTorch to ONNX
# export/pytorch_export.py — export PyTorch models to ONNX
import torch
import torch.nn as nn
import onnx
import onnxruntime as ort
import numpy as np
from pathlib import Path
def export_classification_model(
model: nn.Module,
input_shape: tuple,
output_path: str,
opset_version: int = 17,
dynamic_axes: dict | None = None,
) -> str:
"""Export a PyTorch classification model to ONNX."""
model.eval()
# Dummy input for tracing the computation graph
dummy_input = torch.randn(*input_shape)
# Dynamic axes: allow variable batch size and sequence length at runtime
if dynamic_axes is None:
dynamic_axes = {
"input": {0: "batch_size"},
"output": {0: "batch_size"},
}
torch.onnx.export(
model,
dummy_input,
output_path,
export_params=True,
opset_version=opset_version,
do_constant_folding=True, # Fold constant expressions at export time
input_names=["input"],
output_names=["output"],
dynamic_axes=dynamic_axes,
)
# Validate the exported model
onnx_model = onnx.load(output_path)
onnx.checker.check_model(onnx_model)
print(f"Model exported and validated: {output_path}")
print(f"ONNX opset: {onnx_model.opset_import[0].version}")
print(f"Inputs: {[i.name for i in onnx_model.graph.input]}")
print(f"Outputs: {[o.name for o in onnx_model.graph.output]}")
return output_path
def export_transformer_model(
model_name_or_path: str,
output_dir: str,
task: str = "text-classification",
) -> str:
"""Export Hugging Face transformer using Optimum (recommended path)."""
from optimum.onnxruntime import ORTModelForSequenceClassification
from transformers import AutoTokenizer
# Optimum handles the dynamic axes, opset, and graph optimizations
model = ORTModelForSequenceClassification.from_pretrained(
model_name_or_path,
export=True, # Export on load
)
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
Path(output_dir).mkdir(parents=True, exist_ok=True)
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print(f"Transformer exported to {output_dir}")
return output_dir
def validate_onnx_matches_pytorch(
pytorch_model: nn.Module,
onnx_path: str,
test_input: torch.Tensor,
rtol: float = 1e-3,
atol: float = 1e-5,
) -> bool:
"""Compare ONNX and PyTorch outputs for numerical equivalence."""
pytorch_model.eval()
with torch.no_grad():
pytorch_output = pytorch_model(test_input).numpy()
sess = ort.InferenceSession(onnx_path)
ort_output = sess.run(None, {"input": test_input.numpy()})[0]
match = np.allclose(pytorch_output, ort_output, rtol=rtol, atol=atol)
max_diff = np.max(np.abs(pytorch_output - ort_output))
print(f"Max absolute difference: {max_diff:.6f}")
print(f"Outputs match (rtol={rtol}): {match}")
return match
ONNX Runtime Optimization
# optimization/optimize_onnx.py — graph-level optimizations
import onnxruntime as ort
from onnxruntime.transformers import optimizer
from onnxruntime.transformers.fusion_options import FusionOptions
import onnx
from pathlib import Path
def optimize_graph(
input_path: str,
output_path: str,
optimization_level: int = ort.GraphOptimizationLevel.ORT_ENABLE_ALL,
) -> str:
"""Apply ONNX Runtime graph-level optimizations."""
sess_options = ort.SessionOptions()
sess_options.graph_optimization_level = optimization_level
sess_options.optimized_model_filepath = output_path
# Loading with these options applies and saves the optimized graph
sess = ort.InferenceSession(
input_path,
sess_options=sess_options,
providers=["CPUExecutionProvider"],
)
# Log node counts before/after
original = onnx.load(input_path)
optimized = onnx.load(output_path)
print(f"Nodes before: {len(original.graph.node)}")
print(f"Nodes after: {len(optimized.graph.node)}")
return output_path
def optimize_transformer(
input_path: str,
output_path: str,
model_type: str = "bert", # bert, roberta, gpt2, bart, t5
use_gpu: bool = False,
num_heads: int = 12,
hidden_size: int = 768,
) -> str:
"""Apply transformer-specific fusions (attention, layernorm, etc)."""
fusion_options = FusionOptions(model_type)
fusion_options.enable_gelu = True
fusion_options.enable_layer_norm = True
fusion_options.enable_attention = True
fusion_options.enable_skip_layer_norm = True
fusion_options.enable_bias_gelu = True
opt_model = optimizer.optimize_model(
input_path,
model_type=model_type,
num_heads=num_heads,
hidden_size=hidden_size,
opt_level=99,
use_gpu=use_gpu,
optimization_options=fusion_options,
)
opt_model.save_model_to_file(output_path)
print(f"Transformer fusions applied:")
print(f" Attention fusions: {opt_model.get_fused_operator_statistics()}")
return output_path
INT8 Quantization
# quantization/quantize.py — reduce model size and latency with INT8
import onnxruntime as ort
from onnxruntime.quantization import (
quantize_dynamic,
quantize_static,
QuantType,
QuantFormat,
CalibrationMethod,
)
from onnxruntime.quantization.calibrate import CalibrationDataReader
import numpy as np
def quantize_int8_dynamic(
input_path: str,
output_path: str,
weight_type: QuantType = QuantType.QInt8,
) -> str:
"""Dynamic INT8 quantization — quantizes weights, activations at runtime."""
quantize_dynamic(
model_input=input_path,
model_output=output_path,
weight_type=weight_type,
optimize_model=True,
)
# Compare sizes
import os
original_mb = os.path.getsize(input_path) / 1024 / 1024
quantized_mb = os.path.getsize(output_path) / 1024 / 1024
print(f"Original: {original_mb:.1f} MB → Quantized: {quantized_mb:.1f} MB ({(1 - quantized_mb/original_mb)*100:.0f}% reduction)")
return output_path
class TextCalibrationReader(CalibrationDataReader):
"""Provides calibration batches for static INT8 quantization."""
def __init__(self, tokenizer, texts: list[str], batch_size: int = 16):
self.texts = texts
self.tokenizer = tokenizer
self.batch_size = batch_size
self._index = 0
self._batches = self._create_batches()
def _create_batches(self):
batches = []
for i in range(0, len(self.texts), self.batch_size):
batch = self.texts[i:i + self.batch_size]
encoded = self.tokenizer(
batch,
padding=True,
truncation=True,
max_length=128,
return_tensors="np",
)
batches.append({
"input_ids": encoded["input_ids"].astype(np.int64),
"attention_mask": encoded["attention_mask"].astype(np.int64),
"token_type_ids": encoded.get("token_type_ids", np.zeros_like(encoded["input_ids"])).astype(np.int64),
})
return batches
def get_next(self):
if self._index >= len(self._batches):
return None
batch = self._batches[self._index]
self._index += 1
return batch
def quantize_int8_static(
input_path: str,
output_path: str,
calibration_reader: CalibrationDataReader,
) -> str:
"""Static INT8 quantization with calibration dataset — best accuracy."""
quantize_static(
model_input=input_path,
model_output=output_path,
calibration_data_reader=calibration_reader,
quant_format=QuantFormat.QDQ, # QDQ format for TensorRT compatibility
activation_type=QuantType.QInt8,
weight_type=QuantType.QInt8,
calibrate_method=CalibrationMethod.MinMax,
per_channel=True, # Per-channel weights = better accuracy
)
return output_path
Production Inference Session
# inference/ort_session.py — optimized ORT inference
import onnxruntime as ort
import numpy as np
from transformers import AutoTokenizer
from typing import Any
import time
class ORTTextClassifier:
"""Production ONNX Runtime inference wrapper."""
def __init__(
self,
model_path: str,
tokenizer_path: str,
max_length: int = 128,
num_threads: int = 4,
use_gpu: bool = False,
):
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)
self.max_length = max_length
# Session options
sess_options = ort.SessionOptions()
sess_options.intra_op_num_threads = num_threads
sess_options.inter_op_num_threads = 2
sess_options.execution_mode = ort.ExecutionMode.ORT_PARALLEL
sess_options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
# Choose execution provider
providers = ["CUDAExecutionProvider", "CPUExecutionProvider"] if use_gpu else ["CPUExecutionProvider"]
self.session = ort.InferenceSession(
model_path,
sess_options=sess_options,
providers=providers,
)
# Cache input/output names
self.input_names = [i.name for i in self.session.get_inputs()]
self.output_names = [o.name for o in self.session.get_outputs()]
def predict(self, texts: list[str]) -> list[dict]:
"""Run batch inference and return labels + scores."""
encoded = self.tokenizer(
texts,
padding=True,
truncation=True,
max_length=self.max_length,
return_tensors="np",
)
inputs = {
"input_ids": encoded["input_ids"].astype(np.int64),
"attention_mask": encoded["attention_mask"].astype(np.int64),
}
if "token_type_ids" in self.input_names:
inputs["token_type_ids"] = encoded.get(
"token_type_ids",
np.zeros_like(encoded["input_ids"])
).astype(np.int64)
logits = self.session.run(self.output_names, inputs)[0]
# Softmax
exp_logits = np.exp(logits - logits.max(axis=-1, keepdims=True))
probs = exp_logits / exp_logits.sum(axis=-1, keepdims=True)
return [
{
"label": int(np.argmax(p)),
"score": float(np.max(p)),
"probabilities": p.tolist(),
}
for p in probs
]
def benchmark(self, texts: list[str], n_runs: int = 50) -> dict:
"""Measure inference latency."""
# Warmup
for _ in range(3):
self.predict(texts[:1])
# Benchmark
times = []
for _ in range(n_runs):
start = time.perf_counter()
self.predict(texts)
times.append((time.perf_counter() - start) * 1000)
return {
"mean_ms": np.mean(times),
"p50_ms": np.percentile(times, 50),
"p95_ms": np.percentile(times, 95),
"p99_ms": np.percentile(times, 99),
"throughput_per_sec": len(texts) / (np.mean(times) / 1000),
}
For the Hugging Face Transformers training pipeline that produces models you export to ONNX, see the Transformers guide for LoRA fine-tuning and Trainer API. For the vLLM inference server that handles large causal LMs with PagedAttention rather than ONNX, the vLLM guide covers high-throughput generative inference. The Claude Skills 360 bundle includes ONNX skill sets covering export pipelines, ORT optimization, and INT8 quantization. Start with the free tier to try ONNX export script generation.