TensorRT optimizes neural network inference on NVIDIA GPUs. pip install tensorrt. import tensorrt as trt. Build engine from ONNX: logger = trt.Logger(trt.Logger.WARNING), builder = trt.Builder(logger), network = builder.create_network(1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)), parser = trt.OnnxParser(network, logger), parser.parse_from_file("model.onnx"). Config: config = builder.create_builder_config(), config.set_memory_pool_limit(trt.MemoryPoolType.WORKSPACE, 4 << 30) (4GB), config.set_flag(trt.BuilderFlag.FP16) for fp16. Engine: serialized = builder.build_serialized_network(network, config). Save: open("model.trt","wb").write(serialized). Load and run: runtime = trt.Runtime(logger), engine = runtime.deserialize_cuda_engine(open("model.trt","rb").read()), context = engine.create_execution_context(). Input: context.set_input_shape("input", (batch, 3, 224, 224)). Allocate: input_mem = cuda.mem_alloc(input_tensor.nbytes), output_mem = cuda.mem_alloc(output_size). cuda.memcpy_htod_async(input_mem, input_np, stream). context.execute_async_v3(stream.handle). cuda.memcpy_dtoh_async(output_np, output_mem, stream). stream.synchronize(). INT8: config.set_flag(trt.BuilderFlag.INT8), config.int8_calibrator = MyCalibrator(calibration_data) — calibrator must implement get_batch/get_batch_size/get_algorithm/read_calibration_cache/write_calibration_cache. Dynamic shapes: profile = builder.create_optimization_profile(), profile.set_shape("input", min=(1,3,224,224), opt=(8,3,224,224), max=(32,3,224,224)), config.add_optimization_profile(profile). Torch conversion: from torch2trt import torch2trt, trt_model = torch2trt(model, [x_dummy], fp16_mode=True). Claude Code generates TensorRT build scripts, calibration pipelines, dynamic shape configs, and CUDA stream inference loops.
CLAUDE.md for TensorRT
## TensorRT Stack
- Version: tensorrt >= 10.0 + cuda >= 12.0 + pycuda
- Build: Builder → create_network(EXPLICIT_BATCH) → OnnxParser.parse_from_file → build_serialized_network
- Config: set_memory_pool_limit(WORKSPACE, N<<30), set_flag(FP16|INT8|TF32)
- Dynamic: create_optimization_profile().set_shape("input", min, opt, max)
- Run: Runtime.deserialize_cuda_engine → create_execution_context → set_input_shape → execute_async_v3
- INT8 calibration: IInt8EntropyCalibrator2 with get_batch returning device pointer
- Torch: torch2trt(model, [dummy], fp16_mode=True) — one-liner conversion
TensorRT Engine Pipeline
# inference/tensorrt_engine.py — build, save, and run TensorRT engines
from __future__ import annotations
import os
import pickle
from pathlib import Path
from typing import Optional
import numpy as np
import torch
try:
import pycuda.autoinit # noqa: F401 — initializes CUDA context
import pycuda.driver as cuda
import tensorrt as trt
TRT_AVAILABLE = True
except ImportError:
TRT_AVAILABLE = False
# ── ONNX export ───────────────────────────────────────────────────────────────
def export_to_onnx(
model: torch.nn.Module,
input_shape: tuple[int, ...],
onnx_path: str = "model.onnx",
opset: int = 17,
dynamic_batch: bool = True,
) -> str:
"""Export PyTorch model to ONNX with optional dynamic batch dimension."""
model.eval()
dummy = torch.randn(1, *input_shape[1:], device="cuda" if next(model.parameters()).is_cuda else "cpu")
dynamic_axes = {"input": {0: "batch"}, "output": {0: "batch"}} if dynamic_batch else None
torch.onnx.export(
model,
dummy,
onnx_path,
export_params=True,
opset_version=opset,
do_constant_folding=True,
input_names=["input"],
output_names=["output"],
dynamic_axes=dynamic_axes,
)
print(f"ONNX model exported: {onnx_path}")
return onnx_path
# ── INT8 Calibration ──────────────────────────────────────────────────────────
class Int8EntropyCalibrator(trt.IInt8EntropyCalibrator2 if TRT_AVAILABLE else object):
"""
INT8 calibrator — feeds representative batches to compute quantization scale.
Caches calibration data to avoid re-running on every build.
"""
def __init__(
self,
calibration_data: np.ndarray, # (N, C, H, W) float32 calibration images
batch_size: int = 32,
cache_file: str = "calibration.cache",
):
if TRT_AVAILABLE:
super().__init__()
self.data = calibration_data.astype(np.float32)
self.batch_size = batch_size
self.cache_file = cache_file
self.index = 0
self._device_mem = cuda.mem_alloc(self.data[0:batch_size].nbytes) if TRT_AVAILABLE else None
def get_batch_size(self) -> int:
return self.batch_size
def get_batch(self, names: list[str]) -> Optional[list]:
if self.index + self.batch_size > len(self.data):
return None
batch = self.data[self.index : self.index + self.batch_size]
cuda.memcpy_htod(self._device_mem, np.ascontiguousarray(batch))
self.index += self.batch_size
return [int(self._device_mem)]
def read_calibration_cache(self) -> Optional[bytes]:
if os.path.exists(self.cache_file):
with open(self.cache_file, "rb") as f:
return f.read()
return None
def write_calibration_cache(self, cache: bytes) -> None:
with open(self.cache_file, "wb") as f:
f.write(cache)
print(f"Calibration cache written: {self.cache_file}")
# ── Engine builder ────────────────────────────────────────────────────────────
class TRTEngineBuilder:
"""Build TensorRT engines from ONNX with fp16/int8 and dynamic shapes."""
def __init__(self, verbose: bool = False):
severity = trt.Logger.VERBOSE if verbose else trt.Logger.WARNING
self.logger = trt.Logger(severity)
self.builder = trt.Builder(self.logger)
def build_from_onnx(
self,
onnx_path: str,
engine_path: str,
precision: str = "fp16", # "fp32", "fp16", "int8"
workspace_gb: int = 4,
# Dynamic shape: {tensor_name: (min, opt, max)} each is a shape tuple
dynamic_shapes: dict = None,
calibrator = None,
) -> bytes:
network = self.builder.create_network(
1 << int(trt.NetworkDefinitionCreationFlag.EXPLICIT_BATCH)
)
parser = trt.OnnxParser(network, self.logger)
with open(onnx_path, "rb") as f:
if not parser.parse(f.read()):
for i in range(parser.num_errors):
print(f"ONNX parse error: {parser.get_error(i)}")
raise RuntimeError("Failed to parse ONNX model")
config = self.builder.create_builder_config()
config.set_memory_pool_limit(
trt.MemoryPoolType.WORKSPACE, workspace_gb << 30
)
# Precision flags
if precision == "fp16":
config.set_flag(trt.BuilderFlag.FP16)
elif precision == "int8":
config.set_flag(trt.BuilderFlag.INT8)
config.set_flag(trt.BuilderFlag.FP16) # INT8 layers may fall back to FP16
if calibrator:
config.int8_calibrator = calibrator
# Dynamic shape optimization profiles
if dynamic_shapes:
profile = self.builder.create_optimization_profile()
for tensor_name, (min_s, opt_s, max_s) in dynamic_shapes.items():
profile.set_shape(tensor_name, min=min_s, opt=opt_s, max=max_s)
config.add_optimization_profile(profile)
print(f"Building TensorRT engine ({precision})...")
serialized = self.builder.build_serialized_network(network, config)
if serialized is None:
raise RuntimeError("Engine build failed")
Path(engine_path).parent.mkdir(parents=True, exist_ok=True)
with open(engine_path, "wb") as f:
f.write(serialized)
print(f"Engine saved: {engine_path}")
return serialized
# ── Inference runtime ─────────────────────────────────────────────────────────
class TRTInferencer:
"""Load a saved TensorRT engine and run batched inference."""
def __init__(self, engine_path: str):
logger = trt.Logger(trt.Logger.WARNING)
runtime = trt.Runtime(logger)
with open(engine_path, "rb") as f:
self.engine = runtime.deserialize_cuda_engine(f.read())
self.context = self.engine.create_execution_context()
self.stream = cuda.Stream()
# Pre-allocate device buffers for all tensors
self._allocate_buffers()
def _allocate_buffers(self):
self._inputs = {}
self._outputs = {}
for i in range(self.engine.num_io_tensors):
name = self.engine.get_tensor_name(i)
dtype = trt.nptype(self.engine.get_tensor_dtype(name))
mode = self.engine.get_tensor_mode(name)
shape = self.engine.get_tensor_shape(name)
# -1 in shape means dynamic — allocate max size separately
if -1 not in shape:
buf = cuda.mem_alloc(int(np.prod(shape)) * np.dtype(dtype).itemsize)
if mode == trt.TensorIOMode.INPUT:
self._inputs[name] = (buf, shape, dtype)
else:
self._outputs[name] = (buf, shape, dtype)
def infer(
self,
inputs: dict[str, np.ndarray],
) -> dict[str, np.ndarray]:
"""Run synchronous inference. Returns output arrays."""
# Set input shapes (required for dynamic inputs)
for name, arr in inputs.items():
self.context.set_input_shape(name, arr.shape)
buf = cuda.mem_alloc(arr.nbytes)
self._inputs[name] = (buf, arr.shape, arr.dtype)
cuda.memcpy_htod_async(buf, np.ascontiguousarray(arr), self.stream)
self.context.set_tensor_address(name, int(buf))
# Allocate output buffers based on inferred shapes
results = {}
for i in range(self.engine.num_io_tensors):
name = self.engine.get_tensor_name(i)
if self.engine.get_tensor_mode(name) == trt.TensorIOMode.OUTPUT:
shape = tuple(self.context.get_tensor_shape(name))
dtype = trt.nptype(self.engine.get_tensor_dtype(name))
out_arr = np.empty(shape, dtype=dtype)
buf = cuda.mem_alloc(out_arr.nbytes)
self.context.set_tensor_address(name, int(buf))
results[name] = (buf, out_arr)
# Run inference
self.context.execute_async_v3(stream_handle=self.stream.handle)
# Copy outputs to host
for name, (buf, out_arr) in results.items():
cuda.memcpy_dtoh_async(out_arr, buf, self.stream)
self.stream.synchronize()
return {name: arr for name, (_, arr) in results.items()}
def infer_torch(self, x: torch.Tensor) -> torch.Tensor:
"""Convenience wrapper: accepts and returns torch tensors."""
out_dict = self.infer({"input": x.cpu().numpy()})
return torch.from_numpy(list(out_dict.values())[0])
# ── Full pipeline example ─────────────────────────────────────────────────────
def build_and_benchmark(
model: torch.nn.Module,
input_shape: tuple[int,...] = (1, 3, 224, 224),
engine_path: str = "outputs/model_fp16.trt",
precision: str = "fp16",
):
"""Export → build → benchmark pipeline."""
onnx_path = engine_path.replace(".trt", ".onnx")
# Step 1: Export to ONNX
export_to_onnx(model, input_shape, onnx_path, dynamic_batch=True)
# Step 2: Build TRT engine with dynamic batch (1-32)
C, H, W = input_shape[1:]
builder = TRTEngineBuilder()
builder.build_from_onnx(
onnx_path,
engine_path,
precision=precision,
dynamic_shapes={"input": ((1,C,H,W), (8,C,H,W), (32,C,H,W))},
)
# Step 3: Benchmark fp16 vs torch
inferencer = TRTInferencer(engine_path)
x = np.random.randn(8, *input_shape[1:]).astype(np.float32)
import time
N = 100
# Warm-up
for _ in range(10):
inferencer.infer({"input": x})
t0 = time.perf_counter()
for _ in range(N):
inferencer.infer({"input": x})
trt_ms = (time.perf_counter() - t0) * 1e3 / N
model.eval().cuda()
x_t = torch.from_numpy(x).cuda()
with torch.no_grad(), torch.amp.autocast("cuda"):
for _ in range(10):
model(x_t)
t0 = time.perf_counter()
for _ in range(N):
model(x_t)
torch.cuda.synchronize()
torch_ms = (time.perf_counter() - t0) * 1e3 / N
speedup = torch_ms / trt_ms
print(f"\nBenchmark (batch=8):")
print(f" PyTorch fp16: {torch_ms:.2f} ms")
print(f" TensorRT {precision}: {trt_ms:.2f} ms")
print(f" Speedup: {speedup:.2f}x")
For the ONNX Runtime alternative when needing CPU inference, cross-platform deployment (Linux/Windows/ARM/edge), or when building inference pipelines that must run without NVIDIA hardware — ONNX Runtime’s execution providers span CPU/CUDA/TensorRT/CoreML while TensorRT’s tighter CUDA integration enables layer fusion and precision calibration that achieves 2-8x lower latency specifically on NVIDIA GPUs. For the TorchScript/torch.compile alternative when staying within the PyTorch ecosystem and needing inference on models that use dynamic control flow — torch.compile with mode="reduce-overhead" gives meaningful speedups without ONNX conversion while TensorRT’s engine builder performs kernel layer fusion and INT8 calibration that can’t be replicated by JIT compilation alone, making it the choice for production inference where latency SLAs require maximum GPU utilization. The Claude Skills 360 bundle includes TensorRT skill sets covering ONNX export, FP16/INT8 engine building, dynamic shape profiles, INT8 calibration, and CUDA stream inference. Start with the free tier to try GPU inference optimization generation.