Production ML inference has different requirements than training: minimize latency for interactive use cases, maximize throughput for batch jobs, and fit models into GPU memory efficiently. vLLM handles LLM serving with PagedAttention for KV cache management. TorchServe handles classification and embedding models. Claude Code configures both, writes the batching middleware, and sets up the monitoring for GPU utilization and inference latency.
CLAUDE.md for Inference Projects
## ML Inference Stack
- LLM serving: vLLM (OpenAI-compatible API on port 8000)
- Classification/embedding: TorchServe (gRPC on 7070, HTTP on 8080)
- GPU: A100 80GB for large models, T4 16GB for smaller ones
- Model storage: Hugging Face Hub (private org models) or local NFS
- Batching: vLLM handles dynamic batching automatically
- Quantization: AWQ 4-bit for memory-constrained deployments
- Monitoring: DCGM exporter for GPU metrics, OpenTelemetry for request tracing
vLLM LLM Serving
# Start vLLM with OpenAI-compatible API
python -m vllm.entrypoints.openai.api_server \
--model meta-llama/Meta-Llama-3.1-8B-Instruct \
--tensor-parallel-size 1 \ # Number of GPUs to split the model across
--gpu-memory-utilization 0.85 \ # Leave 15% for KV cache growth
--max-model-len 8192 \
--max-num-batched-tokens 32768 \ # Total tokens across all concurrent requests
--enable-prefix-caching \ # Cache KV for shared prefixes (system prompts)
--port 8000
# For larger models: tensor parallelism across multiple GPUs
python -m vllm.entrypoints.openai.api_server \
--model meta-llama/Meta-Llama-3.1-70B-Instruct \
--tensor-parallel-size 4 \ # Split across 4 GPUs
--dtype bfloat16 \
--gpu-memory-utilization 0.90
# For memory-constrained deployments: AWQ quantization
python -m vllm.entrypoints.openai.api_server \
--model TheBloke/Llama-2-7B-AWQ \
--quantization awq \
--dtype half
Client: OpenAI-Compatible API
// src/lib/llm.ts — vLLM has OpenAI-compatible API
import OpenAI from 'openai';
// Point to your vLLM server instead of OpenAI
const client = new OpenAI({
baseURL: process.env.VLLM_API_URL ?? 'http://localhost:8000/v1',
apiKey: 'dummy', // vLLM doesn't require auth by default (add --api-key if needed)
});
export async function generateStructuredOutput(
prompt: string,
schema: object,
): Promise<unknown> {
const response = await client.chat.completions.create({
model: 'meta-llama/Meta-Llama-3.1-8B-Instruct',
messages: [
{ role: 'system', content: 'You are a helpful assistant that outputs valid JSON.' },
{ role: 'user', content: prompt },
],
// vLLM: guided decoding — constrains output to valid JSON matching schema
extra_body: {
guided_json: schema, // vLLM-specific: guarantees valid JSON output
},
temperature: 0.1,
max_tokens: 512,
});
return JSON.parse(response.choices[0].message.content!);
}
// Streaming response
export async function* streamCompletion(messages: OpenAI.ChatCompletionMessageParam[]) {
const stream = await client.chat.completions.create({
model: 'meta-llama/Meta-Llama-3.1-8B-Instruct',
messages,
stream: true,
max_tokens: 2048,
});
for await (const chunk of stream) {
const content = chunk.choices[0]?.delta?.content;
if (content) yield content;
}
}
Batching for Throughput
# For maximum throughput: send requests concurrently — vLLM batches them automatically
# vLLM's continuous batching handles this transparently
import asyncio
from openai import AsyncOpenAI
client = AsyncOpenAI(base_url="http://localhost:8000/v1", api_key="dummy")
async def classify_single(text: str) -> dict:
response = await client.chat.completions.create(
model="meta-llama/Meta-Llama-3.1-8B-Instruct",
messages=[
{"role": "system", "content": "Classify the sentiment. Reply with JSON: {sentiment: positive|negative|neutral, confidence: 0-1}"},
{"role": "user", "content": text},
],
extra_body={"guided_json": {"type": "object", "properties": {"sentiment": {"type": "string"}, "confidence": {"type": "number"}}}},
max_tokens=50,
temperature=0,
)
import json
return json.loads(response.choices[0].message.content)
async def classify_batch(texts: list[str], concurrency: int = 32) -> list[dict]:
"""Send up to `concurrency` requests simultaneously — vLLM batches them."""
semaphore = asyncio.Semaphore(concurrency)
async def classify_with_limit(text):
async with semaphore:
return await classify_single(text)
return await asyncio.gather(*[classify_with_limit(t) for t in texts])
# Process 1000 texts in parallel
results = asyncio.run(classify_batch(texts, concurrency=32))
TorchServe for Embedding Models
# mar_config.yaml — Model Archiver configuration
model_name: text_embedder
version: 1.0
serialized_file: sentence_transformer.pt
handler: custom_handler.py
requirements_file: requirements.txt
# custom_handler.py — TorchServe custom handler
import torch
from transformers import AutoTokenizer, AutoModel
import json
class EmbeddingHandler:
def __init__(self):
self.model = None
self.tokenizer = None
self.device = None
def initialize(self, context):
self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_dir = context.system_properties.get("model_dir")
self.tokenizer = AutoTokenizer.from_pretrained(model_dir)
self.model = AutoModel.from_pretrained(model_dir).to(self.device)
self.model.eval()
def preprocess(self, requests):
texts = [r["body"]["text"] if isinstance(r["body"], dict) else r["body"].decode()
for r in requests]
encoded = self.tokenizer(
texts,
max_length=512,
truncation=True,
padding=True,
return_tensors="pt",
)
return {k: v.to(self.device) for k, v in encoded.items()}
def inference(self, inputs):
with torch.no_grad():
outputs = self.model(**inputs)
# Mean pooling
attention_mask = inputs["attention_mask"]
token_embeddings = outputs.last_hidden_state
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
# Normalize
embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
return embeddings.cpu().numpy().tolist()
def postprocess(self, outputs):
return [json.dumps({"embedding": emb}).encode() for emb in outputs]
# torchserve-config.properties
inference_address=http://0.0.0.0:8080
management_address=http://0.0.0.0:8081
metrics_address=http://0.0.0.0:8082
number_of_netty_threads=32
job_queue_size=1000
default_workers_per_model=4
default_response_timeout=120
enable_metrics_api=true
metrics_format=prometheus
GPU Memory Optimization
# Monitor GPU memory usage per model
import torch
def get_gpu_memory_info():
if not torch.cuda.is_available():
return {}
return {
f"gpu_{i}": {
"allocated_gb": torch.cuda.memory_allocated(i) / 1e9,
"reserved_gb": torch.cuda.memory_reserved(i) / 1e9,
"total_gb": torch.cuda.get_device_properties(i).total_memory / 1e9,
}
for i in range(torch.cuda.device_count())
}
# Calculate model memory footprint before loading
def estimate_model_memory(model_name: str, dtype: str = "float16") -> float:
"""Estimate GPU memory needed in GB."""
from transformers import AutoConfig
config = AutoConfig.from_pretrained(model_name)
# Rough estimate: parameters * bytes per parameter
bytes_per_param = {"float32": 4, "float16": 2, "bfloat16": 2, "int8": 1, "int4": 0.5}
# Total parameters (rough for transformer)
vocab_size = config.vocab_size
hidden_size = config.hidden_size
num_layers = config.num_hidden_layers
params = vocab_size * hidden_size + num_layers * (4 * hidden_size ** 2 + hidden_size)
memory_gb = params * bytes_per_param.get(dtype, 2) / 1e9
# Add ~20% overhead for activations and KV cache
return memory_gb * 1.2
Kubernetes Deployment
# deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: vllm-llama
spec:
replicas: 2
selector:
matchLabels:
app: vllm-llama
template:
spec:
containers:
- name: vllm
image: vllm/vllm-openai:latest
args:
- --model=meta-llama/Meta-Llama-3.1-8B-Instruct
- --tensor-parallel-size=1
- --gpu-memory-utilization=0.85
- --max-model-len=8192
- --enable-prefix-caching
resources:
limits:
nvidia.com/gpu: "1"
memory: "40Gi"
requests:
nvidia.com/gpu: "1"
memory: "40Gi"
env:
- name: HUGGING_FACE_HUB_TOKEN
valueFrom:
secretKeyRef:
name: hf-token
key: token
readinessProbe:
httpGet:
path: /health
port: 8000
initialDelaySeconds: 120 # Model loading takes time
periodSeconds: 10
failureThreshold: 30
nodeSelector:
cloud.google.com/gke-accelerator: nvidia-tesla-a100
tolerations:
- key: nvidia.com/gpu
operator: Exists
effect: NoSchedule
For the MLflow experiment tracking and model promotion pipeline that feeds models into these serving systems, see the MLOps guide. For the RAG system that uses these embedding and LLM inference endpoints, the RAG guide covers hybrid retrieval and reranking. The Claude Skills 360 bundle includes ML inference skill sets covering vLLM configuration, TorchServe handlers, and GPU deployment patterns. Start with the free tier to try inference server configuration.