Production ML inference has different requirements than training: minimize latency for interactive use cases, maximize throughput for batch jobs, and fit models into GPU memory efficiently. vLLM handles LLM serving with PagedAttention for KV cache management. TorchServe handles classification and embedding models. Claude Code configures both, writes the batching middleware, and sets up the monitoring for GPU utilization and inference latency.

CLAUDE.md for Inference Projects

## ML Inference Stack
- LLM serving: vLLM (OpenAI-compatible API on port 8000)
- Classification/embedding: TorchServe (gRPC on 7070, HTTP on 8080)
- GPU: A100 80GB for large models, T4 16GB for smaller ones
- Model storage: Hugging Face Hub (private org models) or local NFS
- Batching: vLLM handles dynamic batching automatically
- Quantization: AWQ 4-bit for memory-constrained deployments
- Monitoring: DCGM exporter for GPU metrics, OpenTelemetry for request tracing

vLLM LLM Serving

# Start vLLM with OpenAI-compatible API
python -m vllm.entrypoints.openai.api_server \
  --model meta-llama/Meta-Llama-3.1-8B-Instruct \
  --tensor-parallel-size 1 \    # Number of GPUs to split the model across
  --gpu-memory-utilization 0.85 \  # Leave 15% for KV cache growth
  --max-model-len 8192 \
  --max-num-batched-tokens 32768 \  # Total tokens across all concurrent requests
  --enable-prefix-caching \    # Cache KV for shared prefixes (system prompts)
  --port 8000

# For larger models: tensor parallelism across multiple GPUs
python -m vllm.entrypoints.openai.api_server \
  --model meta-llama/Meta-Llama-3.1-70B-Instruct \
  --tensor-parallel-size 4 \   # Split across 4 GPUs
  --dtype bfloat16 \
  --gpu-memory-utilization 0.90

# For memory-constrained deployments: AWQ quantization
python -m vllm.entrypoints.openai.api_server \
  --model TheBloke/Llama-2-7B-AWQ \
  --quantization awq \
  --dtype half

Client: OpenAI-Compatible API

// src/lib/llm.ts — vLLM has OpenAI-compatible API
import OpenAI from 'openai';

// Point to your vLLM server instead of OpenAI
const client = new OpenAI({
  baseURL: process.env.VLLM_API_URL ?? 'http://localhost:8000/v1',
  apiKey: 'dummy',  // vLLM doesn't require auth by default (add --api-key if needed)
});

export async function generateStructuredOutput(
  prompt: string,
  schema: object,
): Promise<unknown> {
  const response = await client.chat.completions.create({
    model: 'meta-llama/Meta-Llama-3.1-8B-Instruct',
    messages: [
      { role: 'system', content: 'You are a helpful assistant that outputs valid JSON.' },
      { role: 'user', content: prompt },
    ],
    // vLLM: guided decoding — constrains output to valid JSON matching schema
    extra_body: {
      guided_json: schema,  // vLLM-specific: guarantees valid JSON output
    },
    temperature: 0.1,
    max_tokens: 512,
  });
  
  return JSON.parse(response.choices[0].message.content!);
}

// Streaming response
export async function* streamCompletion(messages: OpenAI.ChatCompletionMessageParam[]) {
  const stream = await client.chat.completions.create({
    model: 'meta-llama/Meta-Llama-3.1-8B-Instruct',
    messages,
    stream: true,
    max_tokens: 2048,
  });
  
  for await (const chunk of stream) {
    const content = chunk.choices[0]?.delta?.content;
    if (content) yield content;
  }
}

Batching for Throughput

# For maximum throughput: send requests concurrently — vLLM batches them automatically
# vLLM's continuous batching handles this transparently

import asyncio
from openai import AsyncOpenAI

client = AsyncOpenAI(base_url="http://localhost:8000/v1", api_key="dummy")

async def classify_single(text: str) -> dict:
    response = await client.chat.completions.create(
        model="meta-llama/Meta-Llama-3.1-8B-Instruct",
        messages=[
            {"role": "system", "content": "Classify the sentiment. Reply with JSON: {sentiment: positive|negative|neutral, confidence: 0-1}"},
            {"role": "user", "content": text},
        ],
        extra_body={"guided_json": {"type": "object", "properties": {"sentiment": {"type": "string"}, "confidence": {"type": "number"}}}},
        max_tokens=50,
        temperature=0,
    )
    import json
    return json.loads(response.choices[0].message.content)

async def classify_batch(texts: list[str], concurrency: int = 32) -> list[dict]:
    """Send up to `concurrency` requests simultaneously — vLLM batches them."""
    semaphore = asyncio.Semaphore(concurrency)
    
    async def classify_with_limit(text):
        async with semaphore:
            return await classify_single(text)
    
    return await asyncio.gather(*[classify_with_limit(t) for t in texts])

# Process 1000 texts in parallel
results = asyncio.run(classify_batch(texts, concurrency=32))

TorchServe for Embedding Models

# mar_config.yaml — Model Archiver configuration
model_name: text_embedder
version: 1.0
serialized_file: sentence_transformer.pt
handler: custom_handler.py
requirements_file: requirements.txt

# custom_handler.py — TorchServe custom handler
import torch
from transformers import AutoTokenizer, AutoModel
import json

class EmbeddingHandler:
    def __init__(self):
        self.model = None
        self.tokenizer = None
        self.device = None
    
    def initialize(self, context):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        model_dir = context.system_properties.get("model_dir")
        
        self.tokenizer = AutoTokenizer.from_pretrained(model_dir)
        self.model = AutoModel.from_pretrained(model_dir).to(self.device)
        self.model.eval()
    
    def preprocess(self, requests):
        texts = [r["body"]["text"] if isinstance(r["body"], dict) else r["body"].decode() 
                 for r in requests]
        
        encoded = self.tokenizer(
            texts,
            max_length=512,
            truncation=True,
            padding=True,
            return_tensors="pt",
        )
        return {k: v.to(self.device) for k, v in encoded.items()}
    
    def inference(self, inputs):
        with torch.no_grad():
            outputs = self.model(**inputs)
        
        # Mean pooling
        attention_mask = inputs["attention_mask"]
        token_embeddings = outputs.last_hidden_state
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        embeddings = torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
        
        # Normalize
        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
        return embeddings.cpu().numpy().tolist()
    
    def postprocess(self, outputs):
        return [json.dumps({"embedding": emb}).encode() for emb in outputs]

# torchserve-config.properties
inference_address=http://0.0.0.0:8080
management_address=http://0.0.0.0:8081
metrics_address=http://0.0.0.0:8082
number_of_netty_threads=32
job_queue_size=1000
default_workers_per_model=4
default_response_timeout=120
enable_metrics_api=true
metrics_format=prometheus

GPU Memory Optimization

# Monitor GPU memory usage per model
import torch

def get_gpu_memory_info():
    if not torch.cuda.is_available():
        return {}
    return {
        f"gpu_{i}": {
            "allocated_gb": torch.cuda.memory_allocated(i) / 1e9,
            "reserved_gb": torch.cuda.memory_reserved(i) / 1e9,
            "total_gb": torch.cuda.get_device_properties(i).total_memory / 1e9,
        }
        for i in range(torch.cuda.device_count())
    }

# Calculate model memory footprint before loading
def estimate_model_memory(model_name: str, dtype: str = "float16") -> float:
    """Estimate GPU memory needed in GB."""
    from transformers import AutoConfig
    config = AutoConfig.from_pretrained(model_name)
    
    # Rough estimate: parameters * bytes per parameter
    bytes_per_param = {"float32": 4, "float16": 2, "bfloat16": 2, "int8": 1, "int4": 0.5}
    
    # Total parameters (rough for transformer)
    vocab_size = config.vocab_size
    hidden_size = config.hidden_size
    num_layers = config.num_hidden_layers
    
    params = vocab_size * hidden_size + num_layers * (4 * hidden_size ** 2 + hidden_size)
    memory_gb = params * bytes_per_param.get(dtype, 2) / 1e9
    
    # Add ~20% overhead for activations and KV cache
    return memory_gb * 1.2

Kubernetes Deployment

# deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: vllm-llama
spec:
  replicas: 2
  selector:
    matchLabels:
      app: vllm-llama
  template:
    spec:
      containers:
        - name: vllm
          image: vllm/vllm-openai:latest
          args:
            - --model=meta-llama/Meta-Llama-3.1-8B-Instruct
            - --tensor-parallel-size=1
            - --gpu-memory-utilization=0.85
            - --max-model-len=8192
            - --enable-prefix-caching
          resources:
            limits:
              nvidia.com/gpu: "1"
              memory: "40Gi"
            requests:
              nvidia.com/gpu: "1"
              memory: "40Gi"
          env:
            - name: HUGGING_FACE_HUB_TOKEN
              valueFrom:
                secretKeyRef:
                  name: hf-token
                  key: token
          readinessProbe:
            httpGet:
              path: /health
              port: 8000
            initialDelaySeconds: 120  # Model loading takes time
            periodSeconds: 10
            failureThreshold: 30
      nodeSelector:
        cloud.google.com/gke-accelerator: nvidia-tesla-a100
      tolerations:
        - key: nvidia.com/gpu
          operator: Exists
          effect: NoSchedule

For the MLflow experiment tracking and model promotion pipeline that feeds models into these serving systems, see the MLOps guide. For the RAG system that uses these embedding and LLM inference endpoints, the RAG guide covers hybrid retrieval and reranking. The Claude Skills 360 bundle includes ML inference skill sets covering vLLM configuration, TorchServe handlers, and GPU deployment patterns. Start with the free tier to try inference server configuration.

Claude Code for ML Inference: vLLM Serving, Batching, and Model APIs

CLAUDE.md for Inference Projects

vLLM LLM Serving

Client: OpenAI-Compatible API

Batching for Throughput

TorchServe for Embedding Models

GPU Memory Optimization

Kubernetes Deployment

Keep Reading

Claude Code for AWS Bedrock: Building AI Applications on Managed Infrastructure

Claude Code for PyTorch: Model Training, Custom Datasets, and Production Deployment

Claude Code for MCP: Building Model Context Protocol Servers

Put these ideas into practice