BentoML packages and serves ML models — from scikit-learn to LLMs. import bentoml. Save a model: bentoml.sklearn.save_model("sentiment", model, signatures={"predict": {"batchable": True}}). Or bentoml.transformers.save_model("llm", pipeline). @bentoml.service(resources={"cpu": "2", "memory": "4Gi"}, traffic={"timeout": 60}) decorates the Service class. @bentoml.get("/predict") and @bentoml.post("/batch") define endpoints. Input/output types via Python type hints and Pydantic: async def predict(self, text: str) -> dict. bentoml.models.get("sentiment:latest") loads from model store. Runners for hardware isolation: runner = bentoml.models.get("gpt2:latest").to_runner(), @bentoml.service(runners=[runner]). GPU: @bentoml.service(resources={"gpu": 1}). Adaptive batching: @bentoml.api(input=Text(), output=JSON(), batchable=True, max_latency_ms=100, max_batch_size=32). bentoml build packages into a Bento — a self-contained deployable artifact with bentofile.yaml. bentoml containerize sentiment_service:latest builds a Docker image. bentoml deploy to BentoML Cloud. bentoml serve sentiment_service:latest starts locally on 0.0.0.0:3000. Multi-model pipeline: multiple @bentoml.depends() services composed together. bentoml list shows model store. bentoml models export llm:latest model.bentomodel exports. Claude Code generates BentoML services, model packaging, bentofile configurations, and TypeScript API clients.
CLAUDE.md for BentoML
## BentoML Stack
- Version: bentoml >= 1.2
- Service: @bentoml.service(resources={}, traffic={}) class with @bentoml.api endpoints
- Model store: bentoml.sklearn.save_model / bentoml.transformers.save_model / bentoml.picklable_model.save_model
- Serve: bentoml serve service:svc (dev) or bentoml serve --production
- Build: bentoml build (creates Bento from bentofile.yaml)
- Container: bentoml containerize {service}:{version} → Docker image
- Deploy: bentoml cloud deploy (BentoML Cloud) or kubectl apply (K8s with Yatai)
BentoML Service
# service.py — BentoML service with transformer model
from __future__ import annotations
import numpy as np
import bentoml
from bentoml.io import JSON, Text, NumpyNdarray
from pydantic import BaseModel
from typing import Optional, List
# ── Schema models ──────────────────────────────────────────────────────────
class TextInput(BaseModel):
text: str
max_length: Optional[int] = 512
class SentimentOutput(BaseModel):
label: str
score: float
text: str
class BatchInput(BaseModel):
texts: List[str]
class BatchOutput(BaseModel):
results: List[SentimentOutput]
# ── Service definition ─────────────────────────────────────────────────────
@bentoml.service(
name="sentiment_service",
resources={
"cpu": "2",
"memory": "4Gi",
# "gpu": 1, # Uncomment for GPU
},
traffic={
"timeout": 60,
"max_concurrency": 32,
},
)
class SentimentService:
def __init__(self):
# Load model from BentoML model store
self.model_ref = bentoml.transformers.get("sentiment_model:latest")
self.pipeline = self.model_ref.load_model()
@bentoml.api(
input_spec=TextInput,
output_spec=SentimentOutput,
)
async def classify(self, input: TextInput) -> SentimentOutput:
"""Single text sentiment classification."""
result = self.pipeline(input.text[:input.max_length])[0]
return SentimentOutput(
label=result["label"],
score=round(result["score"], 4),
text=input.text,
)
@bentoml.api(
input_spec=BatchInput,
output_spec=BatchOutput,
batchable=True,
max_batch_size=64,
max_latency_ms=200,
)
async def batch_classify(self, input: BatchInput) -> BatchOutput:
"""Batch sentiment classification with adaptive batching."""
results = self.pipeline(input.texts, batch_size=32, truncation=True)
return BatchOutput(
results=[
SentimentOutput(label=r["label"], score=round(r["score"], 4), text=t)
for r, t in zip(results, input.texts)
]
)
@bentoml.api()
async def health(self) -> dict:
return {"status": "ok", "model": self.model_ref.tag.name}
Model Registration Script
# scripts/register_model.py — save trained model to BentoML model store
import bentoml
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
def register_sentiment_model(model_name: str = "cardiffnlp/twitter-roberta-base-sentiment-latest"):
"""Download and register a HuggingFace model in BentoML."""
print(f"Loading model: {model_name}")
classifier = pipeline(
"text-classification",
model=model_name,
tokenizer=model_name,
return_all_scores=False,
)
tag = bentoml.transformers.save_model(
name="sentiment_model",
model=classifier,
metadata={
"hf_model": model_name,
"task": "text-classification",
"labels": ["negative", "neutral", "positive"],
},
signatures={
"__call__": {
"batchable": True,
"batch_dim": 0,
"max_batch_size": 64,
}
},
)
print(f"Saved model: {tag}")
return tag
def register_sklearn_model(model, feature_names: list[str]):
"""Register a scikit-learn model."""
tag = bentoml.sklearn.save_model(
"churn_model",
model,
signatures={"predict": {"batchable": True}, "predict_proba": {"batchable": True}},
metadata={"features": feature_names, "framework": "sklearn"},
custom_objects={"feature_names": feature_names},
)
print(f"Saved sklearn model: {tag}")
return tag
if __name__ == "__main__":
register_sentiment_model()
bentofile.yaml
# bentofile.yaml — Bento packaging configuration
service: "service:SentimentService"
name: sentiment-api
version: 1.0.0
description: "Sentiment analysis API with adaptive batching"
labels:
team: ml-platform
env: production
include:
- "service.py"
- "*.py"
python:
packages:
- transformers>=4.40.0
- torch>=2.2.0
- numpy>=1.26.0
- pydantic>=2.0.0
lock_packages: true
docker:
python_version: "3.11"
cuda_version: "12.1.1" # Remove for CPU-only
dockerfile_template: "Dockerfile.template" # Optional custom template
env:
- "BENTOML_HOME=/bento/.bentoml"
- "TOKENIZERS_PARALLELISM=false"
TypeScript Client
// lib/bentoml/client.ts — TypeScript client for BentoML service
const BENTO_URL = process.env.BENTOML_SERVICE_URL ?? "http://localhost:3000"
export type SentimentInput = { text: string; max_length?: number }
export type SentimentOutput = { label: string; score: number; text: string }
export type BatchOutput = { results: SentimentOutput[] }
async function bentoFetch<T>(endpoint: string, body: unknown): Promise<T> {
const res = await fetch(`${BENTO_URL}${endpoint}`, {
method: "POST",
headers: { "Content-Type": "application/json" },
body: JSON.stringify(body),
})
if (!res.ok) throw new Error(`BentoML error ${res.status}: ${await res.text()}`)
return res.json()
}
export async function classifyText(text: string): Promise<SentimentOutput> {
return bentoFetch<SentimentOutput>("/classify", { text })
}
export async function classifyBatch(texts: string[]): Promise<SentimentOutput[]> {
const result = await bentoFetch<BatchOutput>("/batch_classify", { texts })
return result.results
}
export async function healthCheck(): Promise<{ status: string; model: string }> {
const res = await fetch(`${BENTO_URL}/health`)
return res.json()
}
For the Ray Serve alternative when needing horizontal scaling across a Ray cluster with fine-grained actor-based deployment, composable deployments for multi-model pipelines, and Python-native traffic splitting — Ray Serve excels for large distributed inference workloads while BentoML is friendlier for packaging individual models with clear Docker+K8s deployment patterns and a simpler operator experience. For the FastAPI alternative when building a simple model serving endpoint without the BentoML packaging abstractions — FastAPI is a great direct option but requires manually handling model loading, batching, versioning, and container packaging that BentoML provides out of the box with bentofile.yaml and adaptive batching. The Claude Skills 360 bundle includes BentoML skill sets covering service definition, model registration, batching, and TypeScript clients. Start with the free tier to try ML serving generation.