Hugging Face Transformers provides pretrained models for every NLP and vision task. AutoModelForSequenceClassification loads any classification model with one line. PEFT’s LoRA adapter adds trainable low-rank matrices to frozen base weights — reducing trainable parameters by 99%. The Trainer API handles distributed training, gradient accumulation, and evaluation loops. The datasets library streams terabyte-scale corpora with lazy map/filter transforms. Claude Code generates Transformers training scripts, PEFT adapter configs, Datasets preprocessing pipelines, and the Hub publish workflow for production ML systems.
CLAUDE.md for Transformers Projects
## Hugging Face Stack
- transformers >= 4.44, datasets >= 2.20, peft >= 0.12, accelerate >= 0.33
- Training: Trainer API + TrainingArguments for most tasks; raw loop for custom losses
- Efficient fine-tuning: LoRA via peft.LoraConfig (not full fine-tuning unless <1B params)
- Quantization: bitsandbytes for 4-bit/8-bit inference on consumer GPUs
- Evaluation: compute_metrics callback with sklearn.metrics or evaluate library
- Hub: push_to_hub() after training; load from Hub with from_pretrained("org/model")
- Datasets: streaming=True for >10GB; use .map(batched=True, num_proc=8) for preprocessing
Loading and Tokenizing
# training/tokenize.py — prepare dataset for classification
from datasets import load_dataset, DatasetDict
from transformers import AutoTokenizer
MODEL_NAME = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
def load_and_tokenize(dataset_name: str, text_col: str, label_col: str) -> DatasetDict:
"""Load a Hugging Face dataset and tokenize for classification."""
ds = load_dataset(dataset_name)
label_names = ds["train"].features[label_col].names
id2label = {i: l for i, l in enumerate(label_names)}
label2id = {l: i for i, l in enumerate(label_names)}
def tokenize_batch(batch):
tokens = tokenizer(
batch[text_col],
padding="max_length",
truncation=True,
max_length=256,
)
tokens["labels"] = batch[label_col]
return tokens
tokenized = ds.map(
tokenize_batch,
batched=True,
num_proc=8,
remove_columns=ds["train"].column_names,
)
tokenized.set_format("torch")
return tokenized, id2label, label2id
# Custom dataset from CSV files
def load_custom_dataset(train_csv: str, val_csv: str) -> DatasetDict:
"""Load from local CSV files."""
from datasets import Dataset
import pandas as pd
train_df = pd.read_csv(train_csv)
val_df = pd.read_csv(val_csv)
ds = DatasetDict({
"train": Dataset.from_pandas(train_df),
"validation": Dataset.from_pandas(val_df),
})
return ds
# Streaming for large datasets
def create_streaming_dataset(path: str):
"""Stream a Parquet dataset without loading into RAM."""
ds = load_dataset("parquet", data_files={"train": path}, streaming=True)
# Lazy transforms — applied on-the-fly during training
ds = ds.map(lambda x: tokenizer(x["text"], truncation=True, max_length=512))
ds = ds.filter(lambda x: len(x["input_ids"]) > 16)
return ds
LoRA Fine-Tuning with PEFT
# training/lora_trainer.py — parameter-efficient fine-tuning
import numpy as np
from transformers import (
AutoModelForSequenceClassification,
AutoTokenizer,
Trainer,
TrainingArguments,
DataCollatorWithPadding,
EarlyStoppingCallback,
)
from peft import LoraConfig, get_peft_model, TaskType
from sklearn.metrics import accuracy_score, f1_score
import evaluate
MODEL_NAME = "roberta-base"
def build_lora_model(num_labels: int, id2label: dict, label2id: dict):
"""Load base model and attach LoRA adapters."""
# Load base model (weights frozen by PEFT)
model = AutoModelForSequenceClassification.from_pretrained(
MODEL_NAME,
num_labels=num_labels,
id2label=id2label,
label2id=label2id,
)
# LoRA config: inject low-rank matrices into attention layers
lora_config = LoraConfig(
task_type=TaskType.SEQ_CLS,
r=16, # Rank — higher = more capacity, more params
lora_alpha=32, # Scaling factor (alpha/r = effective scale)
lora_dropout=0.1,
target_modules=["query", "value"], # Which layers to adapt
bias="none",
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()
# Trainable params: ~0.5% of total with r=16
return model
def compute_metrics(eval_pred):
"""Compute accuracy and F1 from Trainer predictions."""
logits, labels = eval_pred
predictions = np.argmax(logits, axis=-1)
return {
"accuracy": accuracy_score(labels, predictions),
"f1_macro": f1_score(labels, predictions, average="macro"),
"f1_weighted": f1_score(labels, predictions, average="weighted"),
}
def train(
tokenized_ds,
num_labels: int,
id2label: dict,
label2id: dict,
output_dir: str = "./lora-classifier",
):
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = build_lora_model(num_labels, id2label, label2id)
training_args = TrainingArguments(
output_dir=output_dir,
num_train_epochs=5,
per_device_train_batch_size=32,
per_device_eval_batch_size=64,
learning_rate=2e-4, # Higher LR for LoRA than full fine-tuning
warmup_ratio=0.06,
lr_scheduler_type="cosine",
weight_decay=0.01,
eval_strategy="epoch",
save_strategy="epoch",
load_best_model_at_end=True,
metric_for_best_model="f1_macro",
greater_is_better=True,
fp16=True, # Mixed precision
gradient_accumulation_steps=2,
dataloader_num_workers=4,
report_to="none", # Set "wandb" for W&B logging
push_to_hub=False,
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized_ds["train"],
eval_dataset=tokenized_ds["validation"],
tokenizer=tokenizer,
data_collator=DataCollatorWithPadding(tokenizer),
compute_metrics=compute_metrics,
callbacks=[EarlyStoppingCallback(early_stopping_patience=3)],
)
trainer.train()
trainer.save_model(output_dir)
return trainer
Causal LM Fine-Tuning (Instruction Tuning)
# training/sft_trainer.py — instruction fine-tuning with SFTTrainer
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig
import torch
BASE_MODEL = "meta-llama/Llama-3.2-3B-Instruct"
def load_quantized_model(model_name: str):
"""Load in 4-bit quantization for fine-tuning on consumer GPUs."""
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4", # Normal float 4-bit
bnb_4bit_compute_dtype=torch.bfloat16,
bnb_4bit_use_double_quant=True, # Nested quantization
)
model = AutoModelForCausalLM.from_pretrained(
model_name,
quantization_config=bnb_config,
device_map="auto",
trust_remote_code=False,
)
return model
def format_instruction(sample: dict) -> str:
"""Format dataset rows into instruction template."""
return f"""<|begin_of_text|><|start_header_id|>system<|end_header_id|>
You are a helpful customer support assistant.<|eot_id|>
<|start_header_id|>user<|end_header_id|>
{sample['question']}<|eot_id|>
<|start_header_id|>assistant<|end_header_id|>
{sample['answer']}<|eot_id|>"""
def finetune_causal_lm(dataset_name: str, output_dir: str = "./sft-model"):
tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
tokenizer.pad_token = tokenizer.eos_token
model = load_quantized_model(BASE_MODEL)
lora_config = LoraConfig(
r=32,
lora_alpha=64,
target_modules=["q_proj", "v_proj", "k_proj", "o_proj"],
lora_dropout=0.05,
bias="none",
task_type="CAUSAL_LM",
)
dataset = load_dataset(dataset_name, split="train")
sft_config = SFTConfig(
output_dir=output_dir,
num_train_epochs=2,
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
learning_rate=2e-4,
bf16=True,
max_seq_length=2048,
packing=True, # Pack multiple short examples per sequence
save_steps=100,
logging_steps=25,
)
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset,
formatting_func=format_instruction,
peft_config=lora_config,
args=sft_config,
)
trainer.train()
trainer.save_model(output_dir)
# Merge LoRA weights into base model for deployment
merged_model = trainer.model.merge_and_unload()
merged_model.save_pretrained(f"{output_dir}-merged")
return trainer
Inference Pipeline
# inference/pipeline.py — production inference patterns
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
from functools import lru_cache
@lru_cache(maxsize=1)
def get_classifier(model_path: str, device: str = "auto"):
"""Load model once, cache for repeated inference."""
return pipeline(
"text-classification",
model=model_path,
tokenizer=model_path,
device_map=device,
torch_dtype=torch.float16,
top_k=None, # Return all class probabilities
)
def classify_batch(texts: list[str], model_path: str) -> list[dict]:
"""Batch classify texts — more efficient than one-by-one."""
classifier = get_classifier(model_path)
# Pipeline handles batching internally
results = classifier(texts, batch_size=32, truncation=True)
return [
{
"text": text,
"label": max(r, key=lambda x: x["score"])["label"],
"scores": {item["label"]: item["score"] for item in r},
}
for text, r in zip(texts, results)
]
# Generative inference
def generate_response(
prompt: str,
model_path: str,
max_new_tokens: int = 512,
temperature: float = 0.7,
) -> str:
"""Generate text with a causal LM."""
generator = pipeline(
"text-generation",
model=model_path,
torch_dtype=torch.bfloat16,
device_map="auto",
)
outputs = generator(
prompt,
max_new_tokens=max_new_tokens,
temperature=temperature,
do_sample=temperature > 0,
top_p=0.9,
return_full_text=False, # Only return generated part
)
return outputs[0]["generated_text"].strip()
# Embeddings
def get_embeddings(texts: list[str], model_name: str = "sentence-transformers/all-MiniLM-L6-v2") -> list[list[float]]:
"""Generate sentence embeddings for semantic search."""
embedder = pipeline(
"feature-extraction",
model=model_name,
tokenizer=model_name,
)
outputs = embedder(texts, batch_size=64)
# Mean pool over token dimension → sentence embedding
embeddings = []
for output in outputs:
embedding = torch.tensor(output).mean(dim=1).squeeze().tolist()
embeddings.append(embedding)
return embeddings
Push to Hub and Load
# hub/publish.py — publish model to Hugging Face Hub
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from huggingface_hub import HfApi
import os
def push_model_to_hub(
local_path: str,
repo_id: str,
private: bool = True,
commit_message: str = "Add fine-tuned model",
) -> str:
"""Push a trained model to the Hugging Face Hub."""
# Load and push
model = AutoModelForSequenceClassification.from_pretrained(local_path)
tokenizer = AutoTokenizer.from_pretrained(local_path)
# Create repo if it doesn't exist
api = HfApi()
api.create_repo(repo_id=repo_id, private=private, exist_ok=True)
model.push_to_hub(repo_id, commit_message=commit_message)
tokenizer.push_to_hub(repo_id)
print(f"Model published: https://huggingface.co/{repo_id}")
return f"https://huggingface.co/{repo_id}"
def load_from_hub(repo_id: str):
"""Load model directly from Hub for inference."""
model = AutoModelForSequenceClassification.from_pretrained(
repo_id,
token=os.environ["HF_TOKEN"], # For private repos
)
tokenizer = AutoTokenizer.from_pretrained(repo_id)
return model, tokenizer
Evaluate with the evaluate Library
# evaluation/eval_model.py — structured evaluation
import evaluate
import numpy as np
from datasets import load_dataset
from transformers import pipeline
def evaluate_classifier(model_path: str, dataset_name: str, split: str = "test"):
"""Run structured evaluation with multiple metrics."""
# Load metrics
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
# Load dataset and model
ds = load_dataset(dataset_name, split=split)
classifier = pipeline("text-classification", model=model_path, device_map="auto")
# Run predictions in batches
texts = ds["text"]
true_labels = ds["label"]
predictions = classifier(texts, batch_size=64, truncation=True)
pred_labels = [int(p["label"].split("_")[-1]) for p in predictions]
# Compute all metrics
results = {
**accuracy.compute(predictions=pred_labels, references=true_labels),
**f1.compute(predictions=pred_labels, references=true_labels, average="macro"),
**precision.compute(predictions=pred_labels, references=true_labels, average="macro"),
**recall.compute(predictions=pred_labels, references=true_labels, average="macro"),
}
print(f"Evaluation results on {dataset_name}/{split}:")
for metric, value in results.items():
print(f" {metric}: {value:.4f}")
return results
For the MLflow experiment tracking that pairs with Transformers training runs to log metrics, hyperparameters, and model artifacts, see the MLflow guide. For the PyTorch custom training loops when Trainer doesn’t fit your architecture, the PyTorch guide covers gradient accumulation and DDP. The Claude Skills 360 bundle includes Hugging Face skill sets covering LoRA fine-tuning, Datasets pipelines, and Hub deployment. Start with the free tier to try Transformers training script generation.