Unsloth fine-tunes LLMs 2x faster with 60% less VRAM. pip install unsloth. from unsloth import FastLanguageModel. model, tokenizer = FastLanguageModel.from_pretrained(model_name="unsloth/llama-3-8b-Instruct-bnb-4bit", max_seq_length=2048, dtype=None, load_in_4bit=True). Apply LoRA: model = FastLanguageModel.get_peft_model(model, r=16, target_modules=["q_proj","k_proj","v_proj","o_proj","gate_proj","up_proj","down_proj"], lora_alpha=16, lora_dropout=0, bias="none", use_gradient_checkpointing="unsloth", random_state=42). Chat template: from unsloth.chat_templates import get_chat_template, tokenizer = get_chat_template(tokenizer, chat_template="llama-3") — supports "llama-3", "chatml", "mistral", "alpaca", "zephyr", "gemma". Dataset prep: from unsloth.chat_templates import standardize_sharegpt, dataset = standardize_sharegpt(dataset), then apply_chat_template maps conversations column to text. Train: from trl import SFTTrainer, from unsloth import UnslothTrainingArguments, trainer = SFTTrainer(model=model, tokenizer=tokenizer, train_dataset=dataset, dataset_text_field="text", args=UnslothTrainingArguments(output_dir="outputs", num_train_epochs=3, per_device_train_batch_size=2, gradient_accumulation_steps=4, learning_rate=2e-4, warmup_ratio=0.05, lr_scheduler_type="cosine", bf16=True, optim="adamw_8bit", logging_steps=10, save_steps=100)). trainer.train(). Inference: FastLanguageModel.for_inference(model), then normal model.generate. Save GGUF: model.save_pretrained_gguf("model-gguf", tokenizer, quantization_method="q4_k_m") — generates model-gguf/unsloth.Q4_K_M.gguf for Ollama. Push to Hub: model.push_to_hub_gguf("hf-user/model-gguf", tokenizer, quantization_method=["q4_k_m","q8_0"]). model.save_pretrained_merged("merged-model", tokenizer) merges LoRA weights into base model. Claude Code generates Unsloth training scripts, chat template pipelines, GGUF export, and Ollama Modelfile generation.
CLAUDE.md for Unsloth
## Unsloth Stack
- Version: unsloth >= 2024.12 (nightly for latest models)
- Load: FastLanguageModel.from_pretrained(model_name, max_seq_length=2048, load_in_4bit=True)
- LoRA: FastLanguageModel.get_peft_model(model, r=16, use_gradient_checkpointing="unsloth")
- Chat template: get_chat_template(tokenizer, "llama-3"|"chatml"|"mistral"|"alpaca")
- Dataset: standardize_sharegpt(dataset) → apply_chat_template(dataset_text_field="text")
- Train: SFTTrainer(model, tokenizer, dataset, args=UnslothTrainingArguments(..., optim="adamw_8bit"))
- Export: model.save_pretrained_gguf("dir", tokenizer, quantization_method="q4_k_m")
Fine-Tuning Script
# finetune/unsloth_train.py — 2x faster LoRA fine-tuning with Unsloth
from __future__ import annotations
import os
from pathlib import Path
from datasets import load_dataset
from trl import SFTTrainer
from unsloth import FastLanguageModel
from unsloth import UnslothTrainingArguments # Drop-in for TrainingArguments
from unsloth.chat_templates import (
get_chat_template,
standardize_sharegpt,
train_on_responses_only,
)
MODEL_NAME = os.environ.get("BASE_MODEL", "unsloth/Meta-Llama-3.1-8B-Instruct-bnb-4bit")
OUTPUT_DIR = "outputs/unsloth-llama3"
MAX_SEQ_LEN = 2048
# ── 1. Load model ─────────────────────────────────────────────────────────────
def load_model():
model, tokenizer = FastLanguageModel.from_pretrained(
model_name=MODEL_NAME,
max_seq_length=MAX_SEQ_LEN,
dtype=None, # Auto-detect: bfloat16 on Ampere+, float16 otherwise
load_in_4bit=True, # QLoRA — saves ~75% VRAM vs full precision
)
# Apply LoRA — Unsloth patches attention for 2x speedup
model = FastLanguageModel.get_peft_model(
model,
r=16,
target_modules=[
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",
],
lora_alpha=16,
lora_dropout=0, # Unsloth recommends 0 for performance
bias="none",
use_gradient_checkpointing="unsloth", # 30% less VRAM than standard
random_state=42,
use_rslora=False,
loftq_config=None,
)
model.print_trainable_parameters()
return model, tokenizer
# ── 2. Prepare dataset ────────────────────────────────────────────────────────
def prepare_dataset(tokenizer, dataset_name: str = "mlabonne/FineTome-100k"):
"""
Load a ShareGPT-format dataset and apply Llama-3 chat template.
ShareGPT format: {"conversations": [{"from": "human"|"gpt", "value": str}]}
"""
# Apply Llama-3 chat template to tokenizer
tokenizer = get_chat_template(
tokenizer,
chat_template="llama-3",
mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"},
)
dataset = load_dataset(dataset_name, split="train[:5000]")
# Standardize various ShareGPT dialects to canonical format
dataset = standardize_sharegpt(dataset)
def apply_template(examples):
convos = examples["conversations"]
texts = [
tokenizer.apply_chat_template(
convo, tokenize=False, add_generation_prompt=False
)
for convo in convos
]
return {"text": texts}
return dataset.map(apply_template, batched=True), tokenizer
# ── 3. Train ──────────────────────────────────────────────────────────────────
def train(model, tokenizer, dataset) -> SFTTrainer:
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset,
dataset_text_field="text",
max_seq_length=MAX_SEQ_LEN,
dataset_num_proc=4,
packing=False, # Set True for short sequences to pack into one block
args=UnslothTrainingArguments(
output_dir=OUTPUT_DIR,
num_train_epochs=1,
per_device_train_batch_size=2,
gradient_accumulation_steps=4,
warmup_ratio=0.05,
learning_rate=2e-4,
lr_scheduler_type="cosine",
optim="adamw_8bit", # 8-bit AdamW — further reduces memory
bf16=True,
fp16=False,
logging_steps=10,
save_steps=200,
save_total_limit=3,
seed=42,
report_to=["tensorboard"],
),
)
# Optional: train only on assistant responses (masks user/system tokens)
trainer = train_on_responses_only(
trainer,
instruction_part="<|start_header_id|>user<|end_header_id|>\n\n",
response_part="<|start_header_id|>assistant<|end_header_id|>\n\n",
)
print("Starting Unsloth training...")
trainer_stats = trainer.train()
print(f"Training complete. Runtime: {trainer_stats.metrics['train_runtime']:.0f}s")
return trainer
# ── 4. Inference ──────────────────────────────────────────────────────────────
def run_inference(model, tokenizer, prompt: str) -> str:
"""Enable fast inference mode and generate a response."""
FastLanguageModel.for_inference(model) # Enables native 2x faster inference
messages = [{"role": "user", "content": prompt}]
input_ids = tokenizer.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_tensors="pt",
).to("cuda")
output_ids = model.generate(
input_ids,
max_new_tokens=512,
temperature=0.7,
top_p=0.9,
do_sample=True,
use_cache=True,
)
# Decode only new tokens (skip prompt)
new_tokens = output_ids[0][input_ids.shape[-1]:]
return tokenizer.decode(new_tokens, skip_special_tokens=True)
# ── 5. Export ─────────────────────────────────────────────────────────────────
def export_model(model, tokenizer, hub_username: str = "") -> None:
Path(OUTPUT_DIR).mkdir(parents=True, exist_ok=True)
# Save LoRA adapter (lightweight, ~100MB)
model.save_pretrained(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"LoRA adapter saved: {OUTPUT_DIR}")
# Save merged model (full weights, ~16GB for 8B)
model.save_pretrained_merged(
f"{OUTPUT_DIR}/merged",
tokenizer,
save_method="merged_16bit",
)
print(f"Merged model saved: {OUTPUT_DIR}/merged")
# Export GGUF for Ollama / llama.cpp deployment
# quantization_method: "q4_k_m" (best quality/size), "q8_0" (best quality),
# "f16" (full precision), "q5_k_m", "q2_k"
model.save_pretrained_gguf(
f"{OUTPUT_DIR}/gguf",
tokenizer,
quantization_method="q4_k_m",
)
print(f"GGUF saved (q4_k_m): {OUTPUT_DIR}/gguf/unsloth.Q4_K_M.gguf")
if hub_username:
# Push adapter + multiple GGUF quantizations to HuggingFace Hub
model.push_to_hub(f"{hub_username}/llama3-finetuned-lora", tokenizer)
model.push_to_hub_gguf(
f"{hub_username}/llama3-finetuned-gguf",
tokenizer,
quantization_method=["q4_k_m", "q8_0", "f16"],
)
print(f"Pushed to Hub: {hub_username}/llama3-finetuned-gguf")
def generate_ollama_modelfile(gguf_path: str, system_prompt: str = "") -> str:
"""Generate Ollama Modelfile to serve the fine-tuned GGUF locally."""
modelfile = f'FROM {gguf_path}\n'
if system_prompt:
modelfile += f'\nSYSTEM """\n{system_prompt}\n"""\n'
modelfile += '\nPARAMETER temperature 0.7\nPARAMETER top_p 0.9\nPARAMETER stop "<|eot_id|>"\n'
return modelfile
# ── Main ──────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
model, tokenizer = load_model()
dataset, tokenizer = prepare_dataset(tokenizer)
trainer = train(model, tokenizer, dataset)
response = run_inference(model, tokenizer, "Write a Python function to reverse a linked list.")
print(f"\nSample response:\n{response}\n")
export_model(model, tokenizer)
gguf_path = f"{OUTPUT_DIR}/gguf/unsloth.Q4_K_M.gguf"
modelfile = generate_ollama_modelfile(gguf_path, "You are an expert Python programmer.")
with open("Modelfile", "w") as f:
f.write(modelfile)
print("Ollama Modelfile written. Run: ollama create my-model -f Modelfile")
For the standard PEFT/TRL alternative when needing more control over the training loop, custom data collators, or training methods not yet supported by Unsloth (PPO, GRPO, reward modeling) — PEFT + TRL give lower-level API access while Unsloth wraps these with kernel-level optimizations (custom Triton kernels for RoPE, cross-entropy, SwiGLU) that transparently deliver 2x speedup and 60% less VRAM on the same SFT and DPO workflows. For the vLLM serving alternative after fine-tuning when deploying to production with high-throughput batched inference — merge the LoRA adapter with model.save_pretrained_merged and serve with vLLM, or use model.save_pretrained_gguf for local Ollama deployment when compute resources are constrained. The Claude Skills 360 bundle includes Unsloth skill sets covering fast LoRA fine-tuning, chat template setup, GGUF export, Ollama integration, and HuggingFace Hub upload. Start with the free tier to try accelerated LLM fine-tuning generation.