PEFT (Parameter-Efficient Fine-Tuning) fine-tunes LLMs with minimal parameters. pip install peft transformers bitsandbytes. LoRA: from peft import LoraConfig, get_peft_model. config = LoraConfig(r=16, lora_alpha=32, target_modules=["q_proj", "v_proj"], lora_dropout=0.05, bias="none", task_type="CAUSAL_LM"). model = get_peft_model(base_model, config). model.print_trainable_parameters() shows ~0.1% of params are trainable. QLoRA: bnb_config = BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_use_double_quant=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=torch.bfloat16). model = AutoModelForCausalLM.from_pretrained(model_id, quantization_config=bnb_config, device_map="auto"). model = prepare_model_for_kbit_training(model). Then apply LoRA on top. Merge adapter: model = model.merge_and_unload() — bakes LoRA weights into base model for deployment. Save adapter only: model.save_pretrained("adapter-weights/"). Load adapter: model = PeftModel.from_pretrained(base_model, "adapter-weights/"). Prompt tuning: PromptTuningConfig(task_type="CAUSAL_LM", prompt_tuning_init="TEXT", num_virtual_tokens=20). Prefix tuning: PrefixTuningConfig(task_type="CAUSAL_LM", num_virtual_tokens=30). Training: Trainer(model=peft_model, args=TrainingArguments(per_device_train_batch_size=4, gradient_accumulation_steps=4, warmup_steps=100, num_train_epochs=3, learning_rate=2e-4, fp16=True, gradient_checkpointing=True)). model.enable_input_require_grads() needed with gradient checkpointing. Target modules by architecture: Llama: ["q_proj", "k_proj", "v_proj", "o_proj"], Mistral same, Phi-3: ["qkv_proj", "o_proj"]. Claude Code generates LoRA configs, QLoRA quantization setups, SFT training scripts, adapter merging, and TypeScript inference patterns.
CLAUDE.md for PEFT/LoRA
## PEFT/LoRA Stack
- Version: peft >= 0.10, transformers >= 4.40, bitsandbytes >= 0.43
- LoRA: LoraConfig(r=16, lora_alpha=32, target_modules=[...], lora_dropout=0.05, task_type="CAUSAL_LM")
- QLoRA: BitsAndBytesConfig(load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_compute_dtype=bfloat16) + prepare_model_for_kbit_training
- Wrap: get_peft_model(base_model, lora_config) → peft_model.print_trainable_parameters()
- Train: HuggingFace Trainer or SFTTrainer(model, dataset, peft_config)
- Merge: model.merge_and_unload() for deployment without PEFT overhead
- Save: model.save_pretrained("adapter/") or merged_model.push_to_hub(repo_id)
LoRA Fine-Tuning Script
# finetune/lora_finetune.py — LoRA / QLoRA fine-tuning for causal LM
from __future__ import annotations
import os
from dataclasses import dataclass, field
import torch
from datasets import Dataset, load_dataset
from peft import (
LoraConfig,
PeftModel,
TaskType,
get_peft_model,
prepare_model_for_kbit_training,
)
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
BitsAndBytesConfig,
DataCollatorForSeq2Seq,
TrainingArguments,
Trainer,
logging,
)
logging.set_verbosity_info()
# ── Configuration ─────────────────────────────────────────────────────────────
@dataclass
class FineTuneConfig:
# Model
model_id: str = "meta-llama/Llama-3.2-3B-Instruct"
output_dir: str = "outputs/lora-finetune"
adapter_name: str = "churn-sentiment-adapter"
# LoRA hyperparameters
lora_r: int = 16
lora_alpha: int = 32
lora_dropout: float = 0.05
target_modules: list[str] = field(default_factory=lambda: [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",
])
# QLoRA
use_4bit: bool = True
bnb_4bit_quant: str = "nf4"
bnb_compute_dtype: str = "bfloat16"
# Training
num_train_epochs: int = 3
per_device_train_batch_size: int = 4
gradient_accumulation_steps: int = 4
learning_rate: float = 2e-4
warmup_ratio: float = 0.05
lr_scheduler_type: str = "cosine"
max_seq_length: int = 2048
save_steps: int = 100
logging_steps: int = 10
# ── Load model with quantization ─────────────────────────────────────────────
def load_quantized_model(cfg: FineTuneConfig):
"""Load base model with BitsAndBytes 4-bit quantization (QLoRA)."""
compute_dtype = getattr(torch, cfg.bnb_compute_dtype)
if cfg.use_4bit:
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type=cfg.bnb_4bit_quant,
bnb_4bit_compute_dtype=compute_dtype,
)
else:
bnb_config = None
model = AutoModelForCausalLM.from_pretrained(
cfg.model_id,
quantization_config=bnb_config,
device_map="auto",
torch_dtype=compute_dtype if not cfg.use_4bit else None,
trust_remote_code=True,
use_cache=False, # Disable KV cache for training
)
if cfg.use_4bit:
model = prepare_model_for_kbit_training(model, use_gradient_checkpointing=True)
tokenizer = AutoTokenizer.from_pretrained(cfg.model_id, trust_remote_code=True)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"
return model, tokenizer
# ── Apply LoRA adapters ───────────────────────────────────────────────────────
def apply_lora(model, cfg: FineTuneConfig):
"""Wrap the model with LoRA adapters."""
lora_config = LoraConfig(
r=cfg.lora_r,
lora_alpha=cfg.lora_alpha,
target_modules=cfg.target_modules,
lora_dropout=cfg.lora_dropout,
bias="none",
task_type=TaskType.CAUSAL_LM,
)
model = get_peft_model(model, lora_config)
model.enable_input_require_grads() # Required for gradient checkpointing
model.print_trainable_parameters()
return model
# ── Dataset preparation ───────────────────────────────────────────────────────
def format_instruction(example: dict) -> dict:
"""Format into instruction-following prompt template."""
instruction = example.get("instruction", "")
context = example.get("context", "")
response = example.get("response", "")
if context:
prompt = (
f"<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n"
f"You are a helpful assistant.<|eot_id|>\n"
f"<|start_header_id|>user<|end_header_id|>\n"
f"Context: {context}\n\n{instruction}<|eot_id|>\n"
f"<|start_header_id|>assistant<|end_header_id|>\n{response}<|eot_id|>"
)
else:
prompt = (
f"<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n"
f"{instruction}<|eot_id|>\n"
f"<|start_header_id|>assistant<|end_header_id|>\n{response}<|eot_id|>"
)
return {"text": prompt}
def load_and_prepare_dataset(
tokenizer,
dataset_name: str = "iamtarun/python_code_instructions_18k_alpaca",
max_length: int = 2048,
max_samples: int = 5000,
):
"""Load dataset and tokenize for training."""
dataset = load_dataset(dataset_name, split=f"train[:{max_samples}]")
dataset = dataset.map(format_instruction, remove_columns=dataset.column_names)
def tokenize(examples):
result = tokenizer(
examples["text"],
truncation=True,
max_length=max_length,
padding=False,
)
result["labels"] = result["input_ids"].copy()
return result
return dataset.map(tokenize, batched=True, remove_columns=["text"])
# ── Train ─────────────────────────────────────────────────────────────────────
def train(cfg: FineTuneConfig | None = None):
"""Full QLoRA fine-tuning pipeline."""
cfg = cfg or FineTuneConfig()
print(f"Loading model: {cfg.model_id}")
model, tokenizer = load_quantized_model(cfg)
print("Applying LoRA adapters...")
model = apply_lora(model, cfg)
print("Preparing dataset...")
train_dataset = load_and_prepare_dataset(tokenizer, max_length=cfg.max_seq_length)
training_args = TrainingArguments(
output_dir=cfg.output_dir,
num_train_epochs=cfg.num_train_epochs,
per_device_train_batch_size=cfg.per_device_train_batch_size,
gradient_accumulation_steps=cfg.gradient_accumulation_steps,
gradient_checkpointing=True,
learning_rate=cfg.learning_rate,
lr_scheduler_type=cfg.lr_scheduler_type,
warmup_ratio=cfg.warmup_ratio,
fp16=not (cfg.bnb_compute_dtype == "bfloat16"),
bf16=(cfg.bnb_compute_dtype == "bfloat16"),
logging_steps=cfg.logging_steps,
save_steps=cfg.save_steps,
save_total_limit=3,
group_by_length=True,
dataloader_pin_memory=False,
report_to=["tensorboard"],
)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
data_collator=DataCollatorForSeq2Seq(
tokenizer, model=model, label_pad_token_id=-100, pad_to_multiple_of=8
),
)
print("Starting training...")
trainer.train()
trainer.save_model(cfg.output_dir)
tokenizer.save_pretrained(cfg.output_dir)
print(f"Adapter saved: {cfg.output_dir}")
# ── Merge and export ──────────────────────────────────────────────────────────
def merge_and_save(
base_model_id: str = "meta-llama/Llama-3.2-3B-Instruct",
adapter_path: str = "outputs/lora-finetune",
merged_path: str = "outputs/merged-model",
push_to_hub: bool = False,
hub_repo_id: str = "",
) -> None:
"""Merge LoRA weights into base model and save for deployment."""
base_model = AutoModelForCausalLM.from_pretrained(
base_model_id, torch_dtype=torch.bfloat16, device_map="auto"
)
model = PeftModel.from_pretrained(base_model, adapter_path)
merged = model.merge_and_unload()
merged.save_pretrained(merged_path, safe_serialization=True)
AutoTokenizer.from_pretrained(adapter_path).save_pretrained(merged_path)
print(f"Merged model saved: {merged_path}")
if push_to_hub and hub_repo_id:
merged.push_to_hub(hub_repo_id)
print(f"Pushed to Hub: {hub_repo_id}")
if __name__ == "__main__":
train()
For the Full Fine-Tuning alternative when having sufficient GPU memory (>80GB for 13B+ models) and a large, high-quality dataset that justifies updating all model weights rather than a small adapter — full fine-tuning gives potentially better performance on domain-specific tasks while LoRA/QLoRA fine-tunes the same model with 1-20x less VRAM, taking minutes instead of hours, making it the practical choice for most teams. For the vLLM/GGUF fine-tuning alternative when using Ollama or llama.cpp for local inference and needing a single GGUF file — convert the merged LoRA model with llama.cpp/convert.py and then quantize with llama-quantize, as PEFT adapters must be merged into the base model before quantization. The Claude Skills 360 bundle includes PEFT/LoRA skill sets covering LoRA config, QLoRA 4-bit training, SFTTrainer setup, adapter merging, and deployment workflows. Start with the free tier to try LLM fine-tuning generation.