LightGBM builds gradient boosting trees with leaf-wise growth for maximum speed. pip install lightgbm. import lightgbm as lgb. Dataset: dtrain = lgb.Dataset(X_train, label=y_train, categorical_feature=[0,2]) or by column name. Params: params = {"objective":"binary","metric":"auc","num_leaves":63,"learning_rate":0.05,"feature_fraction":0.8,"bagging_fraction":0.8,"bagging_freq":5,"verbose":-1}. Train: model = lgb.train(params, dtrain, num_boost_round=1000, valid_sets=[dvalid], callbacks=[lgb.early_stopping(50), lgb.log_evaluation(100)]). Predict: probs = model.predict(X_test) — returns probabilities. Sklearn API: from lightgbm import LGBMClassifier, clf = LGBMClassifier(n_estimators=1000, num_leaves=63, learning_rate=0.05, early_stopping_rounds=50, verbose=-1), clf.fit(X_train, y_train, eval_set=[(X_val, y_val)]). Feature importance: model.feature_importance(importance_type="gain") — “gain” or “split”. Categorical: lgb.Dataset(X, categorical_feature=["cat_col1","cat_col2"]) — handles internally without one-hot encoding. CV: lgb.cv(params, dtrain, nfold=5, stratified=True, num_boost_round=1000, callbacks=[lgb.early_stopping(50)]). SHAP: model.predict(X, pred_contrib=True). Optuna: from optuna.integration import lightgbm as lgb_optuna, tuner = lgb_optuna.LightGBMTuner(params, dtrain, valid_sets=[dvalid]), tuner.run(). Save: model.save_model("model.txt"), load: lgb.Booster(model_file="model.txt"). Claude Code generates LightGBM pipelines, categorical feature workflows, Optuna tuners, and SHAP explainability scripts.
CLAUDE.md for LightGBM
## LightGBM Stack
- Version: lightgbm >= 4.0
- Format: lgb.Dataset(X, label=y, categorical_feature=["col"] or [idx])
- Train: lgb.train(params, dtrain, num_boost_round, valid_sets, callbacks)
- Callbacks: lgb.early_stopping(rounds) | lgb.log_evaluation(period)
- Sklearn: LGBMClassifier/LGBMRegressor(n_estimators, num_leaves, learning_rate)
- Objectives: binary | multiclass | regression | regression_l1 | poisson | rank_xendcg
- Metrics: auc | binary_logloss | rmse | mae | mape | ndcg
- Importance: model.feature_importance(importance_type="gain"/"split")
- Categorical: pass as-is to Dataset, set categorical_feature — no encoding needed
LightGBM Training Pipeline
# ml/lightgbm_pipeline.py — fast gradient boosting for tabular data
from __future__ import annotations
import numpy as np
import pandas as pd
from pathlib import Path
import lightgbm as lgb
from lightgbm import LGBMClassifier, LGBMRegressor
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (
roc_auc_score, accuracy_score, f1_score,
mean_squared_error, mean_absolute_error, r2_score,
)
# ── 1. Data preparation ───────────────────────────────────────────────────────
def to_dataset(
X: np.ndarray | pd.DataFrame,
y: np.ndarray | pd.Series = None,
weight: np.ndarray = None,
categorical_cols: list[str | int] = None,
reference: lgb.Dataset = None,
) -> lgb.Dataset:
"""
Create a LightGBM Dataset.
Pass categorical_cols as column names (DataFrame) or indices.
reference: set to train dataset when creating valid/test sets for
consistent binning.
"""
if isinstance(y, pd.Series):
y = y.values
return lgb.Dataset(
X, label=y, weight=weight,
categorical_feature=categorical_cols or "auto",
reference=reference,
free_raw_data=False,
)
# ── 2. Binary classification ──────────────────────────────────────────────────
def train_classifier(
X_train: np.ndarray | pd.DataFrame,
y_train: np.ndarray,
X_val: np.ndarray | pd.DataFrame = None,
y_val: np.ndarray = None,
num_leaves: int = 63,
max_depth: int = -1, # -1 = no limit (leaf-wise controls depth)
learning_rate: float = 0.05,
n_estimators: int = 1000,
feature_frac: float = 0.8,
bagging_frac: float = 0.8,
bagging_freq: int = 5,
min_data_leaf: int = 20,
lambda_l1: float = 0.0,
lambda_l2: float = 0.0,
scale_pos_weight: float = 1.0,
early_stopping: int = 50,
categorical_cols: list = None,
device: str = "cpu", # "cpu" | "gpu" | "cuda"
) -> LGBMClassifier:
"""
Train a binary LightGBM classifier.
num_leaves is the primary complexity control (31 conservative, 127 expressive).
"""
eval_set = [(X_val, y_val)] if X_val is not None else [(X_train, y_train)]
clf = LGBMClassifier(
num_leaves=num_leaves,
max_depth=max_depth,
learning_rate=learning_rate,
n_estimators=n_estimators,
feature_fraction=feature_frac,
bagging_fraction=bagging_frac,
bagging_freq=bagging_freq,
min_child_samples=min_data_leaf,
reg_alpha=lambda_l1,
reg_lambda=lambda_l2,
scale_pos_weight=scale_pos_weight,
early_stopping_rounds=early_stopping,
device=device,
verbose=-1,
random_state=42,
categorical_feature=categorical_cols or "auto",
)
clf.fit(
X_train, y_train,
eval_set=eval_set,
eval_metric="auc",
callbacks=[lgb.log_evaluation(100)],
)
print(f"Best iteration: {clf.best_iteration_}")
return clf
def evaluate_classifier(
model,
X_test: np.ndarray,
y_test: np.ndarray,
threshold: float = 0.5,
) -> dict:
probs = model.predict_proba(X_test)[:, 1]
preds = (probs >= threshold).astype(int)
return {
"auc": round(roc_auc_score(y_test, probs), 4),
"accuracy": round(accuracy_score(y_test, preds), 4),
"f1": round(f1_score(y_test, preds), 4),
}
# ── 3. Multiclass classification ──────────────────────────────────────────────
def train_multiclass(
X_train: np.ndarray,
y_train: np.ndarray,
X_val: np.ndarray,
y_val: np.ndarray,
num_classes: int,
num_leaves: int = 63,
learning_rate: float = 0.05,
n_estimators: int = 1000,
early_stopping: int = 50,
) -> LGBMClassifier:
"""Train multiclass LightGBM (softmax objective)."""
clf = LGBMClassifier(
objective="multiclass",
num_class=num_classes,
num_leaves=num_leaves,
learning_rate=learning_rate,
n_estimators=n_estimators,
early_stopping_rounds=early_stopping,
verbose=-1,
random_state=42,
)
clf.fit(X_train, y_train, eval_set=[(X_val, y_val)],
callbacks=[lgb.log_evaluation(100)])
return clf
# ── 4. Regression ─────────────────────────────────────────────────────────────
def train_regressor(
X_train: np.ndarray,
y_train: np.ndarray,
X_val: np.ndarray = None,
y_val: np.ndarray = None,
objective: str = "regression", # "regression_l1" | "mape" | "poisson" | "quantile"
num_leaves: int = 63,
learning_rate: float = 0.05,
n_estimators: int = 1000,
early_stopping: int = 50,
) -> LGBMRegressor:
"""Train LightGBM regression model."""
eval_set = [(X_val, y_val)] if X_val is not None else [(X_train, y_train)]
eval_metric = {"regression":"rmse","regression_l1":"mae","mape":"mape"}.get(objective,"rmse")
reg = LGBMRegressor(
objective=objective,
num_leaves=num_leaves,
learning_rate=learning_rate,
n_estimators=n_estimators,
early_stopping_rounds=early_stopping,
verbose=-1,
random_state=42,
)
reg.fit(X_train, y_train, eval_set=eval_set,
eval_metric=eval_metric, callbacks=[lgb.log_evaluation(100)])
return reg
def evaluate_regressor(model, X_test: np.ndarray, y_test: np.ndarray) -> dict:
preds = model.predict(X_test)
return {
"rmse": round(np.sqrt(mean_squared_error(y_test, preds)), 4),
"mae": round(mean_absolute_error(y_test, preds), 4),
"r2": round(r2_score(y_test, preds), 4),
}
# ── 5. Cross-validation ───────────────────────────────────────────────────────
def cross_validate_lgb(
X: np.ndarray,
y: np.ndarray,
params: dict,
n_folds: int = 5,
num_rounds: int = 1000,
early_stopping: int = 50,
stratified: bool = True,
) -> dict:
"""Run lgb.cv and return best round + metric."""
dtrain = to_dataset(X, y)
cv_results = lgb.cv(
params=params,
train_set=dtrain,
nfold=n_folds,
stratified=stratified,
num_boost_round=num_rounds,
callbacks=[lgb.early_stopping(early_stopping), lgb.log_evaluation(0)],
seed=42,
)
metric_key = list(cv_results.keys())[0] # e.g., "valid auc-mean"
best_scores = cv_results[metric_key]
best_idx = int(np.argmax(best_scores))
print(f"CV best round: {best_idx} | {metric_key}: {best_scores[best_idx]:.4f}")
return {"best_round": best_idx, "metric": best_scores[best_idx]}
# ── 6. Feature importance ─────────────────────────────────────────────────────
def get_feature_importance(
model,
importance_type: str = "gain", # "gain" | "split"
feature_names: list[str] = None,
top_n: int = 20,
) -> pd.DataFrame:
"""
Get feature importance.
- gain: total gain contributed (select for feature relevance)
- split: number of splits (biased toward high-cardinality features)
"""
booster = model.booster_ if hasattr(model, "booster_") else model
names = feature_names or booster.feature_name()
values = booster.feature_importance(importance_type=importance_type)
df = pd.DataFrame({"feature": names, "importance": values})
return df.sort_values("importance", ascending=False).head(top_n).reset_index(drop=True)
# ── 7. Hyperparameter tuning with Optuna ─────────────────────────────────────
def tune_with_optuna(
X_train: np.ndarray,
y_train: np.ndarray,
X_val: np.ndarray,
y_val: np.ndarray,
n_trials: int = 50,
task: str = "classification",
) -> dict:
"""Tune LightGBM with Optuna."""
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)
def objective(trial):
params = {
"n_estimators": 1000,
"num_leaves": trial.suggest_int("num_leaves", 20, 300),
"max_depth": trial.suggest_int("max_depth", 3, 12),
"learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
"min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
"feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
"bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
"bagging_freq": trial.suggest_int("bagging_freq", 1, 10),
"reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10, log=True),
"reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10, log=True),
"early_stopping_rounds": 30,
"verbose": -1,
"random_state": 42,
}
if task == "classification":
m = LGBMClassifier(**params)
m.fit(X_train, y_train, eval_set=[(X_val, y_val)])
return roc_auc_score(y_val, m.predict_proba(X_val)[:, 1])
else:
m = LGBMRegressor(**params)
m.fit(X_train, y_train, eval_set=[(X_val, y_val)])
return -mean_squared_error(y_val, m.predict(X_val)) ** 0.5
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
print(f"Best: {study.best_value:.4f}")
return study.best_params
# ── 8. Categorical feature example ───────────────────────────────────────────
def demo_categorical_features() -> None:
"""
LightGBM handles categorical features natively without encoding.
Pass them as pandas Categorical dtype or specify col names/indices.
"""
# Example DataFrame with categoricals
df = pd.DataFrame({
"age": np.random.randint(18, 80, 1000),
"income": np.random.normal(50000, 20000, 1000),
"city": np.random.choice(["NYC", "LA", "Chicago", "Houston"], 1000),
"occupation": np.random.choice(["engineer", "teacher", "doctor", "other"], 1000),
"target": np.random.randint(0, 2, 1000),
})
# Convert to categorical dtype
df["city"] = df["city"].astype("category")
df["occupation"] = df["occupation"].astype("category")
X = df.drop("target", axis=1)
y = df["target"].values
# LightGBM detects pandas Categorical columns automatically
dtrain = lgb.Dataset(X, label=y)
params = {"objective": "binary", "metric": "auc", "verbose": -1, "num_leaves": 31}
model = lgb.train(params, dtrain, num_boost_round=50)
print("Categorical demo: model trained with native categorical support")
print(f"Feature names: {model.feature_name()}")
# ── Demo ──────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
from sklearn.datasets import make_classification
print("LightGBM Demo — Binary Classification")
print("="*50)
X, y = make_classification(n_samples=10_000, n_features=20,
n_informative=10, random_state=42)
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, random_state=42)
X_tr, X_val, y_tr, y_val = train_test_split(X_tr, y_tr, test_size=0.2, random_state=42)
clf = train_classifier(X_tr, y_tr, X_val, y_val, num_leaves=31, n_estimators=300)
metrics = evaluate_classifier(clf, X_te, y_te)
print(f"\nTest metrics: {metrics}")
feature_names = [f"f{i}" for i in range(20)]
imp = get_feature_importance(clf, importance_type="gain",
feature_names=feature_names, top_n=5)
print(f"\nTop-5 features:\n{imp}")
demo_categorical_features()
For the XGBoost alternative when training on dense float features, needing stronger regularization controls, or requiring broad deployment (XGBoost JSON models are supported by AWS SageMaker, Azure ML, and Google Vertex AI natively) — XGBoost’s level-wise tree growth generalizes better on small datasets (< 10K rows) while LightGBM’s leaf-wise growth with num_leaves control trains 10-100x faster on large datasets (> 100K rows), uses 3-5x less memory via histogram binning, and handles high-cardinality categoricals natively without one-hot encoding (critical for user ID or product ID features). For the CatBoost alternative when working with many high-cardinality categorical features without manual preprocessing — CatBoost’s ordered boosting prevents target leakage from categoricals while LightGBM’s LightGBMTuner from Optuna automates hyperparameter search specifically for LGBM, achieving better validation AUC with less search budget, and LGBM’s GPU training with histogram bins runs faster than CatBoost’s GPU implementation for datasets with > 50 features. The Claude Skills 360 bundle includes LightGBM skill sets covering Dataset creation, binary/multiclass/regression training, early stopping callbacks, cross-validation, feature importance gain and split, categorical feature handling, Optuna tuning, and SHAP explainability. Start with the free tier to try gradient boosting code generation.