XGBoost is the most popular gradient boosting framework for tabular data. pip install xgboost. import xgboost as xgb. DMatrix: dtrain = xgb.DMatrix(X_train, label=y_train), dtest = xgb.DMatrix(X_test, label=y_test). Params: params = {"objective":"binary:logistic","max_depth":6,"eta":0.1,"subsample":0.8,"colsample_bytree":0.8,"eval_metric":"auc"}. Train: model = xgb.train(params, dtrain, num_boost_round=500, evals=[(dtrain,"train"),(dtest,"test")], early_stopping_rounds=50, verbose_eval=50). Predict: probs = model.predict(dtest). Sklearn API: from xgboost import XGBClassifier, clf = XGBClassifier(n_estimators=500, max_depth=6, learning_rate=0.1, early_stopping_rounds=50), clf.fit(X_train, y_train, eval_set=[(X_test, y_test)], verbose=False), clf.predict_proba(X_test). Feature importance: model.get_score(importance_type="gain") — “weight” count, “gain” avg gain, “cover” avg cover. Save: model.save_model("model.ubj"), load: model.load_model("model.ubj"). GPU: params["device"] = "cuda" or XGBClassifier(device="cuda"). CV: results = xgb.cv(params, dtrain, nfold=5, stratified=True, num_boost_round=500, early_stopping_rounds=50). SHAP: model.get_booster().predict(dtest, pred_contribs=True). Multiclass: "objective":"multi:softprob","num_class":3. Regression: "objective":"reg:squarederror". Ranking: "objective":"rank:pairwise". Claude Code generates XGBoost training pipelines, hyperparameter search scripts, feature importance analysis, and Optuna tuning loops.
CLAUDE.md for XGBoost
## XGBoost Stack
- Version: xgboost >= 2.0
- Format: xgb.DMatrix(X, label=y, weight=w, feature_names=cols)
- Train: xgb.train(params, dtrain, num_boost_round, evals=[], early_stopping_rounds)
- Sklearn: XGBClassifier/XGBRegressor(n_estimators, max_depth, learning_rate, device)
- Objectives: binary:logistic | multi:softprob | reg:squarederror | rank:pairwise
- Eval: auc | logloss | rmse | map | ndcg
- Importance: get_score(importance_type="gain") | "weight" | "cover"
- Save: model.save_model(path) | load_model(path) (.ubj binary, .json text)
- GPU: device="cuda" | tree_method="hist" (default, fastest)
XGBoost Training Pipeline
# ml/xgboost_pipeline.py — gradient boosting for tabular data
from __future__ import annotations
import json
import numpy as np
import pandas as pd
from pathlib import Path
from typing import Optional
import xgboost as xgb
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import (
roc_auc_score, accuracy_score, f1_score,
mean_squared_error, mean_absolute_error, r2_score,
)
# ── 1. Data preparation ───────────────────────────────────────────────────────
def to_dmatrix(
X: np.ndarray | pd.DataFrame,
y: np.ndarray | pd.Series = None,
weight: np.ndarray = None,
feature_names: list[str] = None,
) -> xgb.DMatrix:
"""Convert arrays/DataFrames to XGBoost DMatrix."""
if isinstance(X, pd.DataFrame):
feature_names = feature_names or list(X.columns)
X = X.values
if isinstance(y, pd.Series):
y = y.values
return xgb.DMatrix(
X, label=y, weight=weight,
feature_names=feature_names,
)
def split_data(
X: np.ndarray | pd.DataFrame,
y: np.ndarray | pd.Series,
test_size: float = 0.2,
stratify: bool = True,
random_state: int = 42,
) -> tuple:
"""Split data into train/test sets."""
strat = y if stratify and len(np.unique(y)) < 100 else None
X_tr, X_te, y_tr, y_te = train_test_split(
X, y, test_size=test_size, stratify=strat, random_state=random_state
)
return X_tr, X_te, y_tr, y_te
# ── 2. Binary classification ──────────────────────────────────────────────────
def train_classifier(
X_train: np.ndarray,
y_train: np.ndarray,
X_val: np.ndarray = None,
y_val: np.ndarray = None,
n_estimators: int = 1000,
max_depth: int = 6,
learning_rate: float = 0.05,
subsample: float = 0.8,
colsample: float = 0.8,
scale_pos_weight: float = 1.0, # sum(neg) / sum(pos) for imbalanced data
early_stopping: int = 50,
device: str = "cpu",
) -> XGBClassifier:
"""
Train a binary XGBoost classifier.
scale_pos_weight > 1 handles class imbalance (set to neg/pos ratio).
"""
eval_set = [(X_val, y_val)] if X_val is not None else [(X_train, y_train)]
clf = XGBClassifier(
n_estimators=n_estimators,
max_depth=max_depth,
learning_rate=learning_rate,
subsample=subsample,
colsample_bytree=colsample,
scale_pos_weight=scale_pos_weight,
early_stopping_rounds=early_stopping,
eval_metric="auc",
device=device,
random_state=42,
)
clf.fit(X_train, y_train, eval_set=eval_set, verbose=False)
print(f"Best iteration: {clf.best_iteration} | Best AUC: {clf.best_score:.4f}")
return clf
def evaluate_classifier(
model,
X_test: np.ndarray,
y_test: np.ndarray,
threshold: float = 0.5,
) -> dict:
"""Evaluate binary classifier."""
probs = model.predict_proba(X_test)[:, 1]
preds = (probs >= threshold).astype(int)
return {
"auc": round(roc_auc_score(y_test, probs), 4),
"accuracy": round(accuracy_score(y_test, preds), 4),
"f1": round(f1_score(y_test, preds), 4),
}
# ── 3. Regression ─────────────────────────────────────────────────────────────
def train_regressor(
X_train: np.ndarray,
y_train: np.ndarray,
X_val: np.ndarray = None,
y_val: np.ndarray = None,
n_estimators: int = 1000,
max_depth: int = 6,
learning_rate: float = 0.05,
objective: str = "reg:squarederror", # or "reg:absoluteerror", "count:poisson"
early_stopping: int = 50,
device: str = "cpu",
) -> XGBRegressor:
"""Train an XGBoost regressor."""
eval_set = [(X_val, y_val)] if X_val is not None else [(X_train, y_train)]
eval_metric = "rmse" if "squarederror" in objective else "mae"
reg = XGBRegressor(
n_estimators=n_estimators,
max_depth=max_depth,
learning_rate=learning_rate,
objective=objective,
early_stopping_rounds=early_stopping,
eval_metric=eval_metric,
device=device,
random_state=42,
)
reg.fit(X_train, y_train, eval_set=eval_set, verbose=False)
print(f"Best iteration: {reg.best_iteration}")
return reg
def evaluate_regressor(model, X_test: np.ndarray, y_test: np.ndarray) -> dict:
"""Evaluate regression model."""
preds = model.predict(X_test)
return {
"rmse": round(np.sqrt(mean_squared_error(y_test, preds)), 4),
"mae": round(mean_absolute_error(y_test, preds), 4),
"r2": round(r2_score(y_test, preds), 4),
}
# ── 4. Cross-validation ───────────────────────────────────────────────────────
def cross_validate(
X: np.ndarray,
y: np.ndarray,
params: dict,
n_folds: int = 5,
num_rounds: int = 500,
early_stopping: int = 50,
stratified: bool = True,
) -> pd.DataFrame:
"""
Run stratified k-fold cross-validation using xgb.cv.
Returns DataFrame with train/test metrics per round.
"""
dtrain = to_dmatrix(X, y)
results = xgb.cv(
params=params,
dtrain=dtrain,
nfold=n_folds,
stratified=stratified,
num_boost_round=num_rounds,
early_stopping_rounds=early_stopping,
verbose_eval=False,
as_pandas=True,
seed=42,
)
best_round = results[f"test-{params.get('eval_metric','rmse')}-mean"].idxmin()
best_score = results.iloc[best_round][f"test-{params.get('eval_metric','rmse')}-mean"]
print(f"CV best round: {best_round} | test metric: {best_score:.4f}")
return results
# ── 5. Feature importance ─────────────────────────────────────────────────────
def get_feature_importance(
model,
importance_type: str = "gain", # "gain" | "weight" | "cover" | "total_gain"
top_n: int = 20,
) -> pd.DataFrame:
"""
Get feature importance scores.
- gain: average information gain per split (best for feature selection)
- weight: number of times feature used (can be biased for high-cardinality)
- cover: average number of samples per split
- total_gain: total gain (gain * weight)
"""
booster = model.get_booster() if hasattr(model, "get_booster") else model
scores = booster.get_score(importance_type=importance_type)
df = pd.DataFrame(
list(scores.items()), columns=["feature", "importance"]
).sort_values("importance", ascending=False).head(top_n)
return df.reset_index(drop=True)
def select_important_features(
model,
feature_names: list[str],
threshold: float = 0.0,
importance_type: str = "gain",
) -> list[str]:
"""Return features above an importance threshold."""
df = get_feature_importance(model, importance_type=importance_type, top_n=len(feature_names))
return df[df["importance"] > threshold]["feature"].tolist()
# ── 6. Hyperparameter tuning with Optuna ─────────────────────────────────────
def tune_with_optuna(
X_train: np.ndarray,
y_train: np.ndarray,
X_val: np.ndarray,
y_val: np.ndarray,
n_trials: int = 50,
task: str = "classification", # "classification" | "regression"
) -> dict:
"""
Tune XGBoost hyperparameters with Optuna.
Returns best parameter dict.
"""
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)
def objective(trial):
params = {
"n_estimators": 500,
"max_depth": trial.suggest_int("max_depth", 3, 10),
"learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
"subsample": trial.suggest_float("subsample", 0.5, 1.0),
"colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
"min_child_weight": trial.suggest_int("min_child_weight", 1, 10),
"gamma": trial.suggest_float("gamma", 0, 5),
"reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 10, log=True),
"reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 10, log=True),
"early_stopping_rounds": 30,
"eval_metric": "auc" if task == "classification" else "rmse",
"random_state": 42,
"verbosity": 0,
}
if task == "classification":
m = XGBClassifier(**params)
m.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
return roc_auc_score(y_val, m.predict_proba(X_val)[:, 1])
else:
m = XGBRegressor(**params)
m.fit(X_train, y_train, eval_set=[(X_val, y_val)], verbose=False)
return -mean_squared_error(y_val, m.predict(X_val)) ** 0.5
direction = "maximize" if task == "classification" else "maximize"
study = optuna.create_study(direction=direction)
study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
print(f"Best trial: {study.best_value:.4f}")
return study.best_params
# ── 7. SHAP explainability ────────────────────────────────────────────────────
def compute_shap_values(
model,
X: np.ndarray,
feature_names: list[str] = None,
) -> np.ndarray:
"""Compute SHAP values using XGBoost's built-in SHAP support."""
dmat = to_dmatrix(X, feature_names=feature_names)
booster = model.get_booster() if hasattr(model, "get_booster") else model
# pred_contribs returns (N, n_features+1) — last col is bias
shap_values = booster.predict(dmat, pred_contribs=True)
return shap_values[:, :-1] # Drop bias
def top_shap_features(
shap_values: np.ndarray,
feature_names: list[str],
top_n: int = 10,
) -> pd.DataFrame:
"""Global SHAP feature importance (mean |SHAP|)."""
mean_abs = np.abs(shap_values).mean(axis=0)
df = pd.DataFrame({"feature": feature_names, "mean_abs_shap": mean_abs})
return df.sort_values("mean_abs_shap", ascending=False).head(top_n).reset_index(drop=True)
# ── Demo ──────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
from sklearn.datasets import make_classification
print("XGBoost Demo — Binary Classification")
print("="*50)
X, y = make_classification(n_samples=10_000, n_features=20,
n_informative=10, random_state=42)
X_tr, X_te, y_tr, y_te = split_data(X, y)
X_tr, X_val, y_tr, y_val = split_data(X_tr, y_tr, test_size=0.2)
# Train
clf = train_classifier(
X_tr, y_tr, X_val, y_val,
n_estimators=500, max_depth=5, learning_rate=0.05,
)
# Evaluate
metrics = evaluate_classifier(clf, X_te, y_te)
print(f"\nTest metrics: {metrics}")
# Feature importance
feature_names = [f"feature_{i}" for i in range(20)]
clf.get_booster().feature_names = feature_names
imp = get_feature_importance(clf, importance_type="gain", top_n=5)
print(f"\nTop-5 features (gain):\n{imp}")
# SHAP
shap_vals = compute_shap_values(clf, X_te[:100], feature_names)
shap_imp = top_shap_features(shap_vals, feature_names, top_n=5)
print(f"\nTop-5 SHAP features:\n{shap_imp}")
For the LightGBM alternative when training on extremely large datasets (50M+ rows) or needing categorical feature support without encoding — LightGBM’s leaf-wise tree growth and histogram-based binning is faster on wide, high-cardinality datasets while XGBoost’s level-wise growth is more regularized against overfitting on smaller datasets, and XGBoost’s native GPU support (device="cuda") with optimized CUDA kernels achieves particularly strong speedups for dense float features common in financial and scientific applications. For the scikit-learn GradientBoostingClassifier alternative when needing a pure-Python implementation that integrates seamlessly with sklearn Pipeline and GridSearchCV without any additional dependencies — sklearn’s implementation is easier to deploy in constrained environments while XGBoost is 10-100x faster, supports distributed training via Dask and Spark, and its tree pruning via max_delta_step and gamma provides stronger regularization controls that matter for production models trained on noisy tabular data. The Claude Skills 360 bundle includes XGBoost skill sets covering DMatrix preparation, binary classification and regression training, early stopping, cross-validation, feature importance with gain/weight/cover, Optuna hyperparameter tuning, SHAP explainability, and GPU acceleration. Start with the free tier to try gradient boosting code generation.