CatBoost handles categorical features natively without preprocessing. pip install catboost. from catboost import CatBoostClassifier, CatBoostRegressor, Pool. Pool: train_pool = Pool(X_train, label=y_train, cat_features=cat_cols, feature_names=list(X_train.columns)). Classifier: model = CatBoostClassifier(iterations=1000, learning_rate=0.03, depth=6, l2_leaf_reg=3, loss_function="Logloss", eval_metric="AUC", random_seed=42, verbose=100). Train: model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=50). Predict: model.predict(X_test), model.predict_proba(X_test)[:, 1]. Regressor: CatBoostRegressor(loss_function="RMSE"). Categorical: cat_features=["city","product","user_agent"] — pass index or column names. Feature importance: model.get_feature_importance(train_pool, type="FeatureImportance"). SHAP: model.get_feature_importance(train_pool, type="ShapValues") → (N, F+1) array. Plot tree: model.plot_tree(tree_idx=0, pool=train_pool). CV: from catboost import cv, cv_results = cv(pool=train_pool, params=params, fold_count=5). GPU: CatBoostClassifier(task_type="GPU", devices="0"). Ranking: CatBoostRanker(loss_function="YetiRank"). Save: model.save_model("model.cbm"). Load: model.load_model("model.cbm"). Encode: model.get_cat_feature_indices(). Calibrate: model.set_scale_and_bias(scale, bias). Claude Code generates CatBoost pipelines, Optuna hyperparameter searches, SHAP explainability reports, and ranking systems.
CLAUDE.md for CatBoost
## CatBoost Stack
- Version: catboost >= 1.2
- Pool: Pool(X, label=y, cat_features=cat_cols, feature_names=cols)
- Classifier: CatBoostClassifier(iterations, learning_rate, depth, random_seed)
- Regressor: CatBoostRegressor(loss_function="RMSE"/"MAE"/"Huber")
- Train: model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=50)
- Categorical: pass column names/indices to cat_features — no encoding needed
- SHAP: model.get_feature_importance(pool, type="ShapValues")
- GPU: task_type="GPU" | save/load: model.save_model / model.load_model
CatBoost Gradient Boosting Pipeline
# ml/catboost_pipeline.py — gradient boosting for tabular data with CatBoost
from __future__ import annotations
import warnings
import numpy as np
import pandas as pd
from pathlib import Path
from typing import Optional
from catboost import (
CatBoostClassifier, CatBoostRegressor, CatBoostRanker,
Pool, cv as catboost_cv,
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
roc_auc_score, average_precision_score,
mean_squared_error, mean_absolute_error, r2_score,
)
warnings.filterwarnings("ignore")
# ── 1. Data preparation ───────────────────────────────────────────────────────
def make_pool(
X: pd.DataFrame,
y: np.ndarray | pd.Series | None = None,
cat_features: list[str] | None = None,
weight_col: pd.Series | None = None,
) -> Pool:
"""
Create a CatBoost Pool from a Pandas DataFrame.
CatBoost handles categorical strings natively — no encoding needed.
Fill NaN in categorical columns with empty string before passing.
"""
if cat_features:
X = X.copy()
for col in cat_features:
if col in X.columns:
X[col] = X[col].fillna("__missing__").astype(str)
return Pool(
data=X,
label=y,
cat_features=cat_features,
feature_names=list(X.columns),
weight=weight_col,
)
def prepare_train_val(
df: pd.DataFrame,
target: str,
cat_features: list[str] = None,
val_size: float = 0.2,
random_state: int = 42,
) -> tuple[Pool, Pool, list[str]]:
"""Split DataFrame into train/val CatBoost Pools."""
X = df.drop(columns=[target])
y = df[target]
X_tr, X_val, y_tr, y_val = train_test_split(
X, y, test_size=val_size, random_state=random_state,
stratify=y if y.nunique() <= 20 else None,
)
cat_cols = cat_features or [c for c in X.columns if X[c].dtype == "object"]
train_pool = make_pool(X_tr, y_tr, cat_features=cat_cols)
val_pool = make_pool(X_val, y_val, cat_features=cat_cols)
return train_pool, val_pool, cat_cols
# ── 2. Classification ──────────────────────────────────────────────────────────
def train_classifier(
train_pool: Pool,
val_pool: Pool,
iterations: int = 1000,
lr: float = 0.03,
depth: int = 6,
l2_leaf_reg: float = 3.0,
early_stop: int = 50,
task_type: str = "CPU", # "GPU" if CUDA available
eval_metric: str = "AUC",
loss_function: str = "Logloss",
class_weights: list = None,
verbose: int = 100,
) -> CatBoostClassifier:
"""
Train a binary or multi-class CatBoost classifier.
Early stopping monitors eval_metric on val_pool.
"""
model = CatBoostClassifier(
iterations=iterations,
learning_rate=lr,
depth=depth,
l2_leaf_reg=l2_leaf_reg,
loss_function=loss_function,
eval_metric=eval_metric,
task_type=task_type,
random_seed=42,
bootstrap_type="Bernoulli",
subsample=0.8,
class_weights=class_weights,
verbose=verbose,
)
model.fit(
train_pool,
eval_set=val_pool,
early_stopping_rounds=early_stop,
use_best_model=True,
)
return model
def evaluate_classifier(
model: CatBoostClassifier,
X_test: pd.DataFrame,
y_test: np.ndarray | pd.Series,
cat_features: list[str] = None,
threshold: float = 0.5,
) -> dict:
"""Compute classification metrics on a test set."""
pool = make_pool(X_test, cat_features=cat_features)
proba = model.predict_proba(pool)[:, 1]
pred = (proba >= threshold).astype(int)
y_np = np.array(y_test)
return {
"roc_auc": round(roc_auc_score(y_np, proba), 4),
"avg_precision": round(average_precision_score(y_np, proba), 4),
"accuracy": round((pred == y_np).mean(), 4),
"positive_rate": round(pred.mean(), 4),
"n_best_iteration": model.best_iteration_,
}
# ── 3. Regression ─────────────────────────────────────────────────────────────
def train_regressor(
train_pool: Pool,
val_pool: Pool,
iterations: int = 1000,
lr: float = 0.03,
depth: int = 6,
loss_function: str = "RMSE", # "RMSE" | "MAE" | "Huber:delta=1"
early_stop: int = 50,
verbose: int = 100,
) -> CatBoostRegressor:
"""Train a CatBoost regression model."""
model = CatBoostRegressor(
iterations=iterations,
learning_rate=lr,
depth=depth,
loss_function=loss_function,
eval_metric=loss_function,
random_seed=42,
verbose=verbose,
)
model.fit(
train_pool,
eval_set=val_pool,
early_stopping_rounds=early_stop,
use_best_model=True,
)
return model
def evaluate_regressor(
model: CatBoostRegressor,
X_test: pd.DataFrame,
y_test: np.ndarray | pd.Series,
cat_features: list[str] = None,
) -> dict:
"""Compute regression metrics."""
pool = make_pool(X_test, cat_features=cat_features)
preds = model.predict(pool)
y_np = np.array(y_test)
rmse = float(np.sqrt(mean_squared_error(y_np, preds)))
return {
"rmse": round(rmse, 4),
"mae": round(float(mean_absolute_error(y_np, preds)), 4),
"r2": round(float(r2_score(y_np, preds)), 4),
"n_best_iteration": model.best_iteration_,
}
# ── 4. Feature importance and SHAP ────────────────────────────────────────────
def feature_importance(
model, # CatBoostClassifier or CatBoostRegressor
pool: Pool,
top_n: int = 20,
) -> pd.DataFrame:
"""
Compute standard feature importances (Prediction Value Change).
Returns DataFrame sorted by importance descending.
"""
importances = model.get_feature_importance(pool, type="FeatureImportance")
names = pool.get_feature_names()
df = pd.DataFrame({"feature": names, "importance": importances})
return df.sort_values("importance", ascending=False).head(top_n).reset_index(drop=True)
def shap_values(
model, # CatBoostClassifier or CatBoostRegressor
pool: Pool,
) -> np.ndarray:
"""
Compute SHAP values via CatBoost's built-in implementation.
Returns (N, F+1) array — last column is the bias (expected value).
For binary classifiers, the values are in log-odds space.
"""
sv = model.get_feature_importance(pool, type="ShapValues")
return sv # shape: (n_samples, n_features + 1)
def mean_abs_shap(
model,
pool: Pool,
top_n: int = 20,
) -> pd.DataFrame:
"""Global SHAP importance (mean |SHAP|) for each feature."""
sv = shap_values(model, pool)[:, :-1] # Drop bias column
names = pool.get_feature_names()
mean_abs = np.abs(sv).mean(axis=0)
df = pd.DataFrame({"feature": names, "mean_abs_shap": mean_abs})
return df.sort_values("mean_abs_shap", ascending=False).head(top_n).reset_index(drop=True)
# ── 5. Cross-validation ───────────────────────────────────────────────────────
def cross_validate(
pool: Pool,
params: dict,
fold_count: int = 5,
early_stop: int = 50,
) -> pd.DataFrame:
"""
K-fold cross-validation using CatBoost's native cv function.
Returns DataFrame with train/test metric per iteration.
"""
cv_results = catboost_cv(
pool=pool,
params=params,
fold_count=fold_count,
early_stopping_rounds=early_stop,
verbose=False,
seed=42,
)
return pd.DataFrame(cv_results)
# ── 6. Hyperparameter search (Optuna) ────────────────────────────────────────
def optuna_search(
train_pool: Pool,
val_pool: Pool,
n_trials: int = 40,
task: str = "classify", # "classify" | "regress"
) -> dict:
"""
Optuna hyperparameter search for CatBoost.
Returns best params dict.
"""
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)
def objective(trial: optuna.Trial) -> float:
params = {
"iterations": trial.suggest_int("iterations", 200, 1500),
"learning_rate": trial.suggest_float("lr", 0.01, 0.3, log=True),
"depth": trial.suggest_int("depth", 4, 10),
"l2_leaf_reg": trial.suggest_float("l2", 1, 10),
"subsample": trial.suggest_float("subsample", 0.5, 1.0),
"bagging_temperature": trial.suggest_float("bagging_temp", 0, 10),
"verbose": 0,
"random_seed": 42,
"early_stopping_rounds": 50,
}
if task == "classify":
model = CatBoostClassifier(**params, loss_function="Logloss", eval_metric="AUC")
else:
model = CatBoostRegressor(**params, loss_function="RMSE")
model.fit(train_pool, eval_set=val_pool, use_best_model=True)
if task == "classify":
proba = model.predict_proba(val_pool)[:, 1]
return -roc_auc_score(val_pool.get_label(), proba)
else:
preds = model.predict(val_pool)
return float(np.sqrt(mean_squared_error(val_pool.get_label(), preds)))
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=n_trials, show_progress_bar=True)
return study.best_params
# ── 7. Persistence ────────────────────────────────────────────────────────────
def save_model(model, path: str) -> str:
"""Save CatBoost model in native .cbm format."""
Path(path).parent.mkdir(parents=True, exist_ok=True)
model.save_model(path)
print(f"Model saved: {path}")
return path
def load_classifier(path: str) -> CatBoostClassifier:
"""Load a CatBoostClassifier from .cbm file."""
model = CatBoostClassifier()
model.load_model(path)
return model
# ── Demo ──────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
print("CatBoost Gradient Boosting Demo")
print("=" * 50)
# Generate synthetic dataset with mixed types
np.random.seed(42)
n = 5000
df = pd.DataFrame({
"age": np.random.randint(18, 75, n),
"income": np.random.lognormal(10, 0.8, n),
"score": np.random.normal(650, 100, n),
"region": np.random.choice(["North","South","East","West"], n),
"product": np.random.choice(["Basic","Premium","Enterprise"], n),
"channel": np.random.choice(["web","app","phone"], n),
"has_promo": np.random.binomial(1, 0.3, n),
})
df["churn"] = (
(df["income"] < np.exp(10)) & (df["score"] < 650) |
(df["region"] == "North") & (df["channel"] == "phone")
).astype(int)
# Add noise
noise_mask = np.random.rand(n) < 0.05
df.loc[noise_mask, "churn"] = 1 - df.loc[noise_mask, "churn"]
cat_cols = ["region", "product", "channel"]
print(f"\nDataset: {len(df):,} rows, churn rate: {df['churn'].mean():.2%}")
# Prepare pools
train_pool, val_pool, _ = prepare_train_val(df, "churn", cat_features=cat_cols)
# Train
print("\nTraining CatBoost classifier...")
model = train_classifier(train_pool, val_pool, iterations=500, lr=0.05, verbose=100)
# Evaluate
X_test = df.drop("churn", axis=1).sample(500, random_state=1)
y_test = df.loc[X_test.index, "churn"]
metrics = evaluate_classifier(model, X_test, y_test, cat_features=cat_cols)
print(f"\nTest metrics: {metrics}")
# Feature importance
imp = feature_importance(model, train_pool, top_n=5)
print(f"\nTop-5 features:\n{imp}")
# SHAP
shap_imp = mean_abs_shap(model, train_pool, top_n=5)
print(f"\nTop-5 SHAP importance:\n{shap_imp}")
# Cross-validation
import tempfile, os
with tempfile.TemporaryDirectory() as tmpdir:
model_path = os.path.join(tmpdir, "model.cbm")
save_model(model, model_path)
loaded = load_classifier(model_path)
preds_orig = model.predict_proba(train_pool)[:5, 1]
preds_loaded = loaded.predict_proba(train_pool)[:5, 1]
print(f"\nSave/load match: {np.allclose(preds_orig, preds_loaded)}")
For the XGBoost alternative — XGBoost requires one-hot or ordinal encoding of categorical columns while CatBoost’s cat_features parameter uses target statistics and ordered boosting to handle string categoricals directly, avoiding cardinality explosion in high-cardinality columns like user IDs or ZIP codes, and the default hyperparameters are significantly better out-of-the-box, often matching tuned XGBoost without any parameter search. For the LightGBM alternative for categorical features — LightGBM has native categorical support but requires integer encoding first, while CatBoost accepts raw string columns, the get_feature_importance(type="ShapValues") returns exact SHAP values without an additional library, and CatBoostRanker with YetiRank loss supports learning-to-rank tasks not natively available in LightGBM. The Claude Skills 360 bundle includes CatBoost skill sets covering Pool creation with categorical strings, classifier and regressor training, early stopping, feature importance and SHAP values, Optuna hyperparameter search, k-fold cross-validation, model save and load, and GPU training with task_type=GPU. Start with the free tier to try gradient boosting code generation.