imbalanced-learn provides resampling techniques for class-imbalanced datasets. pip install imbalanced-learn. from imblearn.over_sampling import SMOTE, ADASYN, RandomOverSampler, BorderlineSMOTE. from imblearn.under_sampling import RandomUnderSampler, TomekLinks, ClusterCentroids, EditedNearestNeighbours. from imblearn.combine import SMOTEENN, SMOTETomek. Oversample: X_res, y_res = SMOTE(sampling_strategy=0.5, random_state=42).fit_resample(X_train, y_train). ADASYN: ADASYN(sampling_strategy="minority") — focuses on hard-to-classify samples. Undersample: RandomUnderSampler(sampling_strategy=1.0) — match majority to minority. TomekLinks — removes majority samples near decision boundary. Pipeline: from imblearn.pipeline import Pipeline, pipe = Pipeline([("smote", SMOTE()), ("clf", RandomForestClassifier())]). Ensemble: from imblearn.ensemble import BalancedRandomForestClassifier, BalancedRandomForestClassifier(n_estimators=100). EasyEnsemble: from imblearn.ensemble import EasyEnsembleClassifier. sampling_strategy: float = ratio of minority/majority after resampling, "minority" = resample only minority, "all" = resample all. Combined: SMOTEENN() — SMOTE then clean with ENN, SMOTETomek() — SMOTE then remove Tomek pairs. Metrics: from sklearn.metrics import classification_report, roc_auc_score, average_precision_score. ALWAYS use PR-AUC and F1, not accuracy, for imbalanced tasks. Check: Counter(y). Claude Code generates imbalanced-learn resampling pipelines, fraud detection models, and rare event classifiers.
CLAUDE.md for imbalanced-learn
## imbalanced-learn Stack
- Version: imbalanced-learn >= 0.12 (scikit-learn >= 1.3)
- Oversample: SMOTE | ADASYN | BorderlineSMOTE | RandomOverSampler
- Undersample: RandomUnderSampler | TomekLinks | ClusterCentroids | ENN
- Combined: SMOTEENN | SMOTETomek (most robust in practice)
- Pipeline: imblearn.pipeline.Pipeline — resampling only on training fold
- Ensemble: BalancedRandomForestClassifier | EasyEnsembleClassifier
- Metrics: PR-AUC, F1, G-mean — NOT accuracy for imbalanced tasks
- sampling_strategy: float (target ratio) | "minority" | "all" | dict
imbalanced-learn Resampling Pipeline
# ml/imbalanced_pipeline.py — class imbalance handling with imbalanced-learn
from __future__ import annotations
import warnings
from collections import Counter
import numpy as np
import pandas as pd
from sklearn.datasets import make_classification
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.metrics import (
classification_report, roc_auc_score, average_precision_score,
f1_score, precision_score, recall_score, confusion_matrix,
)
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import (
SMOTE, ADASYN, RandomOverSampler, BorderlineSMOTE,
)
from imblearn.under_sampling import (
RandomUnderSampler, TomekLinks, ClusterCentroids,
EditedNearestNeighbours as ENN,
)
from imblearn.combine import SMOTEENN, SMOTETomek
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.ensemble import BalancedRandomForestClassifier, EasyEnsembleClassifier
warnings.filterwarnings("ignore")
# ── 1. Dataset diagnostics ────────────────────────────────────────────────────
def diagnose_imbalance(y: np.ndarray | pd.Series) -> dict:
"""
Characterize class distribution and imbalance ratio.
Recommends a strategy based on imbalance severity.
"""
counts = Counter(y)
total = len(y)
sorted_ = sorted(counts.items(), key=lambda x: x[1])
minority_class, minority_n = sorted_[0]
majority_class, majority_n = sorted_[-1]
ratio = minority_n / majority_n
pct = minority_n / total * 100
if ratio < 0.01:
recommendation = "SMOTEENN or EasyEnsembleClassifier — extreme imbalance"
elif ratio < 0.1:
recommendation = "SMOTETomek or BalancedRandomForestClassifier — severe"
elif ratio < 0.3:
recommendation = "SMOTE + class_weight='balanced' — moderate"
else:
recommendation = "class_weight='balanced' alone may suffice — mild"
return {
"counts": dict(counts),
"minority_class": minority_class,
"majority_class": majority_class,
"imbalance_ratio": round(ratio, 4),
"minority_pct": round(pct, 2),
"n_classes": len(counts),
"recommendation": recommendation,
}
# ── 2. Oversampling ───────────────────────────────────────────────────────────
def apply_smote(
X: np.ndarray | pd.DataFrame,
y: np.ndarray | pd.Series,
sampling_strategy: float | str = "minority",
k_neighbors: int = 5,
random_state: int = 42,
variant: str = "smote", # "smote" | "adasyn" | "borderline"
) -> tuple[np.ndarray, np.ndarray]:
"""
Oversample minority class using SMOTE variants.
SMOTE: synthetic interpolation between minority samples
ADASYN: focuses synthesis on hard (misclassified) region
Borderline-SMOTE: focuses on decision boundary minority samples
IMPORTANT: Only apply to training data, never to test/validation.
"""
sampler_map = {
"smote": SMOTE(sampling_strategy=sampling_strategy,
k_neighbors=k_neighbors, random_state=random_state),
"adasyn": ADASYN(sampling_strategy=sampling_strategy,
n_neighbors=k_neighbors, random_state=random_state),
"borderline": BorderlineSMOTE(sampling_strategy=sampling_strategy,
k_neighbors=k_neighbors, random_state=random_state),
"random": RandomOverSampler(sampling_strategy=sampling_strategy,
random_state=random_state),
}
sampler = sampler_map.get(variant, sampler_map["smote"])
X_res, y_res = sampler.fit_resample(X, y)
print(f"After {variant}: {Counter(y_res)}")
return X_res, y_res
# ── 3. Undersampling ──────────────────────────────────────────────────────────
def apply_undersampling(
X: np.ndarray | pd.DataFrame,
y: np.ndarray | pd.Series,
sampling_strategy: float | str = "auto",
variant: str = "random", # "random" | "tomek" | "enn"
random_state: int = 42,
) -> tuple[np.ndarray, np.ndarray]:
"""
Reduce majority class.
random: random removal
tomek: remove Tomek link pairs (ambiguous samples)
enn: remove samples misclassified by 3-NN (noisy samples)
"""
sampler_map = {
"random": RandomUnderSampler(sampling_strategy=sampling_strategy,
random_state=random_state),
"tomek": TomekLinks(sampling_strategy="majority"),
"enn": ENN(sampling_strategy="majority"),
}
sampler = sampler_map.get(variant, sampler_map["random"])
X_res, y_res = sampler.fit_resample(X, y)
print(f"After undersampling ({variant}): {Counter(y_res)}")
return X_res, y_res
# ── 4. Combined resampling ────────────────────────────────────────────────────
def apply_smoteenn(
X: np.ndarray | pd.DataFrame,
y: np.ndarray | pd.Series,
smote_ratio: float | str = "minority",
random_state: int = 42,
) -> tuple[np.ndarray, np.ndarray]:
"""
SMOTEENN: Oversample minority with SMOTE, then clean both classes with ENN.
More conservative than SMOTE alone — removes noisy synthetic samples.
"""
combined = SMOTEENN(
smote=SMOTE(sampling_strategy=smote_ratio, random_state=random_state),
random_state=random_state,
)
X_res, y_res = combined.fit_resample(X, y)
print(f"After SMOTEENN: {Counter(y_res)}")
return X_res, y_res
def apply_smotetomek(
X: np.ndarray | pd.DataFrame,
y: np.ndarray | pd.Series,
smote_ratio: float | str = "minority",
random_state: int = 42,
) -> tuple[np.ndarray, np.ndarray]:
"""SMOTETomek: Oversample then remove Tomek pairs."""
combined = SMOTETomek(sampling_strategy=smote_ratio, random_state=random_state)
X_res, y_res = combined.fit_resample(X, y)
print(f"After SMOTETomek: {Counter(y_res)}")
return X_res, y_res
# ── 5. Imbalanced pipeline ────────────────────────────────────────────────────
def build_smote_pipeline(
resampler, # Any imblearn sampler
classifier, # Any sklearn classifier
scaler = None,
) -> ImbPipeline:
"""
Build an imblearn Pipeline that applies resampling only during fit().
CRITICAL: must use imblearn.pipeline.Pipeline, not sklearn's — it handles
the fit/predict distinction correctly so test data is never resampled.
"""
steps = []
if scaler is not None:
steps.append(("scaler", scaler))
steps.append(("resampler", resampler))
steps.append(("classifier", classifier))
return ImbPipeline(steps)
def cross_val_imbalanced(
X: np.ndarray | pd.DataFrame,
y: np.ndarray | pd.Series,
pipeline: ImbPipeline,
folds: int = 5,
scoring: list[str] = None,
) -> dict:
"""
Stratified K-fold cross-validation with the imblearn pipeline.
Resampling is applied fresh inside each training fold.
"""
scoring = scoring or ["roc_auc", "average_precision", "f1"]
cv = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
results = cross_validate(pipeline, X, y, cv=cv, scoring=scoring,
return_train_score=False, n_jobs=-1)
return {
metric: {"mean": round(results[f"test_{metric}"].mean(), 4),
"std": round(results[f"test_{metric}"].std(), 4)}
for metric in scoring
}
# ── 6. Ensemble methods ───────────────────────────────────────────────────────
def balanced_random_forest(
X_train: np.ndarray | pd.DataFrame,
y_train: np.ndarray | pd.Series,
n_estimators: int = 200,
max_depth: int = None,
random_state: int = 42,
) -> BalancedRandomForestClassifier:
"""
BalancedRandomForestClassifier: bootstrap resampling within each tree
balances classes without a separate resampling step.
Built-in replacement for RandomForestClassifier on imbalanced data.
"""
model = BalancedRandomForestClassifier(
n_estimators=n_estimators,
max_depth=max_depth,
sampling_strategy="auto",
replacement=True,
random_state=random_state,
n_jobs=-1,
)
model.fit(X_train, y_train)
return model
def easy_ensemble(
X_train: np.ndarray | pd.DataFrame,
y_train: np.ndarray | pd.Series,
n_estimators: int = 10,
random_state: int = 42,
) -> EasyEnsembleClassifier:
"""
EasyEnsembleClassifier: trains multiple classifiers on balanced subsets.
Best for extreme imbalance (< 1% minority).
"""
model = EasyEnsembleClassifier(
n_estimators=n_estimators,
random_state=random_state,
n_jobs=-1,
)
model.fit(X_train, y_train)
return model
# ── 7. Evaluation helpers ─────────────────────────────────────────────────────
def evaluate_imbalanced(
model,
X_test: np.ndarray | pd.DataFrame,
y_test: np.ndarray | pd.Series,
threshold: float = 0.5,
) -> dict:
"""
Comprehensive evaluation for imbalanced classification.
Reports ROC-AUC, PR-AUC, F1, geometric mean, and confusion matrix.
"""
y_pred_proba = (
model.predict_proba(X_test)[:, 1]
if hasattr(model, "predict_proba")
else model.decision_function(X_test)
)
y_pred = (y_pred_proba >= threshold).astype(int)
y_np = np.array(y_test)
tn, fp, fn, tp = confusion_matrix(y_np, y_pred).ravel()
specificity = tn / (tn + fp + 1e-9)
sensitivity = tp / (tp + fn + 1e-9)
g_mean = np.sqrt(sensitivity * specificity)
return {
"roc_auc": round(roc_auc_score(y_np, y_pred_proba), 4),
"pr_auc": round(average_precision_score(y_np, y_pred_proba), 4),
"f1": round(f1_score(y_np, y_pred), 4),
"precision": round(precision_score(y_np, y_pred, zero_division=0), 4),
"recall": round(recall_score(y_np, y_pred), 4),
"specificity": round(specificity, 4),
"g_mean": round(g_mean, 4),
"confusion_matrix": {"tn": int(tn), "fp": int(fp), "fn": int(fn), "tp": int(tp)},
}
def compare_strategies(
X_train: np.ndarray,
y_train: np.ndarray,
X_test: np.ndarray,
y_test: np.ndarray,
base_clf = None,
) -> pd.DataFrame:
"""Compare multiple resampling strategies on the same classifier."""
if base_clf is None:
base_clf = LogisticRegression(class_weight="balanced", max_iter=500)
strategies = {
"No resampling": None,
"RandomOverSampler": RandomOverSampler(random_state=42),
"SMOTE": SMOTE(random_state=42),
"ADASYN": ADASYN(random_state=42),
"RandomUnderSampler": RandomUnderSampler(random_state=42),
"SMOTEENN": SMOTEENN(random_state=42),
"SMOTETomek": SMOTETomek(random_state=42),
}
rows = []
for name, sampler in strategies.items():
if sampler is not None:
X_res, y_res = sampler.fit_resample(X_train, y_train)
else:
X_res, y_res = X_train, y_train
from sklearn.base import clone
clf = clone(base_clf)
clf.fit(X_res, y_res)
metrics = evaluate_imbalanced(clf, X_test, y_test)
rows.append({"strategy": name, **metrics})
return pd.DataFrame(rows).sort_values("pr_auc", ascending=False).reset_index(drop=True)
# ── Demo ──────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
print("imbalanced-learn Class Imbalance Demo")
print("=" * 50)
# Create highly imbalanced dataset (5% positive)
X, y = make_classification(
n_samples=10000, n_features=20, n_informative=10,
n_redundant=5, weights=[0.95, 0.05], flip_y=0.01,
random_state=42,
)
print(f"\nClass distribution: {Counter(y)}")
# Diagnose
diag = diagnose_imbalance(y)
print(f"Imbalance ratio: {diag['imbalance_ratio']} ({diag['minority_pct']}% minority)")
print(f"Recommendation: {diag['recommendation']}")
from sklearn.model_selection import train_test_split
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
# Compare strategies
print("\nComparing resampling strategies (LogisticRegression):")
comparison = compare_strategies(X_tr, y_tr, X_te, y_te)
print(comparison[["strategy", "roc_auc", "pr_auc", "f1", "g_mean"]].to_string(index=False))
# SMOTEENN pipeline with cross-validation
print("\nSMOTEENN + Random Forest pipeline (5-fold CV):")
pipe = build_smote_pipeline(
SMOTEENN(random_state=42),
RandomForestClassifier(n_estimators=100, class_weight="balanced", random_state=42),
)
cv_results = cross_val_imbalanced(X_tr, y_tr, pipe)
for metric, vals in cv_results.items():
print(f" {metric}: {vals['mean']:.4f} ± {vals['std']:.4f}")
# BalancedRandomForest
print("\nBalancedRandomForestClassifier:")
brf = balanced_random_forest(X_tr, y_tr, n_estimators=100)
brf_metrics = evaluate_imbalanced(brf, X_te, y_te)
print(f" ROC-AUC: {brf_metrics['roc_auc']}, PR-AUC: {brf_metrics['pr_auc']}, "
f"F1: {brf_metrics['f1']}, G-mean: {brf_metrics['g_mean']}")
For the class_weight=‘balanced’ sklearn alternative — setting class_weight='balanced' adjusts loss weights without creating synthetic data, while imbalanced-learn’s SMOTE generates new minority samples that change the data distribution, helping tree models find better splits, and SMOTEENN then removes the noisy boundary samples that cause false positives, making the combined strategy strictly better than class_weight alone for severe imbalance (< 5% minority). For the manual oversampling (.sample(replace=True)) alternative — naive replication duplicates identical samples while SMOTE synthesizes new points along the K-nearest-neighbor convex hull, adding geometric diversity that prevents overfitting to the exact minority samples, and imblearn.pipeline.Pipeline ensures the resampler fits only on training folds so test data never leaks synthetic distribution information into cross-validation scores. The Claude Skills 360 bundle includes imbalanced-learn skill sets covering SMOTE, ADASYN, BorderlineSMOTE, RandomUnderSampler, TomekLinks, SMOTEENN, SMOTETomek, imblearn Pipeline for safe cross-validation, BalancedRandomForestClassifier, EasyEnsembleClassifier, imbalance diagnostics, and PR-AUC/G-mean evaluation. Start with the free tier to try class imbalance code generation.