Comet ML tracks ML experiments with rich visualizations. pip install comet_ml. from comet_ml import Experiment. experiment = Experiment(api_key="KEY", project_name="churn", workspace="myorg"). Log params: experiment.log_parameters({"lr": 0.05, "n_estimators": 200}). Log metrics: experiment.log_metric("auc", 0.87, step=epoch). Log multiple: experiment.log_metrics({"auc": 0.87, "ap": 0.72}, step=epoch). Images: experiment.log_image(fig, name="roc_curve"). Confusion matrix: experiment.log_confusion_matrix(y_true, y_pred, labels=["no_churn", "churn"]). Dataset hash: experiment.log_dataset_hash(df). Model artifact: experiment.log_model("churn-gbm", "model.pkl"). HTML: experiment.log_html_url("https://..."). Tags: experiment.add_tag("production"). End: experiment.end(). Context manager: with Experiment(...) as exp: .... Optimizer: from comet_ml import Optimizer, config = {"algorithm": "bayes", "name": "churn_sweep", "parameters": {"lr": {"type": "float", "min": 0.001, "max": 0.3, "scalingType": "loguniform"}}, "spec": {"maxCombo": 50, "objective": "maximize", "metric": "val_auc"}}. opt = Optimizer(config). for exp in opt.get_experiments(project_name="churn"): runs each trial. exp.log_parameter(...). Model Registry: from comet_ml.api import API, api = API(), api.registry_model_details("myorg", "churn-gbm"). api.update_registry_model_version(workspace="myorg", registry_name="churn-gbm", version="1.0.0", stages=["production"]). Comet LLM: import comet_llm; comet_llm.log_prompt(prompt, output, metadata). Claude Code generates Comet experiments, Optimizer sweep configs, model registry workflows, and TypeScript API clients.
CLAUDE.md for Comet ML
## Comet ML Stack
- Version: comet_ml >= 3.35
- Init: Experiment(api_key, project_name, workspace) — or COMET_API_KEY env var
- Params: experiment.log_parameters(dict) or log_parameter(key, value)
- Metrics: experiment.log_metric(name, value, step) or log_metrics(dict)
- Artifacts: experiment.log_model(name, file_path) for model registration
- Optimizer: Optimizer(config_dict) → for exp in opt.get_experiments(): ...
- Registry: API().update_registry_model_version(..., stages=["production"])
- LLM: comet_llm.log_prompt(prompt, output, metadata)
Training with Comet ML
# train_comet.py — training script with Comet ML tracking
from __future__ import annotations
import os
import pickle
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from comet_ml import Experiment
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import (
RocCurveDisplay,
average_precision_score,
classification_report,
confusion_matrix,
roc_auc_score,
)
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
FEATURE_COLS = ["age", "tenure_days", "monthly_spend", "support_tickets", "last_login_days"]
WORKSPACE = os.environ.get("COMET_WORKSPACE", "myorg")
PROJECT = "churn-prediction"
def train_and_log(
data_path: str = "data/train.csv",
n_estimators: int = 200,
learning_rate: float = 0.05,
max_depth: int = 4,
tags: list[str] | None = None,
) -> tuple[Pipeline, str]:
"""Train model with full Comet ML experiment logging."""
experiment = Experiment(
api_key=os.environ.get("COMET_API_KEY"),
project_name=PROJECT,
workspace=WORKSPACE,
auto_param_logging=False, # Log params manually for full control
auto_metric_logging=False,
)
if tags:
for tag in tags:
experiment.add_tag(tag)
experiment_key = experiment.get_key()
print(f"Comet experiment: https://www.comet.com/{WORKSPACE}/{PROJECT}/{experiment_key}")
# ── Log parameters ────────────────────────────────────────────────────
experiment.log_parameters({
"n_estimators": n_estimators,
"learning_rate": learning_rate,
"max_depth": max_depth,
"features": FEATURE_COLS,
"random_state": 42,
})
# ── Load and hash dataset ─────────────────────────────────────────────
df = pd.read_csv(data_path)
experiment.log_dataset_hash(df)
experiment.log_parameter("n_samples", len(df))
experiment.log_parameter("target_rate", float(df["churned"].mean()))
X = df[FEATURE_COLS].values
y = df["churned"].values
# ── Cross-validation ─────────────────────────────────────────────────
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
aucs: list[float] = []
with experiment.context_manager("cross_validation"):
for fold, (train_idx, val_idx) in enumerate(cv.split(X, y)):
pipeline = Pipeline([
("scaler", StandardScaler()),
("clf", GradientBoostingClassifier(
n_estimators=n_estimators, learning_rate=learning_rate,
max_depth=max_depth, random_state=42,
)),
])
pipeline.fit(X[train_idx], y[train_idx])
auc = roc_auc_score(y[val_idx], pipeline.predict_proba(X[val_idx])[:, 1])
aucs.append(auc)
experiment.log_metric("fold_auc", auc, step=fold)
mean_auc = float(np.mean(aucs))
experiment.log_metrics({"cv_auc_mean": mean_auc, "cv_auc_std": float(np.std(aucs))})
# ── Final model ───────────────────────────────────────────────────────
final = Pipeline([
("scaler", StandardScaler()),
("clf", GradientBoostingClassifier(
n_estimators=n_estimators, learning_rate=learning_rate,
max_depth=max_depth, random_state=42,
)),
])
final.fit(X, y)
y_pred = final.predict(X)
y_proba = final.predict_proba(X)[:, 1]
experiment.log_metrics({
"train_auc": float(roc_auc_score(y, y_proba)),
"train_ap": float(average_precision_score(y, y_proba)),
})
# ── Confusion matrix ──────────────────────────────────────────────────
experiment.log_confusion_matrix(
y_true=y.tolist(),
y_predicted=y_pred.tolist(),
labels=["no_churn", "churn"],
title="Confusion Matrix (Train)",
)
# ── ROC curve plot ────────────────────────────────────────────────────
fig, ax = plt.subplots(figsize=(6, 5))
RocCurveDisplay.from_predictions(y, y_proba, ax=ax, name="GBM")
ax.set_title(f"ROC Curve — AUC={mean_auc:.4f}")
experiment.log_figure("ROC Curve", fig)
plt.close(fig)
# ── Feature importance ────────────────────────────────────────────────
importances = final.named_steps["clf"].feature_importances_
fig2, ax2 = plt.subplots(figsize=(6, 4))
idxs = np.argsort(importances)[::-1]
ax2.bar(range(len(FEATURE_COLS)), importances[idxs])
ax2.set_xticks(range(len(FEATURE_COLS)))
ax2.set_xticklabels([FEATURE_COLS[i] for i in idxs], rotation=30, ha="right")
ax2.set_title("Feature Importance")
experiment.log_figure("Feature Importance", fig2)
plt.close(fig2)
# ── Save and log model ────────────────────────────────────────────────
Path("models").mkdir(exist_ok=True)
model_path = "models/churn_model.pkl"
with open(model_path, "wb") as f:
pickle.dump(final, f)
experiment.log_model("churn-gbm", model_path)
experiment.end()
print(f"\nCV AUC: {mean_auc:.4f} ± {np.std(aucs):.4f}")
return final, experiment_key
# ── Optimizer (hyperparameter sweep) ─────────────────────────────────────────
def run_optimizer(n_trials: int = 20) -> str:
"""Run Comet Optimizer Bayesian sweep."""
from comet_ml import Optimizer
config = {
"algorithm": "bayes",
"name": "churn_sweep",
"spec": {"maxCombo": n_trials, "objective": "maximize", "metric": "cv_auc_mean"},
"parameters": {
"learning_rate": {"type": "float", "min": 0.005, "max": 0.3, "scalingType": "loguniform"},
"n_estimators": {"type": "integer", "min": 50, "max": 600},
"max_depth": {"type": "integer", "min": 2, "max": 8},
},
}
optimizer = Optimizer(config, api_key=os.environ.get("COMET_API_KEY"))
best_auc = 0.0
best_key = ""
for experiment in optimizer.get_experiments(project_name=PROJECT, workspace=WORKSPACE):
lr = experiment.get_parameter("learning_rate")
n_est = experiment.get_parameter("n_estimators")
depth = experiment.get_parameter("max_depth")
df = pd.read_csv("data/train.csv")
X, y = df[FEATURE_COLS].values, df["churned"].values
pipeline = Pipeline([
("scaler", StandardScaler()),
("clf", GradientBoostingClassifier(
n_estimators=int(n_est), learning_rate=lr,
max_depth=int(depth), random_state=42,
)),
])
from sklearn.model_selection import cross_val_score
cv_auc = float(np.mean(cross_val_score(pipeline, X, y, cv=3, scoring="roc_auc")))
experiment.log_metric("cv_auc_mean", cv_auc)
if cv_auc > best_auc:
best_auc = cv_auc
best_key = experiment.get_key()
experiment.end()
print(f"\nBest CV AUC: {best_auc:.4f} (experiment: {best_key})")
return best_key
# ── Model Registry ────────────────────────────────────────────────────────────
def promote_to_production(registry_name: str = "churn-gbm", version: str = "1.0.0") -> None:
"""Promote a model version to production in Comet Model Registry."""
from comet_ml.api import API
api = API(api_key=os.environ.get("COMET_API_KEY"))
api.update_registry_model_version(
workspace=WORKSPACE,
registry_name=registry_name,
version=version,
stages=["production"],
comment="Promoted after A/B test validation",
)
print(f"Model {registry_name}:{version} → production")
if __name__ == "__main__":
train_and_log(
n_estimators=200,
learning_rate=0.05,
max_depth=4,
tags=["baseline", "gbm"],
)
TypeScript Client
// lib/cometml/client.ts — Comet ML REST API client
const COMET_API = "https://www.comet.com/api/rest/v2"
const API_KEY = process.env.COMET_API_KEY ?? ""
const WORKSPACE = process.env.COMET_WORKSPACE ?? "myorg"
const PROJECT = "churn-prediction"
async function cometFetch<T>(path: string, options?: RequestInit): Promise<T> {
const res = await fetch(`${COMET_API}${path}`, {
...options,
headers: { Authorization: API_KEY, "Content-Type": "application/json", ...options?.headers },
})
if (!res.ok) throw new Error(`Comet ML ${res.status}: ${await res.text()}`)
return res.json()
}
export type CometExperiment = {
experimentKey: string
experimentName: string
status: string
tags: string[]
metricsSummary: { metricName: string; valueCurrent: number }[]
}
/** List experiments in a project, sorted by a metric */
export async function listExperiments(
sortMetric: string = "cv_auc_mean",
): Promise<CometExperiment[]> {
const data = await cometFetch<{ experiments: CometExperiment[] }>(
`/experiments?workspaceName=${WORKSPACE}&projectName=${PROJECT}`
)
return (data.experiments ?? []).sort((a, b) => {
const getMetric = (e: CometExperiment) =>
e.metricsSummary?.find(m => m.metricName === sortMetric)?.valueCurrent ?? 0
return getMetric(b) - getMetric(a)
})
}
/** Get metrics for a specific experiment */
export async function getExperimentMetrics(experimentKey: string) {
return cometFetch(`/experiment/metrics/get-summary?experimentKey=${experimentKey}`)
}
For the Weights & Biases alternative when needing richer interactive visualizations, the W&B Tables for dataset comparison, Sweeps with ASHA/PBT schedulers, and a larger community with more integrations — W&B has a stronger ecosystem while Comet ML’s Optimizer supports similar Bayesian search and Comet’s confustion matrix and curve logging APIs are more opinionated and easier to use for standard classification metrics. For the Neptune.ai alternative when needing very large artifact storage, deeply nested custom metadata namespaces with the Neptune path hierarchy, advanced run filtering queries, and architecture designed for storing thousands of runs per day without performance degradation — Neptune handles massive scale while Comet ML’s LLM tracing with comet_llm provides unique capabilities for tracking LLM prompt chains alongside traditional ML experiments in a single unified platform. The Claude Skills 360 bundle includes Comet ML skill sets covering training experiments, Optimizer Bayesian sweeps, model registry promotion, LLM tracing, and TypeScript API clients. Start with the free tier to try ML experiment tracking generation.