whylogs profiles datasets with lightweight statistical summaries. pip install whylogs. import whylogs as why. result = why.log(df) profiles a pandas DataFrame. profile = result.profile(). profile.view().to_pandas() shows column statistics — count, null fraction, min, max, mean, stddev, quantiles, cardinality estimate. why.log({"text": "hello world"}) profiles a single record. Save: result.writer("local").option(base_dir="profiles").write() saves .bin and .json. Read: DatasetProfileView.read("profile.bin"). WhyLabs integration: why.init(whylabs_api_key="key", org_id="org", dataset_id="model-123"), then result.writer("whylabs").write() uploads to WhyLabs for monitoring. Logger: with why.logger(mode=LoggingMode.ROLLING, interval=10, when=TimeUnit.MINUTES) as logger: logger.log({"feature": value}) for streaming. Constraint checks: from whylogs.core.constraints.factories import no_missing_values, is_non_negative, greater_than_number. builder = ConstraintsBuilder(dataset_profile_view=profile.view()). builder.add_constraint(no_missing_values(col_name="user_id")). builder.add_constraint(is_non_negative("amount_usd")). constraints = builder.build(). report = constraints.report() — report is a list of (constraint_name, passed, metrics). Segments: why.log(df, schema=DatasetSchema(segments=segment_column("plan"))) profiles each segment separately. Column schema: DatasetSchema(resolvers=Resolver()) for custom metric configuration. Profile comparison: ProfileView.diff(profile_a, profile_b) detects drift. SummaryDriftAlgorithm computes standardized drift scores. Claude Code generates whylogs profiling scripts, constraint sets, streaming loggers, WhyLabs uploads, and CI pipeline checks.
CLAUDE.md for whylogs
## whylogs Stack
- Version: whylogs >= 1.4
- Profile: why.log(df) → result.profile() → profile.view().to_pandas()
- Save: result.writer("local").option(base_dir="profiles/").write()
- WhyLabs: why.init(whylabs_api_key, org_id, dataset_id) then result.writer("whylabs").write()
- Constraints: ConstraintsBuilder(profile.view()) + add_constraint / build / report
- Segments: DatasetSchema(segments=segment_column("col")) for per-segment profiles
- Logger: why.logger(mode=LoggingMode.ROLLING, interval=5, when=TimeUnit.MINUTES)
- Compare: ProfileView.diff(ref, current) for drift detection
Profiling and Constraints
# monitoring/whylogs_profiler.py — data profiling with constraints
from __future__ import annotations
import json
import sys
from datetime import datetime
from pathlib import Path
from typing import Any
import pandas as pd
import whylogs as why
from whylogs.core import DatasetSchema
from whylogs.core.constraints import ConstraintsBuilder, MetricConstraint
from whylogs.core.constraints.factories import (
greater_than_number,
is_in_range,
is_non_negative,
no_missing_values,
smaller_than_number,
)
from whylogs.core.segmentation_partition import segment_column
from whylogs.core.view.dataset_profile_view import DatasetProfileView
FEATURE_COLS = ["age", "tenure_days", "monthly_spend", "support_tickets", "last_login_days"]
TARGET_COL = "churned"
# ── Profile a DataFrame ───────────────────────────────────────────────────────
def profile_dataframe(
df: pd.DataFrame,
output_dir: str = "profiles",
dataset_name: str = "churn_data",
) -> DatasetProfileView:
"""Profile a DataFrame and save to disk."""
Path(output_dir).mkdir(exist_ok=True)
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
result = why.log(df)
# Save binary profile
result.writer("local") \
.option(base_dir=output_dir) \
.option(filename=f"{dataset_name}_{ts}") \
.write()
# Also export JSON summary
view = result.profile().view()
summary = view.to_pandas()
summary.to_csv(f"{output_dir}/{dataset_name}_{ts}_summary.csv")
print(f"Profile saved: {output_dir}/{dataset_name}_{ts}.bin")
print(f"\nColumn statistics:")
print(summary[["counts/n", "counts/null", "distribution/mean", "distribution/stddev",
"distribution/min", "distribution/max"]].to_string())
return view
# ── Segment profiling ─────────────────────────────────────────────────────────
def profile_segments(
df: pd.DataFrame,
segment_col: str = "plan",
output_dir: str = "profiles",
) -> dict[str, DatasetProfileView]:
"""Profile data broken down by a categorical segment."""
result = why.log(
df,
schema=DatasetSchema(
segments=segment_column(segment_col),
),
)
segment_views: dict[str, DatasetProfileView] = {}
for segment, view in result.segments().items():
segment_name = str(segment)
segment_views[segment_name] = view
# Save each segment profile
Path(output_dir).mkdir(exist_ok=True)
view.write(f"{output_dir}/segment_{segment_name}.bin")
print(f"Segment '{segment_name}': {view.to_pandas()['counts/n'].values[0]} rows")
return segment_views
# ── Build constraint set ──────────────────────────────────────────────────────
def build_constraints(view: DatasetProfileView) -> ConstraintsBuilder:
"""Define data quality constraints."""
builder = ConstraintsBuilder(dataset_profile_view=view)
# Completeness constraints
builder.add_constraint(no_missing_values(col_name="age"))
builder.add_constraint(no_missing_values(col_name="monthly_spend"))
# Range constraints
builder.add_constraint(is_in_range(col_name="age", lower=18, upper=120))
builder.add_constraint(is_in_range(col_name="monthly_spend", lower=0, upper=100000))
builder.add_constraint(is_in_range(col_name="tenure_days", lower=0, upper=3650))
builder.add_constraint(is_non_negative("support_tickets"))
builder.add_constraint(is_non_negative("last_login_days"))
# Distribution constraints (warn if mean drifts)
builder.add_constraint(
is_in_range(col_name="monthly_spend", lower=10, upper=5000,
metric_selector=lambda m: m.mean)
)
return builder
def check_data_quality(
df: pd.DataFrame,
fail_fast: bool = False,
) -> tuple[bool, list[dict]]:
"""Profile data and run constraint checks. Returns (all_passed, report)."""
view = profile_dataframe(df)
builder = build_constraints(view)
constraints = builder.build()
report = constraints.report()
failures = []
for constraint_name, passed, metric_value in report:
status = "PASS" if passed else "FAIL"
print(f" {status}: {constraint_name} (value={metric_value})")
if not passed:
failures.append({"constraint": constraint_name, "value": str(metric_value)})
if fail_fast:
raise ValueError(f"Constraint failed: {constraint_name}")
all_passed = len(failures) == 0
return all_passed, [{"name": r[0], "passed": r[1], "value": str(r[2])} for r in report]
# ── Drift detection ───────────────────────────────────────────────────────────
def detect_drift(
reference_path: str = "profiles/reference.bin",
current_path: str = "profiles/current.bin",
threshold: float = 0.3,
) -> dict[str, Any]:
"""Compare two profiles and flag drifted columns."""
from whylogs.core.utils.summary_drift_calculations import SummaryDriftAlgorithm
ref = DatasetProfileView.read(reference_path)
curr = DatasetProfileView.read(current_path)
drift_results: dict[str, float] = {}
drifted_cols: list[str] = []
ref_cols = ref.to_pandas().index.tolist()
curr_cols = curr.to_pandas().index.tolist()
shared = set(ref_cols) & set(curr_cols)
for col in shared:
try:
col_ref = ref.get_column(col)
col_curr = curr.get_column(col)
score = SummaryDriftAlgorithm.compute(col_ref, col_curr)
drift_results[col] = float(score) if score is not None else 0.0
if drift_results[col] > threshold:
drifted_cols.append(col)
except Exception as e:
drift_results[col] = 0.0
drift_share = len(drifted_cols) / max(len(shared), 1)
summary = {
"drift_share": drift_share,
"drifted_columns": drifted_cols,
"column_scores": drift_results,
"threshold": threshold,
"alert": drift_share > 0.3,
}
print(f"\nDrift summary: {len(drifted_cols)}/{len(shared)} columns drifted "
f"(share={drift_share:.2%})")
for col in drifted_cols:
print(f" DRIFT: {col} (score={drift_results[col]:.4f})")
return summary
# ── WhyLabs upload ────────────────────────────────────────────────────────────
def upload_to_whylabs(df: pd.DataFrame, api_key: str, org_id: str, dataset_id: str) -> None:
"""Profile and upload to WhyLabs for monitoring."""
import os
os.environ["WHYLABS_DEFAULT_DATASET_ID"] = dataset_id
os.environ["WHYLABS_DEFAULT_ORG_ID"] = org_id
why.init(whylabs_api_key=api_key, default_dataset_id=dataset_id)
result = why.log(df)
result.writer("whylabs").write()
print(f"Profile uploaded to WhyLabs: org={org_id}, dataset={dataset_id}")
# ── CI gate ───────────────────────────────────────────────────────────────────
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--data", required=True, help="CSV file to profile")
parser.add_argument("--mode", choices=["profile", "check", "drift"], default="check")
parser.add_argument("--reference", help="Reference profile for drift check")
args = parser.parse_args()
df = pd.read_csv(args.data)
if args.mode == "profile":
profile_dataframe(df)
elif args.mode == "check":
passed, report = check_data_quality(df)
print(f"\n{'All checks passed.' if passed else 'QUALITY CHECKS FAILED.'}")
sys.exit(0 if passed else 1)
elif args.mode == "drift":
if not args.reference:
print("--reference required for drift mode")
sys.exit(1)
# Save current profile first
view = profile_dataframe(df, dataset_name="current")
result = detect_drift(reference_path=args.reference)
sys.exit(1 if result["alert"] else 0)
For the Evidently alternative when needing rich HTML reports, interactive dashboards, and a Testing Suite framework with pre-built test presets for data drift, data quality, and model performance that produce detailed pass/fail results with visual diffs — Evidently’s Report is more comprehensive for batch monitoring while whylogs is optimized for lightweight statistical sketches that work in streaming/real-time scenarios with minimal overhead, making it ideal for logging every row in a high-throughput inference pipeline. For the Great Expectations alternative when needing a Python-native assertion framework with detailed expectation suites, HTML validation reports, and deep integration with pandas/Spark/SQL — Great Expectations is richer for ETL validation while whylogs is purpose-built for ML data profiling with quantile sketches, cardinality estimation (HLL), and the WhyLabs platform for centralized monitoring dashboards. The Claude Skills 360 bundle includes whylogs skill sets covering DataFrame profiling, constraint sets, drift detection, segment profiling, and WhyLabs cloud upload. Start with the free tier to try ML data logging generation.