Python’s statistics module computes exact descriptive statistics without third-party dependencies. import statistics. mean: statistics.mean([1,2,3,4,5]) → 3 (exact). fmean: statistics.fmean(data) → float, faster. median: statistics.median(data) — middle value; .median_low, .median_high, .median_grouped. mode: statistics.mode(data) — most common; .multimode(data) → list of all modes. stdev: statistics.stdev(data) — sample standard deviation (N-1). variance: statistics.variance(data) — sample variance. pstdev / pvariance: population versions (N). geometric_mean: statistics.geometric_mean([1,2,4,8]) → 2.828… harmonic_mean: statistics.harmonic_mean([40,60]) — rate averages. quantiles: statistics.quantiles(data, n=4) → [Q1, Q2, Q3]. NormalDist: nd = statistics.NormalDist(mu=0, sigma=1); nd.pdf(0); nd.cdf(1.96); nd.inv_cdf(0.95); nd.zscore(x); nd.overlap(other); nd.samples(n). correlation: statistics.correlation(x, y) — Pearson r (Python 3.10+). covariance: statistics.covariance(x, y). linear_regression: slope, intercept = statistics.linear_regression(x, y). StatisticsError: raised on empty data or no unique mode. Claude Code generates data summary reporters, A/B test analyzers, quality-control checkers, and metric dashboards.
CLAUDE.md for statistics
## statistics Stack
- Stdlib: import statistics
- Central: statistics.mean(data) | statistics.median(data) | statistics.mode(data)
- Spread: statistics.stdev(data) | statistics.variance(data) | statistics.quantiles(data, n=4)
- Dist: statistics.NormalDist(mu, sigma).cdf(x) | .inv_cdf(p) | .zscore(x)
- Bivariate: statistics.correlation(x, y) | statistics.linear_regression(x, y)
- Error: statistics.StatisticsError on empty list or ambiguous mode
statistics Data Analysis Pipeline
# app/statsutil.py — mean, median, stdev, quantiles, NormalDist, regression
from __future__ import annotations
import statistics
from dataclasses import dataclass, field
from typing import Any, Sequence
# ─────────────────────────────────────────────────────────────────────────────
# 1. Descriptive summary
# ─────────────────────────────────────────────────────────────────────────────
@dataclass
class Summary:
"""
Descriptive statistics for a numeric dataset.
Example:
s = Summary.from_data([2, 4, 4, 6, 6, 6, 8])
print(s)
"""
n: int
mean: float
median: float
stdev: float | None
variance: float | None
minimum: float
maximum: float
q1: float
q3: float
iqr: float
modes: list[Any] = field(default_factory=list)
@classmethod
def from_data(cls, data: Sequence[float]) -> "Summary":
data = list(data)
n = len(data)
if n == 0:
raise statistics.StatisticsError("No data points")
mn = statistics.fmean(data)
med = statistics.median(data)
sd = statistics.stdev(data) if n > 1 else None
var = statistics.variance(data) if n > 1 else None
q1, _, q3 = statistics.quantiles(data, n=4) if n >= 2 else (data[0], data[0], data[0])
return cls(
n=n, mean=mn, median=med, stdev=sd, variance=var,
minimum=min(data), maximum=max(data),
q1=q1, q3=q3, iqr=q3 - q1,
modes=statistics.multimode(data),
)
def __str__(self) -> str:
lines = [
f"n={self.n} mean={self.mean:.4g} median={self.median:.4g}",
f"stdev={self.stdev:.4g if self.stdev is not None else 'N/A'} "
f"var={self.variance:.4g if self.variance is not None else 'N/A'}",
f"min={self.minimum:.4g} Q1={self.q1:.4g} Q3={self.q3:.4g} "
f"max={self.maximum:.4g} IQR={self.iqr:.4g}",
f"mode(s)={self.modes}",
]
return "\n".join(lines)
def describe(data: Sequence[float]) -> dict[str, Any]:
"""
Return a plain dict of descriptive statistics.
Example:
d = describe([10, 20, 30, 40, 50])
d["mean"] # 30.0
d["stdev"] # 15.81...
"""
s = Summary.from_data(data)
return {
"n": s.n, "mean": s.mean, "median": s.median,
"stdev": s.stdev, "variance": s.variance,
"min": s.minimum, "max": s.maximum,
"q1": s.q1, "q3": s.q3, "iqr": s.iqr,
"modes": s.modes,
}
def five_number_summary(data: Sequence[float]) -> dict[str, float]:
"""
Return min, Q1, median, Q3, max (Tukey five-number summary).
Example:
five_number_summary([3,7,8,5,12,14,21,13,18])
"""
d = sorted(data)
q1, med, q3 = statistics.quantiles(d, n=4)
return {"min": d[0], "q1": q1, "median": med, "q3": q3, "max": d[-1]}
def outliers_iqr(
data: Sequence[float],
multiplier: float = 1.5,
) -> list[float]:
"""
Return values outside [Q1 - k*IQR, Q3 + k*IQR] (Tukey fence method).
Example:
outliers_iqr([1, 2, 2, 3, 100]) # [100]
"""
d = list(data)
q1, _, q3 = statistics.quantiles(d, n=4)
iqr = q3 - q1
lo, hi = q1 - multiplier * iqr, q3 + multiplier * iqr
return [v for v in d if v < lo or v > hi]
# ─────────────────────────────────────────────────────────────────────────────
# 2. Distribution and z-scores
# ─────────────────────────────────────────────────────────────────────────────
def z_scores(data: Sequence[float]) -> list[float]:
"""
Compute z-score for every value in data.
Example:
z_scores([2, 4, 4, 4, 5, 5, 7, 9])
"""
nd = statistics.NormalDist.from_samples(data)
return [nd.zscore(x) for x in data]
def percentile_rank(data: Sequence[float], value: float) -> float:
"""
Return percentage of values in data that are ≤ value (0–100).
Example:
percentile_rank([1,2,3,4,5,6,7,8,9,10], 7) # 70.0
"""
d = sorted(data)
count = sum(1 for v in d if v <= value)
return count / len(d) * 100
def normal_ci(
data: Sequence[float],
confidence: float = 0.95,
) -> tuple[float, float]:
"""
Compute a confidence interval assuming normality.
Returns (lower, upper).
Example:
lo, hi = normal_ci(measurements, confidence=0.95)
"""
nd = statistics.NormalDist.from_samples(data)
alpha = (1 - confidence) / 2
margin = nd.inv_cdf(1 - alpha) - nd.mean
return nd.mean - margin, nd.mean + margin
def distribution_overlap(
sample_a: Sequence[float],
sample_b: Sequence[float],
) -> float:
"""
Compute the overlap coefficient between two empirical normal distributions.
Returns fraction of overlap [0, 1].
Example:
overlap = distribution_overlap(control_times, treatment_times)
print(f"Overlap: {overlap:.1%}")
"""
nd_a = statistics.NormalDist.from_samples(sample_a)
nd_b = statistics.NormalDist.from_samples(sample_b)
return nd_a.overlap(nd_b)
# ─────────────────────────────────────────────────────────────────────────────
# 3. Rates and weighted averages
# ─────────────────────────────────────────────────────────────────────────────
def geometric_mean(data: Sequence[float]) -> float:
"""
Geometric mean — appropriate for growth rates and ratios.
Example:
geometric_mean([1.10, 1.25, 0.90, 1.05]) # compound growth factor
"""
return statistics.geometric_mean(data)
def harmonic_mean(data: Sequence[float]) -> float:
"""
Harmonic mean — appropriate for rates (speed, throughput).
Example:
harmonic_mean([40, 60]) # avg speed when equal distances at 40 and 60 mph → 48
"""
return statistics.harmonic_mean(data)
def weighted_mean(values: Sequence[float], weights: Sequence[float]) -> float:
"""
Compute weighted arithmetic mean.
Example:
weighted_mean([70, 80, 90], [1, 2, 3]) # 83.33...
"""
if len(values) != len(weights):
raise ValueError("values and weights must be the same length")
total_w = sum(weights)
if total_w == 0:
raise statistics.StatisticsError("Total weight is zero")
return sum(v * w for v, w in zip(values, weights)) / total_w
def moving_average(values: Sequence[float], window: int) -> list[float]:
"""
Simple moving average over a sliding window.
Example:
moving_average([1, 2, 3, 4, 5, 6, 7], window=3)
# [2.0, 3.0, 4.0, 5.0, 6.0]
"""
if window < 1:
raise ValueError("window must be >= 1")
result = []
for i in range(window - 1, len(values)):
result.append(statistics.fmean(values[i - window + 1: i + 1]))
return result
# ─────────────────────────────────────────────────────────────────────────────
# 4. Bivariate analysis
# ─────────────────────────────────────────────────────────────────────────────
@dataclass
class RegressionResult:
slope: float
intercept: float
r: float # Pearson correlation
r_squared: float
def predict(self, x: float) -> float:
return self.slope * x + self.intercept
def __str__(self) -> str:
return (
f"y = {self.slope:.4g}x + {self.intercept:.4g} "
f"r={self.r:.4f} R²={self.r_squared:.4f}"
)
def linear_fit(x: Sequence[float], y: Sequence[float]) -> RegressionResult:
"""
Fit a simple linear regression y = mx + b.
Example:
result = linear_fit([1,2,3,4,5], [2,4,5,4,5])
print(result)
print(result.predict(6))
"""
slope, intercept = statistics.linear_regression(x, y)
r = statistics.correlation(x, y)
return RegressionResult(slope=slope, intercept=intercept, r=r, r_squared=r**2)
def correlation_strength(r: float) -> str:
"""
Describe Pearson r magnitude informally.
Example:
correlation_strength(0.85) # "strong positive"
"""
abs_r = abs(r)
sign = "positive" if r >= 0 else "negative"
if abs_r >= 0.9:
strength = "very strong"
elif abs_r >= 0.7:
strength = "strong"
elif abs_r >= 0.5:
strength = "moderate"
elif abs_r >= 0.3:
strength = "weak"
else:
strength = "negligible"
return f"{strength} {sign}"
# ─────────────────────────────────────────────────────────────────────────────
# Demo
# ─────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
print("=== statistics demo ===")
data = [15, 17, 19, 21, 22, 23, 23, 24, 25, 27, 30, 35, 100]
print("\n--- Summary ---")
s = Summary.from_data(data)
print(s)
print("\n--- five_number_summary ---")
print(f" {five_number_summary(data)}")
print("\n--- outliers_iqr ---")
print(f" outliers: {outliers_iqr(data)}")
print("\n--- z_scores (first 5) ---")
zs = z_scores(data[:12]) # exclude outlier for readable z
for v, z in zip(data[:12], zs):
print(f" {v:5.1f} → z={z:+.2f}")
print("\n--- percentile_rank ---")
print(f" rank of 23: {percentile_rank(data, 23):.1f}%")
print(f" rank of 30: {percentile_rank(data, 30):.1f}%")
print("\n--- normal_ci ---")
clean = data[:-1] # drop outlier
lo, hi = normal_ci(clean, 0.95)
print(f" 95% CI: ({lo:.2f}, {hi:.2f})")
print("\n--- distribution_overlap ---")
import random
random.seed(42)
control = [random.gauss(50, 10) for _ in range(200)]
treatment = [random.gauss(55, 10) for _ in range(200)]
ov = distribution_overlap(control, treatment)
print(f" overlap: {ov:.1%}")
print("\n--- geometric_mean / harmonic_mean ---")
growth_rates = [1.10, 1.25, 0.90, 1.05, 1.15]
print(f" geometric_mean(growth rates): {geometric_mean(growth_rates):.4f}")
speeds = [40, 60, 80]
print(f" harmonic_mean(speeds): {harmonic_mean(speeds):.4f}")
print("\n--- weighted_mean ---")
scores = [70, 80, 90]
weights = [1, 2, 3]
print(f" weighted_mean: {weighted_mean(scores, weights):.4f}")
print("\n--- moving_average ---")
series = [10, 12, 11, 14, 13, 16, 15, 18]
ma = moving_average(series, window=3)
print(f" series: {series}")
print(f" MA(3): {[round(v, 2) for v in ma]}")
print("\n--- linear_fit ---")
x = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
y = [2.1, 3.9, 6.2, 7.8, 10.1, 11.9, 14.0, 16.2, 18.1, 20.0]
result = linear_fit(x, y)
print(f" {result}")
print(f" predict(11): {result.predict(11):.2f}")
print(f" strength: {correlation_strength(result.r)}")
print("\n=== done ===")
For the numpy alternative — numpy provides vectorized array operations with np.mean(), np.std(), np.percentile(), np.corrcoef(), and np.polyfit() at C speed across large arrays, but adds a ~20 MB dependency; statistics module uses exact rational arithmetic (no floating-point rounding for integer inputs) and requires no installation — use numpy for large datasets (>10k elements), matrix operations, or FFT pipelines where vectorization matters, statistics for small datasets, server-side summaries with zero dependencies, and anywhere exact integer arithmetic is preferable. For the scipy.stats alternative — scipy.stats extends beyond descriptive statistics to inferential testing: t-tests, chi-square, Mann-Whitney, ANOVA, KDE, confidence intervals, and distribution fitting with 100+ distributions; statistics.NormalDist covers z-scores, CDF, and overlap for the normal distribution only — use scipy.stats for hypothesis testing, goodness-of-fit, or multi-distribution modeling in data science notebooks, statistics for production services needing lightweight summaries without a SciPy install. The Claude Skills 360 bundle includes statistics skill sets covering Summary dataclass with from_data() factory, describe()/five_number_summary()/outliers_iqr() summary helpers, z_scores()/percentile_rank()/normal_ci()/distribution_overlap() NormalDist utilities, geometric_mean()/harmonic_mean()/weighted_mean()/moving_average() rate and average helpers, and linear_fit()/RegressionResult/correlation_strength() regression tools. Start with the free tier to try descriptive analytics and statistics pipeline code generation.