statsmodels provides rigorous statistical inference for regression and time series. pip install statsmodels. import statsmodels.api as sm. import statsmodels.formula.api as smf. OLS: model = sm.OLS(y, sm.add_constant(X)), results = model.fit(), print(results.summary()) — R² F-stat t-tests p-values CI. Formula API: results = smf.ols("y ~ x1 + x2 + C(category)", data=df).fit(). Robust SE: results = model.fit(cov_type="HC3") — heteroscedasticity-robust. Logit: smf.logit("bought ~ price + age + C(gender)", data=df).fit(), results.get_margeff(). ARIMA: from statsmodels.tsa.arima.model import ARIMA, model = ARIMA(y, order=(p,d,q)).fit(), forecast = model.forecast(steps=12). SARIMA: ARIMA(y, order=(1,1,1), seasonal_order=(1,1,1,12)). ACF/PACF: from statsmodels.graphics.tsaplots import plot_acf, plot_pacf. ADF test: from statsmodels.tsa.stattools import adfuller, stat, p, *_ = adfuller(series). Holt-Winters: from statsmodels.tsa.holtwinters import ExponentialSmoothing, model = ExponentialSmoothing(y, trend="add", seasonal="add", seasonal_periods=12).fit(). VAR: from statsmodels.tsa.vector_ar.var_model import VAR, var = VAR(df_multi).fit(maxlags=12). GLM Poisson: sm.GLM(y, X, family=sm.families.Poisson()).fit(). QuantReg: smf.quantreg("y ~ x", data=df).fit(q=0.5). Claude Code generates OLS regression analysis, ARIMA forecasters, logistic regression models, and time series diagnostic pipelines.
CLAUDE.md for statsmodels
## statsmodels Stack
- Version: statsmodels >= 0.14
- API: sm.OLS/GLM/Logit(y, X).fit() | smf.ols/logit("y ~ x + C(cat)", df).fit()
- Robust SE: model.fit(cov_type="HC3") for heteroscedasticity
- Summary: results.summary() → R², F-stat, t-tests, p-values, 95% CI
- ARIMA: ARIMA(y, order=(p,d,q), seasonal_order=(P,D,Q,m)).fit()
- HW: ExponentialSmoothing(y, trend, seasonal, seasonal_periods).fit()
- VAR: VAR(df_multivariate).fit(maxlags=12) → forecast, irf, fevd
- Tests: adfuller(y) | kpss(y) | het_breuschpagan(resid, X)
statsmodels Statistical Modeling Pipeline
# ml/statsmodels_pipeline.py — statistical inference and time series modeling
from __future__ import annotations
import warnings
import numpy as np
import pandas as pd
from pathlib import Path
import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.holtwinters import ExponentialSmoothing, SimpleExpSmoothing
from statsmodels.tsa.vector_ar.var_model import VAR
from statsmodels.tsa.stattools import adfuller, kpss, acf, pacf
from statsmodels.stats.stattools import durbin_watson
from statsmodels.stats.diagnostic import het_breuschpagan, acorr_ljungbox
from statsmodels.stats.outliers_influence import variance_inflation_factor
warnings.filterwarnings("ignore")
# ── 1. OLS regression ─────────────────────────────────────────────────────────
def fit_ols(
y: np.ndarray | pd.Series,
X: np.ndarray | pd.DataFrame,
add_const: bool = True,
cov_type: str = "nonrobust", # "HC3" for robust, "HAC" for autocorrelated
missing: str = "drop",
) -> sm.regression.linear_model.RegressionResultsWrapper:
"""
Fit OLS regression and return results.
cov_type:
- "nonrobust": classical OLS SE (assumes homoscedastic errors)
- "HC3": White's heteroscedasticity-robust SE (use by default)
- "HAC": Newey-West, autocorrelation + heteroscedasticity robust
- "cluster": Cluster-robust SE (add groups=cluster_var)
"""
if add_const:
X = sm.add_constant(X)
model = sm.OLS(y, X, missing=missing)
results = model.fit(cov_type=cov_type)
return results
def fit_ols_formula(
formula: str,
data: pd.DataFrame,
cov_type: str = "HC3",
) -> object:
"""
R-style formula OLS.
Examples:
- "price ~ sqft + bedrooms + C(neighborhood) + sqft:bedrooms"
- "y ~ np.log(x1) + x2 + I(x2**2)"
- "sales ~ Q('a column with spaces') + time + time:campaign"
"""
return smf.ols(formula, data=data).fit(cov_type=cov_type)
def regression_summary(results) -> dict:
"""Extract key statistics from regression results."""
return {
"r_squared": round(results.rsquared, 4),
"adj_r2": round(results.rsquared_adj, 4),
"f_stat": round(results.fvalue, 4) if not np.isnan(results.fvalue) else None,
"f_pvalue": round(results.f_pvalue, 6) if not np.isnan(results.f_pvalue) else None,
"aic": round(results.aic, 4),
"bic": round(results.bic, 4),
"n_obs": int(results.nobs),
"coefs": results.params.to_dict(),
"pvalues": results.pvalues.to_dict(),
"conf_int_95": results.conf_int().to_dict(),
}
# ── 2. GLM ────────────────────────────────────────────────────────────────────
def fit_glm(
y: np.ndarray | pd.Series,
X: np.ndarray | pd.DataFrame,
family: str = "poisson", # "poisson" | "binomial" | "negativebinomial" | "gamma"
add_const: bool = True,
cov_type: str = "HC3",
) -> object:
"""
Fit Generalized Linear Model.
family presets:
- poisson: count data (overdispersion → use negativebinomial)
- binomial: proportions / binary with known n (logit link default)
- negativebinomial: overdispersed count data
- gamma: positive continuous with multiplicative effects
"""
family_map = {
"poisson": sm.families.Poisson(),
"binomial": sm.families.Binomial(),
"negativebinomial": sm.families.NegativeBinomial(),
"gamma": sm.families.Gamma(),
"gaussian": sm.families.Gaussian(),
}
if add_const:
X = sm.add_constant(X)
return sm.GLM(y, X, family=family_map[family]).fit(cov_type=cov_type)
# ── 3. Logistic regression ────────────────────────────────────────────────────
def fit_logit(
formula: str,
data: pd.DataFrame,
method: str = "bfgs",
) -> object:
"""
Binary logistic regression (formula API).
get_margeff() returns average marginal effects (AME).
"""
results = smf.logit(formula, data=data).fit(method=method, maxiter=200, disp=False)
return results
def marginal_effects(logit_results, at: str = "overall") -> pd.DataFrame:
"""
Compute marginal effects from logit/probit.
at: "overall" (mean), "mean" (at mean X), "median"
"""
me = logit_results.get_margeff(at=at)
return pd.DataFrame({"dy/dx": me.margeff, "p_value": me.pvalues})
# ── 4. Time series stationarity checks ───────────────────────────────────────
def check_stationarity(
series: pd.Series | np.ndarray,
alpha: float = 0.05,
regression: str = "ct", # "c" constant, "ct" const+trend, "nc" none
) -> dict:
"""
Test for unit root with ADF and KPSS.
ADF H0: has unit root (non-stationary) → reject if p < α
KPSS H0: stationary → reject if p < α
"""
adf_stat, adf_p, adf_lags, *_ = adfuller(series, regression=regression, autolag="AIC")
kpss_stat, kpss_p, kpss_lags, _ = kpss(series, regression="ct" if "c" in regression else "c")
adf_reject = adf_p < alpha
kpss_reject = kpss_p < alpha
# Interpretation
if adf_reject and not kpss_reject:
verdict = "STATIONARY (ADF rejects unit root, KPSS accepts)"
elif not adf_reject and kpss_reject:
verdict = "NON-STATIONARY (unit root present, difference required)"
elif adf_reject and kpss_reject:
verdict = "UNCERTAIN (possible structural break)"
else:
verdict = "WEAKLY STATIONARY (check visually)"
return {
"adf_stat": round(adf_stat, 4),
"adf_p": round(adf_p, 4),
"kpss_stat": round(kpss_stat, 4),
"kpss_p": round(kpss_p, 4),
"verdict": verdict,
}
def select_arima_orders(
series: pd.Series,
max_p: int = 5,
max_q: int = 5,
d: int = None,
) -> tuple[int, int, int]:
"""
Select ARIMA(p,d,q) orders via AIC grid search.
If d is None, auto-determines differencing via ADF test.
"""
if d is None:
adf_p = adfuller(series)[1]
d = 0 if adf_p < 0.05 else 1
best_aic = float("inf")
best_order = (1, d, 1)
for p in range(0, max_p + 1):
for q in range(0, max_q + 1):
try:
m = ARIMA(series, order=(p, d, q)).fit()
if m.aic < best_aic:
best_aic = m.aic
best_order = (p, d, q)
except Exception:
pass
print(f"Best AIC={best_aic:.2f} | order={best_order}")
return best_order
# ── 5. ARIMA / SARIMA ─────────────────────────────────────────────────────────
def fit_arima(
series: pd.Series,
order: tuple[int, int, int] = (1, 1, 1),
seasonal_order: tuple[int, int, int, int] = None,
trend: str = "n", # "n" none, "c" const, "t" time, "ct" both
) -> object:
"""Fit ARIMA or SARIMA model."""
model = SARIMAX(
series,
order=order,
seasonal_order=seasonal_order or (0, 0, 0, 0),
trend=trend,
enforce_stationarity=False,
enforce_invertibility=False,
)
return model.fit(disp=False)
def forecast_arima(
results,
steps: int = 12,
alpha: float = 0.05,
) -> pd.DataFrame:
"""Forecast from fitted ARIMA results."""
pred = results.get_forecast(steps=steps)
ci = pred.conf_int(alpha=alpha)
return pd.DataFrame({
"forecast": pred.predicted_mean.values,
"lower": ci.iloc[:, 0].values,
"upper": ci.iloc[:, 1].values,
})
# ── 6. Exponential smoothing (Holt-Winters) ───────────────────────────────────
def fit_holt_winters(
series: pd.Series,
trend: str = "add", # "add" | "mul" | None
seasonal: str = "add", # "add" | "mul" | None
seasonal_periods: int = 12,
damped_trend: bool = True,
optimized: bool = True,
) -> ExponentialSmoothing:
"""
Fit Holt-Winters triple exponential smoothing.
Use damped_trend=True to avoid over-extrapolating trends.
"""
model = ExponentialSmoothing(
series,
trend=trend,
seasonal=seasonal,
seasonal_periods=seasonal_periods,
damped_trend=damped_trend,
)
return model.fit(optimized=optimized)
def forecast_hw(results, steps: int = 12) -> np.ndarray:
"""Forecast future values from Holt-Winters results."""
return results.forecast(steps=steps).values
# ── 7. VAR (Vector Autoregression) ───────────────────────────────────────────
def fit_var(
df: pd.DataFrame,
maxlags: int = 12,
ic: str = "aic", # "aic" | "bic" | "hqic" | "fpe"
) -> object:
"""
Fit VAR model for multivariate time series.
Each column in df is a separate time series variable.
"""
model = VAR(df)
results = model.fit(maxlags=maxlags, ic=ic)
print(f"VAR selected lag order: {results.k_ar}")
return results
def var_forecast(results, steps: int = 12) -> pd.DataFrame:
"""Generate VAR forecast."""
lag_order = results.k_ar
forecast = results.forecast(results.y[-lag_order:], steps=steps)
return pd.DataFrame(forecast, columns=results.names)
# ── 8. Regression diagnostics ─────────────────────────────────────────────────
def compute_vif(X: pd.DataFrame) -> pd.DataFrame:
"""
Variance Inflation Factor for multicollinearity detection.
VIF > 10 indicates problematic multicollinearity.
"""
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
vif_data["VIF"] = [
variance_inflation_factor(X.values, i) for i in range(X.shape[1])
]
return vif_data.sort_values("VIF", ascending=False)
def run_diagnostics(results) -> dict:
"""Run standard regression diagnostics."""
resid = results.resid
X = results.model.exog
dw = durbin_watson(resid)
lb = acorr_ljungbox(resid, lags=[10], return_df=True)
bp = het_breuschpagan(resid, X)
return {
"durbin_watson": round(dw, 4), # ~2 = no autocorrelation
"ljung_box_p": round(float(lb["lb_pvalue"].values[0]), 4),
"breusch_pagan_lm": round(bp[0], 4),
"breusch_pagan_p": round(bp[1], 4), # < 0.05 → heteroscedastic
"jarque_bera_p": round(float(results.test_normality("jarquebera")[0][0][1]), 4),
}
# ── Demo ──────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
print("statsmodels Demo")
print("="*50)
# OLS regression
np.random.seed(42)
n = 200
x1 = np.random.normal(0, 1, n)
x2 = np.random.normal(0, 1, n)
y = 2 + 3*x1 - 1.5*x2 + np.random.normal(0, 0.5, n)
df_reg = pd.DataFrame({"y": y, "x1": x1, "x2": x2})
results = fit_ols_formula("y ~ x1 + x2", data=df_reg)
print("\nOLS Results:")
print(results.summary().tables[1])
stats = regression_summary(results)
print(f"\nR²={stats['r_squared']} AIC={stats['aic']}")
diag = run_diagnostics(results)
print(f"Diagnostics: DW={diag['durbin_watson']} BP-p={diag['breusch_pagan_p']}")
# Time series
dates = pd.date_range("2018-01", periods=60, freq="MS")
trend_series = pd.Series(100 + 0.5*np.arange(60) + np.random.normal(0, 3, 60), index=dates)
print("\nStationarity check:")
stat = check_stationarity(trend_series)
print(f" {stat['verdict']}")
print(f" ADF p={stat['adf_p']:.4f} KPSS p={stat['kpss_p']:.4f}")
# ARIMA
order = (1, 1, 1)
sarima = fit_arima(trend_series, order=order)
fcast = forecast_arima(sarima, steps=12)
print(f"\nARIMA{order} 12-step forecast: {fcast['forecast'].values.round(1)}")
# Holt-Winters
hw = fit_holt_winters(trend_series, trend="add", seasonal=None)
hw_fc = forecast_hw(hw, steps=6)
print(f"Holt-Winters 6-step: {hw_fc.round(1)}")
For the scikit-learn regression alternative when needing predictive performance rather than statistical inference — sklearn models optimize predictive metrics while statsmodels’ OLS.summary() outputs p-values, confidence intervals, and diagnostic tests critical for hypothesis-driven analysis, causal inference, econometric reporting, and academic research where coefficient interpretation matters as much as prediction accuracy. For the Prophet alternative when forecasting univariate business time series — Prophet handles complex multi-seasonality with holiday effects while statsmodels’ SARIMAX provides strict statistical tests (ADF, KPSS, Ljung-Box, Breusch-Pagan) for validating model assumptions, and the VAR model for multivariate time series captures inter-variable lead-lag relationships that Prophet’s additive decomposition misses by design. The Claude Skills 360 bundle includes statsmodels skill sets covering OLS and formula API regression, robust standard errors, logit marginal effects, GLM families, ARIMA/SARIMA order selection, Holt-Winters smoothing, VAR multivariate forecasting, stationarity tests, and regression diagnostics. Start with the free tier to try statistical modeling code generation.