Python’s timeit module measures execution time of small code snippets, disabling garbage collection during runs for consistent results. import timeit. timeit: timeit.timeit(stmt, setup="", number=1000000, globals=None) → float total seconds; divide by number for per-call time. repeat: timeit.repeat(stmt, setup="", number=1000000, repeat=5) → list of floats (one per repeat); best practice: take min(). Timer: t = timeit.Timer(stmt, setup="", globals=globals()) — reusable timer; t.timeit(number), t.repeat(repeat, number), t.autorange() → (number, total_time). default_timer: timeit.default_timer() → current time via time.perf_counter. Disable GC: timeit temporarily disables GC during runs (call gc.enable() in setup if you want GC active). globals: pass globals=globals() to make module-level names available in statement strings. CLI: python -m timeit -n 10000 -r 5 "x = [i*2 for i in range(100)]". Best practice: always take min(results) — the minimum represents the run with least background noise; mean / stdev are useful for characterizing variance. Claude Code generates function comparison tables, optimization before-and-after reports, algorithm complexity estimators, and CI performance regression checks.
CLAUDE.md for timeit
## timeit Stack
- Stdlib: import timeit
- Quick: timeit.timeit(lambda: fn(), number=10_000)
- Best: min(timeit.repeat(lambda: fn(), number=1000, repeat=7))
- Timer: t = timeit.Timer(lambda: fn()); t.autorange()
- µs/op: time_s / number * 1e6 # convert to microseconds per call
- GC: Timer(stmt, setup="import gc; gc.enable()") # if GC matters
timeit Micro-Benchmark Pipeline
# app/benchutil.py — benchmark, compare, sweep, regression, report
from __future__ import annotations
import gc
import math
import statistics
import timeit
from dataclasses import dataclass, field
from typing import Any, Callable
# ─────────────────────────────────────────────────────────────────────────────
# 1. Core timing helpers
# ─────────────────────────────────────────────────────────────────────────────
def time_fn(fn: Callable, number: int = 0, warmup: int = 3) -> float:
"""
Time fn() and return seconds per call.
If number=0, uses Timer.autorange() to choose automatically.
Example:
t = time_fn(lambda: sum(range(1000)))
print(f"{t*1e6:.2f} µs per call")
"""
# Warmup to fill caches
for _ in range(warmup):
fn()
timer = timeit.Timer(fn)
if number:
return timer.timeit(number) / number
n, total = timer.autorange()
return total / n
def time_µs(fn: Callable, number: int = 0) -> float:
"""
Return microseconds per call for fn().
Example:
print(f"{time_µs(lambda: [x*2 for x in range(100)]):.2f} µs")
"""
return time_fn(fn, number=number) * 1e6
@dataclass
class TimingResult:
label: str
number: int
samples: list[float] # one total-time per repeat
per_call: float # best (min) seconds per call
mean_µs: float
min_µs: float
max_µs: float
stdev_µs: float
@classmethod
def measure(
cls,
fn: Callable,
label: str = "",
number: int = 0,
repeat: int = 7,
) -> "TimingResult":
"""
Benchmark fn() using repeat runs; return a TimingResult.
Example:
r = TimingResult.measure(lambda: sorted(range(100)), label="sorted")
print(r)
"""
lbl = label or getattr(fn, "__name__", "fn")
# Auto-choose number using a single autorange call
timer = timeit.Timer(fn)
if not number:
number, _ = timer.autorange()
# Round up to nearest power of 10 for cleanliness
number = 10 ** math.ceil(math.log10(number)) if number > 1 else 10
samples = timer.repeat(repeat, number)
per_sec = [s / number for s in samples]
return cls(
label=lbl,
number=number,
samples=samples,
per_call=min(per_sec), # best estimate
mean_µs=statistics.mean(per_sec) * 1e6,
min_µs=min(per_sec) * 1e6,
max_µs=max(per_sec) * 1e6,
stdev_µs=statistics.stdev(per_sec) * 1e6 if len(per_sec) > 1 else 0.0,
)
def __str__(self) -> str:
return (
f"{self.label:30s}: "
f"best={self.min_µs:8.2f}µs "
f"mean={self.mean_µs:8.2f}µs "
f"stdev={self.stdev_µs:6.2f}µs "
f"n={self.number}"
)
# ─────────────────────────────────────────────────────────────────────────────
# 2. Comparison and ranking
# ─────────────────────────────────────────────────────────────────────────────
def compare(
fns: dict[str, Callable],
number: int = 0,
repeat: int = 7,
) -> list[TimingResult]:
"""
Benchmark multiple callables and return results sorted fastest-first.
Example:
results = compare({
"list comp": lambda: [x*2 for x in range(1000)],
"map": lambda: list(map(lambda x: x*2, range(1000))),
"for loop": lambda: [x.__mul__(2) for x in range(1000)],
})
for r in results:
print(r)
"""
results = [
TimingResult.measure(fn, label=label, number=number, repeat=repeat)
for label, fn in fns.items()
]
results.sort(key=lambda r: r.per_call)
return results
def print_comparison(results: list[TimingResult]) -> None:
"""
Print a comparison table with relative speedup vs the fastest.
Example:
print_comparison(compare(fns))
"""
if not results:
return
fastest = results[0].per_call
print(f"{'Function':30s} {'Best µs':>10} {'Mean µs':>10} {'vs fastest':>10}")
print("-" * 68)
for r in results:
ratio = r.per_call / fastest
mark = " ← fastest" if ratio == 1.0 else f" ({ratio:.2f}×)"
print(f"{r.label:30s} {r.min_µs:10.2f} {r.mean_µs:10.2f} {mark}")
# ─────────────────────────────────────────────────────────────────────────────
# 3. Complexity estimation
# ─────────────────────────────────────────────────────────────────────────────
def estimate_complexity(
fn_factory: Callable[[int], Callable],
sizes: list[int],
repeat: int = 5,
) -> list[tuple[int, float]]:
"""
Measure fn_factory(n)() for each n in sizes.
Returns [(n, µs_per_call), ...] for plotting or log-log regression.
Example:
# test sorted() scaling
results = estimate_complexity(lambda n: lambda: sorted(range(n)), [10, 100, 1000, 10000])
for n, µs in results:
print(f" n={n:6d}: {µs:8.2f} µs")
"""
measurements = []
for n in sizes:
fn = fn_factory(n)
t = TimingResult.measure(fn, label=f"n={n}", repeat=repeat)
measurements.append((n, t.min_µs))
return measurements
def infer_big_o(sizes_times: list[tuple[int, float]]) -> str:
"""
Estimate O(n), O(n log n), or O(n^2) from (size, time) measurements
by checking which model gives the best log-log slope.
Example:
data = estimate_complexity(lambda n: lambda: sorted(range(n)), [100,1000,10000])
print(infer_big_o(data)) # "O(n log n)"
"""
if len(sizes_times) < 2:
return "insufficient data"
ns = [math.log(x[0]) for x in sizes_times]
ts = [math.log(max(x[1], 1e-9)) for x in sizes_times]
# Compute slope via least-squares
n_ = len(ns)
m_ = (n_ * sum(x * y for x, y in zip(ns, ts)) - sum(ns) * sum(ts))
d_ = (n_ * sum(x * x for x in ns) - sum(ns) ** 2)
slope = m_ / d_ if d_ else 0
if slope < 0.75:
return "O(1) or sublinear"
elif slope < 1.2:
return "O(n)"
elif slope < 1.65:
return "O(n log n)"
elif slope < 2.3:
return "O(n²)"
else:
return f"O(n^{slope:.1f})"
# ─────────────────────────────────────────────────────────────────────────────
# 4. GC-aware benchmarking
# ─────────────────────────────────────────────────────────────────────────────
def time_with_gc(fn: Callable, number: int = 1000, repeat: int = 5) -> TimingResult:
"""
Benchmark fn() with GC enabled (unlike timeit default which disables GC).
Example:
r = time_with_gc(lambda: [object() for _ in range(100)])
print(r)
"""
def wrapped() -> None:
gc.collect()
fn()
old_enabled = gc.isenabled()
gc.enable()
try:
return TimingResult.measure(wrapped, label=getattr(fn, "__name__", "fn_gc"),
number=number, repeat=repeat)
finally:
if not old_enabled:
gc.disable()
# ─────────────────────────────────────────────────────────────────────────────
# 5. Regression guard
# ─────────────────────────────────────────────────────────────────────────────
@dataclass
class PerformanceBaseline:
"""
Store a performance baseline and check for regressions.
Example:
baseline = PerformanceBaseline.record("sum 1k", lambda: sum(range(1000)))
# later: raise if 2× slower
baseline.assert_not_regressed(lambda: sum(range(1000)), tolerance=2.0)
"""
label: str
baseline_µs: float
@classmethod
def record(cls, label: str, fn: Callable, **kw: Any) -> "PerformanceBaseline":
r = TimingResult.measure(fn, label=label, **kw)
return cls(label=label, baseline_µs=r.min_µs)
def assert_not_regressed(
self,
fn: Callable,
tolerance: float = 1.5,
**kw: Any,
) -> None:
"""
Raise AssertionError if fn is more than `tolerance`× slower than baseline.
Example:
baseline.assert_not_regressed(new_impl, tolerance=1.2)
"""
r = TimingResult.measure(fn, label=self.label, **kw)
ratio = r.min_µs / self.baseline_µs
if ratio > tolerance:
raise AssertionError(
f"Performance regression in {self.label!r}: "
f"{r.min_µs:.2f}µs vs baseline {self.baseline_µs:.2f}µs "
f"({ratio:.2f}× — limit {tolerance}×)"
)
# ─────────────────────────────────────────────────────────────────────────────
# Demo
# ─────────────────────────────────────────────────────────────────────────────
if __name__ == "__main__":
print("=== timeit demo ===")
print("\n--- time_µs ---")
for expr, fn in [
("sum(range(1000))", lambda: sum(range(1000))),
("[x*2 for x in r(100)]", lambda: [x * 2 for x in range(100)]),
("sorted(range(1000))", lambda: sorted(range(1000))),
]:
print(f" {expr:35s}: {time_µs(fn):.2f} µs")
print("\n--- TimingResult.measure ---")
r = TimingResult.measure(lambda: sum(range(1000)), label="sum(range(1000))")
print(f" {r}")
print("\n--- compare (list building strategies) ---")
results = compare({
"list comprehension": lambda: [x * 2 for x in range(200)],
"map + list": lambda: list(map(lambda x: x * 2, range(200))),
"append loop": lambda: _append_loop(200),
})
print_comparison(results)
print("\n--- estimate_complexity ---")
data = estimate_complexity(
lambda n: lambda: sorted(range(n)),
sizes=[10, 100, 1000, 5000],
)
for n, µs in data:
print(f" n={n:5d}: {µs:8.2f} µs")
print(f" inferred complexity: {infer_big_o(data)}")
print("\n--- PerformanceBaseline ---")
baseline = PerformanceBaseline.record("sum_baseline", lambda: sum(range(100)))
print(f" baseline: {baseline.baseline_µs:.2f} µs")
try:
# Same function — should not regress
baseline.assert_not_regressed(lambda: sum(range(100)), tolerance=2.0)
print(" regression check: PASSED")
except AssertionError as e:
print(f" regression check: FAILED — {e}")
def _append_loop(n: int) -> list[int]:
result = []
for x in range(n):
result.append(x * 2)
return result
For the time.perf_counter alternative — time.perf_counter() is the underlying primitive that timeit.Timer uses internally; it gives you raw nanosecond-precision timestamps that you manage manually (start/stop/compute difference); timeit adds the loop, garbage-collection disabling, warmup convention, and autorange() on top — use time.perf_counter() for inline performance measurements in application code, lap timers, and rate monitors; use timeit in benchmarking scripts and notebooks where you need a statistically meaningful comparison of two or more implementations with minimal measurement bias. For the pytest-benchmark alternative — pytest-benchmark (PyPI) integrates benchmarking into pytest, generating JSON reports, histogram plots, and automatic regression detection with --benchmark-compare; it manages warmup, calibration, and CI artifact storage; stdlib timeit is zero-dependency and works in any context — use pytest-benchmark for production performance test suites integrated into CI; use timeit for ad-hoc one-off comparisons in scripts and notebooks. The Claude Skills 360 bundle includes timeit skill sets covering time_fn()/time_µs() quick helpers, TimingResult dataclass with measure()/autorange integration and stdev, compare()/print_comparison() multi-function ranking tables, estimate_complexity()/infer_big_o() big-O estimation, time_with_gc() GC-aware measurement, and PerformanceBaseline for regression guard assertions. Start with the free tier to try micro-benchmark patterns and timeit pipeline code generation.