pytest-benchmark measures Python performance in tests. pip install pytest-benchmark. Fixture: def test_sort(benchmark): result = benchmark(sorted, [3,1,2]); assert result == [1,2,3]. Lambda: benchmark(lambda: my_func(arg)). Setup: benchmark.pedantic(func, args=(a,b), setup=setup_fn, rounds=100, iterations=10). Group: @pytest.mark.benchmark(group="sorting"). Run: pytest --benchmark-only. Skip in normal runs: pytest --benchmark-skip. Storage: pytest --benchmark-autosave — saves JSON in .benchmarks/. Compare: pytest --benchmark-compare — compare with last saved. --benchmark-compare=0001 — compare with specific run. --benchmark-compare-fail=mean:5% — fail if mean regresses by >5%. Warmup: benchmark.warmup_rounds (default 1). rounds: number of rounds (default auto). timer: time.perf_counter (default). disable_gc=True — disable garbage collector during timing. calibrate_timer=True. Histogram: pytest --benchmark-histogram — generates PNG. JSON: pytest --benchmark-json=output.json. Parametrize: @pytest.mark.parametrize("fn", [fn1, fn2]). def test_compare(benchmark, fn): benchmark(fn, data). Min/mean/stddev/ops in output. benchmark.stats — access stats in test. benchmark.stats.mean, benchmark.stats.min, benchmark.stats.stddev. CI: export PYTHONHASHSEED=0 for reproducibility. Store .benchmarks/ in git. --benchmark-compare-fail gates CI. Claude Code generates pytest-benchmark fixtures, pedantic setups, and compare-fail thresholds for CI.
CLAUDE.md for pytest-benchmark
## pytest-benchmark Stack
- Version: pytest-benchmark >= 4.0 | pip install pytest-benchmark
- Fixture: def test_fn(benchmark): result = benchmark(func, *args, **kwargs)
- Pedantic: benchmark.pedantic(fn, args=(..), setup=setup_fn, rounds=N)
- Group: @pytest.mark.benchmark(group="name") — group related benchmarks
- CI: pytest --benchmark-autosave && pytest --benchmark-compare-fail=mean:5%
- Skip: pytest --benchmark-skip | pytest --benchmark-only for bench-only runs
- Stats: benchmark.stats.mean | .min | .max | .stddev after test runs
pytest-benchmark Performance Testing Pipeline
# tests/test_benchmarks.py — pytest-benchmark patterns
from __future__ import annotations
import json
import re
import time
from functools import lru_cache
from typing import Any
import pytest
# ─────────────────────────────────────────────────────────────────────────────
# Functions under benchmark
# ─────────────────────────────────────────────────────────────────────────────
# Sorting implementations
def bubble_sort(arr: list[int]) -> list[int]:
result = arr.copy()
n = len(result)
for i in range(n):
for j in range(0, n - i - 1):
if result[j] > result[j + 1]:
result[j], result[j + 1] = result[j + 1], result[j]
return result
def insertion_sort(arr: list[int]) -> list[int]:
result = arr.copy()
for i in range(1, len(result)):
key = result[i]
j = i - 1
while j >= 0 and result[j] > key:
result[j + 1] = result[j]
j -= 1
result[j + 1] = key
return result
# Serialization implementations
def slow_json_build(records: list[dict]) -> str:
parts = []
for r in records:
parts.append(json.dumps(r))
return "[" + ",".join(parts) + "]"
def fast_json_build(records: list[dict]) -> str:
return json.dumps(records)
# Regex implementations
def slow_email_validate(emails: list[str]) -> list[bool]:
pattern = r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$"
return [re.match(pattern, e) is not None for e in emails]
_EMAIL_RE = re.compile(r"^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$")
def fast_email_validate(emails: list[str]) -> list[bool]:
return [_EMAIL_RE.match(e) is not None for e in emails]
# Fibonacci implementations
def fib_recursive(n: int) -> int:
if n <= 1:
return n
return fib_recursive(n - 1) + fib_recursive(n - 2)
def fib_iterative(n: int) -> int:
a, b = 0, 1
for _ in range(n):
a, b = b, a + b
return a
@lru_cache(maxsize=None)
def fib_memoized(n: int) -> int:
if n <= 1:
return n
return fib_memoized(n - 1) + fib_memoized(n - 2)
# Data processing
def process_records_dict(records: list[dict]) -> dict[str, list]:
result: dict[str, list] = {}
for r in records:
key = r.get("category", "unknown")
result.setdefault(key, []).append(r)
return result
def process_records_defaultdict(records: list[dict]) -> dict[str, list]:
from collections import defaultdict
result: dict[str, list] = defaultdict(list)
for r in records:
result[r.get("category", "unknown")].append(r)
return dict(result)
# ─────────────────────────────────────────────────────────────────────────────
# Fixtures — shared test data
# ─────────────────────────────────────────────────────────────────────────────
@pytest.fixture
def small_list() -> list[int]:
import random
rng = random.Random(42)
return [rng.randint(0, 1000) for _ in range(100)]
@pytest.fixture
def medium_list() -> list[int]:
import random
rng = random.Random(42)
return [rng.randint(0, 1000) for _ in range(1_000)]
@pytest.fixture
def email_list() -> list[str]:
return [f"user{i}@example.com" for i in range(500)]
@pytest.fixture
def records() -> list[dict]:
categories = ["Electronics", "Clothing", "Books", "Home", "Sports"]
return [
{"id": i, "name": f"Product {i}", "category": categories[i % 5], "price": i * 0.99}
for i in range(1_000)
]
# ─────────────────────────────────────────────────────────────────────────────
# 1. Basic benchmark — callable with args
# ─────────────────────────────────────────────────────────────────────────────
class TestSortBenchmarks:
@pytest.mark.benchmark(group="sort-small")
def test_builtin_sort(self, benchmark, small_list: list[int]) -> None:
result = benchmark(sorted, small_list)
assert result == sorted(small_list)
@pytest.mark.benchmark(group="sort-small")
def test_insertion_sort(self, benchmark, small_list: list[int]) -> None:
result = benchmark(insertion_sort, small_list)
assert result == sorted(small_list)
@pytest.mark.benchmark(group="sort-small")
def test_bubble_sort(self, benchmark, small_list: list[int]) -> None:
"""Expected to be slowest — demonstrates regression detection value."""
result = benchmark(bubble_sort, small_list)
assert result == sorted(small_list)
# ─────────────────────────────────────────────────────────────────────────────
# 2. benchmark.pedantic — fine-grained rounds and iterations
# ─────────────────────────────────────────────────────────────────────────────
class TestFibonacci:
@pytest.mark.benchmark(group="fibonacci")
def test_iterative(self, benchmark) -> None:
result = benchmark.pedantic(fib_iterative, args=(30,), rounds=200, iterations=5)
assert result == 832040
@pytest.mark.benchmark(group="fibonacci")
def test_memoized(self, benchmark) -> None:
# Clear cache before each round for fair comparison
fib_memoized.cache_clear()
result = benchmark.pedantic(fib_memoized, args=(30,), rounds=200, iterations=5,
setup=fib_memoized.cache_clear)
assert result == 832040
@pytest.mark.benchmark(group="fibonacci-slow")
@pytest.mark.slow
def test_recursive(self, benchmark) -> None:
"""Recursive is exponential — skip in fast test runs with -m 'not slow'."""
result = benchmark.pedantic(fib_recursive, args=(25,), rounds=5)
assert result == 75025
# ─────────────────────────────────────────────────────────────────────────────
# 3. Comparing implementations via parametrize
# ─────────────────────────────────────────────────────────────────────────────
@pytest.mark.parametrize("validate_fn,label", [
(slow_email_validate, "slow-recompile"),
(fast_email_validate, "fast-compiled"),
], ids=["slow", "fast"])
@pytest.mark.benchmark(group="email-validation")
def test_email_validation(benchmark, validate_fn, label, email_list: list[str]) -> None:
result = benchmark(validate_fn, email_list)
assert len(result) == len(email_list)
assert all(r is True for r in result)
@pytest.mark.parametrize("build_fn", [slow_json_build, fast_json_build],
ids=["slow", "fast"])
@pytest.mark.benchmark(group="json-build")
def test_json_build(benchmark, build_fn, records: list[dict]) -> None:
result = benchmark(build_fn, records[:100])
parsed = json.loads(result)
assert len(parsed) == 100
# ─────────────────────────────────────────────────────────────────────────────
# 4. Data structure comparison
# ─────────────────────────────────────────────────────────────────────────────
@pytest.mark.parametrize("process_fn", [
process_records_dict,
process_records_defaultdict,
], ids=["dict-setdefault", "defaultdict"])
@pytest.mark.benchmark(group="groupby")
def test_group_by_category(benchmark, process_fn, records: list[dict]) -> None:
result = benchmark(process_fn, records)
assert "Electronics" in result
assert len(sum(result.values(), [])) == len(records)
# ─────────────────────────────────────────────────────────────────────────────
# 5. Accessing stats in assertions
# ─────────────────────────────────────────────────────────────────────────────
def test_sort_stats(benchmark, medium_list: list[int]) -> None:
"""Access benchmark statistics after the run for custom assertions."""
benchmark(sorted, medium_list)
stats = benchmark.stats
# Sorting 1000 integers should be well under 1ms mean
assert stats.mean < 0.001, f"Mean {stats.mean:.6f}s exceeds 1ms threshold"
# Coefficient of variation: stddev / mean should be < 50% (stable measurement)
if stats.mean > 0:
cv = stats.stddev / stats.mean
assert cv < 0.5, f"High measurement variance: CV={cv:.2%}"
def test_json_dumps_stats(benchmark, records: list[dict]) -> None:
benchmark(json.dumps, records[:50])
# Serialising 50 small dicts should be under 500µs
assert benchmark.stats.mean < 0.0005
For the timeit alternative — timeit.timeit("sorted([3,1,2])", number=100_000) gives you a raw number but no statistics — no standard deviation, no outlier detection, no warmup — and the result varies across machines and Python versions with no stored baseline for comparison, while pytest-benchmark automatically runs warmup rounds, computes mean/median/stddev/IQR, stores results as JSON in .benchmarks/, and --benchmark-compare-fail=mean:10% fails CI when a commit regresses the mean by more than 10% relative to the stored baseline. For the cProfile / line_profiler alternative — profilers tell you where time is spent in a single run, while pytest-benchmark measures how much time a specific function takes across many runs with statistical confidence — the two are complementary: use pytest-benchmark’s regression gate to detect regressions in CI, and use cProfile when a benchmark fails to identify the hot path to optimize. The Claude Skills 360 bundle includes pytest-benchmark skill sets covering benchmark fixture with callable, benchmark.pedantic for rounds/iterations/setup, @pytest.mark.benchmark group annotation, parametrize for head-to-head implementation comparisons, benchmark.stats.mean/stddev post-run assertions, —benchmark-autosave for baseline storage, —benchmark-compare and —benchmark-compare-fail for CI gating, disable_gc for stable measurements, JSON and histogram output, and pytest fixture integration for pre-built test data. Start with the free tier to try performance benchmarking code generation.