Chaos engineering is the practice of deliberately introducing failures into a system to verify it handles them gracefully — before they happen in production. The goal is to find resilience gaps: services that don’t retry, caches that don’t degrade, health checks that don’t catch the real failure. Claude Code writes chaos experiments, fault injection middleware, circuit breaker implementations, and the runbooks that guide controlled failure tests.
CLAUDE.md for Chaos Engineering Projects
## Chaos Engineering Stack
- Framework: custom Python experiments or Chaos Toolkit
- Fault injection: latency, error rate, resource exhaustion, network partitions
- Environments: staging first, limited production blast radius with feature flags
- Hypothesis format: "When X fails, Y should happen within Z seconds"
- Circuit breakers: resilience4j (JVM), pybreaker (Python), opossum (Node.js)
- Runbooks: every experiment has rollback procedure documented
- Monitoring: confirm Datadog/Prometheus dashboards show the failure before recovery
Chaos Experiment Framework
# chaos/experiment.py — typed experiment definition and runner
from dataclasses import dataclass, field
from typing import Callable, Any
from contextlib import contextmanager
import time
import logging
logger = logging.getLogger(__name__)
@dataclass
class Hypothesis:
description: str
# Returns True if system is healthy, False if degraded
probe: Callable[[], bool]
# How long to wait for recovery after fault is removed
recovery_timeout_seconds: float = 30.0
@dataclass
class Experiment:
name: str
hypothesis: Hypothesis
method: Callable # The fault injection
rollback: Callable # How to undo the fault
duration_seconds: float = 60.0
tags: list[str] = field(default_factory=list)
@dataclass
class ExperimentResult:
experiment_name: str
hypothesis_met: bool
steady_state_before: bool
steady_state_after: bool
duration_seconds: float
observations: list[str]
def run_experiment(experiment: Experiment) -> ExperimentResult:
observations = []
# 1. Verify steady state BEFORE fault injection
steady_before = experiment.hypothesis.probe()
observations.append(f"Steady state before: {steady_before}")
if not steady_before:
observations.append("System not in steady state — aborting experiment")
return ExperimentResult(
experiment_name=experiment.name,
hypothesis_met=False,
steady_state_before=False,
steady_state_after=False,
duration_seconds=0,
observations=observations,
)
# 2. Inject fault
start = time.time()
logger.warning(f"[CHAOS] Starting experiment: {experiment.name}")
try:
experiment.method()
observations.append(f"Fault injected. Running for {experiment.duration_seconds}s...")
time.sleep(experiment.duration_seconds)
finally:
# 3. Always rollback, even if probe fails
experiment.rollback()
observations.append("Fault removed. Waiting for recovery...")
# 4. Wait for system to recover
deadline = time.time() + experiment.hypothesis.recovery_timeout_seconds
steady_after = False
while time.time() < deadline:
if experiment.hypothesis.probe():
steady_after = True
break
time.sleep(5)
duration = time.time() - start
observations.append(f"Steady state after: {steady_after}")
# 5. Hypothesis: system recovered to steady state
hypothesis_met = steady_after
logger.info(
f"[CHAOS] Experiment '{experiment.name}' complete. "
f"Hypothesis met: {hypothesis_met}. Duration: {duration:.1f}s"
)
return ExperimentResult(
experiment_name=experiment.name,
hypothesis_met=hypothesis_met,
steady_state_before=steady_before,
steady_state_after=steady_after,
duration_seconds=duration,
observations=observations,
)
Fault Injection Implementations
# chaos/faults.py — fault injection primitives
import subprocess
import threading
import random
import socket
from http.server import HTTPServer, BaseHTTPRequestHandler
from unittest.mock import patch
# Fault 1: Simulate high latency on a downstream service
class LatencyProxy:
"""Intercept calls to a service URL and add latency."""
def __init__(self, target_host: str, latency_ms: int = 2000):
self.target_host = target_host
self.latency_ms = latency_ms
self._original_connect = None
def __enter__(self):
import socket as sock_module
self._original_connect = sock_module.socket.connect
latency = self.latency_ms / 1000.0
target = self.target_host
def slow_connect(self, address):
if address[0] == target:
time.sleep(latency)
return self._original_connect(address)
sock_module.socket.connect = slow_connect
return self
def __exit__(self, *args):
import socket as sock_module
sock_module.socket.connect = self._original_connect
# Fault 2: Random error injection middleware (WSGI/ASGI)
class ErrorInjectionMiddleware:
"""Inject random HTTP 500s — simulates flaky dependency."""
def __init__(self, app, error_rate: float = 0.1):
self.app = app
self.error_rate = error_rate
self.active = False
async def __call__(self, scope, receive, send):
if self.active and scope["type"] == "http" and random.random() < self.error_rate:
await send({
"type": "http.response.start",
"status": 503,
"headers": [[b"content-type", b"application/json"]],
})
await send({
"type": "http.response.body",
"body": b'{"error": "Service temporarily unavailable"}',
})
return
await self.app(scope, receive, send)
# Fault 3: CPU spike
def inject_cpu_spike(duration_seconds: float, cores: int = 2):
"""Spin threads to consume CPU."""
stop_event = threading.Event()
def spin():
while not stop_event.is_set():
pass
threads = [threading.Thread(target=spin) for _ in range(cores)]
for t in threads:
t.start()
def cleanup():
stop_event.set()
for t in threads:
t.join()
return cleanup
# Fault 4: Memory pressure
def inject_memory_pressure(mb: int = 512):
"""Allocate and hold memory."""
data = bytearray(mb * 1024 * 1024)
return lambda: data.clear()
Circuit Breaker Pattern
# resilience/circuit_breaker.py
from enum import Enum
import threading
import time
from typing import Callable, TypeVar, Any
T = TypeVar('T')
class State(Enum):
CLOSED = "closed" # Normal — all requests pass through
OPEN = "open" # Tripped — all requests fail fast
HALF_OPEN = "half_open" # Testing — one request allowed through
class CircuitBreaker:
def __init__(
self,
failure_threshold: int = 5, # Failures before opening
success_threshold: int = 2, # Successes in half-open to close
timeout_seconds: float = 60.0, # How long to stay open before trying
):
self.failure_threshold = failure_threshold
self.success_threshold = success_threshold
self.timeout_seconds = timeout_seconds
self._state = State.CLOSED
self._failure_count = 0
self._success_count = 0
self._last_failure_time: float | None = None
self._lock = threading.Lock()
def call(self, fn: Callable[[], T]) -> T:
with self._lock:
if self._state == State.OPEN:
if time.time() - self._last_failure_time > self.timeout_seconds:
self._state = State.HALF_OPEN
self._success_count = 0
else:
raise CircuitOpenError(f"Circuit breaker is OPEN")
try:
result = fn()
self._on_success()
return result
except Exception as e:
self._on_failure()
raise
def _on_success(self):
with self._lock:
if self._state == State.HALF_OPEN:
self._success_count += 1
if self._success_count >= self.success_threshold:
self._state = State.CLOSED
self._failure_count = 0
elif self._state == State.CLOSED:
self._failure_count = max(0, self._failure_count - 1)
def _on_failure(self):
with self._lock:
self._failure_count += 1
self._last_failure_time = time.time()
if self._failure_count >= self.failure_threshold:
self._state = State.OPEN
class CircuitOpenError(Exception):
pass
# Usage
payment_breaker = CircuitBreaker(failure_threshold=3, timeout_seconds=30)
def charge_payment(amount: int, card_token: str) -> dict:
return payment_breaker.call(lambda: stripe_client.charge(amount, card_token))
Defined Experiments
# chaos/experiments/database_experiments.py
import httpx
from ..experiment import Experiment, Hypothesis, run_experiment
from ..faults import ErrorInjectionMiddleware
def probe_api_health() -> bool:
"""Check if API returns 200 within timeout."""
try:
r = httpx.get("http://localhost:8000/health", timeout=5)
return r.status_code == 200 and r.json().get("status") == "ok"
except Exception:
return False
# Experiment: What happens when the database is slow?
db_latency_experiment = Experiment(
name="database-high-latency",
hypothesis=Hypothesis(
description="API health check remains healthy even when DB latency is 2s",
probe=probe_api_health,
recovery_timeout_seconds=30,
),
method=lambda: inject_db_latency(2000),
rollback=lambda: remove_db_latency(),
duration_seconds=30,
tags=["database", "latency"],
)
# Run experiments from CLI
if __name__ == "__main__":
result = run_experiment(db_latency_experiment)
print(f"\n{'✅ PASSED' if result.hypothesis_met else '❌ FAILED'}: {result.experiment_name}")
for obs in result.observations:
print(f" - {obs}")
For the SRE error budgets and SLO alerting that chaos experiments inform, the SRE patterns guide covers error budget burn rates and alert routing. For the load testing that complements chaos testing by verifying performance under stress, the load testing guide covers k6 and Gatling scenarios. The Claude Skills 360 bundle includes chaos engineering skill sets covering fault injection primitives, circuit breakers, and defined experiment patterns. Start with the free tier to try circuit breaker generation.