Skill
Circuit Breaker Patterns Skill
Metadata
name: circuit-breaker-patterns
version: 1.0.0
category: resilience
status: active
priority: P1
derived_from: Claude Operating Preferences v4.0/v5.0
When to Use This Skill
Use this skill when:
- Implementing error cascade prevention
- Building resilient API integrations
- Managing failure recovery with backoff
- Tracking service health across operations
Core Pattern: Circuit Breaker
The circuit breaker pattern prevents cascading failures by temporarily stopping requests to failing services.
import random
import time
from dataclasses import dataclass, field
from enum import Enum
from typing import Dict, Any, Optional, List
from collections import deque
class CircuitState(Enum):
CLOSED = "closed" # Normal operation
OPEN = "open" # Failing, reject requests
HALF_OPEN = "half_open" # Testing recovery
class CircuitBreakerOpen(Exception):
"""Raised when circuit is open"""
def __init__(self, message: str, retry_after: float):
super().__init__(message)
self.retry_after = retry_after
@dataclass
class CircuitBreaker:
"""Circuit breaker for error cascade prevention"""
name: str
failure_threshold: int = 5
recovery_timeout: float = 60.0
half_open_max_calls: int = 3
backoff_factor: float = 2.0
max_backoff: float = 300.0
# State
state: CircuitState = CircuitState.CLOSED
failure_count: int = 0
last_failure_time: float = 0
half_open_calls: int = 0
# Metrics
success_count: int = 0
total_calls: int = 0
latencies: deque = field(default_factory=lambda: deque(maxlen=100))
def can_execute(self) -> bool:
"""Check if request can proceed"""
if self.state == CircuitState.CLOSED:
return True
if self.state == CircuitState.OPEN:
if time.time() - self.last_failure_time > self.recovery_timeout:
self.state = CircuitState.HALF_OPEN
self.half_open_calls = 0
return True
return False
if self.state == CircuitState.HALF_OPEN:
return self.half_open_calls < self.half_open_max_calls
return False
def record_success(self, latency: float):
"""Record successful call"""
self.success_count += 1
self.total_calls += 1
self.latencies.append(latency)
if self.state == CircuitState.HALF_OPEN:
self.half_open_calls += 1
if self.half_open_calls >= self.half_open_max_calls:
# Recovery successful
self.state = CircuitState.CLOSED
self.failure_count = 0
def record_failure(self):
"""Record failed call"""
self.failure_count += 1
self.total_calls += 1
self.last_failure_time = time.time()
if self.state == CircuitState.HALF_OPEN:
# Recovery failed
self.state = CircuitState.OPEN
elif self.failure_count >= self.failure_threshold:
self.state = CircuitState.OPEN
def calculate_backoff(self, attempt: int) -> float:
"""Calculate backoff with jitter"""
delay = min(self.backoff_factor ** attempt, self.max_backoff)
jitter = delay * 0.2 * (2 * random.random() - 1)
return max(0.1, delay + jitter)
def get_health_report(self) -> Dict[str, Any]:
"""Get circuit health report"""
latency_list = list(self.latencies)
return {
"name": self.name,
"state": self.state.value,
"failure_count": self.failure_count,
"success_count": self.success_count,
"total_calls": self.total_calls,
"success_rate": (
self.success_count / self.total_calls
if self.total_calls > 0 else 0
),
"latency_p50": (
sorted(latency_list)[len(latency_list) // 2]
if latency_list else 0
),
"latency_p95": (
sorted(latency_list)[int(len(latency_list) * 0.95)]
if len(latency_list) > 20 else 0
),
"latency_p99": (
sorted(latency_list)[int(len(latency_list) * 0.99)]
if len(latency_list) > 100 else 0
),
}
Circuit Breaker Manager
class CircuitBreakerManager:
"""Manage multiple circuit breakers"""
def __init__(self):
self.breakers: Dict[str, CircuitBreaker] = {}
def get_or_create(
self,
name: str,
**kwargs
) -> CircuitBreaker:
"""Get or create circuit breaker"""
if name not in self.breakers:
self.breakers[name] = CircuitBreaker(name=name, **kwargs)
return self.breakers[name]
def get_health_report(self) -> Dict[str, Any]:
"""Get health report for all circuits"""
reports = {
name: breaker.get_health_report()
for name, breaker in self.breakers.items()
}
open_circuits = [
name for name, breaker in self.breakers.items()
if breaker.state == CircuitState.OPEN
]
return {
"circuit_breakers": reports,
"open_circuits": open_circuits,
"total_circuits": len(self.breakers),
"healthy_circuits": len(self.breakers) - len(open_circuits),
}
Error Mitigation
@dataclass
class ErrorMitigation:
"""Strategies for error handling"""
retry_count: int = 3
fallback_enabled: bool = True
circuit_breaker: Optional[CircuitBreaker] = None
async def execute_with_retry(
self,
operation,
*args,
**kwargs
):
"""Execute operation with retry and circuit breaker"""
if self.circuit_breaker and not self.circuit_breaker.can_execute():
raise CircuitBreakerOpen(
f"Circuit {self.circuit_breaker.name} is open",
retry_after=self.circuit_breaker.recovery_timeout
)
last_error = None
for attempt in range(self.retry_count):
try:
start = time.time()
result = await operation(*args, **kwargs)
latency = time.time() - start
if self.circuit_breaker:
self.circuit_breaker.record_success(latency)
return result
except Exception as e:
last_error = e
if self.circuit_breaker:
self.circuit_breaker.record_failure()
if attempt < self.retry_count - 1:
backoff = (
self.circuit_breaker.calculate_backoff(attempt)
if self.circuit_breaker
else 2 ** attempt
)
await asyncio.sleep(backoff)
raise last_error
Checkpoint Integration
@dataclass
class Checkpoint:
"""Checkpoint with circuit breaker awareness"""
timestamp: str
task_description: str
completed_steps: List[str]
remaining_steps: List[str]
circuit_states: Dict[str, str]
thinking_usage: Optional[int] = None
thinking_summary: Optional[str] = None
def is_stale(self, max_age_hours: float = 24) -> bool:
"""Check if checkpoint is stale"""
from datetime import datetime, timezone
checkpoint_time = datetime.fromisoformat(
self.timestamp.replace('Z', '+00:00')
)
age = datetime.now(timezone.utc) - checkpoint_time
return age.total_seconds() > max_age_hours * 3600
Usage Example
# Create manager
manager = CircuitBreakerManager()
# Get circuit for API
api_circuit = manager.get_or_create(
"external_api",
failure_threshold=3,
recovery_timeout=30.0
)
# Use with error mitigation
mitigation = ErrorMitigation(
retry_count=3,
circuit_breaker=api_circuit
)
try:
result = await mitigation.execute_with_retry(
api_call,
endpoint="/users"
)
except CircuitBreakerOpen as e:
print(f"Circuit open. Retry after {e.retry_after}s")
Success Output
When this skill completes successfully, output:
✅ SKILL COMPLETE: circuit-breaker-patterns
Completed:
- [x] Circuit breaker implementation created
- [x] Circuit breaker manager configured
- [x] Error mitigation patterns applied
- [x] Health monitoring integrated
- [x] Checkpoint integration verified
Outputs:
- Circuit breaker implementation with state management
- Manager for multiple circuit instances
- Error mitigation with retry logic
- Health report generation
- Checkpoint integration for recovery
Completion Checklist
Before marking this skill as complete, verify:
- CircuitBreaker class implements all three states (CLOSED, OPEN, HALF_OPEN)
- Failure threshold and recovery timeout configured appropriately
- CircuitBreakerManager tracks multiple circuits
- ErrorMitigation integrates with circuit breaker
- Health reports generated with latency metrics (p50, p95, p99)
- Checkpoint integration captures circuit states
- Unit tests cover state transitions
- Integration tests verify recovery behavior
Failure Indicators
This skill has FAILED if:
- ❌ Circuit breaker doesn't transition between states correctly
- ❌ Recovery timeout not respected (circuit stays open indefinitely)
- ❌ Failure count not reset after successful recovery
- ❌ Health metrics missing or incorrect (latency percentiles)
- ❌ Circuit breaker manager cannot create/retrieve circuits
- ❌ Error mitigation doesn't respect circuit state
- ❌ No backoff calculation or jitter applied
- ❌ Checkpoint doesn't capture circuit states
When NOT to Use
Do NOT use this skill when:
- Simple retry logic is sufficient (use basic retry instead)
- Errors are not cascade-prone (isolated failures)
- Service has no downstream dependencies
- Working with synchronous, non-critical operations
- Implementing rate limiting (use
rate-limiting-patternsskill instead) - Building simple health checks (use
health-check-patternsskill) - No need for failure tracking across requests
Use alternatives:
- Basic retry - For transient, isolated errors
- Rate limiting - For controlling request frequency
- Health checks - For simple availability monitoring
- Timeout patterns - For time-based failure detection
Anti-Patterns (Avoid)
| Anti-Pattern | Problem | Solution |
|---|---|---|
| No jitter in backoff | Thundering herd on recovery | Add jitter: delay + (delay * 0.2 * random) |
| Circuit per request | Too granular, high overhead | Circuit per service/endpoint |
| Ignoring half-open state | Binary open/closed, slow recovery | Use half-open for gradual recovery testing |
| No metrics collection | Cannot diagnose issues | Track latency, success rate, failure count |
| Shared circuit state | Race conditions | Use per-request or thread-safe state |
| Missing recovery timeout | Circuit stays open forever | Always configure recovery timeout |
| No circuit manager | Duplicate circuit instances | Use CircuitBreakerManager for singleton circuits |
| Hardcoded thresholds | Cannot adapt to different services | Make thresholds configurable per circuit |
Principles
This skill embodies these CODITECT principles:
- #1 Prevent Cascading Failures - Circuit breaker stops error propagation to healthy services
- #2 Graceful Degradation - Open circuit allows system to recover instead of crashing
- #3 Observable Recovery - Half-open state tests recovery with limited requests
- #5 Eliminate Ambiguity - Clear state machine (CLOSED → OPEN → HALF_OPEN)
- #6 Clear, Understandable - Explicit success/failure tracking with health reports
- #8 No Assumptions - Verify recovery through half-open testing, don't assume
Reference: CODITECT-STANDARD-AUTOMATION.md
State Transition Diagram
┌─────────────────────────────────────────┐
│ │
│ CLOSED │
│ (Normal Operation) │
│ │
│ • All requests allowed │
│ • Track failures │
│ • Reset failure count on success │
│ │
└──────────────┬──────────────────────────┘
│
│ failure_count >= threshold
│
▼
┌─────────────────────────────────────────┐
│ │
│ OPEN │
│ (Failing Fast) │
│ │
│ • Reject all requests immediately │
│ • Return CircuitBreakerOpen error │
│ • Wait for recovery_timeout │
│ │
└──────────────┬──────────────────────────┘
│
│ recovery_timeout elapsed
│
▼
┌─────────────────────────────────────────┐
│ │
│ HALF-OPEN │
│ (Testing Recovery) │
│ │
│ • Allow limited test requests │
│ • Success → transition to CLOSED │
│ • Failure → transition to OPEN │
│ │
└──────────────┬──────────────────────────┘
│
┌──────────────┼──────────────┐
│ │ │
success ×3 any failure
│ │
▼ ▼
CLOSED OPEN
State Transitions Summary:
| From | To | Trigger |
|---|---|---|
| CLOSED | OPEN | failure_count >= failure_threshold |
| OPEN | HALF-OPEN | recovery_timeout elapsed |
| HALF-OPEN | CLOSED | half_open_max_calls successes |
| HALF-OPEN | OPEN | Any failure during test |
Related Components
agents/circuit-breaker-monitor.md- Monitoring agentcommands/circuit-status.md- CLI commandhooks/circuit-breaker-open- Event hookscripts/circuit-breaker-manager.py- CLI tool