Skip to main content

Skill

Circuit Breaker Patterns Skill

Metadata

name: circuit-breaker-patterns
version: 1.0.0
category: resilience
status: active
priority: P1
derived_from: Claude Operating Preferences v4.0/v5.0

When to Use This Skill

Use this skill when:

  • Implementing error cascade prevention
  • Building resilient API integrations
  • Managing failure recovery with backoff
  • Tracking service health across operations

Core Pattern: Circuit Breaker

The circuit breaker pattern prevents cascading failures by temporarily stopping requests to failing services.

import random
import time
from dataclasses import dataclass, field
from enum import Enum
from typing import Dict, Any, Optional, List
from collections import deque

class CircuitState(Enum):
CLOSED = "closed" # Normal operation
OPEN = "open" # Failing, reject requests
HALF_OPEN = "half_open" # Testing recovery

class CircuitBreakerOpen(Exception):
"""Raised when circuit is open"""
def __init__(self, message: str, retry_after: float):
super().__init__(message)
self.retry_after = retry_after

@dataclass
class CircuitBreaker:
"""Circuit breaker for error cascade prevention"""

name: str
failure_threshold: int = 5
recovery_timeout: float = 60.0
half_open_max_calls: int = 3
backoff_factor: float = 2.0
max_backoff: float = 300.0

# State
state: CircuitState = CircuitState.CLOSED
failure_count: int = 0
last_failure_time: float = 0
half_open_calls: int = 0

# Metrics
success_count: int = 0
total_calls: int = 0
latencies: deque = field(default_factory=lambda: deque(maxlen=100))

def can_execute(self) -> bool:
"""Check if request can proceed"""
if self.state == CircuitState.CLOSED:
return True

if self.state == CircuitState.OPEN:
if time.time() - self.last_failure_time > self.recovery_timeout:
self.state = CircuitState.HALF_OPEN
self.half_open_calls = 0
return True
return False

if self.state == CircuitState.HALF_OPEN:
return self.half_open_calls < self.half_open_max_calls

return False

def record_success(self, latency: float):
"""Record successful call"""
self.success_count += 1
self.total_calls += 1
self.latencies.append(latency)

if self.state == CircuitState.HALF_OPEN:
self.half_open_calls += 1
if self.half_open_calls >= self.half_open_max_calls:
# Recovery successful
self.state = CircuitState.CLOSED
self.failure_count = 0

def record_failure(self):
"""Record failed call"""
self.failure_count += 1
self.total_calls += 1
self.last_failure_time = time.time()

if self.state == CircuitState.HALF_OPEN:
# Recovery failed
self.state = CircuitState.OPEN

elif self.failure_count >= self.failure_threshold:
self.state = CircuitState.OPEN

def calculate_backoff(self, attempt: int) -> float:
"""Calculate backoff with jitter"""
delay = min(self.backoff_factor ** attempt, self.max_backoff)
jitter = delay * 0.2 * (2 * random.random() - 1)
return max(0.1, delay + jitter)

def get_health_report(self) -> Dict[str, Any]:
"""Get circuit health report"""
latency_list = list(self.latencies)
return {
"name": self.name,
"state": self.state.value,
"failure_count": self.failure_count,
"success_count": self.success_count,
"total_calls": self.total_calls,
"success_rate": (
self.success_count / self.total_calls
if self.total_calls > 0 else 0
),
"latency_p50": (
sorted(latency_list)[len(latency_list) // 2]
if latency_list else 0
),
"latency_p95": (
sorted(latency_list)[int(len(latency_list) * 0.95)]
if len(latency_list) > 20 else 0
),
"latency_p99": (
sorted(latency_list)[int(len(latency_list) * 0.99)]
if len(latency_list) > 100 else 0
),
}

Circuit Breaker Manager

class CircuitBreakerManager:
"""Manage multiple circuit breakers"""

def __init__(self):
self.breakers: Dict[str, CircuitBreaker] = {}

def get_or_create(
self,
name: str,
**kwargs
) -> CircuitBreaker:
"""Get or create circuit breaker"""
if name not in self.breakers:
self.breakers[name] = CircuitBreaker(name=name, **kwargs)
return self.breakers[name]

def get_health_report(self) -> Dict[str, Any]:
"""Get health report for all circuits"""
reports = {
name: breaker.get_health_report()
for name, breaker in self.breakers.items()
}

open_circuits = [
name for name, breaker in self.breakers.items()
if breaker.state == CircuitState.OPEN
]

return {
"circuit_breakers": reports,
"open_circuits": open_circuits,
"total_circuits": len(self.breakers),
"healthy_circuits": len(self.breakers) - len(open_circuits),
}

Error Mitigation

@dataclass
class ErrorMitigation:
"""Strategies for error handling"""
retry_count: int = 3
fallback_enabled: bool = True
circuit_breaker: Optional[CircuitBreaker] = None

async def execute_with_retry(
self,
operation,
*args,
**kwargs
):
"""Execute operation with retry and circuit breaker"""
if self.circuit_breaker and not self.circuit_breaker.can_execute():
raise CircuitBreakerOpen(
f"Circuit {self.circuit_breaker.name} is open",
retry_after=self.circuit_breaker.recovery_timeout
)

last_error = None
for attempt in range(self.retry_count):
try:
start = time.time()
result = await operation(*args, **kwargs)
latency = time.time() - start

if self.circuit_breaker:
self.circuit_breaker.record_success(latency)

return result

except Exception as e:
last_error = e
if self.circuit_breaker:
self.circuit_breaker.record_failure()

if attempt < self.retry_count - 1:
backoff = (
self.circuit_breaker.calculate_backoff(attempt)
if self.circuit_breaker
else 2 ** attempt
)
await asyncio.sleep(backoff)

raise last_error

Checkpoint Integration

@dataclass
class Checkpoint:
"""Checkpoint with circuit breaker awareness"""
timestamp: str
task_description: str
completed_steps: List[str]
remaining_steps: List[str]
circuit_states: Dict[str, str]
thinking_usage: Optional[int] = None
thinking_summary: Optional[str] = None

def is_stale(self, max_age_hours: float = 24) -> bool:
"""Check if checkpoint is stale"""
from datetime import datetime, timezone
checkpoint_time = datetime.fromisoformat(
self.timestamp.replace('Z', '+00:00')
)
age = datetime.now(timezone.utc) - checkpoint_time
return age.total_seconds() > max_age_hours * 3600

Usage Example

# Create manager
manager = CircuitBreakerManager()

# Get circuit for API
api_circuit = manager.get_or_create(
"external_api",
failure_threshold=3,
recovery_timeout=30.0
)

# Use with error mitigation
mitigation = ErrorMitigation(
retry_count=3,
circuit_breaker=api_circuit
)

try:
result = await mitigation.execute_with_retry(
api_call,
endpoint="/users"
)
except CircuitBreakerOpen as e:
print(f"Circuit open. Retry after {e.retry_after}s")

Success Output

When this skill completes successfully, output:

✅ SKILL COMPLETE: circuit-breaker-patterns

Completed:
- [x] Circuit breaker implementation created
- [x] Circuit breaker manager configured
- [x] Error mitigation patterns applied
- [x] Health monitoring integrated
- [x] Checkpoint integration verified

Outputs:
- Circuit breaker implementation with state management
- Manager for multiple circuit instances
- Error mitigation with retry logic
- Health report generation
- Checkpoint integration for recovery

Completion Checklist

Before marking this skill as complete, verify:

  • CircuitBreaker class implements all three states (CLOSED, OPEN, HALF_OPEN)
  • Failure threshold and recovery timeout configured appropriately
  • CircuitBreakerManager tracks multiple circuits
  • ErrorMitigation integrates with circuit breaker
  • Health reports generated with latency metrics (p50, p95, p99)
  • Checkpoint integration captures circuit states
  • Unit tests cover state transitions
  • Integration tests verify recovery behavior

Failure Indicators

This skill has FAILED if:

  • ❌ Circuit breaker doesn't transition between states correctly
  • ❌ Recovery timeout not respected (circuit stays open indefinitely)
  • ❌ Failure count not reset after successful recovery
  • ❌ Health metrics missing or incorrect (latency percentiles)
  • ❌ Circuit breaker manager cannot create/retrieve circuits
  • ❌ Error mitigation doesn't respect circuit state
  • ❌ No backoff calculation or jitter applied
  • ❌ Checkpoint doesn't capture circuit states

When NOT to Use

Do NOT use this skill when:

  • Simple retry logic is sufficient (use basic retry instead)
  • Errors are not cascade-prone (isolated failures)
  • Service has no downstream dependencies
  • Working with synchronous, non-critical operations
  • Implementing rate limiting (use rate-limiting-patterns skill instead)
  • Building simple health checks (use health-check-patterns skill)
  • No need for failure tracking across requests

Use alternatives:

  • Basic retry - For transient, isolated errors
  • Rate limiting - For controlling request frequency
  • Health checks - For simple availability monitoring
  • Timeout patterns - For time-based failure detection

Anti-Patterns (Avoid)

Anti-PatternProblemSolution
No jitter in backoffThundering herd on recoveryAdd jitter: delay + (delay * 0.2 * random)
Circuit per requestToo granular, high overheadCircuit per service/endpoint
Ignoring half-open stateBinary open/closed, slow recoveryUse half-open for gradual recovery testing
No metrics collectionCannot diagnose issuesTrack latency, success rate, failure count
Shared circuit stateRace conditionsUse per-request or thread-safe state
Missing recovery timeoutCircuit stays open foreverAlways configure recovery timeout
No circuit managerDuplicate circuit instancesUse CircuitBreakerManager for singleton circuits
Hardcoded thresholdsCannot adapt to different servicesMake thresholds configurable per circuit

Principles

This skill embodies these CODITECT principles:

  • #1 Prevent Cascading Failures - Circuit breaker stops error propagation to healthy services
  • #2 Graceful Degradation - Open circuit allows system to recover instead of crashing
  • #3 Observable Recovery - Half-open state tests recovery with limited requests
  • #5 Eliminate Ambiguity - Clear state machine (CLOSED → OPEN → HALF_OPEN)
  • #6 Clear, Understandable - Explicit success/failure tracking with health reports
  • #8 No Assumptions - Verify recovery through half-open testing, don't assume

Reference: CODITECT-STANDARD-AUTOMATION.md


State Transition Diagram

                    ┌─────────────────────────────────────────┐
│ │
│ CLOSED │
│ (Normal Operation) │
│ │
│ • All requests allowed │
│ • Track failures │
│ • Reset failure count on success │
│ │
└──────────────┬──────────────────────────┘

│ failure_count >= threshold


┌─────────────────────────────────────────┐
│ │
│ OPEN │
│ (Failing Fast) │
│ │
│ • Reject all requests immediately │
│ • Return CircuitBreakerOpen error │
│ • Wait for recovery_timeout │
│ │
└──────────────┬──────────────────────────┘

│ recovery_timeout elapsed


┌─────────────────────────────────────────┐
│ │
│ HALF-OPEN │
│ (Testing Recovery) │
│ │
│ • Allow limited test requests │
│ • Success → transition to CLOSED │
│ • Failure → transition to OPEN │
│ │
└──────────────┬──────────────────────────┘

┌──────────────┼──────────────┐
│ │ │
success ×3 any failure
│ │
▼ ▼
CLOSED OPEN

State Transitions Summary:

FromToTrigger
CLOSEDOPENfailure_count >= failure_threshold
OPENHALF-OPENrecovery_timeout elapsed
HALF-OPENCLOSEDhalf_open_max_calls successes
HALF-OPENOPENAny failure during test
  • agents/circuit-breaker-monitor.md - Monitoring agent
  • commands/circuit-status.md - CLI command
  • hooks/circuit-breaker-open - Event hook
  • scripts/circuit-breaker-manager.py - CLI tool