Skip to main content

Alert Configuration System

Alert Management Implementation

Context

The current situation requires a decision because:

  • Requirement 1
  • Constraint 2
  • Need 3

Status

Accepted | YYYY-MM-DD

A. Alert Configuration Service

from typing import Dict, Any, List, Optional, Callable
from datetime import datetime, timedelta
from enum import Enum
import asyncio
import json

class AlertSeverity(Enum):
INFO = "info"
WARNING = "warning"
CRITICAL = "critical"
EMERGENCY = "emergency"

class AlertState(Enum):
ACTIVE = "active"
ACKNOWLEDGED = "acknowledged"
RESOLVED = "resolved"

class AlertRule:
"""Alert rule definition and evaluation"""

def __init__(
self,
rule_id: str,
name: str,
metric_name: str,
condition: str,
threshold: float,
window: timedelta,
severity: AlertSeverity,
labels: Optional[Dict[str, str]] = None,
description: Optional[str] = None
):
self.rule_id = rule_id
self.name = name
self.metric_name = metric_name
self.condition = condition
self.threshold = threshold
self.window = window
self.severity = severity
self.labels = labels or {}
self.description = description

def to_dict(self) -> Dict[str, Any]:
"""Convert rule to dictionary"""
return {
'rule_id': self.rule_id,
'name': self.name,
'metric_name': self.metric_name,
'condition': self.condition,
'threshold': self.threshold,
'window': str(self.window),
'severity': self.severity.value,
'labels': self.labels,
'description': self.description
}

@classmethod
def from_dict(cls, data: Dict[str, Any]) -> 'AlertRule':
"""Create rule from dictionary"""
return cls(
rule_id=data['rule_id'],
name=data['name'],
metric_name=data['metric_name'],
condition=data['condition'],
threshold=float(data['threshold']),
window=eval(f"timedelta({data['window'].split('(')[1]}"),
severity=AlertSeverity(data['severity']),
labels=data.get('labels', {}),
description=data.get('description')
)

class AlertConfiguration:
"""Alert configuration management service"""

def __init__(
self,
redis_client,
metrics_collector,
notification_service
):
self.redis = redis_client
self.metrics = metrics_collector
self.notifications = notification_service
self.rules: Dict[str, AlertRule] = {}
self.evaluators: Dict[str, Callable] = {
'>': lambda x, y: x > y,
'<': lambda x, y: x < y,
'>=': lambda x, y: x >= y,
'<=': lambda x, y: x <= y,
'==': lambda x, y: abs(x - y) < 1e-10
}

async def initialize(self):
"""Initialize alert configuration"""
# Load rules from Redis
await self._load_rules()

# Start evaluation loop
asyncio.create_task(self._evaluation_loop())

async def add_rule(
self,
rule: AlertRule
) -> str:
"""Add new alert rule"""
# Validate rule
if not self._validate_rule(rule):
raise ValueError("Invalid alert rule")

# Store rule
self.rules[rule.rule_id] = rule
await self._save_rule(rule)

return rule.rule_id

async def update_rule(
self,
rule_id: str,
updates: Dict[str, Any]
) -> AlertRule:
"""Update existing alert rule"""
if rule_id not in self.rules:
raise KeyError(f"Rule {rule_id} not found")

rule_dict = self.rules[rule_id].to_dict()
rule_dict.update(updates)

rule = AlertRule.from_dict(rule_dict)
if not self._validate_rule(rule):
raise ValueError("Invalid alert rule")

self.rules[rule_id] = rule
await self._save_rule(rule)

return rule

async def delete_rule(
self,
rule_id: str
):
"""Delete alert rule"""
if rule_id in self.rules:
del self.rules[rule_id]
await self.redis.delete(f"alert_rule:{rule_id}")

async def get_rule(
self,
rule_id: str
) -> Optional[AlertRule]:
"""Get alert rule by ID"""
return self.rules.get(rule_id)

async def list_rules(
self,
severity: Optional[AlertSeverity] = None,
metric_name: Optional[str] = None
) -> List[AlertRule]:
"""List alert rules with optional filtering"""
rules = list(self.rules.values())

if severity:
rules = [r for r in rules if r.severity == severity]

if metric_name:
rules = [r for r in rules if r.metric_name == metric_name]

return rules

def _validate_rule(
self,
rule: AlertRule
) -> bool:
"""Validate alert rule configuration"""
# Check condition
if rule.condition not in self.evaluators:
return False

# Check threshold
try:
float(rule.threshold)
except ValueError:
return False

# Check window
if not isinstance(rule.window, timedelta):
return False

return True

async def _save_rule(
self,
rule: AlertRule
):
"""Save rule to Redis"""
await self.redis.set(
f"alert_rule:{rule.rule_id}",
json.dumps(rule.to_dict())
)

async def _load_rules(self):
"""Load rules from Redis"""
pattern = "alert_rule:*"
cursor = 0

while True:
cursor, keys = await self.redis.scan(
cursor,
match=pattern
)

for key in keys:
rule_data = await self.redis.get(key)
if rule_data:
try:
rule = AlertRule.from_dict(
json.loads(rule_data)
)
self.rules[rule.rule_id] = rule
except Exception as e:
logging.error(f"Error loading rule: {str(e)}")

if cursor == 0:
break

async def _evaluation_loop(self):
"""Continuous alert rule evaluation"""
while True:
try:
for rule in self.rules.values():
await self._evaluate_rule(rule)
except Exception as e:
logging.error(f"Error in alert evaluation: {str(e)}")

await asyncio.sleep(10) # Evaluate every 10 seconds

async def _evaluate_rule(
self,
rule: AlertRule
):
"""Evaluate single alert rule"""
try:
# Get metric statistics
stats = await self.metrics.get_metric_statistics(
rule.metric_name,
rule.window,
rule.labels
)

if not stats:
return

# Evaluate condition
evaluator = self.evaluators[rule.condition]
value = stats.get('avg', 0)

if evaluator(value, rule.threshold):
# Condition met, create alert
alert_id = f"{rule.rule_id}:{datetime.utcnow().isoformat()}"

alert_data = {
'alert_id': alert_id,
'rule_id': rule.rule_id,
'name': rule.name,
'severity': rule.severity.value,
'metric_name': rule.metric_name,
'value': value,
'threshold': rule.threshold,
'condition': rule.condition,
'labels': rule.labels,
'timestamp': datetime.utcnow().isoformat(),
'state': AlertState.ACTIVE.value
}

# Store alert
await self.redis.set(
f"alert:{alert_id}",
json.dumps(alert_data)
)

# Send notification
await self.notifications.send_alert(alert_data)

except Exception as e:
logging.error(f"Error evaluating rule {rule.rule_id}: {str(e)}")

B. Alert Notification Templates

class AlertTemplates:
"""Alert notification templates"""

EMAIL_TEMPLATE = """
Alert: {name}
Severity: {severity}
Time: {timestamp}

Metric: {metric_name}
Value: {value}
Threshold: {threshold} {condition}

Labels: {labels}

Description: {description}
"""

SLACK_TEMPLATE = {
"blocks": [
{
"type": "header",
"text": {
"type": "plain_text",
"text": "🚨 Alert: {name}"
}
},
{
"type": "section",
"fields": [
{
"type": "mrkdwn",
"text": "*Severity:*\n{severity}"
},
{
"type": "mrkdwn",
"text": "*Time:*\n{timestamp}"
}
]
},
{
"type": "section",
"fields": [
{
"type": "mrkdwn",
"text": "*Metric:*\n{metric_name}"
},
{
"type": "mrkdwn",
"text": "*Value:*\n{value} {condition} {threshold}"
}
]
},
{
"type": "section",
"text": {
"type": "mrkdwn",
"text": "*Labels:*\n{labels}"
}
}
]
}

WEBHOOK_TEMPLATE = {
"alert": {
"name": "{name}",
"severity": "{severity}",
"timestamp": "{timestamp}",
"metric": {
"name": "{metric_name}",
"value": "{value}",
"threshold": "{threshold}",
"condition": "{condition}"
},
"labels": "{labels}",
"description": "{description}"
}
}

Would you like me to continue with the Visualization Components next?