Observability and Monitoring Guide

Production Monitoring for Agentic AI Systems

Document ID: C2-OBSERVABILITY | Version: 1.0 | Category: P3 - Technical Deep Dives

Executive Summary

Agentic systems require comprehensive observability beyond traditional application monitoring. This guide covers metrics, traces, logs, and alerting specifically designed for multi-agent architectures.

Observability Pillars

The Four Pillars for Agents

Pillar	Traditional	Agent-Specific
Metrics	Latency, throughput	Token usage, success rate, cost
Traces	Request flow	Agent reasoning chain
Logs	Events	Decisions, tool calls
Alerts	Thresholds	Anomaly detection, drift

Metrics Framework

Core Agent Metrics

from dataclasses import dataclass
from prometheus_client import Counter, Histogram, Gauge

# Request metrics
agent_requests = Counter(
    'agent_requests_total',
    'Total agent requests',
    ['agent_type', 'paradigm', 'status']
)

agent_latency = Histogram(
    'agent_latency_seconds',
    'Agent request latency',
    ['agent_type', 'paradigm'],
    buckets=[0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0]
)

# Token metrics
tokens_used = Counter(
    'agent_tokens_total',
    'Total tokens used',
    ['agent_type', 'model', 'token_type']
)

# Cost metrics
cost_incurred = Counter(
    'agent_cost_dollars',
    'Cost incurred',
    ['agent_type', 'model']
)

# Tool metrics
tool_calls = Counter(
    'agent_tool_calls_total',
    'Tool calls made',
    ['agent_type', 'tool_name', 'status']
)

# Quality metrics
success_rate = Gauge(
    'agent_success_rate',
    'Rolling success rate',
    ['agent_type', 'paradigm']
)

Metrics Collection

class AgentMetricsCollector:
    """Collect and report agent metrics."""
    
    def __init__(self):
        self.request_buffer = []
        self.window_size = 100
    
    def record_request(
        self,
        agent_type: str,
        paradigm: str,
        success: bool,
        latency: float,
        tokens: dict,
        cost: float,
        tools_used: list
    ):
        # Counters
        status = "success" if success else "failure"
        agent_requests.labels(agent_type, paradigm, status).inc()
        
        # Histogram
        agent_latency.labels(agent_type, paradigm).observe(latency)
        
        # Tokens
        for token_type, count in tokens.items():
            tokens_used.labels(agent_type, "claude-sonnet", token_type).inc(count)
        
        # Cost
        cost_incurred.labels(agent_type, "claude-sonnet").inc(cost)
        
        # Tools
        for tool in tools_used:
            tool_calls.labels(agent_type, tool["name"], tool["status"]).inc()
        
        # Rolling success rate
        self.request_buffer.append(success)
        if len(self.request_buffer) > self.window_size:
            self.request_buffer.pop(0)
        
        rate = sum(self.request_buffer) / len(self.request_buffer)
        success_rate.labels(agent_type, paradigm).set(rate)

Distributed Tracing

Agent Trace Structure

from opentelemetry import trace
from opentelemetry.trace import SpanKind

tracer = trace.get_tracer("agentic-system")

class AgentTracer:
    """Trace agent execution flow."""
    
    async def trace_execution(self, agent_id: str, task: str):
        with tracer.start_as_current_span(
            "agent_execution",
            kind=SpanKind.SERVER,
            attributes={
                "agent.id": agent_id,
                "agent.task": task[:100]
            }
        ) as span:
            # Planning phase
            with tracer.start_span("planning") as planning_span:
                plan = await self.plan(task)
                planning_span.set_attribute("plan.steps", len(plan.steps))
            
            # Execution loop
            for i, step in enumerate(plan.steps):
                with tracer.start_span(f"step_{i}") as step_span:
                    step_span.set_attribute("step.action", step.action)
                    
                    # Tool call
                    if step.requires_tool:
                        with tracer.start_span("tool_call") as tool_span:
                            tool_span.set_attribute("tool.name", step.tool)
                            result = await self.call_tool(step.tool)
                            tool_span.set_attribute("tool.success", result.success)
                    
                    # LLM call
                    with tracer.start_span("llm_call") as llm_span:
                        llm_span.set_attribute("llm.model", "claude-sonnet")
                        response = await self.llm_generate(step)
                        llm_span.set_attribute("llm.tokens", response.tokens)
            
            span.set_attribute("execution.success", True)

Trace Visualization

agent_execution (15.2s)
├── planning (1.2s)
│   └── llm_call: claude-sonnet (1.1s)
├── step_0: research (5.3s)
│   ├── tool_call: web_search (2.1s)
│   └── llm_call: synthesis (3.0s)
├── step_1: analyze (4.8s)
│   ├── tool_call: database_query (1.5s)
│   └── llm_call: analysis (3.2s)
└── step_2: respond (3.9s)
    └── llm_call: response (3.8s)

Structured Logging

Log Schema

import structlog

logger = structlog.get_logger()

class AgentLogger:
    """Structured logging for agents."""
    
    def log_request(self, request_id: str, agent_type: str, task: str):
        logger.info(
            "agent_request_started",
            request_id=request_id,
            agent_type=agent_type,
            task_preview=task[:100],
            timestamp=time.time()
        )
    
    def log_decision(
        self,
        request_id: str,
        decision_type: str,
        options: list,
        selected: str,
        rationale: str
    ):
        logger.info(
            "agent_decision",
            request_id=request_id,
            decision_type=decision_type,
            options_count=len(options),
            selected=selected,
            rationale=rationale[:200]
        )
    
    def log_tool_call(
        self,
        request_id: str,
        tool_name: str,
        input_summary: str,
        success: bool,
        latency_ms: float
    ):
        logger.info(
            "agent_tool_call",
            request_id=request_id,
            tool_name=tool_name,
            input_summary=input_summary[:100],
            success=success,
            latency_ms=latency_ms
        )
    
    def log_completion(
        self,
        request_id: str,
        success: bool,
        total_latency: float,
        tokens_used: int,
        cost: float
    ):
        logger.info(
            "agent_request_completed",
            request_id=request_id,
            success=success,
            total_latency_seconds=total_latency,
            tokens_used=tokens_used,
            cost_dollars=cost
        )

Alerting Framework

Alert Definitions

# Prometheus alerting rules
groups:
  - name: agent_alerts
    rules:
      - alert: AgentHighErrorRate
        expr: |
          sum(rate(agent_requests_total{status="failure"}[5m])) /
          sum(rate(agent_requests_total[5m])) > 0.1
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Agent error rate above 10%"
      
      - alert: AgentHighLatency
        expr: |
          histogram_quantile(0.95, rate(agent_latency_seconds_bucket[5m])) > 30
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Agent P95 latency above 30s"
      
      - alert: AgentCostSpike
        expr: |
          rate(agent_cost_dollars[1h]) > 10
        for: 15m
        labels:
          severity: warning
        annotations:
          summary: "Agent cost spike detected"
      
      - alert: AgentToolFailures
        expr: |
          sum(rate(agent_tool_calls_total{status="failure"}[5m])) > 5
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High tool failure rate"

Dashboard Components

Key Visualizations

Panel	Metric	Purpose
Request Rate	agent_requests_total	Traffic volume
Success Rate	success/total	Quality indicator
Latency Distribution	agent_latency_seconds	Performance
Token Usage	agent_tokens_total	Cost driver
Cost Trend	agent_cost_dollars	Budget tracking
Tool Performance	agent_tool_calls	Integration health

Quick Reference

Metric Type	Key Metrics	Alert Threshold
Availability	Success rate	<95%
Performance	P95 latency	>30s
Cost	Hourly spend	>$10/hr
Quality	Tool success	<90%
Volume	Request rate	Anomaly

Document maintained by CODITECT SRE Team

Production Monitoring for Agentic AI Systems​

Executive Summary​

Observability Pillars​

The Four Pillars for Agents​

Metrics Framework​

Core Agent Metrics​

Metrics Collection​

Distributed Tracing​

Agent Trace Structure​

Trace Visualization​

Structured Logging​

Log Schema​

Alerting Framework​

Alert Definitions​

Dashboard Components​

Key Visualizations​

Quick Reference​