Skip to main content

Observability and Monitoring Guide

Production Monitoring for Agentic AI Systems

Document ID: C2-OBSERVABILITY | Version: 1.0 | Category: P3 - Technical Deep Dives


Executive Summary

Agentic systems require comprehensive observability beyond traditional application monitoring. This guide covers metrics, traces, logs, and alerting specifically designed for multi-agent architectures.


Observability Pillars

The Four Pillars for Agents

PillarTraditionalAgent-Specific
MetricsLatency, throughputToken usage, success rate, cost
TracesRequest flowAgent reasoning chain
LogsEventsDecisions, tool calls
AlertsThresholdsAnomaly detection, drift

Metrics Framework

Core Agent Metrics

from dataclasses import dataclass
from prometheus_client import Counter, Histogram, Gauge

# Request metrics
agent_requests = Counter(
'agent_requests_total',
'Total agent requests',
['agent_type', 'paradigm', 'status']
)

agent_latency = Histogram(
'agent_latency_seconds',
'Agent request latency',
['agent_type', 'paradigm'],
buckets=[0.1, 0.5, 1.0, 2.0, 5.0, 10.0, 30.0, 60.0]
)

# Token metrics
tokens_used = Counter(
'agent_tokens_total',
'Total tokens used',
['agent_type', 'model', 'token_type']
)

# Cost metrics
cost_incurred = Counter(
'agent_cost_dollars',
'Cost incurred',
['agent_type', 'model']
)

# Tool metrics
tool_calls = Counter(
'agent_tool_calls_total',
'Tool calls made',
['agent_type', 'tool_name', 'status']
)

# Quality metrics
success_rate = Gauge(
'agent_success_rate',
'Rolling success rate',
['agent_type', 'paradigm']
)

Metrics Collection

class AgentMetricsCollector:
"""Collect and report agent metrics."""

def __init__(self):
self.request_buffer = []
self.window_size = 100

def record_request(
self,
agent_type: str,
paradigm: str,
success: bool,
latency: float,
tokens: dict,
cost: float,
tools_used: list
):
# Counters
status = "success" if success else "failure"
agent_requests.labels(agent_type, paradigm, status).inc()

# Histogram
agent_latency.labels(agent_type, paradigm).observe(latency)

# Tokens
for token_type, count in tokens.items():
tokens_used.labels(agent_type, "claude-sonnet", token_type).inc(count)

# Cost
cost_incurred.labels(agent_type, "claude-sonnet").inc(cost)

# Tools
for tool in tools_used:
tool_calls.labels(agent_type, tool["name"], tool["status"]).inc()

# Rolling success rate
self.request_buffer.append(success)
if len(self.request_buffer) > self.window_size:
self.request_buffer.pop(0)

rate = sum(self.request_buffer) / len(self.request_buffer)
success_rate.labels(agent_type, paradigm).set(rate)

Distributed Tracing

Agent Trace Structure

from opentelemetry import trace
from opentelemetry.trace import SpanKind

tracer = trace.get_tracer("agentic-system")

class AgentTracer:
"""Trace agent execution flow."""

async def trace_execution(self, agent_id: str, task: str):
with tracer.start_as_current_span(
"agent_execution",
kind=SpanKind.SERVER,
attributes={
"agent.id": agent_id,
"agent.task": task[:100]
}
) as span:
# Planning phase
with tracer.start_span("planning") as planning_span:
plan = await self.plan(task)
planning_span.set_attribute("plan.steps", len(plan.steps))

# Execution loop
for i, step in enumerate(plan.steps):
with tracer.start_span(f"step_{i}") as step_span:
step_span.set_attribute("step.action", step.action)

# Tool call
if step.requires_tool:
with tracer.start_span("tool_call") as tool_span:
tool_span.set_attribute("tool.name", step.tool)
result = await self.call_tool(step.tool)
tool_span.set_attribute("tool.success", result.success)

# LLM call
with tracer.start_span("llm_call") as llm_span:
llm_span.set_attribute("llm.model", "claude-sonnet")
response = await self.llm_generate(step)
llm_span.set_attribute("llm.tokens", response.tokens)

span.set_attribute("execution.success", True)

Trace Visualization

agent_execution (15.2s)
├── planning (1.2s)
│ └── llm_call: claude-sonnet (1.1s)
├── step_0: research (5.3s)
│ ├── tool_call: web_search (2.1s)
│ └── llm_call: synthesis (3.0s)
├── step_1: analyze (4.8s)
│ ├── tool_call: database_query (1.5s)
│ └── llm_call: analysis (3.2s)
└── step_2: respond (3.9s)
└── llm_call: response (3.8s)

Structured Logging

Log Schema

import structlog

logger = structlog.get_logger()

class AgentLogger:
"""Structured logging for agents."""

def log_request(self, request_id: str, agent_type: str, task: str):
logger.info(
"agent_request_started",
request_id=request_id,
agent_type=agent_type,
task_preview=task[:100],
timestamp=time.time()
)

def log_decision(
self,
request_id: str,
decision_type: str,
options: list,
selected: str,
rationale: str
):
logger.info(
"agent_decision",
request_id=request_id,
decision_type=decision_type,
options_count=len(options),
selected=selected,
rationale=rationale[:200]
)

def log_tool_call(
self,
request_id: str,
tool_name: str,
input_summary: str,
success: bool,
latency_ms: float
):
logger.info(
"agent_tool_call",
request_id=request_id,
tool_name=tool_name,
input_summary=input_summary[:100],
success=success,
latency_ms=latency_ms
)

def log_completion(
self,
request_id: str,
success: bool,
total_latency: float,
tokens_used: int,
cost: float
):
logger.info(
"agent_request_completed",
request_id=request_id,
success=success,
total_latency_seconds=total_latency,
tokens_used=tokens_used,
cost_dollars=cost
)

Alerting Framework

Alert Definitions

# Prometheus alerting rules
groups:
- name: agent_alerts
rules:
- alert: AgentHighErrorRate
expr: |
sum(rate(agent_requests_total{status="failure"}[5m])) /
sum(rate(agent_requests_total[5m])) > 0.1
for: 5m
labels:
severity: critical
annotations:
summary: "Agent error rate above 10%"

- alert: AgentHighLatency
expr: |
histogram_quantile(0.95, rate(agent_latency_seconds_bucket[5m])) > 30
for: 5m
labels:
severity: warning
annotations:
summary: "Agent P95 latency above 30s"

- alert: AgentCostSpike
expr: |
rate(agent_cost_dollars[1h]) > 10
for: 15m
labels:
severity: warning
annotations:
summary: "Agent cost spike detected"

- alert: AgentToolFailures
expr: |
sum(rate(agent_tool_calls_total{status="failure"}[5m])) > 5
for: 5m
labels:
severity: warning
annotations:
summary: "High tool failure rate"

Dashboard Components

Key Visualizations

PanelMetricPurpose
Request Rateagent_requests_totalTraffic volume
Success Ratesuccess/totalQuality indicator
Latency Distributionagent_latency_secondsPerformance
Token Usageagent_tokens_totalCost driver
Cost Trendagent_cost_dollarsBudget tracking
Tool Performanceagent_tool_callsIntegration health

Quick Reference

Metric TypeKey MetricsAlert Threshold
AvailabilitySuccess rate<95%
PerformanceP95 latency>30s
CostHourly spend>$10/hr
QualityTool success<90%
VolumeRequest rateAnomaly

Document maintained by CODITECT SRE Team