Skip to main content

#!/usr/bin/env python3 """ CODITECT Agent Health Status Command (H.8.3.7)

CLI wrapper for the HealthMonitoringService (ADR-110) enabling agent health monitoring for autonomous agent loops.

Usage: # Show health for specific agent python3 health-status-command.py --agent-id claude-20260127-143000

# Show all agent health
python3 health-status-command.py --all

# Show health for task's agents
python3 health-status-command.py --task-id H.8.3.7

# Register new agent
python3 health-status-command.py --register --task-id H.8.3.7

# Record heartbeat
python3 health-status-command.py --heartbeat --agent-id claude-20260127-143000

Author: CODITECT Framework Version: 1.0.0 Created: January 27, 2026 Task Reference: H.8.3.7 ADR Reference: ADR-110-health-monitoring-recovery-protocol.md """

import argparse import asyncio import json import os import sys from datetime import datetime, timezone, timedelta from pathlib import Path

Add parent directories to path for imports

SCRIPT_DIR = Path(file).resolve().parent CORE_DIR = SCRIPT_DIR.parent.parent sys.path.insert(0, str(CORE_DIR)) sys.path.insert(0, str(SCRIPT_DIR))

try: from ralph_wiggum.health_monitoring import ( HealthMonitoringService, HealthState, AgentHealth, HeartbeatPayload, CircuitBreaker, CircuitBreakerState, AgentNotFoundError, ) except ImportError as e: print(f"Error importing health_monitoring: {e}") print("Ensure ralph_wiggum/health_monitoring.py exists in scripts/core/") sys.exit(1)

def parse_args(): """Parse command line arguments.""" parser = argparse.ArgumentParser( description="CODITECT Agent Health Status Command (ADR-110)", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=""" Examples:

Show health for specific agent

%(prog)s --agent-id claude-20260127-143000

Show all agent health

%(prog)s --all

Show health for task

%(prog)s --task-id H.8.3.7

Register new agent

%(prog)s --register --task-id H.8.5.4 --agent-id claude-20260127-150000

Record heartbeat

%(prog)s --heartbeat --agent-id claude-20260127-143000 --phase implementing

Show circuit breakers

%(prog)s --circuit-breakers """ )

# Query options (mutually exclusive group)
query_group = parser.add_mutually_exclusive_group()
query_group.add_argument(
"--agent-id",
help="Show health for specific agent"
)
query_group.add_argument(
"--task-id",
help="Show health for all agents on task"
)
query_group.add_argument(
"--all",
action="store_true",
help="Show health for all registered agents"
)
query_group.add_argument(
"--circuit-breakers",
action="store_true",
help="Show circuit breaker status"
)

# Action options
parser.add_argument(
"--register",
action="store_true",
help="Register new agent for monitoring"
)
parser.add_argument(
"--heartbeat",
action="store_true",
help="Record agent heartbeat"
)
parser.add_argument(
"--transition",
choices=["healthy", "degraded", "stuck", "failing", "terminated"],
help="Manually transition agent state"
)

# Heartbeat/registration parameters
parser.add_argument(
"--phase",
choices=["planning", "implementing", "testing", "reviewing", "handoff", "complete"],
default="implementing",
help="Execution phase (default: implementing)"
)
parser.add_argument(
"--progress",
type=int,
default=0,
help="Progress percentage (0-100)"
)
parser.add_argument(
"--message",
help="Status message for heartbeat"
)

# History options
parser.add_argument(
"--history",
action="store_true",
help="Show intervention history"
)
parser.add_argument(
"--history-limit",
type=int,
default=10,
help="Limit history entries (default: 10)"
)

# Output options
parser.add_argument(
"--json",
action="store_true",
help="Output in JSON format"
)
parser.add_argument(
"--quiet",
action="store_true",
help="Minimal output"
)

# ADR-159: Project scoping
parser.add_argument(
"--project",
default=None,
help="Project scope for health diagnostics (ADR-159). Auto-detected from $CODITECT_PROJECT."
)

return parser.parse_args()

def generate_agent_id(): """Generate a unique agent identifier.""" timestamp = datetime.now(timezone.utc).strftime("%Y%m%d-%H%M%S") return f"claude-{timestamp}"

def format_time_ago(timestamp_str): """Format timestamp as relative time.""" if not timestamp_str: return "Never" try: if isinstance(timestamp_str, str): ts = datetime.fromisoformat(timestamp_str.replace('Z', '+00:00')) else: ts = timestamp_str

    now = datetime.now(timezone.utc)
delta = now - ts

if delta.total_seconds() < 60:
return f"{int(delta.total_seconds())}s ago"
elif delta.total_seconds() < 3600:
return f"{int(delta.total_seconds() / 60)}m ago"
elif delta.total_seconds() < 86400:
return f"{int(delta.total_seconds() / 3600)}h ago"
else:
return f"{int(delta.total_seconds() / 86400)}d ago"
except Exception:
return str(timestamp_str)[:19]

def state_icon(state): """Get icon for health state.""" # Handle both enum and string values if isinstance(state, HealthState): state_val = state.value else: state_val = str(state)

icons = {
"healthy": "HEALTHY",
"degraded": "DEGRADED",
"stuck": "STUCK",
"failing": "FAILING",
"terminated": "TERMINATED",
}
return icons.get(state_val, state_val.upper())

def format_agent_health(health, include_history=False, history_limit=10): """Format agent health as human-readable summary.""" lines = [ f"Agent Health: {health.agent_id}", "=" * 50, f" State: {state_icon(health.state)}", f" Task: {health.task_id}", f" Last Heartbeat: {format_time_ago(health.last_heartbeat)}", f" Nudges: {health.nudge_count}", f" Escalations: {health.escalation_count}", ]

# Show metrics if available
if hasattr(health, 'metrics') and health.metrics:
metrics = health.metrics
lines.append("")
lines.append("Metrics:")
lines.append(f" Token Count: {metrics.token_count}")
lines.append(f" Error Count: {metrics.error_count}")
lines.append(f" Tool Calls: {metrics.tool_call_count}")
lines.append(f" Interventions: {metrics.intervention_count}")

# Show circuit breakers if available
if hasattr(health, 'circuit_breakers') and health.circuit_breakers:
lines.append("")
lines.append("Circuit Breakers:")
for cb in health.circuit_breakers:
cb_name = cb.name.split(':')[-1] if ':' in cb.name else cb.name
cb_state = cb.state if isinstance(cb.state, str) else cb.state.value
lines.append(f" {cb_name}: {cb_state} (failures: {cb.failure_count})")

return "\n".join(lines)

def format_all_agents(agents): """Format all agents as dashboard table.""" lines = [ "Agent Health Dashboard", "=" * 70, f"{'Agent ID':<28} | {'Task':<8} | {'State':<10} | Last HB", "-" * 28 + "-+-" + "-" * 8 + "-+-" + "-" * 10 + "-+-" + "-" * 10, ]

healthy = 0
degraded = 0
stuck = 0
failing = 0
terminated = 0

for health in agents:
# Handle both enum and string states
state_val = health.state if isinstance(health.state, str) else health.state.value

if state_val == "healthy":
healthy += 1
elif state_val == "degraded":
degraded += 1
elif state_val == "stuck":
stuck += 1
elif state_val == "failing":
failing += 1
elif state_val == "terminated":
terminated += 1

agent_id = health.agent_id[:26] + ".." if len(health.agent_id) > 28 else health.agent_id
task_id = health.task_id[:8] if health.task_id else "N/A"
state_str = state_icon(health.state)[:10]
last_hb = format_time_ago(health.last_heartbeat)

lines.append(f"{agent_id:<28} | {task_id:<8} | {state_str:<10} | {last_hb}")

lines.append("=" * 70)
total = len(agents)
lines.append(f"Total: {total} agents | {healthy} healthy | {degraded} degraded | {stuck} stuck | {failing} failing")

return "\n".join(lines)

def format_circuit_breakers(breakers): """Format circuit breakers as table.""" lines = [ "Circuit Breaker Status", "=" * 70, f"{'Circuit':<30} | {'State':<10} | {'Failures':<8} | Last Trip", "-" * 30 + "-+-" + "-" * 10 + "-+-" + "-" * 8 + "-+-" + "-" * 15, ]

for name, breaker in breakers.items():
status = breaker.get_status()
state = status.state
failures = status.failure_count
last_trip = format_time_ago(status.last_failure_time) if status.last_failure_time else "Never"

# Shorten name if needed
display_name = name[:28] + ".." if len(name) > 30 else name

lines.append(f"{display_name:<30} | {state:<10} | {failures:<8} | {last_trip}")

lines.append("=" * 70)
return "\n".join(lines)

async def show_agent_health(args, service): """Show health for a specific agent.""" try: health = await service.get_agent_health(args.agent_id) return health except AgentNotFoundError: if args.json: print(json.dumps({"error": f"Agent {args.agent_id} not found"})) else: print(f"Agent {args.agent_id} not found") return None

async def show_task_agents(args, service): """Show health for all agents on a task.""" all_agents = await service.get_all_agent_health() agents = [a for a in all_agents if a.task_id == args.task_id]

if not agents:
if args.json:
print(json.dumps({"error": f"No agents found for task {args.task_id}"}))
else:
print(f"No agents found for task {args.task_id}")
return []

return agents

async def show_all_agents(args, service): """Show health for all registered agents.""" agents = await service.get_all_agent_health()

if not agents:
if args.json:
print(json.dumps({"error": "No agents registered"}))
else:
print("No agents registered")
return []

return agents

async def show_circuit_breakers(args, service): """Show circuit breaker status.""" breakers = service._circuit_breakers # Access internal dict

if not breakers:
if args.json:
print(json.dumps({"circuit_breakers": {}}))
else:
print("No circuit breakers configured")
return {}

return breakers

async def register_agent(args, service): """Register a new agent for monitoring.""" # For registration, we need agent_id from a different source since it's in mutually exclusive group agent_id = generate_agent_id() task_id = args.task_id

if not task_id:
print("Error: --task-id required for registration")
return None

health = await service.register_agent(
agent_id=agent_id,
task_id=task_id,
)

return health

async def record_heartbeat(args, service): """Record an agent heartbeat.""" if not args.agent_id: print("Error: --agent-id required for heartbeat") return None

# Create HeartbeatPayload
heartbeat = HeartbeatPayload(
agent_id=args.agent_id,
task_id=args.task_id or "",
phase=args.phase,
progress_indicator=f"{args.progress}%" if args.progress else "",
)

await service.record_heartbeat(heartbeat)

# Get updated health
try:
health = await service.get_agent_health(args.agent_id)
return health
except AgentNotFoundError:
return None

async def send_nudge(args, service): """Send a nudge to an agent.""" if not args.agent_id: print("Error: --agent-id required for nudge") return None

try:
success = await service.send_nudge(args.agent_id, args.message)
if success:
health = await service.get_agent_health(args.agent_id)
return health
else:
print(f"Max nudge attempts reached for agent {args.agent_id}")
return None
except AgentNotFoundError:
print(f"Agent {args.agent_id} not found")
return None

async def async_main(): """Async main entry point.""" args = parse_args()

# ADR-159: Resolve project scope
project_id = getattr(args, 'project', None) or os.environ.get('CODITECT_PROJECT')
if not project_id:
try:
from scope import resolve_scope
scope = resolve_scope()
project_id = scope.project
except ImportError:
pass
if project_id:
os.environ['CODITECT_PROJECT'] = project_id

# Initialize service
service = HealthMonitoringService()

# Handle registration
if args.register:
health = await register_agent(args, service)
if health:
if args.json:
print(json.dumps(health.to_dict(), indent=2, default=str))
elif args.quiet:
print(f"Registered: {health.agent_id}")
else:
print("Agent Registered")
print(format_agent_health(health))
return

# Handle heartbeat
if args.heartbeat:
health = await record_heartbeat(args, service)
if health:
if args.json:
print(json.dumps(health.to_dict(), indent=2, default=str))
elif args.quiet:
print(f"Heartbeat: {health.agent_id} - {health.state}")
else:
print("Heartbeat Recorded")
print(f" Agent: {health.agent_id}")
print(f" Task: {health.task_id}")
print(f" State: {state_icon(health.state)}")
return

# Handle circuit breakers query
if args.circuit_breakers:
breakers = await show_circuit_breakers(args, service)
if breakers:
if args.json:
result = {}
for name, b in breakers.items():
status = b.get_status()
result[name] = {
"state": status.state,
"failure_count": status.failure_count,
"last_failure_time": status.last_failure_time,
}
print(json.dumps(result, indent=2))
else:
print(format_circuit_breakers(breakers))
return

# Handle all agents query
if args.all:
agents = await show_all_agents(args, service)
if agents:
if args.json:
print(json.dumps([a.to_dict() for a in agents], indent=2, default=str))
else:
print(format_all_agents(agents))
return

# Handle task agents query
if args.task_id and not args.register:
agents = await show_task_agents(args, service)
if agents:
if args.json:
print(json.dumps([a.to_dict() for a in agents], indent=2, default=str))
else:
print(f"Agents for task {args.task_id}:")
print(format_all_agents(agents))
return

# Handle specific agent query
if args.agent_id and not args.heartbeat:
health = await show_agent_health(args, service)
if health:
if args.json:
print(json.dumps(health.to_dict(), indent=2, default=str))
else:
print(format_agent_health(health, args.history, args.history_limit))
return

# No action specified - show usage
print("Error: Specify one of --agent-id, --task-id, --all, --circuit-breakers, --register, or --heartbeat")
print("Use --help for usage information")

def main(): """Main entry point - runs async main.""" asyncio.run(async_main())

if name == "main": main()