Agentic AI Benchmarking Framework
Standardized Evaluation Methodology
Document ID: C10-BENCHMARKING-FRAMEWORK
Version: 1.0
Category: Technical Deep Dive
Evaluation Dimensions
┌─────────────────────────────────────────────────────────────┐
│ AGENTIC AI EVALUATION FRAMEWORK │
├─────────────────────────────────────────────────────────────┤
│ │
│ QUALITY EFFICIENCY RELIABILITY │
│ ──────── ────────── ─────────── │
│ • Accuracy • Token usage • Error rate │
│ • Completeness • Latency • Recovery rate │
│ • Relevance • Cost per task • Consistency │
│ • Citation • Throughput • Graceful degradation │
│ quality │
│ │
│ SAFETY COMPLIANCE USER EXPERIENCE │
│ ────── ────────── ─────────────── │
│ • Hallucination • Audit coverage • Satisfaction │
│ • Harmful output • Policy adherence • Task completion │
│ • Data leakage • Documentation • Trust indicators │
│ │
└─────────────────────────────────────────────────────────────┘
Benchmark Suite Structure
from dataclasses import dataclass
from typing import List, Dict, Callable, Optional
from enum import Enum
class Paradigm(Enum):
LSR = "latent_space_reasoner"
GS = "grounded_synthesizer"
EP = "emergent_planner"
VE = "verifiable_executor"
@dataclass
class BenchmarkTask:
id: str
name: str
paradigm: Paradigm
input_data: Dict
expected_output: Dict
evaluation_criteria: List[str]
timeout_seconds: int = 60
max_iterations: int = 10
@dataclass
class BenchmarkResult:
task_id: str
success: bool
scores: Dict[str, float]
output: Dict
tokens_used: int
latency_ms: float
iterations: int
errors: List[str]
class AgenticBenchmarkSuite:
def __init__(self, name: str):
self.name = name
self.tasks: List[BenchmarkTask] = []
self.evaluators: Dict[str, Callable] = {}
def add_task(self, task: BenchmarkTask):
self.tasks.append(task)
def register_evaluator(self, name: str, evaluator: Callable):
self.evaluators[name] = evaluator
def run(self, agent, paradigm_filter: Optional[Paradigm] = None) -> List[BenchmarkResult]:
tasks = self.tasks
if paradigm_filter:
tasks = [t for t in tasks if t.paradigm == paradigm_filter]
results = []
for task in tasks:
result = self._run_task(agent, task)
results.append(result)
return results
def _run_task(self, agent, task: BenchmarkTask) -> BenchmarkResult:
start_time = time.time()
errors = []
try:
output = agent.execute(
task.input_data,
max_iterations=task.max_iterations,
timeout=task.timeout_seconds
)
# Evaluate
scores = {}
for criterion in task.evaluation_criteria:
evaluator = self.evaluators.get(criterion)
if evaluator:
scores[criterion] = evaluator(output, task.expected_output)
success = all(s >= 0.7 for s in scores.values())
except Exception as e:
output = {}
scores = {c: 0.0 for c in task.evaluation_criteria}
success = False
errors.append(str(e))
latency_ms = (time.time() - start_time) * 1000
return BenchmarkResult(
task_id=task.id,
success=success,
scores=scores,
output=output,
tokens_used=agent.last_token_count,
latency_ms=latency_ms,
iterations=agent.last_iteration_count,
errors=errors
)
Standard Evaluators
Accuracy Evaluators
from sklearn.metrics import f1_score, precision_score, recall_score
import numpy as np
def exact_match_evaluator(output: Dict, expected: Dict) -> float:
"""Check for exact match of key fields"""
if not output or not expected:
return 0.0
matches = 0
total = len(expected)
for key, expected_value in expected.items():
if key in output and output[key] == expected_value:
matches += 1
return matches / total if total > 0 else 0.0
def semantic_similarity_evaluator(output: Dict, expected: Dict) -> float:
"""Evaluate semantic similarity of text outputs"""
output_text = output.get('text', '')
expected_text = expected.get('text', '')
# Use embedding similarity
output_embedding = embedding_model.encode(output_text)
expected_embedding = embedding_model.encode(expected_text)
similarity = cosine_similarity(output_embedding, expected_embedding)
return float(similarity)
def factual_accuracy_evaluator(output: Dict, expected: Dict) -> float:
"""Evaluate factual accuracy using LLM-as-judge"""
prompt = f"""
Evaluate the factual accuracy of the following output against the expected answer.
Output: {json.dumps(output)}
Expected: {json.dumps(expected)}
Rate accuracy from 0.0 to 1.0 where:
- 1.0 = Completely accurate
- 0.8 = Minor inaccuracies
- 0.5 = Partially accurate
- 0.2 = Mostly inaccurate
- 0.0 = Completely wrong
Return only the numeric score.
"""
score = float(llm_judge.complete(prompt))
return min(max(score, 0.0), 1.0)
Citation Evaluators (GS Paradigm)
def citation_coverage_evaluator(output: Dict, expected: Dict) -> float:
"""Evaluate citation coverage for factual claims"""
claims = output.get('claims', [])
citations = output.get('citations', [])
if not claims:
return 1.0 # No claims to cite
cited_claims = 0
for claim in claims:
if any(c['claim_id'] == claim['id'] for c in citations):
cited_claims += 1
return cited_claims / len(claims)
def citation_accuracy_evaluator(output: Dict, expected: Dict) -> float:
"""Evaluate whether citations support their claims"""
citations = output.get('citations', [])
sources = output.get('sources', [])
if not citations:
return 1.0
accurate = 0
for citation in citations:
source_id = citation.get('source_id')
source = next((s for s in sources if s['id'] == source_id), None)
if source:
# Use LLM to verify citation supports claim
verification = verify_citation_supports_claim(
claim=citation['claim'],
source_text=source['text']
)
if verification:
accurate += 1
return accurate / len(citations)
Protocol Compliance Evaluators (VE Paradigm)
def protocol_adherence_evaluator(output: Dict, expected: Dict) -> float:
"""Evaluate adherence to defined protocol"""
expected_steps = expected.get('protocol_steps', [])
executed_steps = output.get('executed_steps', [])
if not expected_steps:
return 1.0
# Check all required steps were executed
executed_step_ids = {s['step_id'] for s in executed_steps}
required_step_ids = {s['step_id'] for s in expected_steps if s.get('required', True)}
if not required_step_ids.issubset(executed_step_ids):
return 0.0
# Check order
expected_order = [s['step_id'] for s in expected_steps]
actual_order = [s['step_id'] for s in executed_steps]
# Calculate order accuracy
order_correct = sum(1 for i, step in enumerate(expected_order)
if i < len(actual_order) and actual_order[i] == step)
return order_correct / len(expected_order)
def audit_completeness_evaluator(output: Dict, expected: Dict) -> float:
"""Evaluate completeness of audit trail"""
audit_records = output.get('audit_trail', [])
executed_actions = output.get('actions', [])
if not executed_actions:
return 1.0
audited_actions = {r['action_id'] for r in audit_records}
all_actions = {a['id'] for a in executed_actions}
return len(audited_actions & all_actions) / len(all_actions)
Standard Benchmark Tasks
LSR Benchmark Tasks
LSR_TASKS = [
BenchmarkTask(
id="lsr-creative-001",
name="Marketing Copy Generation",
paradigm=Paradigm.LSR,
input_data={
"product": "Cloud security platform",
"target_audience": "CISOs",
"tone": "Professional, authoritative",
"format": "LinkedIn post"
},
expected_output={
"word_count_range": (100, 200),
"includes_cta": True,
"tone_match": True
},
evaluation_criteria=["tone_match", "creativity", "cta_presence"]
),
BenchmarkTask(
id="lsr-synthesis-001",
name="Strategy Synthesis",
paradigm=Paradigm.LSR,
input_data={
"context": "Company expanding to new market",
"constraints": ["Limited budget", "6-month timeline"],
"goal": "Develop market entry strategy"
},
expected_output={
"has_recommendations": True,
"addresses_constraints": True,
"actionable": True
},
evaluation_criteria=["completeness", "actionability", "creativity"]
)
]
GS Benchmark Tasks
GS_TASKS = [
BenchmarkTask(
id="gs-research-001",
name="Market Research Synthesis",
paradigm=Paradigm.GS,
input_data={
"query": "What is the current market size for enterprise AI platforms?",
"sources": ["industry_reports", "analyst_coverage"],
"recency": "last_12_months"
},
expected_output={
"has_market_size": True,
"has_citations": True,
"min_sources": 3
},
evaluation_criteria=["factual_accuracy", "citation_coverage", "citation_accuracy"]
)
]
Reporting Template
def generate_benchmark_report(results: List[BenchmarkResult], suite_name: str) -> Dict:
"""Generate comprehensive benchmark report"""
report = {
"suite_name": suite_name,
"run_date": datetime.utcnow().isoformat(),
"summary": {},
"by_paradigm": {},
"by_task": [],
"recommendations": []
}
# Overall summary
report["summary"] = {
"total_tasks": len(results),
"passed": sum(1 for r in results if r.success),
"failed": sum(1 for r in results if not r.success),
"pass_rate": sum(1 for r in results if r.success) / len(results),
"avg_latency_ms": np.mean([r.latency_ms for r in results]),
"avg_tokens": np.mean([r.tokens_used for r in results]),
"total_tokens": sum(r.tokens_used for r in results)
}
# By paradigm
for paradigm in Paradigm:
paradigm_results = [r for r in results if get_task_paradigm(r.task_id) == paradigm]
if paradigm_results:
report["by_paradigm"][paradigm.value] = {
"pass_rate": sum(1 for r in paradigm_results if r.success) / len(paradigm_results),
"avg_scores": {
criterion: np.mean([r.scores.get(criterion, 0) for r in paradigm_results])
for criterion in get_criteria_for_paradigm(paradigm)
}
}
return report
Quick Reference: Evaluation Criteria by Paradigm
| Paradigm | Primary Criteria | Secondary Criteria |
|---|---|---|
| LSR | Creativity, coherence, relevance | Tone match, engagement |
| GS | Factual accuracy, citation coverage, citation accuracy | Source quality, completeness |
| EP | Goal achievement, adaptation quality, efficiency | Learning rate, recovery rate |
| VE | Protocol adherence, audit completeness, consistency | Error handling, compliance |
Document maintained by CODITECT Quality Assurance Team