#!/usr/bin/env python3 """ Test script for MoE Judge Agents.
Tests all 3 judges with sample analyst votes to verify:
- ConsistencyJudge - agreement validation
- QualityJudge - vote quality and confidence
- DomainJudge - CODITECT domain compliance """
import sys from pathlib import Path
Add module path
sys.path.insert(0, str(Path(file).parent))
from core.models import Document, AnalystVote from judges import get_all_judges, ConsistencyJudge, QualityJudge, DomainJudge
def create_test_document(doc_type: str) -> Document: """Create a sample document for testing.""" samples = { 'agent': Document( path=Path('/test/agents/test-agent.md'), content="""--- title: Test Agent type: agent tags: [agent, ai]
Test Agent
Role
AI specialist agent for testing.
Capabilities
- Task execution
- Code analysis """ ), 'adr': Document( path=Path('/test/adrs/ADR-001-test.md'), content="""--- title: ADR-001 Test Decision type: adr
ADR-001: Test Decision
Status
Accepted
Context
We need to make a test decision.
Decision
We will use this approach.
Consequences
This will have certain effects. """ ), 'command': Document( path=Path('/test/commands/test-command.md'), content="""--- title: Test Command type: command
/test-command
Invocation
/test-command [options]
Arguments
--help: Show help """ ), } return samples.get(doc_type, samples['agent'])
def create_unanimous_votes(classification: str, high_confidence: bool = True) -> list: """Create votes where all analysts agree.""" conf = 0.90 if high_confidence else 0.60 return [ AnalystVote(agent='structural', classification=classification, confidence=conf, reasoning=f'Path matches {classification} pattern', duration_ms=5), AnalystVote(agent='content', classification=classification, confidence=conf, reasoning=f'Content structure matches {classification}', duration_ms=8), AnalystVote(agent='metadata', classification=classification, confidence=0.99, reasoning=f'Explicit type={classification} in frontmatter', duration_ms=3), AnalystVote(agent='semantic', classification=classification, confidence=conf - 0.05, reasoning=f'Semantic analysis suggests {classification}', duration_ms=12), AnalystVote(agent='pattern', classification=classification, confidence=conf, reasoning=f'Matches CODITECT {classification} template', duration_ms=6), ]
def create_split_votes(primary: str, secondary: str) -> list: """Create votes with disagreement.""" return [ AnalystVote(agent='structural', classification=primary, confidence=0.85, reasoning=f'Path matches {primary} pattern', duration_ms=5), AnalystVote(agent='content', classification=secondary, confidence=0.80, reasoning=f'Content suggests {secondary}', duration_ms=8), AnalystVote(agent='metadata', classification=primary, confidence=0.75, reasoning='Tags match primary type', duration_ms=3), AnalystVote(agent='semantic', classification=secondary, confidence=0.85, reasoning=f'Intent suggests {secondary}', duration_ms=12), AnalystVote(agent='pattern', classification=primary, confidence=0.70, reasoning='Partial template match', duration_ms=6), ]
def create_low_quality_votes(classification: str) -> list: """Create low-quality votes.""" return [ AnalystVote(agent='structural', classification=classification, confidence=0.50, reasoning='default', duration_ms=5), AnalystVote(agent='content', classification=classification, confidence=0.50, reasoning='not sure', duration_ms=8), AnalystVote(agent='metadata', classification=classification, confidence=0.50, reasoning='maybe', duration_ms=3), ]
def test_consistency_judge(): """Test ConsistencyJudge with various vote patterns.""" print("\n" + "="*60) print("Testing ConsistencyJudge") print("="*60)
judge = ConsistencyJudge()
doc = create_test_document('agent')
# Test 1: Unanimous agreement
print("\n1. Unanimous agreement (agent):")
votes = create_unanimous_votes('agent')
decision = judge.evaluate(doc, votes)
print(f" Approved: {decision.approved}")
print(f" Reason: {decision.reason}")
print(f" Agreement: {decision.metadata.get('agreement_ratio', 'N/A'):.0%}")
assert decision.approved, "Should approve unanimous votes"
# Test 2: Split votes (60% agreement)
print("\n2. Split votes (60% agreement):")
votes = create_split_votes('agent', 'command')
decision = judge.evaluate(doc, votes)
print(f" Approved: {decision.approved}")
print(f" Reason: {decision.reason}")
print(f" Agreement: {decision.metadata.get('agreement_ratio', 'N/A'):.0%}")
# Test 3: Strong disagreement
print("\n3. Strong disagreement (bimodal):")
votes = [
AnalystVote(agent='structural', classification='agent', confidence=0.90,
reasoning='Agent pattern', duration_ms=5),
AnalystVote(agent='content', classification='agent', confidence=0.85,
reasoning='Agent content', duration_ms=8),
AnalystVote(agent='metadata', classification='command', confidence=0.90,
reasoning='Command type', duration_ms=3),
AnalystVote(agent='semantic', classification='command', confidence=0.85,
reasoning='Command intent', duration_ms=12),
AnalystVote(agent='pattern', classification='skill', confidence=0.70,
reasoning='Skill pattern', duration_ms=6),
]
decision = judge.evaluate(doc, votes)
print(f" Approved: {decision.approved}")
print(f" Reason: {decision.reason}")
print(f" Issues: {decision.metadata.get('issues', [])}")
print("\n ConsistencyJudge: PASSED")
def test_quality_judge(): """Test QualityJudge with various vote quality levels.""" print("\n" + "="*60) print("Testing QualityJudge") print("="*60)
judge = QualityJudge()
doc = create_test_document('agent')
# Test 1: High-quality votes
print("\n1. High-quality votes:")
votes = create_unanimous_votes('agent', high_confidence=True)
decision = judge.evaluate(doc, votes)
print(f" Approved: {decision.approved}")
print(f" Reason: {decision.reason}")
print(f" Overall quality: {decision.metadata.get('overall_quality', 'N/A'):.0%}")
assert decision.approved, "Should approve high-quality votes"
# Test 2: Low-quality votes
print("\n2. Low-quality votes:")
votes = create_low_quality_votes('agent')
decision = judge.evaluate(doc, votes)
print(f" Approved: {decision.approved}")
print(f" Reason: {decision.reason}")
print(f" Issues: {decision.metadata.get('issues', [])}")
# Test 3: Mixed quality
print("\n3. Mixed quality votes:")
votes = [
AnalystVote(agent='structural', classification='agent', confidence=0.95,
reasoning='Strong path match for agent directory structure', duration_ms=5),
AnalystVote(agent='content', classification='agent', confidence=0.90,
reasoning='Contains Role and Capabilities sections', duration_ms=8),
AnalystVote(agent='metadata', classification='agent', confidence=0.99,
reasoning='Explicit type=agent in YAML frontmatter', duration_ms=3),
AnalystVote(agent='semantic', classification='agent', confidence=0.50,
reasoning='default', duration_ms=12),
AnalystVote(agent='pattern', classification='agent', confidence=0.40,
reasoning='', duration_ms=6),
]
decision = judge.evaluate(doc, votes)
print(f" Approved: {decision.approved}")
print(f" Warnings: {len(decision.metadata.get('warnings', []))}")
print(f" Overall quality: {decision.metadata.get('overall_quality', 'N/A'):.0%}")
print("\n QualityJudge: PASSED")
def test_domain_judge(): """Test DomainJudge with various document types.""" print("\n" + "="*60) print("Testing DomainJudge") print("="*60)
judge = DomainJudge()
# Test 1: Agent document with agent votes
print("\n1. Agent document with correct classification:")
doc = create_test_document('agent')
votes = create_unanimous_votes('agent')
decision = judge.evaluate(doc, votes)
print(f" Approved: {decision.approved}")
print(f" Reason: {decision.reason}")
print(f" Validations: {decision.metadata.get('validations', [])}")
assert decision.approved, "Should approve matching agent classification"
# Test 2: ADR document with ADR votes
print("\n2. ADR document with correct classification:")
doc = create_test_document('adr')
votes = create_unanimous_votes('adr')
decision = judge.evaluate(doc, votes)
print(f" Approved: {decision.approved}")
print(f" Reason: {decision.reason}")
print(f" Validations: {decision.metadata.get('validations', [])}")
assert decision.approved, "Should approve matching ADR classification"
# Test 3: Mismatched classification
print("\n3. Agent document classified as command (mismatch):")
doc = create_test_document('agent')
votes = create_unanimous_votes('command')
decision = judge.evaluate(doc, votes)
print(f" Approved: {decision.approved}")
print(f" Reason: {decision.reason}")
print(f" Warnings: {decision.metadata.get('warnings', [])}")
# Test 4: Invalid document type
print("\n4. Invalid document type:")
doc = create_test_document('agent')
votes = [
AnalystVote(agent='structural', classification='invalid_type', confidence=0.80,
reasoning='Unknown pattern', duration_ms=5),
]
decision = judge.evaluate(doc, votes)
print(f" Approved: {decision.approved}")
print(f" Reason: {decision.reason}")
assert not decision.approved, "Should reject invalid document type"
print("\n DomainJudge: PASSED")
def test_all_judges_integration(): """Test all judges together on a sample document.""" print("\n" + "="*60) print("Integration Test: All Judges") print("="*60)
judges = get_all_judges()
doc = create_test_document('adr')
votes = create_unanimous_votes('adr')
print(f"\nDocument: {doc.path}")
print(f"Votes: {len(votes)} analysts all vote 'adr'")
print("\nJudge Decisions:")
all_approved = True
for judge in judges:
decision = judge.evaluate(doc, votes)
status = "APPROVED" if decision.approved else "REJECTED"
print(f" {judge.name}: {status} ({decision.confidence:.0%})")
print(f" {decision.reason}")
if not decision.approved:
all_approved = False
print(f"\n Overall: {'ALL APPROVED' if all_approved else 'SOME REJECTED'}")
print("\n Integration Test: PASSED")
def main(): """Run all judge tests.""" print("="*60) print("MoE Judge Agent Tests") print("="*60)
try:
test_consistency_judge()
test_quality_judge()
test_domain_judge()
test_all_judges_integration()
print("\n" + "="*60)
print("ALL TESTS PASSED")
print("="*60)
print("\nJudges implemented:")
print(" 1. ConsistencyJudge - Cross-analyst agreement validation")
print(" 2. QualityJudge - Vote quality and confidence thresholds")
print(" 3. DomainJudge - CODITECT domain conventions compliance")
except AssertionError as e:
print(f"\nTEST FAILED: {e}")
return 1
except Exception as e:
print(f"\nERROR: {e}")
import traceback
traceback.print_exc()
return 1
return 0
if name == 'main': sys.exit(main())