Skip to main content

#!/usr/bin/env python3 """ Test script for MoE Judge Agents.

Tests all 3 judges with sample analyst votes to verify:

  1. ConsistencyJudge - agreement validation
  2. QualityJudge - vote quality and confidence
  3. DomainJudge - CODITECT domain compliance """

import sys from pathlib import Path

Add module path

sys.path.insert(0, str(Path(file).parent))

from core.models import Document, AnalystVote from judges import get_all_judges, ConsistencyJudge, QualityJudge, DomainJudge

def create_test_document(doc_type: str) -> Document: """Create a sample document for testing.""" samples = { 'agent': Document( path=Path('/test/agents/test-agent.md'), content="""--- title: Test Agent type: agent tags: [agent, ai]

Test Agent

Role

AI specialist agent for testing.

Capabilities

  • Task execution
  • Code analysis """ ), 'adr': Document( path=Path('/test/adrs/ADR-001-test.md'), content="""--- title: ADR-001 Test Decision type: adr

ADR-001: Test Decision

Status

Accepted

Context

We need to make a test decision.

Decision

We will use this approach.

Consequences

This will have certain effects. """ ), 'command': Document( path=Path('/test/commands/test-command.md'), content="""--- title: Test Command type: command

/test-command

Invocation

/test-command [options]

Arguments

  • --help: Show help """ ), } return samples.get(doc_type, samples['agent'])

def create_unanimous_votes(classification: str, high_confidence: bool = True) -> list: """Create votes where all analysts agree.""" conf = 0.90 if high_confidence else 0.60 return [ AnalystVote(agent='structural', classification=classification, confidence=conf, reasoning=f'Path matches {classification} pattern', duration_ms=5), AnalystVote(agent='content', classification=classification, confidence=conf, reasoning=f'Content structure matches {classification}', duration_ms=8), AnalystVote(agent='metadata', classification=classification, confidence=0.99, reasoning=f'Explicit type={classification} in frontmatter', duration_ms=3), AnalystVote(agent='semantic', classification=classification, confidence=conf - 0.05, reasoning=f'Semantic analysis suggests {classification}', duration_ms=12), AnalystVote(agent='pattern', classification=classification, confidence=conf, reasoning=f'Matches CODITECT {classification} template', duration_ms=6), ]

def create_split_votes(primary: str, secondary: str) -> list: """Create votes with disagreement.""" return [ AnalystVote(agent='structural', classification=primary, confidence=0.85, reasoning=f'Path matches {primary} pattern', duration_ms=5), AnalystVote(agent='content', classification=secondary, confidence=0.80, reasoning=f'Content suggests {secondary}', duration_ms=8), AnalystVote(agent='metadata', classification=primary, confidence=0.75, reasoning='Tags match primary type', duration_ms=3), AnalystVote(agent='semantic', classification=secondary, confidence=0.85, reasoning=f'Intent suggests {secondary}', duration_ms=12), AnalystVote(agent='pattern', classification=primary, confidence=0.70, reasoning='Partial template match', duration_ms=6), ]

def create_low_quality_votes(classification: str) -> list: """Create low-quality votes.""" return [ AnalystVote(agent='structural', classification=classification, confidence=0.50, reasoning='default', duration_ms=5), AnalystVote(agent='content', classification=classification, confidence=0.50, reasoning='not sure', duration_ms=8), AnalystVote(agent='metadata', classification=classification, confidence=0.50, reasoning='maybe', duration_ms=3), ]

def test_consistency_judge(): """Test ConsistencyJudge with various vote patterns.""" print("\n" + "="*60) print("Testing ConsistencyJudge") print("="*60)

judge = ConsistencyJudge()
doc = create_test_document('agent')

# Test 1: Unanimous agreement
print("\n1. Unanimous agreement (agent):")
votes = create_unanimous_votes('agent')
decision = judge.evaluate(doc, votes)
print(f" Approved: {decision.approved}")
print(f" Reason: {decision.reason}")
print(f" Agreement: {decision.metadata.get('agreement_ratio', 'N/A'):.0%}")
assert decision.approved, "Should approve unanimous votes"

# Test 2: Split votes (60% agreement)
print("\n2. Split votes (60% agreement):")
votes = create_split_votes('agent', 'command')
decision = judge.evaluate(doc, votes)
print(f" Approved: {decision.approved}")
print(f" Reason: {decision.reason}")
print(f" Agreement: {decision.metadata.get('agreement_ratio', 'N/A'):.0%}")

# Test 3: Strong disagreement
print("\n3. Strong disagreement (bimodal):")
votes = [
AnalystVote(agent='structural', classification='agent', confidence=0.90,
reasoning='Agent pattern', duration_ms=5),
AnalystVote(agent='content', classification='agent', confidence=0.85,
reasoning='Agent content', duration_ms=8),
AnalystVote(agent='metadata', classification='command', confidence=0.90,
reasoning='Command type', duration_ms=3),
AnalystVote(agent='semantic', classification='command', confidence=0.85,
reasoning='Command intent', duration_ms=12),
AnalystVote(agent='pattern', classification='skill', confidence=0.70,
reasoning='Skill pattern', duration_ms=6),
]
decision = judge.evaluate(doc, votes)
print(f" Approved: {decision.approved}")
print(f" Reason: {decision.reason}")
print(f" Issues: {decision.metadata.get('issues', [])}")

print("\n ConsistencyJudge: PASSED")

def test_quality_judge(): """Test QualityJudge with various vote quality levels.""" print("\n" + "="*60) print("Testing QualityJudge") print("="*60)

judge = QualityJudge()
doc = create_test_document('agent')

# Test 1: High-quality votes
print("\n1. High-quality votes:")
votes = create_unanimous_votes('agent', high_confidence=True)
decision = judge.evaluate(doc, votes)
print(f" Approved: {decision.approved}")
print(f" Reason: {decision.reason}")
print(f" Overall quality: {decision.metadata.get('overall_quality', 'N/A'):.0%}")
assert decision.approved, "Should approve high-quality votes"

# Test 2: Low-quality votes
print("\n2. Low-quality votes:")
votes = create_low_quality_votes('agent')
decision = judge.evaluate(doc, votes)
print(f" Approved: {decision.approved}")
print(f" Reason: {decision.reason}")
print(f" Issues: {decision.metadata.get('issues', [])}")

# Test 3: Mixed quality
print("\n3. Mixed quality votes:")
votes = [
AnalystVote(agent='structural', classification='agent', confidence=0.95,
reasoning='Strong path match for agent directory structure', duration_ms=5),
AnalystVote(agent='content', classification='agent', confidence=0.90,
reasoning='Contains Role and Capabilities sections', duration_ms=8),
AnalystVote(agent='metadata', classification='agent', confidence=0.99,
reasoning='Explicit type=agent in YAML frontmatter', duration_ms=3),
AnalystVote(agent='semantic', classification='agent', confidence=0.50,
reasoning='default', duration_ms=12),
AnalystVote(agent='pattern', classification='agent', confidence=0.40,
reasoning='', duration_ms=6),
]
decision = judge.evaluate(doc, votes)
print(f" Approved: {decision.approved}")
print(f" Warnings: {len(decision.metadata.get('warnings', []))}")
print(f" Overall quality: {decision.metadata.get('overall_quality', 'N/A'):.0%}")

print("\n QualityJudge: PASSED")

def test_domain_judge(): """Test DomainJudge with various document types.""" print("\n" + "="*60) print("Testing DomainJudge") print("="*60)

judge = DomainJudge()

# Test 1: Agent document with agent votes
print("\n1. Agent document with correct classification:")
doc = create_test_document('agent')
votes = create_unanimous_votes('agent')
decision = judge.evaluate(doc, votes)
print(f" Approved: {decision.approved}")
print(f" Reason: {decision.reason}")
print(f" Validations: {decision.metadata.get('validations', [])}")
assert decision.approved, "Should approve matching agent classification"

# Test 2: ADR document with ADR votes
print("\n2. ADR document with correct classification:")
doc = create_test_document('adr')
votes = create_unanimous_votes('adr')
decision = judge.evaluate(doc, votes)
print(f" Approved: {decision.approved}")
print(f" Reason: {decision.reason}")
print(f" Validations: {decision.metadata.get('validations', [])}")
assert decision.approved, "Should approve matching ADR classification"

# Test 3: Mismatched classification
print("\n3. Agent document classified as command (mismatch):")
doc = create_test_document('agent')
votes = create_unanimous_votes('command')
decision = judge.evaluate(doc, votes)
print(f" Approved: {decision.approved}")
print(f" Reason: {decision.reason}")
print(f" Warnings: {decision.metadata.get('warnings', [])}")

# Test 4: Invalid document type
print("\n4. Invalid document type:")
doc = create_test_document('agent')
votes = [
AnalystVote(agent='structural', classification='invalid_type', confidence=0.80,
reasoning='Unknown pattern', duration_ms=5),
]
decision = judge.evaluate(doc, votes)
print(f" Approved: {decision.approved}")
print(f" Reason: {decision.reason}")
assert not decision.approved, "Should reject invalid document type"

print("\n DomainJudge: PASSED")

def test_all_judges_integration(): """Test all judges together on a sample document.""" print("\n" + "="*60) print("Integration Test: All Judges") print("="*60)

judges = get_all_judges()
doc = create_test_document('adr')
votes = create_unanimous_votes('adr')

print(f"\nDocument: {doc.path}")
print(f"Votes: {len(votes)} analysts all vote 'adr'")
print("\nJudge Decisions:")

all_approved = True
for judge in judges:
decision = judge.evaluate(doc, votes)
status = "APPROVED" if decision.approved else "REJECTED"
print(f" {judge.name}: {status} ({decision.confidence:.0%})")
print(f" {decision.reason}")
if not decision.approved:
all_approved = False

print(f"\n Overall: {'ALL APPROVED' if all_approved else 'SOME REJECTED'}")
print("\n Integration Test: PASSED")

def main(): """Run all judge tests.""" print("="*60) print("MoE Judge Agent Tests") print("="*60)

try:
test_consistency_judge()
test_quality_judge()
test_domain_judge()
test_all_judges_integration()

print("\n" + "="*60)
print("ALL TESTS PASSED")
print("="*60)
print("\nJudges implemented:")
print(" 1. ConsistencyJudge - Cross-analyst agreement validation")
print(" 2. QualityJudge - Vote quality and confidence thresholds")
print(" 3. DomainJudge - CODITECT domain conventions compliance")

except AssertionError as e:
print(f"\nTEST FAILED: {e}")
return 1
except Exception as e:
print(f"\nERROR: {e}")
import traceback
traceback.print_exc()
return 1

return 0

if name == 'main': sys.exit(main())