#!/usr/bin/env python3 """ Test script for MoE Judge Agents.

Tests all 3 judges with sample analyst votes to verify:

ConsistencyJudge - agreement validation
QualityJudge - vote quality and confidence
DomainJudge - CODITECT domain compliance """

import sys from pathlib import Path

Add module path

sys.path.insert(0, str(Path(file).parent))

from core.models import Document, AnalystVote from judges import get_all_judges, ConsistencyJudge, QualityJudge, DomainJudge

def create_test_document(doc_type: str) -> Document: """Create a sample document for testing.""" samples = { 'agent': Document( path=Path('/test/agents/test-agent.md'), content="""--- title: Test Agent type: agent tags: [agent, ai]

Test Agent

Role

AI specialist agent for testing.

Capabilities

Task execution
Code analysis """ ), 'adr': Document( path=Path('/test/adrs/ADR-001-test.md'), content="""--- title: ADR-001 Test Decision type: adr

ADR-001: Test Decision

Status

Accepted

Context

We need to make a test decision.

Decision

We will use this approach.

Consequences

This will have certain effects. """ ), 'command': Document( path=Path('/test/commands/test-command.md'), content="""--- title: Test Command type: command

/test-command

Invocation

/test-command [options]

Arguments

--help: Show help """ ), } return samples.get(doc_type, samples['agent'])

def create_unanimous_votes(classification: str, high_confidence: bool = True) -> list: """Create votes where all analysts agree.""" conf = 0.90 if high_confidence else 0.60 return [ AnalystVote(agent='structural', classification=classification, confidence=conf, reasoning=f'Path matches {classification} pattern', duration_ms=5), AnalystVote(agent='content', classification=classification, confidence=conf, reasoning=f'Content structure matches {classification}', duration_ms=8), AnalystVote(agent='metadata', classification=classification, confidence=0.99, reasoning=f'Explicit type={classification} in frontmatter', duration_ms=3), AnalystVote(agent='semantic', classification=classification, confidence=conf - 0.05, reasoning=f'Semantic analysis suggests {classification}', duration_ms=12), AnalystVote(agent='pattern', classification=classification, confidence=conf, reasoning=f'Matches CODITECT {classification} template', duration_ms=6), ]

def create_split_votes(primary: str, secondary: str) -> list: """Create votes with disagreement.""" return [ AnalystVote(agent='structural', classification=primary, confidence=0.85, reasoning=f'Path matches {primary} pattern', duration_ms=5), AnalystVote(agent='content', classification=secondary, confidence=0.80, reasoning=f'Content suggests {secondary}', duration_ms=8), AnalystVote(agent='metadata', classification=primary, confidence=0.75, reasoning='Tags match primary type', duration_ms=3), AnalystVote(agent='semantic', classification=secondary, confidence=0.85, reasoning=f'Intent suggests {secondary}', duration_ms=12), AnalystVote(agent='pattern', classification=primary, confidence=0.70, reasoning='Partial template match', duration_ms=6), ]

def create_low_quality_votes(classification: str) -> list: """Create low-quality votes.""" return [ AnalystVote(agent='structural', classification=classification, confidence=0.50, reasoning='default', duration_ms=5), AnalystVote(agent='content', classification=classification, confidence=0.50, reasoning='not sure', duration_ms=8), AnalystVote(agent='metadata', classification=classification, confidence=0.50, reasoning='maybe', duration_ms=3), ]

def test_consistency_judge(): """Test ConsistencyJudge with various vote patterns.""" print("\n" + "="*60) print("Testing ConsistencyJudge") print("="*60)

judge = ConsistencyJudge()
doc = create_test_document('agent')

# Test 1: Unanimous agreement
print("\n1. Unanimous agreement (agent):")
votes = create_unanimous_votes('agent')
decision = judge.evaluate(doc, votes)
print(f"   Approved: {decision.approved}")
print(f"   Reason: {decision.reason}")
print(f"   Agreement: {decision.metadata.get('agreement_ratio', 'N/A'):.0%}")
assert decision.approved, "Should approve unanimous votes"

# Test 2: Split votes (60% agreement)
print("\n2. Split votes (60% agreement):")
votes = create_split_votes('agent', 'command')
decision = judge.evaluate(doc, votes)
print(f"   Approved: {decision.approved}")
print(f"   Reason: {decision.reason}")
print(f"   Agreement: {decision.metadata.get('agreement_ratio', 'N/A'):.0%}")

# Test 3: Strong disagreement
print("\n3. Strong disagreement (bimodal):")
votes = [
    AnalystVote(agent='structural', classification='agent', confidence=0.90,
                reasoning='Agent pattern', duration_ms=5),
    AnalystVote(agent='content', classification='agent', confidence=0.85,
                reasoning='Agent content', duration_ms=8),
    AnalystVote(agent='metadata', classification='command', confidence=0.90,
                reasoning='Command type', duration_ms=3),
    AnalystVote(agent='semantic', classification='command', confidence=0.85,
                reasoning='Command intent', duration_ms=12),
    AnalystVote(agent='pattern', classification='skill', confidence=0.70,
                reasoning='Skill pattern', duration_ms=6),
]
decision = judge.evaluate(doc, votes)
print(f"   Approved: {decision.approved}")
print(f"   Reason: {decision.reason}")
print(f"   Issues: {decision.metadata.get('issues', [])}")

print("\n   ConsistencyJudge: PASSED")

def test_quality_judge(): """Test QualityJudge with various vote quality levels.""" print("\n" + "="*60) print("Testing QualityJudge") print("="*60)

judge = QualityJudge()
doc = create_test_document('agent')

# Test 1: High-quality votes
print("\n1. High-quality votes:")
votes = create_unanimous_votes('agent', high_confidence=True)
decision = judge.evaluate(doc, votes)
print(f"   Approved: {decision.approved}")
print(f"   Reason: {decision.reason}")
print(f"   Overall quality: {decision.metadata.get('overall_quality', 'N/A'):.0%}")
assert decision.approved, "Should approve high-quality votes"

# Test 2: Low-quality votes
print("\n2. Low-quality votes:")
votes = create_low_quality_votes('agent')
decision = judge.evaluate(doc, votes)
print(f"   Approved: {decision.approved}")
print(f"   Reason: {decision.reason}")
print(f"   Issues: {decision.metadata.get('issues', [])}")

# Test 3: Mixed quality
print("\n3. Mixed quality votes:")
votes = [
    AnalystVote(agent='structural', classification='agent', confidence=0.95,
                reasoning='Strong path match for agent directory structure', duration_ms=5),
    AnalystVote(agent='content', classification='agent', confidence=0.90,
                reasoning='Contains Role and Capabilities sections', duration_ms=8),
    AnalystVote(agent='metadata', classification='agent', confidence=0.99,
                reasoning='Explicit type=agent in YAML frontmatter', duration_ms=3),
    AnalystVote(agent='semantic', classification='agent', confidence=0.50,
                reasoning='default', duration_ms=12),
    AnalystVote(agent='pattern', classification='agent', confidence=0.40,
                reasoning='', duration_ms=6),
]
decision = judge.evaluate(doc, votes)
print(f"   Approved: {decision.approved}")
print(f"   Warnings: {len(decision.metadata.get('warnings', []))}")
print(f"   Overall quality: {decision.metadata.get('overall_quality', 'N/A'):.0%}")

print("\n   QualityJudge: PASSED")

def test_domain_judge(): """Test DomainJudge with various document types.""" print("\n" + "="*60) print("Testing DomainJudge") print("="*60)

judge = DomainJudge()

# Test 1: Agent document with agent votes
print("\n1. Agent document with correct classification:")
doc = create_test_document('agent')
votes = create_unanimous_votes('agent')
decision = judge.evaluate(doc, votes)
print(f"   Approved: {decision.approved}")
print(f"   Reason: {decision.reason}")
print(f"   Validations: {decision.metadata.get('validations', [])}")
assert decision.approved, "Should approve matching agent classification"

# Test 2: ADR document with ADR votes
print("\n2. ADR document with correct classification:")
doc = create_test_document('adr')
votes = create_unanimous_votes('adr')
decision = judge.evaluate(doc, votes)
print(f"   Approved: {decision.approved}")
print(f"   Reason: {decision.reason}")
print(f"   Validations: {decision.metadata.get('validations', [])}")
assert decision.approved, "Should approve matching ADR classification"

# Test 3: Mismatched classification
print("\n3. Agent document classified as command (mismatch):")
doc = create_test_document('agent')
votes = create_unanimous_votes('command')
decision = judge.evaluate(doc, votes)
print(f"   Approved: {decision.approved}")
print(f"   Reason: {decision.reason}")
print(f"   Warnings: {decision.metadata.get('warnings', [])}")

# Test 4: Invalid document type
print("\n4. Invalid document type:")
doc = create_test_document('agent')
votes = [
    AnalystVote(agent='structural', classification='invalid_type', confidence=0.80,
                reasoning='Unknown pattern', duration_ms=5),
]
decision = judge.evaluate(doc, votes)
print(f"   Approved: {decision.approved}")
print(f"   Reason: {decision.reason}")
assert not decision.approved, "Should reject invalid document type"

print("\n   DomainJudge: PASSED")

def test_all_judges_integration(): """Test all judges together on a sample document.""" print("\n" + "="*60) print("Integration Test: All Judges") print("="*60)

judges = get_all_judges()
doc = create_test_document('adr')
votes = create_unanimous_votes('adr')

print(f"\nDocument: {doc.path}")
print(f"Votes: {len(votes)} analysts all vote 'adr'")
print("\nJudge Decisions:")

all_approved = True
for judge in judges:
    decision = judge.evaluate(doc, votes)
    status = "APPROVED" if decision.approved else "REJECTED"
    print(f"   {judge.name}: {status} ({decision.confidence:.0%})")
    print(f"      {decision.reason}")
    if not decision.approved:
        all_approved = False

print(f"\n   Overall: {'ALL APPROVED' if all_approved else 'SOME REJECTED'}")
print("\n   Integration Test: PASSED")

def main(): """Run all judge tests.""" print("="*60) print("MoE Judge Agent Tests") print("="*60)

try:
    test_consistency_judge()
    test_quality_judge()
    test_domain_judge()
    test_all_judges_integration()

    print("\n" + "="*60)
    print("ALL TESTS PASSED")
    print("="*60)
    print("\nJudges implemented:")
    print("  1. ConsistencyJudge - Cross-analyst agreement validation")
    print("  2. QualityJudge - Vote quality and confidence thresholds")
    print("  3. DomainJudge - CODITECT domain conventions compliance")

except AssertionError as e:
    print(f"\nTEST FAILED: {e}")
    return 1
except Exception as e:
    print(f"\nERROR: {e}")
    import traceback
    traceback.print_exc()
    return 1

return 0

if name == 'main': sys.exit(main())

def create_test_document(doc_type: str) -> Document: """Create a sample document for testing.""" samples = { 'agent': Document( path=Path('/test/agents/test-agent.md'), content="""--- title: Test Agent type: agent tags: [agent, ai]​

Test Agent

Role​

Capabilities​

ADR-001: Test Decision

Status​

Context​

Decision​

Consequences​

This will have certain effects. """ ), 'command': Document( path=Path('/test/commands/test-command.md'), content="""--- title: Test Command type: command​