Skip to main content

scripts-test-llm-judge

""" Tests for LLM Judge (H.3.5.3).

Tests cover:

  • LLMJudge initialization and configuration
  • Prompt building
  • Response parsing
  • Provenance tracking
  • LLMJudgePanel multi-model coordination
  • Model diversity verification """

import asyncio import json import unittest from datetime import datetime, timezone from pathlib import Path from unittest.mock import MagicMock, patch, AsyncMock

from core.models import Document, AnalystVote, JudgeDecision from core.multi_model_client import ( MultiModelClient, CompletionResponse, ModelProvider, ) from core.persona_loader import ( JudgePersona, EvaluationDimension, ModelRouting, Demographics, EvaluationStyle, ) from judges.llm_judge import LLMJudge, LLMJudgePanel, create_llm_judge_panel

def create_mock_persona(persona_id: str = "test_judge", model: str = "claude-sonnet-4") -> JudgePersona: """Create a mock persona for testing.""" return JudgePersona( persona_id=persona_id, version="1.0", enabled=True, demographics=Demographics( name="Test Judge", title="Test Evaluator", experience_years=10, credentials=["Testing Expert"], background="Expert in testing evaluations" ), expertise={"testing": ["unit", "integration"]}, evaluation_style=EvaluationStyle( strictness="MEDIUM", focus="Accuracy" ), model_routing=ModelRouting( primary_model=model, backup_model="gpt-4o", model_family="anthropic" if "claude" in model else "openai" ), weight=1.0, trigger_conditions=["always"], evaluation_dimensions=[ EvaluationDimension( id="accuracy", name="accuracy", weight=1.0, scale=[1, 2, 3], score_descriptions={1: "Poor", 2: "Good", 3: "Excellent"} ) ], output_schema={"verdict": "string", "reason": "string"} )

class TestLLMJudgeInit(unittest.TestCase): """Tests for LLMJudge initialization."""

def test_init_with_persona(self):
"""Test initialization with persona."""
persona = create_mock_persona()
judge = LLMJudge(persona)

self.assertEqual(judge.name, "test_judge")
self.assertIn("Test Evaluator", judge.description)
self.assertEqual(judge.model, "claude-sonnet-4")
self.assertEqual(judge.weight, 1.0)

def test_init_with_custom_client(self):
"""Test initialization with custom client."""
persona = create_mock_persona()
client = MultiModelClient()
judge = LLMJudge(persona, model_client=client)

self.assertIs(judge.client, client)

def test_model_from_routing(self):
"""Test model comes from persona routing."""
persona = create_mock_persona(model="gpt-4o")
judge = LLMJudge(persona)

self.assertEqual(judge.model, "gpt-4o")

class TestLLMJudgePromptBuilding(unittest.TestCase): """Tests for prompt building."""

def setUp(self):
persona = create_mock_persona()
self.judge = LLMJudge(persona)

def test_build_system_prompt(self):
"""Test system prompt contains persona info."""
prompt = self.judge._build_system_prompt()

self.assertIn("test_judge", prompt)
self.assertIn("accuracy", prompt)
self.assertIn("JSON", prompt)

def test_build_prompt_with_document(self):
"""Test prompt includes document info."""
doc = Document(
path=Path("/test/doc.md"),
content="# Test Document\n\nContent here."
)
votes = [
AnalystVote(
agent="analyst_1",
classification="skill",
confidence=0.9,
reasoning="Looks like a skill"
)
]

prompt = self.judge._build_prompt(doc, votes)

self.assertIn("doc.md", prompt)
self.assertIn("Test Document", prompt)
self.assertIn("analyst_1", prompt)
self.assertIn("skill", prompt)

def test_format_votes(self):
"""Test vote formatting."""
votes = [
AnalystVote(
agent="agent_1",
classification="skill",
confidence=0.95,
reasoning="Clear skill structure"
),
AnalystVote(
agent="agent_2",
classification="skill",
confidence=0.85,
reasoning="Has skill markers"
)
]

formatted = self.judge._format_votes(votes)

self.assertIn("agent_1", formatted)
self.assertIn("agent_2", formatted)
self.assertIn("skill", formatted)
self.assertIn("95.00%", formatted)

class TestLLMJudgeResponseParsing(unittest.TestCase): """Tests for response parsing."""

def setUp(self):
persona = create_mock_persona()
self.judge = LLMJudge(persona)

def test_parse_json_response(self):
"""Test parsing valid JSON response."""
response = CompletionResponse(
content=json.dumps({
"approved": True,
"confidence": 0.95,
"reason": "Classification is correct",
"dimension_scores": {"accuracy": 3},
"key_concerns": []
}),
model_used="claude-sonnet-4",
provider="anthropic",
token_usage=100,
input_tokens=50,
output_tokens=50,
latency_ms=500,
timestamp=datetime.now(timezone.utc),
success=True
)

decision = self.judge._parse_response(response, [])

self.assertTrue(decision.approved)
self.assertEqual(decision.confidence, 0.95)
self.assertEqual(decision.reason, "Classification is correct")
self.assertEqual(decision.dimension_scores, {"accuracy": 3})

def test_parse_json_in_markdown(self):
"""Test parsing JSON wrapped in markdown code blocks."""
response = CompletionResponse(
content="""Here's my evaluation:
{
"approved": false,
"confidence": 0.7,
"reason": "Unclear classification"
}

""", model_used="gpt-4o", provider="openai", token_usage=150, input_tokens=75, output_tokens=75, latency_ms=600, timestamp=datetime.now(timezone.utc), success=True )

    decision = self.judge._parse_response(response, [])

self.assertFalse(decision.approved)
self.assertEqual(decision.confidence, 0.7)

def test_parse_fallback_for_invalid_json(self):
"""Test fallback parsing for invalid JSON."""
response = CompletionResponse(
content="I APPROVE this classification because it's correct.",
model_used="claude-sonnet-4",
provider="anthropic",
token_usage=50,
input_tokens=25,
output_tokens=25,
latency_ms=300,
timestamp=datetime.now(timezone.utc),
success=True
)

decision = self.judge._parse_response(response, [])

self.assertTrue(decision.approved) # Contains "approve" not "reject"
self.assertIn("parse_error", decision.metadata)

class TestLLMJudgeEvaluation(unittest.TestCase): """Tests for LLM judge evaluation."""

def test_successful_evaluation(self):
"""Test successful LLM evaluation with provenance."""
async def run_test():
persona = create_mock_persona()

mock_response = CompletionResponse(
content=json.dumps({
"approved": True,
"confidence": 0.9,
"reason": "Good classification",
"dimension_scores": {"accuracy": 3}
}),
model_used="claude-sonnet-4",
provider="anthropic",
token_usage=150,
input_tokens=100,
output_tokens=50,
latency_ms=800,
timestamp=datetime.now(timezone.utc),
success=True
)

mock_client = MagicMock(spec=MultiModelClient)
mock_client.get_completion = AsyncMock(return_value=mock_response)

judge = LLMJudge(persona, model_client=mock_client)

doc = Document(
path=Path("/test/doc.md"),
content="Test content"
)
votes = [
AnalystVote(
agent="analyst_1",
classification="skill",
confidence=0.9,
reasoning="It's a skill"
)
]

decision = await judge.evaluate_async(doc, votes)

# Verify decision
self.assertTrue(decision.approved)
self.assertEqual(decision.confidence, 0.9)
self.assertEqual(decision.judge, "test_judge")

# Verify provenance (H.3.4)
self.assertEqual(decision.model_used, "claude-sonnet-4")
self.assertEqual(decision.token_usage, 150)
self.assertIsNotNone(decision.timestamp)
self.assertIsNotNone(decision.evaluation_start_time)
self.assertIsNotNone(decision.evaluation_end_time)

asyncio.run(run_test())

def test_failed_evaluation(self):
"""Test failed LLM evaluation creates rejection."""
async def run_test():
persona = create_mock_persona()

mock_response = CompletionResponse(
content="",
model_used="claude-sonnet-4",
provider="anthropic",
token_usage=0,
input_tokens=0,
output_tokens=0,
latency_ms=100,
timestamp=datetime.now(timezone.utc),
success=False,
error="Rate limit exceeded"
)

mock_client = MagicMock(spec=MultiModelClient)
mock_client.get_completion = AsyncMock(return_value=mock_response)

judge = LLMJudge(persona, model_client=mock_client)

doc = Document(path=Path("/test/doc.md"), content="Test")
votes = []

decision = await judge.evaluate_async(doc, votes)

self.assertFalse(decision.approved)
self.assertIn("Rate limit exceeded", decision.reason)

asyncio.run(run_test())

class TestLLMJudgePanel(unittest.TestCase): """Tests for LLMJudgePanel."""

def test_get_judge_info(self):
"""Test getting panel judge info."""
# Create panel with mock judges
panel = LLMJudgePanel()
panel.judges = [
LLMJudge(create_mock_persona("judge_1", "claude-sonnet-4")),
LLMJudge(create_mock_persona("judge_2", "gpt-4o")),
LLMJudge(create_mock_persona("judge_3", "deepseek-v3"))
]

info = panel.get_judge_info()

self.assertEqual(len(info), 3)
self.assertEqual(info[0]["persona_id"], "judge_1")
self.assertEqual(info[0]["model"], "claude-sonnet-4")

def test_model_diversity_compliant(self):
"""Test diversity verification with compliant panel."""
panel = LLMJudgePanel()
panel.judges = [
LLMJudge(create_mock_persona("judge_1", "claude-sonnet-4")),
LLMJudge(create_mock_persona("judge_2", "gpt-4o")),
LLMJudge(create_mock_persona("judge_3", "deepseek-v3")),
]

diversity = panel.verify_model_diversity()

self.assertEqual(diversity["unique_families"], 3)
self.assertTrue(diversity["meets_family_requirement"])
self.assertTrue(diversity["meets_weight_requirement"])
self.assertTrue(diversity["is_compliant"])

def test_model_diversity_non_compliant_families(self):
"""Test diversity verification with non-compliant panel (too few families)."""
panel = LLMJudgePanel()
panel.judges = [
LLMJudge(create_mock_persona("judge_1", "claude-sonnet-4")),
LLMJudge(create_mock_persona("judge_2", "claude-opus-4.5")),
LLMJudge(create_mock_persona("judge_3", "claude-haiku-4.5")),
]

diversity = panel.verify_model_diversity()

self.assertEqual(diversity["unique_families"], 1) # All Anthropic
self.assertFalse(diversity["meets_family_requirement"])
self.assertFalse(diversity["is_compliant"])

def test_model_diversity_non_compliant_weight(self):
"""Test diversity verification with non-compliant panel (too much weight on one model)."""
panel = LLMJudgePanel()

# Create judges with unequal weights
j1 = LLMJudge(create_mock_persona("judge_1", "claude-sonnet-4"))
j1.weight = 3.0
j2 = LLMJudge(create_mock_persona("judge_2", "gpt-4o"))
j2.weight = 1.0
j3 = LLMJudge(create_mock_persona("judge_3", "deepseek-v3"))
j3.weight = 1.0

panel.judges = [j1, j2, j3]

diversity = panel.verify_model_diversity()

# Claude-sonnet-4 has 3/5 = 60% weight, exceeds 40% limit
self.assertTrue(diversity["max_single_model_weight"] > 0.40)
self.assertFalse(diversity["meets_weight_requirement"])
self.assertFalse(diversity["is_compliant"])

class TestLLMJudgePanelEvaluation(unittest.TestCase): """Tests for panel evaluation."""

def test_evaluate_all_parallel(self):
"""Test parallel evaluation of all judges."""
async def run_test():
# Create mock responses
responses = [
CompletionResponse(
content=json.dumps({"approved": True, "confidence": 0.9, "reason": "Good"}),
model_used="claude-sonnet-4",
provider="anthropic",
token_usage=100,
input_tokens=50,
output_tokens=50,
latency_ms=500,
timestamp=datetime.now(timezone.utc),
success=True
),
CompletionResponse(
content=json.dumps({"approved": True, "confidence": 0.85, "reason": "OK"}),
model_used="gpt-4o",
provider="openai",
token_usage=120,
input_tokens=60,
output_tokens=60,
latency_ms=600,
timestamp=datetime.now(timezone.utc),
success=True
)
]

call_count = [0]

async def mock_get_completion(*args, **kwargs):
idx = call_count[0]
call_count[0] += 1
return responses[idx % len(responses)]

mock_client = MagicMock(spec=MultiModelClient)
mock_client.get_completion = mock_get_completion

panel = LLMJudgePanel(client=mock_client)

# Add judges manually (bypass loader)
panel.judges = [
LLMJudge(create_mock_persona("j1", "claude-sonnet-4"), mock_client),
LLMJudge(create_mock_persona("j2", "gpt-4o"), mock_client)
]

doc = Document(path=Path("/test/doc.md"), content="Test")
votes = []

decisions = await panel.evaluate_all(doc, votes)

self.assertEqual(len(decisions), 2)
self.assertTrue(all(d.approved for d in decisions))
self.assertEqual(call_count[0], 2)

asyncio.run(run_test())

class TestCreateLLMJudgePanel(unittest.TestCase): """Tests for create_llm_judge_panel convenience function."""

def test_create_panel(self):
"""Test creating panel via convenience function."""
# This will fail gracefully if no personas are configured
try:
panel = create_llm_judge_panel()
self.assertIsInstance(panel, LLMJudgePanel)
except Exception:
# Expected if personas directory doesn't exist
pass

if name == 'main': unittest.main()