scripts-test-debate-protocol
""" Tests for Debate Protocol (H.3.3).
Tests cover:
- DebateOrchestrator class
- Disagreement detection
- Debate context preparation
- Debate round execution
- Consensus calculator integration """
import unittest import asyncio from datetime import datetime, timezone from typing import Dict, List from unittest.mock import MagicMock, AsyncMock, patch
import sys import os sys.path.insert(0, os.path.dirname(os.path.dirname(file)))
from moe_classifier.core.debate import ( DebateOrchestrator, DebateConfig, DebateResult, DebateRound, Disagreement, JudgeEvaluation, Verdict, create_default_orchestrator, requires_debate, ) from moe_classifier.core.consensus import ConsensusCalculator, ConsensusConfig from moe_classifier.core.models import JudgeDecision, ApprovalType
class TestDebateConfig(unittest.TestCase): """Tests for DebateConfig dataclass."""
def test_default_config(self):
"""Test default configuration values."""
config = DebateConfig()
self.assertEqual(config.max_debate_rounds, 3)
self.assertEqual(config.convergence_threshold, 0.8)
self.assertEqual(config.dimension_disagreement_threshold, 1.5)
self.assertEqual(config.min_evaluations_for_debate, 2)
self.assertEqual(config.debate_timeout_seconds, 120)
def test_custom_config(self):
"""Test custom configuration."""
config = DebateConfig(
max_debate_rounds=5,
convergence_threshold=0.9,
dimension_disagreement_threshold=1.0
)
self.assertEqual(config.max_debate_rounds, 5)
self.assertEqual(config.convergence_threshold, 0.9)
self.assertEqual(config.dimension_disagreement_threshold, 1.0)
def test_persona_weights(self):
"""Test default persona weights sum to 1.0."""
config = DebateConfig()
total = sum(config.persona_weights.values())
self.assertAlmostEqual(total, 1.0, places=2)
class TestJudgeEvaluation(unittest.TestCase): """Tests for JudgeEvaluation dataclass."""
def test_create_evaluation(self):
"""Test creating a judge evaluation."""
eval = JudgeEvaluation(
persona_id="technical_architect",
model_used="claude-sonnet-4",
verdict=Verdict.PASS,
confidence=0.9,
dimension_scores={"architecture": 3, "security": 2}
)
self.assertEqual(eval.persona_id, "technical_architect")
self.assertEqual(eval.verdict, Verdict.PASS)
self.assertEqual(eval.confidence, 0.9)
self.assertEqual(eval.debate_round, 0)
def test_evaluation_timestamp(self):
"""Test evaluation has timestamp."""
eval = JudgeEvaluation(
persona_id="test",
model_used="test",
verdict=Verdict.PASS,
confidence=0.8,
dimension_scores={}
)
self.assertIsNotNone(eval.timestamp)
class TestDisagreement(unittest.TestCase): """Tests for Disagreement dataclass."""
def test_verdict_disagreement(self):
"""Test creating verdict-level disagreement."""
d = Disagreement(
type="verdict",
positions={"judge1": "PASS", "judge2": "FAIL"},
severity=0.5
)
self.assertEqual(d.type, "verdict")
self.assertIsNone(d.dimension)
self.assertEqual(d.severity, 0.5)
def test_dimension_disagreement(self):
"""Test creating dimension-level disagreement."""
d = Disagreement(
type="dimension",
dimension="security",
positions={"judge1": 3, "judge2": 1},
severity=0.8
)
self.assertEqual(d.type, "dimension")
self.assertEqual(d.dimension, "security")
class TestDebateOrchestrator(unittest.TestCase): """Tests for DebateOrchestrator class."""
def setUp(self):
"""Set up test fixtures."""
self.orchestrator = DebateOrchestrator()
self.default_config = DebateConfig()
def test_create_orchestrator(self):
"""Test creating debate orchestrator."""
orch = DebateOrchestrator()
self.assertIsNotNone(orch.config)
self.assertEqual(orch.config.max_debate_rounds, 3)
def test_create_with_custom_config(self):
"""Test creating orchestrator with custom config."""
config = DebateConfig(max_debate_rounds=5)
orch = DebateOrchestrator(config)
self.assertEqual(orch.config.max_debate_rounds, 5)
def test_create_default_orchestrator(self):
"""Test convenience function."""
orch = create_default_orchestrator()
self.assertIsInstance(orch, DebateOrchestrator)
class TestAgreementCalculation(unittest.TestCase): """Tests for agreement calculation."""
def setUp(self):
"""Set up test fixtures."""
self.orchestrator = DebateOrchestrator()
def test_perfect_agreement(self):
"""Test 100% agreement when all verdicts match."""
evals = [
JudgeEvaluation("j1", "m1", Verdict.PASS, 0.9, {"dim1": 3}),
JudgeEvaluation("j2", "m2", Verdict.PASS, 0.85, {"dim1": 3}),
JudgeEvaluation("j3", "m3", Verdict.PASS, 0.95, {"dim1": 3}),
]
agreement = self.orchestrator._calculate_agreement(evals)
self.assertAlmostEqual(agreement, 1.0, places=2)
def test_no_agreement(self):
"""Test low agreement when all verdicts differ."""
evals = [
JudgeEvaluation("j1", "m1", Verdict.PASS, 0.9, {"dim1": 3}),
JudgeEvaluation("j2", "m2", Verdict.FAIL, 0.85, {"dim1": 1}),
]
agreement = self.orchestrator._calculate_agreement(evals)
# 50% verdict agreement (0.6 * 0.5) + dimension varies (0.4 * 0.0) = 0.3
self.assertLess(agreement, 0.5)
def test_partial_agreement(self):
"""Test partial agreement."""
evals = [
JudgeEvaluation("j1", "m1", Verdict.PASS, 0.9, {"dim1": 3}),
JudgeEvaluation("j2", "m2", Verdict.PASS, 0.85, {"dim1": 2}),
JudgeEvaluation("j3", "m3", Verdict.FAIL, 0.8, {"dim1": 2}),
]
agreement = self.orchestrator._calculate_agreement(evals)
# 2/3 verdicts agree (0.667), dimensions fairly close
self.assertGreater(agreement, 0.5)
self.assertLess(agreement, 1.0)
def test_single_evaluation(self):
"""Test agreement with single evaluation."""
evals = [
JudgeEvaluation("j1", "m1", Verdict.PASS, 0.9, {})
]
agreement = self.orchestrator._calculate_agreement(evals)
self.assertEqual(agreement, 1.0)
def test_empty_evaluations(self):
"""Test agreement with empty list."""
agreement = self.orchestrator._calculate_agreement([])
self.assertEqual(agreement, 1.0)
class TestDisagreementDetection(unittest.TestCase): """Tests for disagreement detection (H.3.3.2)."""
def setUp(self):
"""Set up test fixtures."""
self.orchestrator = DebateOrchestrator()
def test_detect_verdict_disagreement(self):
"""Test detecting verdict-level disagreement."""
evals = [
JudgeEvaluation("j1", "m1", Verdict.PASS, 0.9, {}),
JudgeEvaluation("j2", "m2", Verdict.FAIL, 0.85, {}),
]
disagreements = self.orchestrator._identify_disagreements(evals)
self.assertEqual(len(disagreements), 1)
self.assertEqual(disagreements[0].type, "verdict")
self.assertEqual(disagreements[0].positions["j1"], "PASS")
self.assertEqual(disagreements[0].positions["j2"], "FAIL")
def test_detect_dimension_disagreement(self):
"""Test detecting dimension-level disagreement."""
evals = [
JudgeEvaluation("j1", "m1", Verdict.PASS, 0.9, {"security": 3}),
JudgeEvaluation("j2", "m2", Verdict.PASS, 0.85, {"security": 1}),
]
disagreements = self.orchestrator._identify_disagreements(evals)
# Should find dimension disagreement (gap of 2 > threshold of 1.5)
dim_disagreements = [d for d in disagreements if d.type == "dimension"]
self.assertEqual(len(dim_disagreements), 1)
self.assertEqual(dim_disagreements[0].dimension, "security")
def test_no_disagreement_when_close(self):
"""Test no disagreement when scores are close."""
evals = [
JudgeEvaluation("j1", "m1", Verdict.PASS, 0.9, {"security": 3}),
JudgeEvaluation("j2", "m2", Verdict.PASS, 0.85, {"security": 2}),
]
disagreements = self.orchestrator._identify_disagreements(evals)
# Gap of 1 < threshold of 1.5
dim_disagreements = [d for d in disagreements if d.type == "dimension"]
self.assertEqual(len(dim_disagreements), 0)
def test_multiple_disagreements(self):
"""Test detecting multiple disagreements."""
evals = [
JudgeEvaluation("j1", "m1", Verdict.PASS, 0.9, {"security": 3, "performance": 1}),
JudgeEvaluation("j2", "m2", Verdict.FAIL, 0.85, {"security": 1, "performance": 3}),
]
disagreements = self.orchestrator._identify_disagreements(evals)
# Should have verdict disagreement + 2 dimension disagreements
self.assertGreaterEqual(len(disagreements), 3)
def test_disagreements_sorted_by_severity(self):
"""Test disagreements are sorted by severity."""
evals = [
JudgeEvaluation("j1", "m1", Verdict.PASS, 0.9, {"dim1": 3, "dim2": 2}),
JudgeEvaluation("j2", "m2", Verdict.FAIL, 0.85, {"dim1": 1, "dim2": 1}),
]
disagreements = self.orchestrator._identify_disagreements(evals)
# Check sorted by severity (descending)
for i in range(len(disagreements) - 1):
self.assertGreaterEqual(disagreements[i].severity, disagreements[i+1].severity)
class TestDebateContextPreparation(unittest.TestCase): """Tests for debate context preparation (H.3.3.3)."""
def setUp(self):
"""Set up test fixtures."""
self.orchestrator = DebateOrchestrator()
def test_prepare_context_includes_round_number(self):
"""Test context includes debate round number."""
evals = [
JudgeEvaluation("j1", "m1", Verdict.PASS, 0.9, {}, rationale="Good architecture"),
JudgeEvaluation("j2", "m2", Verdict.FAIL, 0.85, {}, rationale="Security issues"),
]
disagreements = self.orchestrator._identify_disagreements(evals)
context = self.orchestrator._prepare_debate_context(evals, disagreements, round_num=0)
self.assertIn("DEBATE ROUND 1", context)
def test_prepare_context_includes_positions(self):
"""Test context includes judge positions."""
evals = [
JudgeEvaluation("technical_architect", "claude", Verdict.PASS, 0.9, {}, rationale="Clean design"),
JudgeEvaluation("security_analyst", "gpt4", Verdict.FAIL, 0.85, {}, rationale="Vulnerabilities found"),
]
disagreements = self.orchestrator._identify_disagreements(evals)
context = self.orchestrator._prepare_debate_context(evals, disagreements, round_num=0)
self.assertIn("technical_architect", context)
self.assertIn("security_analyst", context)
self.assertIn("PASS", context)
self.assertIn("FAIL", context)
def test_prepare_context_includes_rationale(self):
"""Test context includes rationales."""
evals = [
JudgeEvaluation("j1", "m1", Verdict.PASS, 0.9, {}, rationale="Architecture is solid"),
JudgeEvaluation("j2", "m2", Verdict.FAIL, 0.85, {}, rationale="Missing error handling"),
]
disagreements = self.orchestrator._identify_disagreements(evals)
context = self.orchestrator._prepare_debate_context(evals, disagreements, round_num=0)
self.assertIn("Architecture is solid", context)
self.assertIn("Missing error handling", context)
def test_prepare_context_includes_instructions(self):
"""Test context includes debate instructions."""
evals = [
JudgeEvaluation("j1", "m1", Verdict.PASS, 0.9, {}),
JudgeEvaluation("j2", "m2", Verdict.FAIL, 0.85, {}),
]
disagreements = self.orchestrator._identify_disagreements(evals)
context = self.orchestrator._prepare_debate_context(evals, disagreements, round_num=0)
self.assertIn("INSTRUCTIONS", context)
self.assertIn("evidence", context.lower())
def test_context_truncates_long_rationales(self):
"""Test long rationales are truncated."""
long_rationale = "A" * 500
evals = [
JudgeEvaluation("j1", "m1", Verdict.PASS, 0.9, {}, rationale=long_rationale),
JudgeEvaluation("j2", "m2", Verdict.FAIL, 0.85, {}),
]
disagreements = self.orchestrator._identify_disagreements(evals)
context = self.orchestrator._prepare_debate_context(evals, disagreements, round_num=0)
# Should truncate to ~300 chars + "..."
self.assertIn("...", context)
class TestDebateRoundExecution(unittest.TestCase): """Tests for debate round execution (H.3.3.4)."""
def setUp(self):
"""Set up test fixtures."""
self.orchestrator = DebateOrchestrator()
def test_conduct_round_without_callback(self):
"""Test debate round without evaluation callback."""
evals = [
JudgeEvaluation("j1", "m1", Verdict.PASS, 0.9, {}),
JudgeEvaluation("j2", "m2", Verdict.FAIL, 0.85, {}),
]
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
result = loop.run_until_complete(
self.orchestrator._conduct_debate_round(
evals, "debate context", "artifact", {}, round_number=1
)
)
# Should return evaluations with updated round number
self.assertEqual(len(result), 2)
for e in result:
self.assertEqual(e.debate_round, 1)
finally:
loop.close()
def test_conduct_round_with_callback(self):
"""Test debate round with evaluation callback."""
# Create mock callback
async def mock_callback(persona_id, artifact, context, extra):
return JudgeEvaluation(
persona_id=persona_id,
model_used="test",
verdict=Verdict.PASS, # Everyone converges to PASS
confidence=0.95,
dimension_scores={},
rationale=f"Updated evaluation for {persona_id}"
)
self.orchestrator.set_evaluation_callback(mock_callback)
evals = [
JudgeEvaluation("j1", "m1", Verdict.PASS, 0.9, {}),
JudgeEvaluation("j2", "m2", Verdict.FAIL, 0.85, {}),
]
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
result = loop.run_until_complete(
self.orchestrator._conduct_debate_round(
evals, "debate context", "artifact", {}, round_number=1
)
)
self.assertEqual(len(result), 2)
# All should now be PASS due to callback
for e in result:
self.assertEqual(e.verdict, Verdict.PASS)
self.assertEqual(e.debate_round, 1)
finally:
loop.close()
class TestOrchestrateDebate(unittest.TestCase): """Tests for full debate orchestration."""
def setUp(self):
"""Set up test fixtures."""
self.orchestrator = DebateOrchestrator()
def test_orchestrate_no_debate_needed(self):
"""Test orchestration when no debate is needed."""
evals = [
JudgeEvaluation("j1", "m1", Verdict.PASS, 0.9, {}),
JudgeEvaluation("j2", "m2", Verdict.PASS, 0.85, {}),
JudgeEvaluation("j3", "m3", Verdict.PASS, 0.95, {}),
]
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
result = loop.run_until_complete(
self.orchestrator.orchestrate_debate(evals, "artifact", {})
)
self.assertIsInstance(result, DebateResult)
self.assertEqual(result.total_debate_rounds, 0)
self.assertTrue(result.convergence_achieved)
self.assertGreaterEqual(result.final_agreement, 0.8)
finally:
loop.close()
def test_orchestrate_single_evaluation(self):
"""Test orchestration with single evaluation."""
evals = [
JudgeEvaluation("j1", "m1", Verdict.PASS, 0.9, {}),
]
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
result = loop.run_until_complete(
self.orchestrator.orchestrate_debate(evals, "artifact", {})
)
self.assertEqual(result.total_debate_rounds, 0)
self.assertTrue(result.convergence_achieved)
finally:
loop.close()
def test_orchestrate_debate_needed(self):
"""Test orchestration when debate is needed."""
evals = [
JudgeEvaluation("j1", "m1", Verdict.PASS, 0.9, {}),
JudgeEvaluation("j2", "m2", Verdict.FAIL, 0.85, {}),
]
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
try:
result = loop.run_until_complete(
self.orchestrator.orchestrate_debate(evals, "artifact", {})
)
# Without callback, no actual change happens but rounds are recorded
self.assertIsInstance(result, DebateResult)
self.assertLessEqual(result.total_debate_rounds, 3) # Max rounds
finally:
loop.close()
class TestDebateSummary(unittest.TestCase): """Tests for debate summary generation."""
def setUp(self):
"""Set up test fixtures."""
self.orchestrator = DebateOrchestrator()
def test_get_summary(self):
"""Test generating debate summary."""
evals = [
JudgeEvaluation("j1", "m1", Verdict.PASS, 0.9, {}),
JudgeEvaluation("j2", "m2", Verdict.PASS, 0.85, {}),
]
result = DebateResult(
final_evaluations=evals,
rounds=[],
initial_agreement=0.6,
final_agreement=0.9,
convergence_achieved=True,
total_debate_rounds=2
)
summary = self.orchestrator.get_debate_summary(result)
self.assertEqual(summary['total_rounds'], 2)
self.assertEqual(summary['initial_agreement'], 0.6)
self.assertEqual(summary['final_agreement'], 0.9)
self.assertTrue(summary['convergence_achieved'])
self.assertAlmostEqual(summary['agreement_improvement'], 0.3, places=2)
self.assertIn('j1', summary['final_verdicts'])
self.assertIn('j2', summary['final_verdicts'])
class TestRequiresDebateFunction(unittest.TestCase): """Tests for requires_debate convenience function."""
def test_requires_debate_true(self):
"""Test requires_debate returns True when needed."""
evals = [
JudgeEvaluation("j1", "m1", Verdict.PASS, 0.9, {}),
JudgeEvaluation("j2", "m2", Verdict.FAIL, 0.85, {}),
]
self.assertTrue(requires_debate(evals))
def test_requires_debate_false(self):
"""Test requires_debate returns False when not needed."""
evals = [
JudgeEvaluation("j1", "m1", Verdict.PASS, 0.9, {}),
JudgeEvaluation("j2", "m2", Verdict.PASS, 0.85, {}),
]
self.assertFalse(requires_debate(evals))
def test_requires_debate_single_evaluation(self):
"""Test requires_debate with single evaluation."""
evals = [
JudgeEvaluation("j1", "m1", Verdict.PASS, 0.9, {}),
]
self.assertFalse(requires_debate(evals))
def test_requires_debate_custom_threshold(self):
"""Test requires_debate with custom threshold."""
evals = [
JudgeEvaluation("j1", "m1", Verdict.PASS, 0.9, {}),
JudgeEvaluation("j2", "m2", Verdict.PASS, 0.85, {}),
JudgeEvaluation("j3", "m3", Verdict.FAIL, 0.8, {}),
]
# With high threshold, debate might be needed
self.assertTrue(requires_debate(evals, threshold=0.95))
# With lower threshold, might not need debate
self.assertFalse(requires_debate(evals, threshold=0.5))
class TestConsensusCalculatorIntegration(unittest.TestCase): """Tests for ConsensusCalculator debate integration (H.3.3.5)."""
def setUp(self):
"""Set up test fixtures."""
self.config = ConsensusConfig(enable_debate=True)
self.calculator = ConsensusCalculator(self.config)
def test_enable_debate_protocol(self):
"""Test enabling debate protocol."""
self.calculator.enable_debate_protocol()
self.assertIsNotNone(self.calculator._debate_orchestrator)
def test_convert_decisions_to_evaluations(self):
"""Test converting JudgeDecision to JudgeEvaluation."""
decisions = [
JudgeDecision(
judge="technical_architect",
approved=True,
reason="Good architecture",
confidence=0.9,
metadata={"model_used": "claude", "dimension_scores": {"security": 3}}
),
JudgeDecision(
judge="security_analyst",
approved=False,
reason="Security vulnerabilities",
confidence=0.85,
metadata={"model_used": "gpt4"}
),
]
evals = self.calculator.convert_decisions_to_evaluations(decisions)
self.assertEqual(len(evals), 2)
self.assertEqual(evals[0].persona_id, "technical_architect")
self.assertEqual(evals[0].verdict, Verdict.PASS)
self.assertEqual(evals[1].persona_id, "security_analyst")
self.assertEqual(evals[1].verdict, Verdict.FAIL)
def test_convert_evaluations_to_decisions(self):
"""Test converting JudgeEvaluation to JudgeDecision."""
evals = [
JudgeEvaluation(
persona_id="technical_architect",
model_used="claude",
verdict=Verdict.PASS,
confidence=0.9,
dimension_scores={"security": 3},
rationale="Good architecture"
),
]
decisions = self.calculator.convert_evaluations_to_decisions(evals)
self.assertEqual(len(decisions), 1)
self.assertEqual(decisions[0].judge, "technical_architect")
self.assertTrue(decisions[0].approved)
self.assertEqual(decisions[0].confidence, 0.9)
def test_check_debate_needed(self):
"""Test check_debate_needed method."""
# Agreeing decisions
decisions_agree = [
JudgeDecision("j1", True, "OK", 0.9, metadata={}),
JudgeDecision("j2", True, "OK", 0.85, metadata={}),
]
self.assertFalse(self.calculator.check_debate_needed(decisions_agree))
# Disagreeing decisions
decisions_disagree = [
JudgeDecision("j1", True, "OK", 0.9, metadata={}),
JudgeDecision("j2", False, "Not OK", 0.85, metadata={}),
]
self.assertTrue(self.calculator.check_debate_needed(decisions_disagree))
def test_debate_disabled(self):
"""Test debate is skipped when disabled in config.
Note: check_debate_needed still returns True because it only checks
if debate would be helpful based on agreement, not whether it's enabled.
The enable_debate flag is checked in apply_judge_decisions_with_debate.
"""
config = ConsensusConfig(enable_debate=False)
calculator = ConsensusCalculator(config)
decisions = [
JudgeDecision("j1", True, "OK", 0.9, metadata={}),
JudgeDecision("j2", False, "Not OK", 0.85, metadata={}),
]
# check_debate_needed only checks agreement, not config
# Debate would help but won't happen since it's disabled
self.assertTrue(calculator.check_debate_needed(decisions))
# The enable_debate flag prevents actual debate in apply_judge_decisions_with_debate
class TestVerdictEnum(unittest.TestCase): """Tests for Verdict enum."""
def test_verdict_values(self):
"""Test verdict enum values."""
self.assertEqual(Verdict.PASS.value, "PASS")
self.assertEqual(Verdict.FAIL.value, "FAIL")
self.assertEqual(Verdict.CONDITIONAL.value, "CONDITIONAL")
def test_verdict_string_enum(self):
"""Test verdict is string enum."""
self.assertIsInstance(Verdict.PASS, str)
# Verdict.PASS.value gives the string value
self.assertEqual(Verdict.PASS.value, "PASS")
class TestDebateRound(unittest.TestCase): """Tests for DebateRound dataclass."""
def test_create_debate_round(self):
"""Test creating debate round."""
evals = [JudgeEvaluation("j1", "m1", Verdict.PASS, 0.9, {})]
disagreements = [Disagreement("verdict", positions={"j1": "PASS"})]
round = DebateRound(
round_number=1,
initial_agreement=0.6,
final_agreement=0.8,
disagreements=disagreements,
evaluations=evals,
debate_context="Test context"
)
self.assertEqual(round.round_number, 1)
self.assertEqual(round.initial_agreement, 0.6)
self.assertEqual(round.final_agreement, 0.8)
self.assertIsNotNone(round.timestamp)
class TestDebateResult(unittest.TestCase): """Tests for DebateResult dataclass."""
def test_create_debate_result(self):
"""Test creating debate result."""
evals = [JudgeEvaluation("j1", "m1", Verdict.PASS, 0.9, {})]
result = DebateResult(
final_evaluations=evals,
rounds=[],
initial_agreement=0.6,
final_agreement=0.9,
convergence_achieved=True,
total_debate_rounds=2
)
self.assertEqual(len(result.final_evaluations), 1)
self.assertEqual(result.total_debate_rounds, 2)
self.assertTrue(result.convergence_achieved)
if name == 'main': unittest.main()