scripts-two-stage-review
#!/usr/bin/env python3 """ CODITECT Two-Stage Review System (ADR-076)
Implements Superpowers' two-stage review pattern:
- Stage 1: Spec Compliance (must pass before Stage 2)
- Stage 2: Code Quality (only after Stage 1 passes)
Integrated with MoE Constitutional Court judge panels. """
from dataclasses import dataclass, field from enum import Enum from typing import List, Optional, Dict, Any, TYPE_CHECKING
if TYPE_CHECKING: # Avoid circular imports pass
class ReviewStage(Enum): """Review stage identifiers.""" SPEC_COMPLIANCE = "spec_compliance" CODE_QUALITY = "code_quality"
class StageVerdict(Enum): """Possible verdicts for each stage.""" # Stage 1 verdicts PASS = "pass" FAIL = "fail" # Stage 2 verdicts APPROVE = "approve" REQUEST_CHANGES = "request_changes"
@dataclass class ReviewResult: """Result from a single review stage.
Attributes:
stage: Which stage this result is from
verdict: Pass/Fail for Stage 1, Approve/RequestChanges for Stage 2
score: Aggregate score (0-100)
issues: List of issues found
judge_verdicts: Per-judge verdict mapping
blocking: Whether this result blocks progression
"""
stage: ReviewStage
verdict: StageVerdict
score: float
issues: List[str]
judge_verdicts: Dict[str, str]
blocking: bool = False
def to_dict(self) -> Dict[str, Any]:
"""Serialize to dictionary."""
return {
"stage": self.stage.value,
"verdict": self.verdict.value,
"score": self.score,
"issues": self.issues,
"judge_verdicts": self.judge_verdicts,
"blocking": self.blocking
}
@dataclass class TwoStageReviewResult: """Combined result from two-stage review.
Attributes:
stage1: Result from spec compliance review
stage2: Result from code quality review (None if Stage 1 failed)
final_verdict: Overall verdict string
total_review_cycles: Number of review cycles executed
"""
stage1: Optional[ReviewResult]
stage2: Optional[ReviewResult]
final_verdict: str
total_review_cycles: int
def to_dict(self) -> Dict[str, Any]:
"""Serialize to dictionary."""
return {
"stage1": self.stage1.to_dict() if self.stage1 else None,
"stage2": self.stage2.to_dict() if self.stage2 else None,
"final_verdict": self.final_verdict,
"total_review_cycles": self.total_review_cycles
}
class TwoStageReviewer: """ Two-stage review system integrating with MoE Constitutional Court.
Implements Superpowers' pattern: Spec compliance MUST pass before
code quality review begins. Prevents "well-written wrong code".
Usage:
orchestrator = MoEOrchestrator(...)
reviewer = TwoStageReviewer(orchestrator)
result = await reviewer.review(implementation, spec, task_id)
Attributes:
STAGE1_JUDGES: Judges for spec compliance (domain, compliance, security)
STAGE2_JUDGES: Judges for code quality (architect, QA, ethics)
"""
# Stage 1: Spec-focused judges
STAGE1_JUDGES = ["domain_expert", "compliance_auditor", "security_analyst"]
# Stage 2: Quality-focused judges
STAGE2_JUDGES = ["technical_architect", "qa_evaluator", "ai_ethics_reviewer"]
# Majority threshold (2/3 for 3 judges)
# Use 0.66 to ensure 2/3 passes with 3 judges (2 >= 3*0.66 = 1.98)
MAJORITY_THRESHOLD = 0.66
def __init__(self, orchestrator: Any, max_cycles: int = 3):
"""
Initialize two-stage reviewer.
Args:
orchestrator: MoE orchestrator for running judge evaluations
max_cycles: Maximum review cycles before giving up
"""
self.orchestrator = orchestrator
self.review_cycles = 0
self.max_cycles = max_cycles
async def review(
self,
implementation: str,
spec: str,
plan_task_id: str
) -> TwoStageReviewResult:
"""
Execute two-stage review on implementation.
Stage 1 (Spec Compliance) must pass before Stage 2 (Code Quality) runs.
Args:
implementation: Code/changes to review
spec: Original requirements/plan
plan_task_id: Task ID from PILOT plan (e.g., "A.1.1")
Returns:
TwoStageReviewResult with stage outcomes
"""
self.review_cycles += 1
# Stage 1: Spec Compliance
stage1_result = await self._run_stage1(implementation, spec, plan_task_id)
if stage1_result.verdict == StageVerdict.FAIL:
return TwoStageReviewResult(
stage1=stage1_result,
stage2=None,
final_verdict="SPEC_VIOLATION",
total_review_cycles=self.review_cycles
)
# Stage 2: Code Quality (only if Stage 1 passed)
stage2_result = await self._run_stage2(implementation, spec, plan_task_id)
if stage2_result.verdict == StageVerdict.REQUEST_CHANGES:
return TwoStageReviewResult(
stage1=stage1_result,
stage2=stage2_result,
final_verdict="QUALITY_IMPROVEMENTS_NEEDED",
total_review_cycles=self.review_cycles
)
return TwoStageReviewResult(
stage1=stage1_result,
stage2=stage2_result,
final_verdict="APPROVED",
total_review_cycles=self.review_cycles
)
async def _run_stage1(
self,
implementation: str,
spec: str,
task_id: str
) -> ReviewResult:
"""Run spec compliance review with Stage 1 judges.
Focus: Does implementation match requirements?
Args:
implementation: Code to review
spec: Original specification
task_id: Task ID for tracking
Returns:
ReviewResult with spec compliance verdict
"""
prompt = self._build_stage1_prompt(implementation, spec, task_id)
result = await self.orchestrator.evaluate(
content=prompt,
judge_personas=self.STAGE1_JUDGES,
evaluation_type="spec_compliance"
)
# Aggregate judge verdicts
pass_count = sum(
1 for j in result.judge_results
if j.verdict.upper() == "PASS"
)
threshold = len(self.STAGE1_JUDGES) * self.MAJORITY_THRESHOLD
verdict = StageVerdict.PASS if pass_count >= threshold else StageVerdict.FAIL
blocking = verdict == StageVerdict.FAIL
return ReviewResult(
stage=ReviewStage.SPEC_COMPLIANCE,
verdict=verdict,
score=result.aggregate_score,
issues=getattr(result, 'blocking_issues', []) or [],
judge_verdicts={j.persona: j.verdict for j in result.judge_results},
blocking=blocking
)
async def _run_stage2(
self,
implementation: str,
spec: str,
task_id: str
) -> ReviewResult:
"""Run code quality review with Stage 2 judges.
Focus: Is implementation well-built?
Prerequisite: Stage 1 must have passed.
Args:
implementation: Code to review
spec: Original specification
task_id: Task ID for tracking
Returns:
ReviewResult with code quality verdict
"""
prompt = self._build_stage2_prompt(implementation, spec, task_id)
result = await self.orchestrator.evaluate(
content=prompt,
judge_personas=self.STAGE2_JUDGES,
evaluation_type="code_quality"
)
# Aggregate judge verdicts
approve_count = sum(
1 for j in result.judge_results
if j.verdict.upper() == "APPROVE"
)
threshold = len(self.STAGE2_JUDGES) * self.MAJORITY_THRESHOLD
verdict = (
StageVerdict.APPROVE
if approve_count >= threshold
else StageVerdict.REQUEST_CHANGES
)
return ReviewResult(
stage=ReviewStage.CODE_QUALITY,
verdict=verdict,
score=result.aggregate_score,
issues=getattr(result, 'improvement_suggestions', []) or [],
judge_verdicts={j.persona: j.verdict for j in result.judge_results},
blocking=False # Quality issues are never blocking
)
def _build_stage1_prompt(
self,
implementation: str,
spec: str,
task_id: str
) -> str:
"""Build prompt for Stage 1 spec compliance review."""
return f"""## Spec Compliance Review
Task ID: {task_id}
Original Specification
{spec}
Implementation to Review
{implementation}
Review Focus
- Are ALL requirements from the spec addressed?
- Are there any MISSING features?
- Is there any EXTRA scope not in spec (scope creep)?
- Are edge cases from spec handled?
- Does business logic match requirements?
Verdict Guidelines
- PASS: All spec requirements are satisfied
- FAIL: One or more spec requirements are NOT met
Output Format
Provide your verdict and reasoning:
-
verdict: PASS or FAIL
-
spec_coverage_score: 0-100
-
missing_requirements: [list any missing]
-
extra_scope: [list any scope creep]
-
reasoning: Brief explanation """
def _build_stage2_prompt( self, implementation: str, spec: str, task_id: str ) -> str: """Build prompt for Stage 2 code quality review.""" return f"""## Code Quality Review
Task ID: {task_id} Note: This implementation has PASSED spec compliance review.
Original Specification
{spec}
Implementation to Review
{implementation}
Review Focus
- Architecture patterns and code organization
- Test coverage and test quality
- Performance considerations
- Maintainability and readability
- Error handling and edge cases
- AI safety and ethics (if applicable)
Verdict Guidelines
- APPROVE: Quality standards met, ready for merge
- REQUEST_CHANGES: Quality improvements needed before merge
Output Format
Provide your verdict and feedback:
-
verdict: APPROVE or REQUEST_CHANGES
-
quality_score: 0-100
-
architecture_issues: [list any]
-
test_coverage_gaps: [list any]
-
improvement_suggestions: [list any]
-
reasoning: Brief explanation """
def reset_cycles(self) -> None: """Reset the review cycle counter.""" self.review_cycles = 0
Convenience functions
def create_reviewer(orchestrator: Any) -> TwoStageReviewer: """Create a TwoStageReviewer with default settings.""" return TwoStageReviewer(orchestrator)
async def quick_review( orchestrator: Any, implementation: str, spec: str, task_id: str ) -> TwoStageReviewResult: """Execute a quick two-stage review.
Args:
orchestrator: MoE orchestrator
implementation: Code to review
spec: Specification
task_id: Task ID
Returns:
TwoStageReviewResult
"""
reviewer = TwoStageReviewer(orchestrator)
return await reviewer.review(implementation, spec, task_id)