Skip to main content

scripts-two-stage-review

#!/usr/bin/env python3 """ CODITECT Two-Stage Review System (ADR-076)

Implements Superpowers' two-stage review pattern:

  • Stage 1: Spec Compliance (must pass before Stage 2)
  • Stage 2: Code Quality (only after Stage 1 passes)

Integrated with MoE Constitutional Court judge panels. """

from dataclasses import dataclass, field from enum import Enum from typing import List, Optional, Dict, Any, TYPE_CHECKING

if TYPE_CHECKING: # Avoid circular imports pass

class ReviewStage(Enum): """Review stage identifiers.""" SPEC_COMPLIANCE = "spec_compliance" CODE_QUALITY = "code_quality"

class StageVerdict(Enum): """Possible verdicts for each stage.""" # Stage 1 verdicts PASS = "pass" FAIL = "fail" # Stage 2 verdicts APPROVE = "approve" REQUEST_CHANGES = "request_changes"

@dataclass class ReviewResult: """Result from a single review stage.

Attributes:
stage: Which stage this result is from
verdict: Pass/Fail for Stage 1, Approve/RequestChanges for Stage 2
score: Aggregate score (0-100)
issues: List of issues found
judge_verdicts: Per-judge verdict mapping
blocking: Whether this result blocks progression
"""
stage: ReviewStage
verdict: StageVerdict
score: float
issues: List[str]
judge_verdicts: Dict[str, str]
blocking: bool = False

def to_dict(self) -> Dict[str, Any]:
"""Serialize to dictionary."""
return {
"stage": self.stage.value,
"verdict": self.verdict.value,
"score": self.score,
"issues": self.issues,
"judge_verdicts": self.judge_verdicts,
"blocking": self.blocking
}

@dataclass class TwoStageReviewResult: """Combined result from two-stage review.

Attributes:
stage1: Result from spec compliance review
stage2: Result from code quality review (None if Stage 1 failed)
final_verdict: Overall verdict string
total_review_cycles: Number of review cycles executed
"""
stage1: Optional[ReviewResult]
stage2: Optional[ReviewResult]
final_verdict: str
total_review_cycles: int

def to_dict(self) -> Dict[str, Any]:
"""Serialize to dictionary."""
return {
"stage1": self.stage1.to_dict() if self.stage1 else None,
"stage2": self.stage2.to_dict() if self.stage2 else None,
"final_verdict": self.final_verdict,
"total_review_cycles": self.total_review_cycles
}

class TwoStageReviewer: """ Two-stage review system integrating with MoE Constitutional Court.

Implements Superpowers' pattern: Spec compliance MUST pass before
code quality review begins. Prevents "well-written wrong code".

Usage:
orchestrator = MoEOrchestrator(...)
reviewer = TwoStageReviewer(orchestrator)
result = await reviewer.review(implementation, spec, task_id)

Attributes:
STAGE1_JUDGES: Judges for spec compliance (domain, compliance, security)
STAGE2_JUDGES: Judges for code quality (architect, QA, ethics)
"""

# Stage 1: Spec-focused judges
STAGE1_JUDGES = ["domain_expert", "compliance_auditor", "security_analyst"]

# Stage 2: Quality-focused judges
STAGE2_JUDGES = ["technical_architect", "qa_evaluator", "ai_ethics_reviewer"]

# Majority threshold (2/3 for 3 judges)
# Use 0.66 to ensure 2/3 passes with 3 judges (2 >= 3*0.66 = 1.98)
MAJORITY_THRESHOLD = 0.66

def __init__(self, orchestrator: Any, max_cycles: int = 3):
"""
Initialize two-stage reviewer.

Args:
orchestrator: MoE orchestrator for running judge evaluations
max_cycles: Maximum review cycles before giving up
"""
self.orchestrator = orchestrator
self.review_cycles = 0
self.max_cycles = max_cycles

async def review(
self,
implementation: str,
spec: str,
plan_task_id: str
) -> TwoStageReviewResult:
"""
Execute two-stage review on implementation.

Stage 1 (Spec Compliance) must pass before Stage 2 (Code Quality) runs.

Args:
implementation: Code/changes to review
spec: Original requirements/plan
plan_task_id: Task ID from PILOT plan (e.g., "A.1.1")

Returns:
TwoStageReviewResult with stage outcomes
"""
self.review_cycles += 1

# Stage 1: Spec Compliance
stage1_result = await self._run_stage1(implementation, spec, plan_task_id)

if stage1_result.verdict == StageVerdict.FAIL:
return TwoStageReviewResult(
stage1=stage1_result,
stage2=None,
final_verdict="SPEC_VIOLATION",
total_review_cycles=self.review_cycles
)

# Stage 2: Code Quality (only if Stage 1 passed)
stage2_result = await self._run_stage2(implementation, spec, plan_task_id)

if stage2_result.verdict == StageVerdict.REQUEST_CHANGES:
return TwoStageReviewResult(
stage1=stage1_result,
stage2=stage2_result,
final_verdict="QUALITY_IMPROVEMENTS_NEEDED",
total_review_cycles=self.review_cycles
)

return TwoStageReviewResult(
stage1=stage1_result,
stage2=stage2_result,
final_verdict="APPROVED",
total_review_cycles=self.review_cycles
)

async def _run_stage1(
self,
implementation: str,
spec: str,
task_id: str
) -> ReviewResult:
"""Run spec compliance review with Stage 1 judges.

Focus: Does implementation match requirements?

Args:
implementation: Code to review
spec: Original specification
task_id: Task ID for tracking

Returns:
ReviewResult with spec compliance verdict
"""
prompt = self._build_stage1_prompt(implementation, spec, task_id)

result = await self.orchestrator.evaluate(
content=prompt,
judge_personas=self.STAGE1_JUDGES,
evaluation_type="spec_compliance"
)

# Aggregate judge verdicts
pass_count = sum(
1 for j in result.judge_results
if j.verdict.upper() == "PASS"
)
threshold = len(self.STAGE1_JUDGES) * self.MAJORITY_THRESHOLD

verdict = StageVerdict.PASS if pass_count >= threshold else StageVerdict.FAIL
blocking = verdict == StageVerdict.FAIL

return ReviewResult(
stage=ReviewStage.SPEC_COMPLIANCE,
verdict=verdict,
score=result.aggregate_score,
issues=getattr(result, 'blocking_issues', []) or [],
judge_verdicts={j.persona: j.verdict for j in result.judge_results},
blocking=blocking
)

async def _run_stage2(
self,
implementation: str,
spec: str,
task_id: str
) -> ReviewResult:
"""Run code quality review with Stage 2 judges.

Focus: Is implementation well-built?
Prerequisite: Stage 1 must have passed.

Args:
implementation: Code to review
spec: Original specification
task_id: Task ID for tracking

Returns:
ReviewResult with code quality verdict
"""
prompt = self._build_stage2_prompt(implementation, spec, task_id)

result = await self.orchestrator.evaluate(
content=prompt,
judge_personas=self.STAGE2_JUDGES,
evaluation_type="code_quality"
)

# Aggregate judge verdicts
approve_count = sum(
1 for j in result.judge_results
if j.verdict.upper() == "APPROVE"
)
threshold = len(self.STAGE2_JUDGES) * self.MAJORITY_THRESHOLD

verdict = (
StageVerdict.APPROVE
if approve_count >= threshold
else StageVerdict.REQUEST_CHANGES
)

return ReviewResult(
stage=ReviewStage.CODE_QUALITY,
verdict=verdict,
score=result.aggregate_score,
issues=getattr(result, 'improvement_suggestions', []) or [],
judge_verdicts={j.persona: j.verdict for j in result.judge_results},
blocking=False # Quality issues are never blocking
)

def _build_stage1_prompt(
self,
implementation: str,
spec: str,
task_id: str
) -> str:
"""Build prompt for Stage 1 spec compliance review."""
return f"""## Spec Compliance Review

Task ID: {task_id}

Original Specification

{spec}

Implementation to Review

{implementation}

Review Focus

  1. Are ALL requirements from the spec addressed?
  2. Are there any MISSING features?
  3. Is there any EXTRA scope not in spec (scope creep)?
  4. Are edge cases from spec handled?
  5. Does business logic match requirements?

Verdict Guidelines

  • PASS: All spec requirements are satisfied
  • FAIL: One or more spec requirements are NOT met

Output Format

Provide your verdict and reasoning:

  • verdict: PASS or FAIL

  • spec_coverage_score: 0-100

  • missing_requirements: [list any missing]

  • extra_scope: [list any scope creep]

  • reasoning: Brief explanation """

    def _build_stage2_prompt( self, implementation: str, spec: str, task_id: str ) -> str: """Build prompt for Stage 2 code quality review.""" return f"""## Code Quality Review

Task ID: {task_id} Note: This implementation has PASSED spec compliance review.

Original Specification

{spec}

Implementation to Review

{implementation}

Review Focus

  1. Architecture patterns and code organization
  2. Test coverage and test quality
  3. Performance considerations
  4. Maintainability and readability
  5. Error handling and edge cases
  6. AI safety and ethics (if applicable)

Verdict Guidelines

  • APPROVE: Quality standards met, ready for merge
  • REQUEST_CHANGES: Quality improvements needed before merge

Output Format

Provide your verdict and feedback:

  • verdict: APPROVE or REQUEST_CHANGES

  • quality_score: 0-100

  • architecture_issues: [list any]

  • test_coverage_gaps: [list any]

  • improvement_suggestions: [list any]

  • reasoning: Brief explanation """

    def reset_cycles(self) -> None: """Reset the review cycle counter.""" self.review_cycles = 0

Convenience functions

def create_reviewer(orchestrator: Any) -> TwoStageReviewer: """Create a TwoStageReviewer with default settings.""" return TwoStageReviewer(orchestrator)

async def quick_review( orchestrator: Any, implementation: str, spec: str, task_id: str ) -> TwoStageReviewResult: """Execute a quick two-stage review.

Args:
orchestrator: MoE orchestrator
implementation: Code to review
spec: Specification
task_id: Task ID

Returns:
TwoStageReviewResult
"""
reviewer = TwoStageReviewer(orchestrator)
return await reviewer.review(implementation, spec, task_id)