scripts-two-stage-review

#!/usr/bin/env python3 """ CODITECT Two-Stage Review System (ADR-076)

Implements Superpowers' two-stage review pattern:

Stage 1: Spec Compliance (must pass before Stage 2)
Stage 2: Code Quality (only after Stage 1 passes)

Integrated with MoE Constitutional Court judge panels. """

from dataclasses import dataclass, field from enum import Enum from typing import List, Optional, Dict, Any, TYPE_CHECKING

if TYPE_CHECKING: # Avoid circular imports pass

class ReviewStage(Enum): """Review stage identifiers.""" SPEC_COMPLIANCE = "spec_compliance" CODE_QUALITY = "code_quality"

class StageVerdict(Enum): """Possible verdicts for each stage.""" # Stage 1 verdicts PASS = "pass" FAIL = "fail" # Stage 2 verdicts APPROVE = "approve" REQUEST_CHANGES = "request_changes"

@dataclass class ReviewResult: """Result from a single review stage.

Attributes:
    stage: Which stage this result is from
    verdict: Pass/Fail for Stage 1, Approve/RequestChanges for Stage 2
    score: Aggregate score (0-100)
    issues: List of issues found
    judge_verdicts: Per-judge verdict mapping
    blocking: Whether this result blocks progression
"""
stage: ReviewStage
verdict: StageVerdict
score: float
issues: List[str]
judge_verdicts: Dict[str, str]
blocking: bool = False

def to_dict(self) -> Dict[str, Any]:
    """Serialize to dictionary."""
    return {
        "stage": self.stage.value,
        "verdict": self.verdict.value,
        "score": self.score,
        "issues": self.issues,
        "judge_verdicts": self.judge_verdicts,
        "blocking": self.blocking
    }

@dataclass class TwoStageReviewResult: """Combined result from two-stage review.

Attributes:
    stage1: Result from spec compliance review
    stage2: Result from code quality review (None if Stage 1 failed)
    final_verdict: Overall verdict string
    total_review_cycles: Number of review cycles executed
"""
stage1: Optional[ReviewResult]
stage2: Optional[ReviewResult]
final_verdict: str
total_review_cycles: int

def to_dict(self) -> Dict[str, Any]:
    """Serialize to dictionary."""
    return {
        "stage1": self.stage1.to_dict() if self.stage1 else None,
        "stage2": self.stage2.to_dict() if self.stage2 else None,
        "final_verdict": self.final_verdict,
        "total_review_cycles": self.total_review_cycles
    }

class TwoStageReviewer: """ Two-stage review system integrating with MoE Constitutional Court.

Implements Superpowers' pattern: Spec compliance MUST pass before
code quality review begins. Prevents "well-written wrong code".

Usage:
    orchestrator = MoEOrchestrator(...)
    reviewer = TwoStageReviewer(orchestrator)
    result = await reviewer.review(implementation, spec, task_id)

Attributes:
    STAGE1_JUDGES: Judges for spec compliance (domain, compliance, security)
    STAGE2_JUDGES: Judges for code quality (architect, QA, ethics)
"""

# Stage 1: Spec-focused judges
STAGE1_JUDGES = ["domain_expert", "compliance_auditor", "security_analyst"]

# Stage 2: Quality-focused judges
STAGE2_JUDGES = ["technical_architect", "qa_evaluator", "ai_ethics_reviewer"]

# Majority threshold (2/3 for 3 judges)
# Use 0.66 to ensure 2/3 passes with 3 judges (2 >= 3*0.66 = 1.98)
MAJORITY_THRESHOLD = 0.66

def __init__(self, orchestrator: Any, max_cycles: int = 3):
    """
    Initialize two-stage reviewer.

    Args:
        orchestrator: MoE orchestrator for running judge evaluations
        max_cycles: Maximum review cycles before giving up
    """
    self.orchestrator = orchestrator
    self.review_cycles = 0
    self.max_cycles = max_cycles

async def review(
    self,
    implementation: str,
    spec: str,
    plan_task_id: str
) -> TwoStageReviewResult:
    """
    Execute two-stage review on implementation.

    Stage 1 (Spec Compliance) must pass before Stage 2 (Code Quality) runs.

    Args:
        implementation: Code/changes to review
        spec: Original requirements/plan
        plan_task_id: Task ID from PILOT plan (e.g., "A.1.1")

    Returns:
        TwoStageReviewResult with stage outcomes
    """
    self.review_cycles += 1

    # Stage 1: Spec Compliance
    stage1_result = await self._run_stage1(implementation, spec, plan_task_id)

    if stage1_result.verdict == StageVerdict.FAIL:
        return TwoStageReviewResult(
            stage1=stage1_result,
            stage2=None,
            final_verdict="SPEC_VIOLATION",
            total_review_cycles=self.review_cycles
        )

    # Stage 2: Code Quality (only if Stage 1 passed)
    stage2_result = await self._run_stage2(implementation, spec, plan_task_id)

    if stage2_result.verdict == StageVerdict.REQUEST_CHANGES:
        return TwoStageReviewResult(
            stage1=stage1_result,
            stage2=stage2_result,
            final_verdict="QUALITY_IMPROVEMENTS_NEEDED",
            total_review_cycles=self.review_cycles
        )

    return TwoStageReviewResult(
        stage1=stage1_result,
        stage2=stage2_result,
        final_verdict="APPROVED",
        total_review_cycles=self.review_cycles
    )

async def _run_stage1(
    self,
    implementation: str,
    spec: str,
    task_id: str
) -> ReviewResult:
    """Run spec compliance review with Stage 1 judges.

    Focus: Does implementation match requirements?

    Args:
        implementation: Code to review
        spec: Original specification
        task_id: Task ID for tracking

    Returns:
        ReviewResult with spec compliance verdict
    """
    prompt = self._build_stage1_prompt(implementation, spec, task_id)

    result = await self.orchestrator.evaluate(
        content=prompt,
        judge_personas=self.STAGE1_JUDGES,
        evaluation_type="spec_compliance"
    )

    # Aggregate judge verdicts
    pass_count = sum(
        1 for j in result.judge_results
        if j.verdict.upper() == "PASS"
    )
    threshold = len(self.STAGE1_JUDGES) * self.MAJORITY_THRESHOLD

    verdict = StageVerdict.PASS if pass_count >= threshold else StageVerdict.FAIL
    blocking = verdict == StageVerdict.FAIL

    return ReviewResult(
        stage=ReviewStage.SPEC_COMPLIANCE,
        verdict=verdict,
        score=result.aggregate_score,
        issues=getattr(result, 'blocking_issues', []) or [],
        judge_verdicts={j.persona: j.verdict for j in result.judge_results},
        blocking=blocking
    )

async def _run_stage2(
    self,
    implementation: str,
    spec: str,
    task_id: str
) -> ReviewResult:
    """Run code quality review with Stage 2 judges.

    Focus: Is implementation well-built?
    Prerequisite: Stage 1 must have passed.

    Args:
        implementation: Code to review
        spec: Original specification
        task_id: Task ID for tracking

    Returns:
        ReviewResult with code quality verdict
    """
    prompt = self._build_stage2_prompt(implementation, spec, task_id)

    result = await self.orchestrator.evaluate(
        content=prompt,
        judge_personas=self.STAGE2_JUDGES,
        evaluation_type="code_quality"
    )

    # Aggregate judge verdicts
    approve_count = sum(
        1 for j in result.judge_results
        if j.verdict.upper() == "APPROVE"
    )
    threshold = len(self.STAGE2_JUDGES) * self.MAJORITY_THRESHOLD

    verdict = (
        StageVerdict.APPROVE
        if approve_count >= threshold
        else StageVerdict.REQUEST_CHANGES
    )

    return ReviewResult(
        stage=ReviewStage.CODE_QUALITY,
        verdict=verdict,
        score=result.aggregate_score,
        issues=getattr(result, 'improvement_suggestions', []) or [],
        judge_verdicts={j.persona: j.verdict for j in result.judge_results},
        blocking=False  # Quality issues are never blocking
    )

def _build_stage1_prompt(
    self,
    implementation: str,
    spec: str,
    task_id: str
) -> str:
    """Build prompt for Stage 1 spec compliance review."""
    return f"""## Spec Compliance Review

Task ID: {task_id}

Original Specification

{spec}

Implementation to Review

{implementation}

Review Focus

Are ALL requirements from the spec addressed?
Are there any MISSING features?
Is there any EXTRA scope not in spec (scope creep)?
Are edge cases from spec handled?
Does business logic match requirements?

Verdict Guidelines

PASS: All spec requirements are satisfied
FAIL: One or more spec requirements are NOT met

Output Format

Provide your verdict and reasoning:

verdict: PASS or FAIL
spec_coverage_score: 0-100
missing_requirements: [list any missing]
extra_scope: [list any scope creep]
reasoning: Brief explanation """

def _build_stage2_prompt( self, implementation: str, spec: str, task_id: str ) -> str: """Build prompt for Stage 2 code quality review.""" return f"""## Code Quality Review

Task ID: {task_id} Note: This implementation has PASSED spec compliance review.

Original Specification

{spec}

Implementation to Review

{implementation}

Review Focus

Architecture patterns and code organization
Test coverage and test quality
Performance considerations
Maintainability and readability
Error handling and edge cases
AI safety and ethics (if applicable)

Verdict Guidelines

APPROVE: Quality standards met, ready for merge
REQUEST_CHANGES: Quality improvements needed before merge

Output Format

Provide your verdict and feedback:

verdict: APPROVE or REQUEST_CHANGES
quality_score: 0-100
architecture_issues: [list any]
test_coverage_gaps: [list any]
improvement_suggestions: [list any]
reasoning: Brief explanation """

def reset_cycles(self) -> None: """Reset the review cycle counter.""" self.review_cycles = 0

Convenience functions

def create_reviewer(orchestrator: Any) -> TwoStageReviewer: """Create a TwoStageReviewer with default settings.""" return TwoStageReviewer(orchestrator)

async def quick_review( orchestrator: Any, implementation: str, spec: str, task_id: str ) -> TwoStageReviewResult: """Execute a quick two-stage review.

Args:
    orchestrator: MoE orchestrator
    implementation: Code to review
    spec: Specification
    task_id: Task ID

Returns:
    TwoStageReviewResult
"""
reviewer = TwoStageReviewer(orchestrator)
return await reviewer.review(implementation, spec, task_id)

Original Specification​

Implementation to Review​

Review Focus​

Verdict Guidelines​

Output Format​

Original Specification​

Implementation to Review​

Review Focus​

Verdict Guidelines​

Output Format​

Convenience functions

Original Specification

Implementation to Review

Review Focus

Verdict Guidelines

Output Format

Original Specification

Implementation to Review

Review Focus

Verdict Guidelines

Output Format