Skip to main content

scripts-llm-judge

""" LLM-based Judge Agent (H.3.5.3).

Provides LLM-backed judge evaluations using the MultiModelClient for multi-provider support with fallback and retry logic.

Features:

  • Uses JudgePersona for evaluation rubrics and prompts
  • Supports multiple LLM providers via MultiModelClient
  • Records full provenance (model_used, token_usage, latency)
  • Automatic fallback to backup model on failure """

import asyncio import json import re import time from datetime import datetime, timezone from pathlib import Path from typing import List, Dict, Optional, Any

import sys sys.path.insert(0, str(Path(file).parent.parent))

from core.models import Document, AnalystVote, JudgeDecision from core.multi_model_client import MultiModelClient, CompletionResponse from core.persona_loader import ( JudgePersona, PersonaLoader, get_default_loader, get_prompt_template, ) from .base import BaseJudge

class LLMJudge(BaseJudge): """LLM-based judge using multi-model client.

Uses JudgePersona configurations to drive evaluations across
multiple LLM providers with automatic fallback.
"""

def __init__(
self,
persona: JudgePersona,
model_client: Optional[MultiModelClient] = None,
max_tokens: int = 2048,
temperature: float = 0.0
):
"""
Initialize LLM judge.

Args:
persona: JudgePersona with evaluation rubric and model routing
model_client: Optional MultiModelClient (creates default if None)
max_tokens: Max tokens for completion
temperature: Sampling temperature (0 for deterministic)
"""
self.persona = persona
self.client = model_client or MultiModelClient()
self.max_tokens = max_tokens
self.temperature = temperature

# Set base class attributes from persona
self.name = persona.persona_id
# Description from demographics if available
if hasattr(persona, 'demographics') and persona.demographics:
self.description = f"{persona.demographics.title} - {persona.demographics.background[:100]}"
else:
self.description = f"LLM Judge: {persona.persona_id}"
self.has_veto_authority = True
# Weight from persona or model_routing
self.weight = getattr(persona, 'weight', 1.0)

@property
def model(self) -> str:
"""Get the primary model for this judge."""
if self.persona.model_routing:
return self.persona.model_routing.primary_model
return "claude-sonnet-4" # Default

def evaluate(
self,
document: Document,
votes: List[AnalystVote]
) -> JudgeDecision:
"""Evaluate analyst votes using LLM.

This is a sync wrapper around the async evaluation.
"""
return asyncio.run(self.evaluate_async(document, votes))

async def evaluate_async(
self,
document: Document,
votes: List[AnalystVote]
) -> JudgeDecision:
"""Evaluate analyst votes using LLM asynchronously.

Args:
document: Document being classified
votes: Analyst votes to evaluate

Returns:
JudgeDecision with LLM evaluation and full provenance
"""
start_time = datetime.now(timezone.utc)

# Build the evaluation prompt
prompt = self._build_prompt(document, votes)
system_prompt = self._build_system_prompt()

# Get completion from model
response = await self.client.get_completion(
model=self.model,
prompt=prompt,
persona_id=self.persona.persona_id,
system_prompt=system_prompt,
max_tokens=self.max_tokens,
temperature=self.temperature
)

end_time = datetime.now(timezone.utc)

# Parse the response
if response.success:
decision = self._parse_response(response, votes)
else:
# Create rejection decision for failed LLM call
decision = JudgeDecision(
judge=self.name,
approved=False,
reason=f"LLM evaluation failed: {response.error}",
confidence=0.0,
metadata={"error": response.error}
)

# Add provenance data (H.3.4)
decision.model_used = response.model_used
decision.timestamp = response.timestamp
decision.token_usage = response.token_usage
decision.raw_response = response.content
decision.evaluation_start_time = start_time
decision.evaluation_end_time = end_time
decision.duration_ms = response.latency_ms

return decision

def _build_system_prompt(self) -> str:
"""Build the system prompt from persona."""
# Build role description from demographics
role_desc = self.description
if hasattr(self.persona, 'demographics') and self.persona.demographics:
role_desc = f"{self.persona.demographics.title} with {self.persona.demographics.experience_years} years experience"

base = f"""You are {self.persona.persona_id}, an expert judge evaluating document classifications.

Your role: {role_desc}

You must evaluate the analyst votes for a document classification and decide whether to APPROVE or REJECT the consensus classification.

Your evaluation must consider these dimensions: """ # Add dimensions from rubric for dim in self.persona.evaluation_dimensions: # EvaluationDimension uses 'name' field base += f"\n- {dim.name}" if hasattr(dim, 'score_descriptions') and dim.score_descriptions: for score, desc in dim.score_descriptions.items(): base += f"\n Score {score}: {desc}"

    base += """

Respond in JSON format: { "approved": true/false, "confidence": 0.0-1.0, "reason": "Your detailed reasoning", "dimension_scores": { "dimension_name": 1-3, ... }, "key_concerns": ["concern1", "concern2"] } """ return base

def _build_prompt(self, document: Document, votes: List[AnalystVote]) -> str:
"""Build the evaluation prompt."""
# Format votes for prompt
votes_text = self._format_votes(votes)

# Get consensus info
consensus = self._get_consensus_classification(votes)
agreement = self._get_agreement_ratio(votes)
avg_confidence = self._get_average_confidence(votes)

prompt = f"""## Document to Evaluate

Path: {document.path} Filename: {document.filename} Extension: {document.extension}

Content Preview

{document.content[:2000]}{'...(truncated)' if len(document.content) > 2000 else ''}

Frontmatter

{self._format_frontmatter(document.frontmatter)}

Analyst Votes

{votes_text}

Consensus Summary

  • Classification: {consensus}
  • Agreement Ratio: {agreement:.2%}
  • Average Confidence: {avg_confidence:.2%}

Your Task

Evaluate whether the consensus classification "{consensus}" is correct for this document. Consider all evaluation dimensions and provide your decision with detailed reasoning. """ return prompt

def _format_votes(self, votes: List[AnalystVote]) -> str:
"""Format votes for the prompt."""
if not votes:
return "No votes received."

formatted = []
for v in votes:
formatted.append(f"""**{v.agent}:**
  • Classification: {v.classification}

  • Confidence: {v.confidence:.2%}

  • Reasoning: {v.reasoning[:500]}{'...' if len(v.reasoning) > 500 else ''} """) return "\n".join(formatted)

    def _format_frontmatter(self, frontmatter: Dict) -> str: """Format frontmatter as YAML.""" if not frontmatter: return "# No frontmatter"

      lines = []
    for k, v in frontmatter.items():
    if isinstance(v, list):
    lines.append(f"{k}:")
    for item in v:
    lines.append(f" - {item}")
    else:
    lines.append(f"{k}: {v}")
    return "\n".join(lines)

    def _parse_response( self, response: CompletionResponse, votes: List[AnalystVote] ) -> JudgeDecision: """Parse LLM response into JudgeDecision.""" content = response.content.strip()

      # Try to extract JSON from response
    try:
    # Handle markdown code blocks
    if "```json" in content:
    match = re.search(r"```json\s*(.*?)\s*```", content, re.DOTALL)
    if match:
    content = match.group(1)
    elif "```" in content:
    match = re.search(r"```\s*(.*?)\s*```", content, re.DOTALL)
    if match:
    content = match.group(1)

    data = json.loads(content)

    return JudgeDecision(
    judge=self.name,
    approved=data.get("approved", False),
    reason=data.get("reason", "No reason provided"),
    confidence=float(data.get("confidence", 0.8)),
    dimension_scores=data.get("dimension_scores", {}),
    metadata={
    "key_concerns": data.get("key_concerns", []),
    "raw_json": data
    }
    )

    except (json.JSONDecodeError, KeyError) as e:
    # Fall back to simple parsing
    approved = "approve" in content.lower() and "reject" not in content.lower()
    return JudgeDecision(
    judge=self.name,
    approved=approved,
    reason=content[:500],
    confidence=0.7,
    metadata={"parse_error": str(e)}
    )

class LLMJudgePanel: """Panel of LLM judges using multiple personas.

Manages a panel of judges, each with their own persona and model routing,
for diverse multi-model evaluation.
"""

def __init__(
self,
persona_ids: Optional[List[str]] = None,
loader: Optional[PersonaLoader] = None,
client: Optional[MultiModelClient] = None
):
"""
Initialize judge panel.

Args:
persona_ids: List of persona IDs to load (default: all from config)
loader: PersonaLoader instance
client: Shared MultiModelClient
"""
self.loader = loader or get_default_loader()
self.client = client or MultiModelClient()
self.judges: List[LLMJudge] = []

# Load personas
if persona_ids:
for pid in persona_ids:
persona = self.loader.load_persona(pid)
self.judges.append(LLMJudge(persona, self.client))
else:
# Load all available personas
for persona in self.loader.registry.personas.values():
self.judges.append(LLMJudge(persona, self.client))

async def evaluate_all(
self,
document: Document,
votes: List[AnalystVote]
) -> List[JudgeDecision]:
"""Run all judges in parallel.

Args:
document: Document to evaluate
votes: Analyst votes

Returns:
List of JudgeDecisions from all judges
"""
tasks = [
judge.evaluate_async(document, votes)
for judge in self.judges
]
return await asyncio.gather(*tasks)

def get_judge_info(self) -> List[Dict[str, Any]]:
"""Get information about all judges in the panel."""
return [
{
"persona_id": j.persona.persona_id,
"model": j.model,
"weight": j.weight,
"dimensions": [d.name for d in j.persona.evaluation_dimensions]
}
for j in self.judges
]

def verify_model_diversity(self) -> Dict[str, Any]:
"""Verify model diversity requirements.

Returns:
Dict with diversity metrics and compliance status
"""
models = [j.model for j in self.judges]
unique_models = set(models)

# Get model families
def get_family(model: str) -> str:
if "claude" in model.lower():
return "anthropic"
elif "gpt" in model.lower():
return "openai"
elif "deepseek" in model.lower():
return "deepseek"
elif "qwen" in model.lower():
return "alibaba"
elif "llama" in model.lower():
return "meta"
elif "gemini" in model.lower():
return "google"
return "unknown"

families = [get_family(m) for m in models]
unique_families = set(families)

# Calculate max weight for single model
model_weights = {}
total_weight = sum(j.weight for j in self.judges)
for j in self.judges:
model = j.model
model_weights[model] = model_weights.get(model, 0) + j.weight

max_single_weight = max(model_weights.values()) / total_weight if total_weight > 0 else 0

# Diversity requirements (from config)
min_families = 3
max_single_weight_allowed = 0.40

return {
"total_judges": len(self.judges),
"unique_models": len(unique_models),
"model_list": list(unique_models),
"unique_families": len(unique_families),
"family_list": list(unique_families),
"max_single_model_weight": round(max_single_weight, 3),
"meets_family_requirement": len(unique_families) >= min_families,
"meets_weight_requirement": max_single_weight <= max_single_weight_allowed,
"is_compliant": (
len(unique_families) >= min_families and
max_single_weight <= max_single_weight_allowed
)
}

def create_llm_judge_panel( persona_ids: Optional[List[str]] = None ) -> LLMJudgePanel: """Create an LLM judge panel with default configuration.

Args:
persona_ids: Optional list of persona IDs (loads all if None)

Returns:
Configured LLMJudgePanel
"""
return LLMJudgePanel(persona_ids=persona_ids)