Skip to main content

""" Multi-Model Client for MoE Judge Panel (H.3.5.2).

Provides unified interface to multiple LLM providers for judge evaluations. Supports: Anthropic (Claude), OpenAI (GPT-4/O3), DeepSeek, Alibaba (Qwen), Meta (Llama), Google (Gemini), MiniMax (ADR-200).

Features:

  • Unified completion API across providers
  • Automatic fallback to backup models
  • Retry logic with exponential backoff
  • Token usage tracking
  • Response timing for provenance
  • Provider-aware model selection (ADR-073)

Configuration Priority:

  1. Runtime parameters
  2. Environment variables
  3. Provider mode detection (ADR-073)
  4. config/judge-model-routing.json """

import asyncio import json import logging import os import time from abc import ABC, abstractmethod from dataclasses import dataclass, field from datetime import datetime, timezone from enum import Enum from pathlib import Path from typing import Dict, List, Optional, Any, Callable

from .provider_detector import ( ProviderDetector, ProviderMode, Provider, ProviderDetectionResult, get_default_detector, reset_default_detector, )

Import usage tracking (ADR-075)

try: from scripts.core.usage_tracking import UsageTracker, get_tracker _USAGE_TRACKING_AVAILABLE = True except ImportError: _USAGE_TRACKING_AVAILABLE = False UsageTracker = None get_tracker = None

Set up logging

logger = logging.getLogger(name)

class ModelProvider(str, Enum): """Supported model providers.""" ANTHROPIC = "anthropic" OPENAI = "openai" DEEPSEEK = "deepseek" ALIBABA = "alibaba" META = "meta" GOOGLE = "google" MINIMAX = "minimax"

@dataclass class ModelConfig: """Configuration for a specific model.""" model_id: str provider: ModelProvider api_key_env: str max_tokens: int = 4096 temperature: float = 0.0 timeout_seconds: int = 60

@dataclass class CompletionRequest: """Request for model completion.""" prompt: str model: str persona_id: str max_tokens: int = 4096 temperature: float = 0.0 system_prompt: Optional[str] = None metadata: Dict[str, Any] = field(default_factory=dict)

@dataclass class CompletionResponse: """Response from model completion.""" content: str model_used: str provider: str token_usage: int input_tokens: int output_tokens: int latency_ms: int timestamp: datetime raw_response: Optional[Dict] = None success: bool = True error: Optional[str] = None

def to_dict(self) -> Dict[str, Any]:
"""Convert to dictionary for serialization."""
return {
"content": self.content[:500] + "..." if len(self.content) > 500 else self.content,
"model_used": self.model_used,
"provider": self.provider,
"token_usage": self.token_usage,
"input_tokens": self.input_tokens,
"output_tokens": self.output_tokens,
"latency_ms": self.latency_ms,
"timestamp": self.timestamp.isoformat(),
"success": self.success,
"error": self.error
}

@dataclass class FallbackConfig: """Fallback strategy configuration.""" max_retries: int = 2 retry_delay_seconds: float = 1.0 use_backup_on_failure: bool = True use_backup_on_timeout: bool = True timeout_seconds: int = 60

class BaseProviderClient(ABC): """Abstract base class for provider clients."""

def __init__(self, api_key: str, timeout: int = 60):
self.api_key = api_key
self.timeout = timeout

@abstractmethod
async def complete(self, request: CompletionRequest) -> CompletionResponse:
"""Execute completion request."""
pass

@property
@abstractmethod
def provider(self) -> ModelProvider:
"""Return provider enum."""
pass

class AnthropicClient(BaseProviderClient): """Anthropic Claude API client."""

@property
def provider(self) -> ModelProvider:
return ModelProvider.ANTHROPIC

async def complete(self, request: CompletionRequest) -> CompletionResponse:
"""Execute completion with Anthropic API."""
start_time = time.time()

try:
# Import here to avoid requiring anthropic if not used
import anthropic

client = anthropic.Anthropic(api_key=self.api_key)

messages = [{"role": "user", "content": request.prompt}]

response = client.messages.create(
model=request.model,
max_tokens=request.max_tokens,
temperature=request.temperature,
system=request.system_prompt or "",
messages=messages
)

latency_ms = int((time.time() - start_time) * 1000)

return CompletionResponse(
content=response.content[0].text,
model_used=request.model,
provider=self.provider.value,
token_usage=response.usage.input_tokens + response.usage.output_tokens,
input_tokens=response.usage.input_tokens,
output_tokens=response.usage.output_tokens,
latency_ms=latency_ms,
timestamp=datetime.now(timezone.utc),
raw_response={"id": response.id, "model": response.model}
)

except Exception as e:
latency_ms = int((time.time() - start_time) * 1000)
return CompletionResponse(
content="",
model_used=request.model,
provider=self.provider.value,
token_usage=0,
input_tokens=0,
output_tokens=0,
latency_ms=latency_ms,
timestamp=datetime.now(timezone.utc),
success=False,
error=str(e)
)

class OpenAIClient(BaseProviderClient): """OpenAI GPT API client."""

@property
def provider(self) -> ModelProvider:
return ModelProvider.OPENAI

async def complete(self, request: CompletionRequest) -> CompletionResponse:
"""Execute completion with OpenAI API."""
start_time = time.time()

try:
import openai

client = openai.OpenAI(api_key=self.api_key)

messages = []
if request.system_prompt:
messages.append({"role": "system", "content": request.system_prompt})
messages.append({"role": "user", "content": request.prompt})

response = client.chat.completions.create(
model=request.model,
messages=messages,
max_tokens=request.max_tokens,
temperature=request.temperature
)

latency_ms = int((time.time() - start_time) * 1000)

return CompletionResponse(
content=response.choices[0].message.content,
model_used=request.model,
provider=self.provider.value,
token_usage=response.usage.total_tokens,
input_tokens=response.usage.prompt_tokens,
output_tokens=response.usage.completion_tokens,
latency_ms=latency_ms,
timestamp=datetime.now(timezone.utc),
raw_response={"id": response.id, "model": response.model}
)

except Exception as e:
latency_ms = int((time.time() - start_time) * 1000)
return CompletionResponse(
content="",
model_used=request.model,
provider=self.provider.value,
token_usage=0,
input_tokens=0,
output_tokens=0,
latency_ms=latency_ms,
timestamp=datetime.now(timezone.utc),
success=False,
error=str(e)
)

class DeepSeekClient(BaseProviderClient): """DeepSeek API client (OpenAI-compatible)."""

BASE_URL = "https://api.deepseek.com"

@property
def provider(self) -> ModelProvider:
return ModelProvider.DEEPSEEK

async def complete(self, request: CompletionRequest) -> CompletionResponse:
"""Execute completion with DeepSeek API."""
start_time = time.time()

try:
import openai

client = openai.OpenAI(
api_key=self.api_key,
base_url=self.BASE_URL
)

messages = []
if request.system_prompt:
messages.append({"role": "system", "content": request.system_prompt})
messages.append({"role": "user", "content": request.prompt})

response = client.chat.completions.create(
model=request.model,
messages=messages,
max_tokens=request.max_tokens,
temperature=request.temperature
)

latency_ms = int((time.time() - start_time) * 1000)

return CompletionResponse(
content=response.choices[0].message.content,
model_used=request.model,
provider=self.provider.value,
token_usage=response.usage.total_tokens if response.usage else 0,
input_tokens=response.usage.prompt_tokens if response.usage else 0,
output_tokens=response.usage.completion_tokens if response.usage else 0,
latency_ms=latency_ms,
timestamp=datetime.now(timezone.utc),
raw_response={"id": response.id, "model": response.model}
)

except Exception as e:
latency_ms = int((time.time() - start_time) * 1000)
return CompletionResponse(
content="",
model_used=request.model,
provider=self.provider.value,
token_usage=0,
input_tokens=0,
output_tokens=0,
latency_ms=latency_ms,
timestamp=datetime.now(timezone.utc),
success=False,
error=str(e)
)

class TogetherClient(BaseProviderClient): """Together AI client for Meta Llama models."""

BASE_URL = "https://api.together.xyz/v1"

@property
def provider(self) -> ModelProvider:
return ModelProvider.META

async def complete(self, request: CompletionRequest) -> CompletionResponse:
"""Execute completion with Together AI API."""
start_time = time.time()

try:
import openai

client = openai.OpenAI(
api_key=self.api_key,
base_url=self.BASE_URL
)

# Map model names to Together format
model = request.model
if model == "llama-3.3-70b":
model = "meta-llama/Llama-3.3-70B-Instruct-Turbo"
elif model == "llama-3.1-405b":
model = "meta-llama/Meta-Llama-3.1-405B-Instruct-Turbo"

messages = []
if request.system_prompt:
messages.append({"role": "system", "content": request.system_prompt})
messages.append({"role": "user", "content": request.prompt})

response = client.chat.completions.create(
model=model,
messages=messages,
max_tokens=request.max_tokens,
temperature=request.temperature
)

latency_ms = int((time.time() - start_time) * 1000)

return CompletionResponse(
content=response.choices[0].message.content,
model_used=request.model,
provider=self.provider.value,
token_usage=response.usage.total_tokens if response.usage else 0,
input_tokens=response.usage.prompt_tokens if response.usage else 0,
output_tokens=response.usage.completion_tokens if response.usage else 0,
latency_ms=latency_ms,
timestamp=datetime.now(timezone.utc),
raw_response={"id": response.id, "model": model}
)

except Exception as e:
latency_ms = int((time.time() - start_time) * 1000)
return CompletionResponse(
content="",
model_used=request.model,
provider=self.provider.value,
token_usage=0,
input_tokens=0,
output_tokens=0,
latency_ms=latency_ms,
timestamp=datetime.now(timezone.utc),
success=False,
error=str(e)
)

class GoogleClient(BaseProviderClient): """Google Gemini API client."""

@property
def provider(self) -> ModelProvider:
return ModelProvider.GOOGLE

async def complete(self, request: CompletionRequest) -> CompletionResponse:
"""Execute completion with Google Gemini API."""
start_time = time.time()

try:
import google.generativeai as genai

genai.configure(api_key=self.api_key)

model = genai.GenerativeModel(request.model)

# Combine system prompt with user prompt
full_prompt = request.prompt
if request.system_prompt:
full_prompt = f"{request.system_prompt}\n\n{request.prompt}"

response = model.generate_content(
full_prompt,
generation_config=genai.types.GenerationConfig(
max_output_tokens=request.max_tokens,
temperature=request.temperature
)
)

latency_ms = int((time.time() - start_time) * 1000)

# Estimate tokens (Gemini doesn't always return usage)
estimated_tokens = len(full_prompt.split()) + len(response.text.split())

return CompletionResponse(
content=response.text,
model_used=request.model,
provider=self.provider.value,
token_usage=estimated_tokens,
input_tokens=len(full_prompt.split()),
output_tokens=len(response.text.split()),
latency_ms=latency_ms,
timestamp=datetime.now(timezone.utc),
raw_response={"model": request.model}
)

except Exception as e:
latency_ms = int((time.time() - start_time) * 1000)
return CompletionResponse(
content="",
model_used=request.model,
provider=self.provider.value,
token_usage=0,
input_tokens=0,
output_tokens=0,
latency_ms=latency_ms,
timestamp=datetime.now(timezone.utc),
success=False,
error=str(e)
)

class DashScopeClient(BaseProviderClient): """Alibaba DashScope client for Qwen models."""

@property
def provider(self) -> ModelProvider:
return ModelProvider.ALIBABA

async def complete(self, request: CompletionRequest) -> CompletionResponse:
"""Execute completion with DashScope API."""
start_time = time.time()

try:
# DashScope uses OpenAI-compatible API
import openai

client = openai.OpenAI(
api_key=self.api_key,
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
)

messages = []
if request.system_prompt:
messages.append({"role": "system", "content": request.system_prompt})
messages.append({"role": "user", "content": request.prompt})

response = client.chat.completions.create(
model=request.model,
messages=messages,
max_tokens=request.max_tokens,
temperature=request.temperature
)

latency_ms = int((time.time() - start_time) * 1000)

return CompletionResponse(
content=response.choices[0].message.content,
model_used=request.model,
provider=self.provider.value,
token_usage=response.usage.total_tokens if response.usage else 0,
input_tokens=response.usage.prompt_tokens if response.usage else 0,
output_tokens=response.usage.completion_tokens if response.usage else 0,
latency_ms=latency_ms,
timestamp=datetime.now(timezone.utc),
raw_response={"id": response.id, "model": response.model}
)

except Exception as e:
latency_ms = int((time.time() - start_time) * 1000)
return CompletionResponse(
content="",
model_used=request.model,
provider=self.provider.value,
token_usage=0,
input_tokens=0,
output_tokens=0,
latency_ms=latency_ms,
timestamp=datetime.now(timezone.utc),
success=False,
error=str(e)
)

class MiniMaxClient(BaseProviderClient): """MiniMax API client (Anthropic Messages API-compatible, ADR-200)."""

BASE_URL = "https://api.minimax.io/anthropic"

@property
def provider(self) -> ModelProvider:
return ModelProvider.MINIMAX

async def complete(self, request: CompletionRequest) -> CompletionResponse:
"""Execute completion with MiniMax API (Anthropic-compatible)."""
start_time = time.time()

try:
import anthropic

client = anthropic.Anthropic(
api_key=self.api_key,
base_url=self.BASE_URL
)

# MiniMax does not allow temperature=0.0 (exclusive range (0.0, 1.0])
temperature = max(0.01, request.temperature) if request.temperature == 0.0 else request.temperature

messages = [{"role": "user", "content": request.prompt}]

response = client.messages.create(
model=request.model,
max_tokens=request.max_tokens,
temperature=temperature,
system=request.system_prompt or "",
messages=messages
)

latency_ms = int((time.time() - start_time) * 1000)

# MiniMax returns ThinkingBlock + TextBlock by default; extract text block
text = next((b.text for b in response.content if b.type == "text"), "")

return CompletionResponse(
content=text,
model_used=request.model,
provider=self.provider.value,
token_usage=response.usage.input_tokens + response.usage.output_tokens,
input_tokens=response.usage.input_tokens,
output_tokens=response.usage.output_tokens,
latency_ms=latency_ms,
timestamp=datetime.now(timezone.utc),
raw_response={"id": response.id, "model": response.model}
)

except Exception as e:
latency_ms = int((time.time() - start_time) * 1000)
return CompletionResponse(
content="",
model_used=request.model,
provider=self.provider.value,
token_usage=0,
input_tokens=0,
output_tokens=0,
latency_ms=latency_ms,
timestamp=datetime.now(timezone.utc),
success=False,
error=str(e)
)

class MultiModelClient: """ Unified client for multiple LLM providers with fallback support.

Usage:
client = MultiModelClient()
response = await client.get_completion(
model="claude-sonnet-4",
prompt="Evaluate this code...",
persona_id="technical_architect"
)
"""

# Model to provider mapping (updated January 2026 per ADR-073)
MODEL_PROVIDERS: Dict[str, ModelProvider] = {
# Anthropic (January 2026)
"claude-opus-4-5": ModelProvider.ANTHROPIC,
"claude-opus-4.5": ModelProvider.ANTHROPIC,
"claude-sonnet-4-5": ModelProvider.ANTHROPIC, # New flagship
"claude-sonnet-4.5": ModelProvider.ANTHROPIC,
"claude-sonnet-4": ModelProvider.ANTHROPIC,
"claude-haiku-4-5": ModelProvider.ANTHROPIC,
"claude-haiku-4.5": ModelProvider.ANTHROPIC,
# OpenAI (January 2026)
"o3": ModelProvider.OPENAI, # New flagship reasoning model
"o3-mini": ModelProvider.OPENAI,
"gpt-4.1": ModelProvider.OPENAI, # New GPT-4.1
"gpt-4.1-mini": ModelProvider.OPENAI,
"gpt-4o": ModelProvider.OPENAI,
"gpt-4o-mini": ModelProvider.OPENAI,
"gpt-4-turbo": ModelProvider.OPENAI,
# DeepSeek (January 2026)
"deepseek-v3.2": ModelProvider.DEEPSEEK, # Latest
"deepseek-v3": ModelProvider.DEEPSEEK,
"deepseek-reasoner": ModelProvider.DEEPSEEK, # R1 reasoning
"deepseek-chat": ModelProvider.DEEPSEEK,
"deepseek-coder": ModelProvider.DEEPSEEK,
# Alibaba/Qwen (January 2026)
"qwen3-72b": ModelProvider.ALIBABA, # Qwen 3.0
"qwen2.5-72b": ModelProvider.ALIBABA,
"qwen2.5-32b": ModelProvider.ALIBABA,
"qwen-max": ModelProvider.ALIBABA,
# Meta (January 2026 via Together)
"llama-4-maverick": ModelProvider.META, # New Llama 4
"llama-4-scout": ModelProvider.META,
"llama-3.3-70b": ModelProvider.META,
"llama-3.1-405b": ModelProvider.META,
# Google (January 2026)
"gemini-3-pro": ModelProvider.GOOGLE, # New Gemini 3
"gemini-2.5-flash": ModelProvider.GOOGLE,
"gemini-2.0-flash": ModelProvider.GOOGLE,
"gemini-1.5-pro": ModelProvider.GOOGLE,
# MiniMax (February 2026 - ADR-200)
"MiniMax-M2.5": ModelProvider.MINIMAX,
"MiniMax-M2.5-highspeed": ModelProvider.MINIMAX,
"MiniMax-M2.1": ModelProvider.MINIMAX,
"MiniMax-M2.1-highspeed": ModelProvider.MINIMAX,
"MiniMax-M2": ModelProvider.MINIMAX,
}

# Provider to API key env var mapping
PROVIDER_API_KEYS: Dict[ModelProvider, str] = {
ModelProvider.ANTHROPIC: "ANTHROPIC_API_KEY",
ModelProvider.OPENAI: "OPENAI_API_KEY",
ModelProvider.DEEPSEEK: "DEEPSEEK_API_KEY",
ModelProvider.ALIBABA: "DASHSCOPE_API_KEY",
ModelProvider.META: "TOGETHER_API_KEY",
ModelProvider.GOOGLE: "GOOGLE_API_KEY",
ModelProvider.MINIMAX: "MINIMAX_API_KEY",
}

def __init__(
self,
config_path: Optional[Path] = None,
fallback_config: Optional[FallbackConfig] = None,
enable_provider_detection: bool = True,
force_provider_mode: Optional[ProviderMode] = None
):
"""
Initialize multi-model client.

Args:
config_path: Path to judge-model-routing.json
fallback_config: Fallback strategy configuration
enable_provider_detection: Auto-detect available providers (ADR-073)
force_provider_mode: Override detected mode (ADR-073)
"""
self.config_path = config_path or self._default_config_path()
self.fallback_config = fallback_config or self._load_fallback_config()
self._clients: Dict[ModelProvider, BaseProviderClient] = {}
self._routing_config: Optional[Dict] = None

# Provider detection (ADR-073)
self._enable_provider_detection = enable_provider_detection
self._force_provider_mode = force_provider_mode
self._provider_detection_result: Optional[ProviderDetectionResult] = None

if enable_provider_detection:
self._detect_providers()

def _default_config_path(self) -> Path:
"""Get default config path."""
return Path(__file__).parent.parent.parent.parent / "config" / "judge-model-routing.json"

def _detect_providers(self) -> None:
"""Detect available LLM providers (ADR-073)."""
try:
detector = get_default_detector()
self._provider_detection_result = detector.detect_mode(
force_mode=self._force_provider_mode
)
logger.info(
f"Provider detection: mode={self._provider_detection_result.mode.value}, "
f"providers={[p.value for p in self._provider_detection_result.available_providers]}"
)
except Exception as e:
logger.warning(f"Provider detection failed: {e}")
self._provider_detection_result = None

@property
def provider_mode(self) -> ProviderMode:
"""Get the detected provider mode (ADR-073)."""
if self._provider_detection_result:
return self._provider_detection_result.mode
return ProviderMode.MULTI

@property
def provider_count(self) -> int:
"""Get the number of available providers."""
if self._provider_detection_result:
return self._provider_detection_result.provider_count
return len(self.get_available_providers())

@property
def provider_info(self) -> Dict[str, Any]:
"""Get provider detection information."""
if self._provider_detection_result:
return self._provider_detection_result.to_dict()
return {"mode": "unknown", "provider_count": 0}

def get_model_for_persona(
self,
persona_id: str,
override_model: Optional[str] = None
) -> str:
"""
Get the appropriate model for a persona based on provider mode (ADR-073).

In single/dual provider mode, uses ProviderDetector for model selection.
In multi-provider mode, uses routing config.

Args:
persona_id: The judge persona ID
override_model: Optional explicit model override

Returns:
Model identifier string
"""
# Runtime override takes priority
if override_model:
return override_model

# Environment variable override
env_key = f"CODITECT_JUDGE_MODEL_{persona_id.upper()}"
env_model = os.environ.get(env_key)
if env_model:
return env_model

# Provider-aware selection (ADR-073)
if self._enable_provider_detection and self._provider_detection_result:
mode = self._provider_detection_result.mode
if mode in (ProviderMode.SINGLE, ProviderMode.DUAL):
try:
detector = get_default_detector()
model = detector.get_model_for_persona(persona_id)
if model:
return model
except Exception:
pass # Fall through to config-based selection

# Config-based selection
config = self._get_routing_config()
routing = config.get("routing", {}).get(persona_id, {})
if routing.get("primary_model"):
return routing["primary_model"]

# Default fallback
return "claude-sonnet-4"

def refresh_provider_detection(self) -> Optional[ProviderDetectionResult]:
"""Refresh provider detection (e.g., after environment changes)."""
reset_default_detector()
self._detect_providers()
return self._provider_detection_result

def _load_fallback_config(self) -> FallbackConfig:
"""Load fallback config from routing file."""
if self.config_path.exists():
try:
with open(self.config_path, 'r', encoding='utf-8') as f:
config = json.load(f)
fallback = config.get("fallback_strategy", {})
return FallbackConfig(
max_retries=fallback.get("max_retries", 2),
retry_delay_seconds=fallback.get("retry_delay_seconds", 1.0),
use_backup_on_failure=fallback.get("use_backup_on_failure", True),
use_backup_on_timeout=fallback.get("use_backup_on_timeout", True),
timeout_seconds=fallback.get("timeout_seconds", 60)
)
except Exception:
pass
return FallbackConfig()

def _get_routing_config(self) -> Dict:
"""Load routing configuration."""
if self._routing_config is None:
if self.config_path.exists():
with open(self.config_path, 'r', encoding='utf-8') as f:
self._routing_config = json.load(f)
else:
self._routing_config = {}
return self._routing_config

def _get_provider(self, model: str) -> ModelProvider:
"""Get provider for a model."""
# Normalize model name
model_lower = model.lower().replace("_", "-")

# Direct lookup
if model in self.MODEL_PROVIDERS:
return self.MODEL_PROVIDERS[model]

# Pattern matching
if "claude" in model_lower:
return ModelProvider.ANTHROPIC
elif "gpt" in model_lower:
return ModelProvider.OPENAI
elif "deepseek" in model_lower:
return ModelProvider.DEEPSEEK
elif "qwen" in model_lower:
return ModelProvider.ALIBABA
elif "llama" in model_lower:
return ModelProvider.META
elif "gemini" in model_lower:
return ModelProvider.GOOGLE
elif "minimax" in model_lower:
return ModelProvider.MINIMAX

raise ValueError(f"Unknown model provider for: {model}")

def _get_client(self, provider: ModelProvider) -> BaseProviderClient:
"""Get or create client for provider."""
if provider not in self._clients:
api_key_env = self.PROVIDER_API_KEYS[provider]
api_key = os.environ.get(api_key_env)

if not api_key:
raise ValueError(
f"Missing API key for {provider.value}. "
f"Set {api_key_env} environment variable."
)

timeout = self.fallback_config.timeout_seconds

if provider == ModelProvider.ANTHROPIC:
self._clients[provider] = AnthropicClient(api_key, timeout)
elif provider == ModelProvider.OPENAI:
self._clients[provider] = OpenAIClient(api_key, timeout)
elif provider == ModelProvider.DEEPSEEK:
self._clients[provider] = DeepSeekClient(api_key, timeout)
elif provider == ModelProvider.ALIBABA:
self._clients[provider] = DashScopeClient(api_key, timeout)
elif provider == ModelProvider.META:
self._clients[provider] = TogetherClient(api_key, timeout)
elif provider == ModelProvider.GOOGLE:
self._clients[provider] = GoogleClient(api_key, timeout)
elif provider == ModelProvider.MINIMAX:
self._clients[provider] = MiniMaxClient(api_key, timeout)
else:
raise ValueError(f"Unsupported provider: {provider}")

return self._clients[provider]

def get_backup_model(self, persona_id: str) -> Optional[str]:
"""Get backup model for a persona."""
config = self._get_routing_config()
routing = config.get("routing", {}).get(persona_id, {})
return routing.get("backup_model")

async def get_completion(
self,
model: str,
prompt: str,
persona_id: str,
system_prompt: Optional[str] = None,
max_tokens: int = 4096,
temperature: float = 0.0,
use_fallback: bool = True,
use_provider_aware_selection: bool = False,
metadata: Optional[Dict] = None
) -> CompletionResponse:
"""
Get completion from the specified model with automatic fallback.

Args:
model: Model identifier (e.g., "claude-sonnet-4")
prompt: The prompt to complete
persona_id: Judge persona making the request (for routing/logging)
system_prompt: Optional system prompt
max_tokens: Maximum tokens in response
temperature: Sampling temperature
use_fallback: Whether to try backup model on failure
use_provider_aware_selection: Use provider-aware model selection (ADR-073)
metadata: Additional metadata

Returns:
CompletionResponse with result or error
"""
# Use provider-aware model selection if requested (ADR-073)
actual_model = model
if use_provider_aware_selection:
actual_model = self.get_model_for_persona(persona_id, override_model=model)
if actual_model != model:
logger.info(f"Provider-aware selection: {model} -> {actual_model} for {persona_id}")

request = CompletionRequest(
prompt=prompt,
model=actual_model,
persona_id=persona_id,
max_tokens=max_tokens,
temperature=temperature,
system_prompt=system_prompt,
metadata={
**(metadata or {}),
"original_model": model if actual_model != model else None,
"provider_mode": self.provider_mode.value if self._provider_detection_result else None
}
)

# Try primary model with retries
response = await self._try_with_retries(request)

# If failed and fallback enabled, try backup model
if not response.success and use_fallback and self.fallback_config.use_backup_on_failure:
backup_model = self.get_backup_model(persona_id)
if backup_model and backup_model != model:
backup_request = CompletionRequest(
prompt=prompt,
model=backup_model,
persona_id=persona_id,
max_tokens=max_tokens,
temperature=temperature,
system_prompt=system_prompt,
metadata={**(metadata or {}), "fallback_from": model}
)
response = await self._try_with_retries(backup_request)

return response

async def _try_with_retries(self, request: CompletionRequest) -> CompletionResponse:
"""Try request with retry logic."""
last_response: Optional[CompletionResponse] = None

for attempt in range(self.fallback_config.max_retries + 1):
try:
provider = self._get_provider(request.model)
client = self._get_client(provider)
response = await client.complete(request)

if response.success:
# Track usage (ADR-075)
self._record_usage(response)
return response

last_response = response

except Exception as e:
last_response = CompletionResponse(
content="",
model_used=request.model,
provider="unknown",
token_usage=0,
input_tokens=0,
output_tokens=0,
latency_ms=0,
timestamp=datetime.now(timezone.utc),
success=False,
error=str(e)
)

# Wait before retry (except on last attempt)
if attempt < self.fallback_config.max_retries:
await asyncio.sleep(self.fallback_config.retry_delay_seconds * (attempt + 1))

return last_response or CompletionResponse(
content="",
model_used=request.model,
provider="unknown",
token_usage=0,
input_tokens=0,
output_tokens=0,
latency_ms=0,
timestamp=datetime.now(timezone.utc),
success=False,
error="All retries exhausted"
)

def _record_usage(self, response: CompletionResponse) -> None:
"""Record token usage from a successful response (ADR-075).

Args:
response: CompletionResponse with token usage info
"""
if not _USAGE_TRACKING_AVAILABLE:
return

try:
tracker = get_tracker()
if tracker.get_summary() is not None: # Only if session active
tracker.record_usage(
model=response.model_used,
provider=response.provider,
input_tokens=response.input_tokens,
output_tokens=response.output_tokens,
cached_tokens=0 # TODO: Extract from raw_response if available
)
except Exception as e:
logger.debug(f"Usage tracking failed: {e}")

def get_available_providers(self) -> List[str]:
"""Get list of providers with configured API keys."""
available = []
for provider, env_var in self.PROVIDER_API_KEYS.items():
if os.environ.get(env_var):
available.append(provider.value)
return available

def get_model_info(self, model: str) -> Dict[str, Any]:
"""Get information about a model."""
try:
provider = self._get_provider(model)
api_key_env = self.PROVIDER_API_KEYS[provider]
return {
"model": model,
"provider": provider.value,
"api_key_env": api_key_env,
"api_key_configured": bool(os.environ.get(api_key_env))
}
except ValueError as e:
return {
"model": model,
"error": str(e)
}

Convenience functions

def create_default_client( enable_provider_detection: bool = True ) -> MultiModelClient: """Create a MultiModelClient with default configuration.""" return MultiModelClient(enable_provider_detection=enable_provider_detection)

def get_provider_for_model(model: str) -> str: """Get the provider name for a model.""" client = MultiModelClient(enable_provider_detection=False) return client._get_provider(model).value

def check_api_keys() -> Dict[str, bool]: """Check which API keys are configured.""" return { provider.value: bool(os.environ.get(env_var)) for provider, env_var in MultiModelClient.PROVIDER_API_KEYS.items() }

def get_client_provider_mode() -> ProviderMode: """Get the detected provider mode for the default client.""" client = MultiModelClient() return client.provider_mode

def get_model_for_persona_from_client(persona_id: str) -> str: """Get the provider-aware model for a persona.""" client = MultiModelClient() return client.get_model_for_persona(persona_id)