LLM Provider Integration Guide
Multi-Provider Architecture for Agentic Systems
Document ID: C1-LLM-INTEGRATION | Version: 1.0 | Category: P3 - Technical Deep Dives
Executive Summary
Production agentic systems require robust LLM integration with failover, load balancing, and cost optimization. This guide provides patterns for multi-provider architectures supporting Claude, GPT-4, Llama, and Mistral.
Provider Landscape
Provider Comparison Matrix
| Provider | Best For | Latency | Cost Tier | Context Window |
|---|---|---|---|---|
| Claude 3.5 Sonnet | Complex reasoning, coding | Medium | Medium | 200K |
| Claude 3 Opus | Nuanced analysis | High | High | 200K |
| GPT-4 Turbo | General purpose | Medium | Medium | 128K |
| GPT-4o | Multimodal | Low | Medium | 128K |
| Llama 3.1 405B | Self-hosted, privacy | Varies | Low | 128K |
| Mistral Large | European compliance | Medium | Medium | 32K |
Paradigm-Provider Mapping
| Paradigm | Primary Provider | Fallback | Rationale |
|---|---|---|---|
| LSR | Claude Sonnet | GPT-4 | Creative reasoning |
| GS | Claude Sonnet | GPT-4 | Citation accuracy |
| EP | Claude Sonnet | GPT-4 | Tool use capability |
| VE | GPT-4 | Claude | Function calling |
Architecture Patterns
Pattern 1: Router Architecture
┌─────────────────┐
│ LLM Router │
│ - Provider selection
│ - Load balancing
│ - Failover
└────────┬────────┘
│
┌────────────────────┼────────────────────┐
│ │ │
▼ ▼ ▼
┌───────────────┐ ┌───────────────┐ ┌───────────────┐
│ Anthropic │ │ OpenAI │ │ Self-Hosted │
│ Adapter │ │ Adapter │ │ Adapter │
└───────────────┘ └───────────────┘ └───────────────┘
Implementation
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import List, Dict, Any, Optional
import asyncio
import random
@dataclass
class LLMResponse:
content: str
model: str
provider: str
tokens_used: int
latency_ms: float
cost_estimate: float
@dataclass
class ProviderConfig:
name: str
models: List[str]
priority: int
weight: float # For load balancing
max_retries: int = 3
timeout_seconds: float = 60.0
rate_limit_rpm: int = 100
class LLMProvider(ABC):
"""Abstract LLM provider interface."""
@abstractmethod
async def generate(
self,
messages: List[Dict],
model: str,
**kwargs
) -> LLMResponse:
pass
@abstractmethod
async def health_check(self) -> bool:
pass
class AnthropicProvider(LLMProvider):
"""Anthropic Claude provider."""
MODEL_MAPPING = {
"claude-sonnet": "claude-sonnet-4-20250514",
"claude-opus": "claude-opus-4-20250514",
"claude-haiku": "claude-haiku-4-20250514"
}
def __init__(self, api_key: str):
from anthropic import AsyncAnthropic
self.client = AsyncAnthropic(api_key=api_key)
async def generate(
self,
messages: List[Dict],
model: str,
**kwargs
) -> LLMResponse:
import time
start = time.time()
model_id = self.MODEL_MAPPING.get(model, model)
response = await self.client.messages.create(
model=model_id,
messages=messages,
max_tokens=kwargs.get("max_tokens", 4096),
temperature=kwargs.get("temperature", 0.7)
)
latency = (time.time() - start) * 1000
return LLMResponse(
content=response.content[0].text,
model=model_id,
provider="anthropic",
tokens_used=response.usage.input_tokens + response.usage.output_tokens,
latency_ms=latency,
cost_estimate=self._calculate_cost(response.usage, model_id)
)
def _calculate_cost(self, usage, model: str) -> float:
# Pricing per 1M tokens (example rates)
rates = {
"claude-sonnet-4": {"input": 3.0, "output": 15.0},
"claude-opus-4": {"input": 15.0, "output": 75.0},
"claude-haiku-4": {"input": 0.25, "output": 1.25}
}
model_key = model.rsplit("-", 1)[0]
rate = rates.get(model_key, rates["claude-sonnet-4"])
return (usage.input_tokens * rate["input"] +
usage.output_tokens * rate["output"]) / 1_000_000
class LLMRouter:
"""Route requests across multiple providers."""
def __init__(self):
self.providers: Dict[str, LLMProvider] = {}
self.configs: Dict[str, ProviderConfig] = {}
self.health_status: Dict[str, bool] = {}
self.metrics: Dict[str, List[float]] = {}
def register_provider(self, config: ProviderConfig, provider: LLMProvider):
self.providers[config.name] = provider
self.configs[config.name] = config
self.health_status[config.name] = True
self.metrics[config.name] = []
async def generate(
self,
messages: List[Dict],
model_preference: str = None,
**kwargs
) -> LLMResponse:
"""Generate with automatic provider selection and failover."""
# Select provider
provider_name = await self._select_provider(model_preference)
for attempt in range(3): # Max failover attempts
try:
provider = self.providers[provider_name]
config = self.configs[provider_name]
response = await asyncio.wait_for(
provider.generate(messages, model_preference or config.models[0], **kwargs),
timeout=config.timeout_seconds
)
# Record success metrics
self.metrics[provider_name].append(response.latency_ms)
return response
except Exception as e:
self.health_status[provider_name] = False
provider_name = await self._select_provider(model_preference, exclude=[provider_name])
if not provider_name:
raise RuntimeError("All providers failed")
raise RuntimeError("Max failover attempts exceeded")
async def _select_provider(
self,
model_preference: str = None,
exclude: List[str] = None
) -> str:
"""Select provider based on health, priority, and load balancing."""
exclude = exclude or []
# Filter healthy providers
available = [
name for name, healthy in self.health_status.items()
if healthy and name not in exclude
]
if not available:
# Try to recover unhealthy providers
for name in self.providers:
if await self.providers[name].health_check():
self.health_status[name] = True
available.append(name)
if not available:
return None
# Priority-based selection with weighted random
configs = [self.configs[name] for name in available]
configs.sort(key=lambda c: c.priority)
# Weighted selection among same-priority providers
top_priority = configs[0].priority
same_priority = [c for c in configs if c.priority == top_priority]
total_weight = sum(c.weight for c in same_priority)
r = random.uniform(0, total_weight)
cumulative = 0
for config in same_priority:
cumulative += config.weight
if r <= cumulative:
return config.name
return same_priority[-1].name
Pattern 2: Cost Optimization
class CostOptimizer:
"""Optimize LLM costs through smart routing."""
def __init__(self, budget_daily: float):
self.budget_daily = budget_daily
self.spent_today = 0.0
self.request_count = 0
def select_model(
self,
task_complexity: str,
quality_requirement: str
) -> tuple[str, str]:
"""Select model based on task and budget."""
remaining_budget = self.budget_daily - self.spent_today
# Model selection matrix
if quality_requirement == "high":
if task_complexity == "complex":
return ("anthropic", "claude-opus")
else:
return ("anthropic", "claude-sonnet")
elif quality_requirement == "standard":
if remaining_budget < self.budget_daily * 0.2:
# Budget pressure - use cheaper model
return ("anthropic", "claude-haiku")
else:
return ("anthropic", "claude-sonnet")
else: # "low" quality ok
return ("anthropic", "claude-haiku")
def record_usage(self, cost: float):
self.spent_today += cost
self.request_count += 1
Pattern 3: Caching Layer
import hashlib
import json
class ResponseCache:
"""Cache LLM responses for repeated queries."""
def __init__(self, redis_client, ttl_seconds: int = 3600):
self.redis = redis_client
self.ttl = ttl_seconds
def _cache_key(self, messages: List[Dict], model: str, **kwargs) -> str:
content = json.dumps({
"messages": messages,
"model": model,
"temperature": kwargs.get("temperature", 0.7)
}, sort_keys=True)
return f"llm_cache:{hashlib.sha256(content.encode()).hexdigest()}"
async def get(self, messages: List[Dict], model: str, **kwargs) -> Optional[LLMResponse]:
key = self._cache_key(messages, model, **kwargs)
cached = await self.redis.get(key)
if cached:
return LLMResponse(**json.loads(cached))
return None
async def set(self, messages: List[Dict], model: str, response: LLMResponse, **kwargs):
# Only cache deterministic responses (low temperature)
if kwargs.get("temperature", 0.7) > 0.3:
return
key = self._cache_key(messages, model, **kwargs)
await self.redis.setex(key, self.ttl, json.dumps({
"content": response.content,
"model": response.model,
"provider": response.provider,
"tokens_used": response.tokens_used,
"latency_ms": 0, # Cached response
"cost_estimate": 0 # No cost for cached
}))
Configuration Reference
Environment Variables
# Provider API Keys
ANTHROPIC_API_KEY=sk-ant-...
OPENAI_API_KEY=sk-...
AZURE_OPENAI_ENDPOINT=https://...
AZURE_OPENAI_KEY=...
# Router Configuration
LLM_PRIMARY_PROVIDER=anthropic
LLM_FALLBACK_PROVIDER=openai
LLM_TIMEOUT_SECONDS=60
LLM_MAX_RETRIES=3
# Cost Controls
LLM_DAILY_BUDGET=100.0
LLM_ALERT_THRESHOLD=0.8
# Caching
LLM_CACHE_ENABLED=true
LLM_CACHE_TTL=3600
Quick Reference
| Scenario | Provider | Model | Rationale |
|---|---|---|---|
| Complex reasoning | Anthropic | Claude Opus | Best performance |
| General tasks | Anthropic | Claude Sonnet | Cost-effective |
| High volume | Anthropic | Claude Haiku | Lowest cost |
| Function calling | OpenAI | GPT-4 | Mature tooling |
| Privacy-critical | Self-hosted | Llama 3.1 | Data control |
| EU compliance | Mistral | Large | European provider |
Document maintained by CODITECT Platform Team