Multi-Model Routing and Provider Management
The Problem: No Single Best Model
No single model is perfect for every task. GPT-4 is smart but slow and expensive. GPT-3.5-turbo is fast but weaker. Open-source models are free but require infrastructure. The solution is routing: send each request to the best model for that specific task.
Why Multi-Model Systems Matter
Quality vs. Cost Tradeoff
- Simple tasks (spell check, classification): use GPT-3.5-turbo or local model
- Complex tasks (reasoning, code generation): use GPT-4 or Claude
- Cost savings: 10x cheaper by using right model for the job
Resilience
If OpenAI is down, fall back to Anthropic. If Anthropic is slow, use local models. Multi-provider systems are more reliable.
Performance
Different models have different strengths:
- GPT-4: General intelligence, reasoning
- Claude 3: Long context, nuance
- Llama: Fast inference, cost-effective
- Specialized models: Domain-specific tasks
Pattern 1: Task-Based Routing
Classify the task, then route to the appropriate model.
Implementation
from enum import Enum
from typing import Literal
class TaskComplexity(Enum):
SIMPLE = "simple"
MEDIUM = "medium"
COMPLEX = "complex"
def classify_task(prompt: str) -> TaskComplexity:
"""
Classify task complexity based on prompt.
"""
# Rule-based classification (in production, use ML model)
keywords_simple = ["spell", "grammar", "format", "classify", "extract"]
keywords_complex = ["design", "architecture", "reason", "analyze", "plan"]
prompt_lower = prompt.lower()
# Count keyword matches
simple_score = sum(1 for kw in keywords_simple if kw in prompt_lower)
complex_score = sum(1 for kw in keywords_complex if kw in prompt_lower)
# Length heuristic: longer prompts often more complex
if len(prompt) > 2000:
complex_score += 1
if complex_score > simple_score:
return TaskComplexity.COMPLEX
elif complex_score == 0 and simple_score > 0:
return TaskComplexity.SIMPLE
else:
return TaskComplexity.MEDIUM
def select_model(task_complexity: TaskComplexity) -> tuple[str, str, float]:
"""
Select model, provider, and budget based on task complexity.
Returns (model, provider, max_tokens)
"""
if task_complexity == TaskComplexity.SIMPLE:
return "gpt-3.5-turbo", "openai", 500
elif task_complexity == TaskComplexity.MEDIUM:
return "gpt-4-turbo", "openai", 2000
else:
return "gpt-4", "openai", 4000
def route_and_call(prompt: str):
"""
End-to-end task routing and LLM call.
"""
# Step 1: Classify task
complexity = classify_task(prompt)
print(f"Task complexity: {complexity.value}")
# Step 2: Select model
model, provider, max_tokens = select_model(complexity)
print(f"Selected: {model} from {provider}")
# Step 3: Call appropriate client
if provider == "openai":
response = openai_client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": prompt}],
max_tokens=int(max_tokens)
)
elif provider == "anthropic":
response = anthropic_client.messages.create(
model=model,
messages=[{"role": "user", "content": prompt}],
max_tokens=int(max_tokens)
)
# ... more providers
return response
# Usage
prompt = "Check spelling in this sentence: The qwick brown fox"
response = route_and_call(prompt)
Pattern 2: Cost-Based Routing
Choose the cheapest model that meets accuracy requirements.
Implementation
from dataclasses import dataclass
@dataclass
class ModelInfo:
name: str
provider: str
cost_per_1k_tokens: float # Input tokens
speed_tokens_per_sec: float
accuracy_score: float # 0-1, estimated quality
MODELS = [
ModelInfo("gpt-3.5-turbo", "openai", 0.0005, 100, 0.85),
ModelInfo("gpt-4-turbo", "openai", 0.003, 50, 0.95),
ModelInfo("gpt-4", "openai", 0.015, 30, 0.99),
ModelInfo("claude-3-opus", "anthropic", 0.015, 30, 0.98),
ModelInfo("mistral-7b", "local", 0.0, 20, 0.75),
]
def select_cheapest_model(min_accuracy: float) -> ModelInfo:
"""
Select the cheapest model that meets accuracy requirement.
"""
qualified_models = [m for m in MODELS if m.accuracy_score >= min_accuracy]
if not qualified_models:
# Use best available if none meet requirement
qualified_models = MODELS
return min(qualified_models, key=lambda m: m.cost_per_1k_tokens)
def route_by_accuracy(prompt: str, min_accuracy: float = 0.80):
"""
Route to cheapest model that meets accuracy requirement.
"""
model = select_cheapest_model(min_accuracy)
print(f"Selected {model.name} (cost: ${model.cost_per_1k_tokens}/1k, accuracy: {model.accuracy_score})")
# Call the selected model
# ...
# Usage
# For simple classification: accuracy can be lower, so use cheap model
route_by_accuracy("Classify: is this positive or negative?", min_accuracy=0.75)
# For legal docs: need high accuracy, so use best model
route_by_accuracy("Review contract for risks", min_accuracy=0.98)
Pattern 3: Provider Fallback Chains
If one provider fails, fall back to another.
Implementation
from typing import Optional
class ProviderChain:
def __init__(self, providers: list[tuple[str, str]]):
"""
providers: list of (model, provider) tuples in fallback order
"""
self.providers = providers
def call(self, prompt: str) -> tuple[str, str, str]:
"""
Try each provider in order until one succeeds.
Returns (response, model_used, provider_used)
"""
for model, provider in self.providers:
try:
print(f"Trying {model} from {provider}...")
if provider == "openai":
response = openai_client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": prompt}],
timeout=30
)
return response.choices[0].message.content, model, provider
elif provider == "anthropic":
response = anthropic_client.messages.create(
model=model,
messages=[{"role": "user", "content": prompt}],
max_tokens=4000
)
return response.content[0].text, model, provider
elif provider == "local":
response = local_client.generate(prompt)
return response, model, provider
except Exception as e:
print(f"{model} from {provider} failed: {e}")
if provider == self.providers[-1][1]:
raise # Last provider, give up
continue # Try next provider
# Usage
chain = ProviderChain([
("gpt-4", "openai"),
("claude-3-opus", "anthropic"),
("mistral-7b", "local")
])
result, model, provider = chain.call("Write a poem")
print(f"Response from {model} ({provider}): {result}")
Pattern 4: A/B Testing Models
Test two models to see which performs better.
Implementation
import random
from collections import defaultdict
class ModelABTest:
def __init__(self, model_a: str, model_b: str, metric_func):
self.model_a = model_a
self.model_b = model_b
self.metric_func = metric_func # Function that scores response
self.results = defaultdict(list)
def call(self, prompt: str) -> tuple[str, str]:
"""
Randomly route to model A or B. Record metric.
"""
# Randomly choose model (50/50 split)
chosen_model = random.choice([self.model_a, self.model_b])
if chosen_model == self.model_a:
response = openai_client.chat.completions.create(
model="gpt-4-turbo",
messages=[{"role": "user", "content": prompt}]
)
else:
response = anthropic_client.messages.create(
model="claude-3-opus",
messages=[{"role": "user", "content": prompt}],
max_tokens=4000
)
response_text = response.choices[0].message.content if chosen_model == self.model_a else response.content[0].text
# Score the response
score = self.metric_func(response_text)
self.results[chosen_model].append(score)
return response_text, chosen_model
def compare(self) -> dict:
"""
Compare performance of both models.
"""
avg_a = sum(self.results[self.model_a]) / len(self.results[self.model_a]) if self.results[self.model_a] else 0
avg_b = sum(self.results[self.model_b]) / len(self.results[self.model_b]) if self.results[self.model_b] else 0
return {
self.model_a: {"avg_score": avg_a, "samples": len(self.results[self.model_a])},
self.model_b: {"avg_score": avg_b, "samples": len(self.results[self.model_b])},
"winner": self.model_a if avg_a > avg_b else self.model_b
}
# Usage
def score_response(response: str) -> float:
# Simple scoring: length and coherence
return min(1.0, len(response) / 500) # Scores 0-1
test = ModelABTest("gpt-4-turbo", "claude-3-opus", score_response)
# Run test
for _ in range(100):
prompt = "Explain quantum computing in one paragraph"
response, model = test.call(prompt)
# Compare results
results = test.compare()
print(f"Winner: {results['winner']} with score {results[results['winner']]['avg_score']:.2f}")
Pattern 5: Abstraction Layer with LiteLLM
Use an abstraction layer to simplify multi-model calls.
Implementation
from litellm import completion, embedding
def call_llm_abstracted(prompt: str, model_preference: str = "gpt-4"):
"""
Use LiteLLM to abstract away provider differences.
Same function works with any model.
"""
try:
response = completion(
model=model_preference,
messages=[{"role": "user", "content": prompt}],
temperature=0.7
)
return response.choices[0].message.content
except Exception as e:
print(f"Primary model failed: {e}. Falling back...")
# Automatically tries fallback
response = completion(
model="gpt-3.5-turbo", # Fallback
messages=[{"role": "user", "content": prompt}]
)
return response.choices[0].message.content
# Works with any provider
# LiteLLM handles API key management and format conversion
result = call_llm_abstracted("Write code", model_preference="claude-3-opus")
Pattern 6: Model Versioning
Track which model version produced which results.
Implementation
from datetime import datetime
class ModelVersion:
def __init__(self, name: str, version: str, provider: str, config: dict):
self.name = name
self.version = version
self.provider = provider
self.config = config
self.created_at = datetime.now()
self.deprecated = False
def __str__(self):
return f"{self.name}:{self.version}"
class VersionedLLM:
def __init__(self):
self.models = {}
self.current_version = None
def register_model(
self,
name: str,
version: str,
provider: str,
config: dict
):
key = f"{name}:{version}"
model = ModelVersion(name, version, provider, config)
self.models[key] = model
self.current_version = key
print(f"Registered {key}")
def call(self, prompt: str, version: str = None) -> tuple[str, str]:
"""
Call LLM with specific version.
"""
version = version or self.current_version
model = self.models.get(version)
if not model:
raise ValueError(f"Model version {version} not found")
if model.deprecated:
print(f"WARNING: {version} is deprecated")
print(f"Using {version}")
# Call actual LLM with config from model.config
# ...
return "response", version
# Usage
versioned_llm = VersionedLLM()
# Register multiple versions
versioned_llm.register_model(
"code-gen",
"v1",
"openai",
{"model": "gpt-3.5-turbo", "temperature": 0.5}
)
versioned_llm.register_model(
"code-gen",
"v2",
"openai",
{"model": "gpt-4-turbo", "temperature": 0.3}
)
# Call specific version
response, version_used = versioned_llm.call("Write a function", version="code-gen:v2")
End-to-End Example: Production Router
class ProductionRouter:
def __init__(self):
self.task_classifier = TaskClassifier()
self.cost_optimizer = CostOptimizer()
self.provider_chain = ProviderChain([
("gpt-4", "openai"),
("claude-3-opus", "anthropic"),
("mistral-7b", "local")
])
self.logger = LLMLogger()
def process_request(self, user_id: str, prompt: str, min_accuracy: float = 0.85) -> str:
"""
Full request routing: classify task, optimize for cost/accuracy, select model, call LLM.
"""
# Step 1: Classify
complexity = self.task_classifier.classify(prompt)
# Step 2: Optimize
model = self.cost_optimizer.select(complexity, min_accuracy)
# Step 3: Call with fallback
try:
response = self.provider_chain.call(prompt)
except Exception as e:
self.logger.log_error(user_id, prompt, str(e))
raise
# Step 4: Log
self.logger.log_success(
user_id,
prompt,
response,
model,
complexity
)
return response
router = ProductionRouter()
response = router.process_request(
user_id="user_123",
prompt="Optimize my database query",
min_accuracy=0.95
)
Summary
Multi-model systems are key to production AI:
- Task-based routing: right model for the right job
- Cost-based routing: optimize spend without sacrificing quality
- Provider fallback: resilience through diversity
- A/B testing: measure and improve over time
- Abstraction layers: simplify management
- Model versioning: track and control changes
Start simple (one model), add complexity as your needs grow. The best production systems use multiple models strategically.
Discussion
Sign in to comment. Your account must be at least 1 day old.