Multi-Model Routing and Provider Management

advanced327 reads

The Problem: No Single Best Model

No single model is perfect for every task. GPT-4 is smart but slow and expensive. GPT-3.5-turbo is fast but weaker. Open-source models are free but require infrastructure. The solution is routing: send each request to the best model for that specific task.

Why Multi-Model Systems Matter

Quality vs. Cost Tradeoff

Simple tasks (spell check, classification): use GPT-3.5-turbo or local model
Complex tasks (reasoning, code generation): use GPT-4 or Claude
Cost savings: 10x cheaper by using right model for the job

Resilience

If OpenAI is down, fall back to Anthropic. If Anthropic is slow, use local models. Multi-provider systems are more reliable.

Performance

Different models have different strengths:

GPT-4: General intelligence, reasoning
Claude 3: Long context, nuance
Llama: Fast inference, cost-effective
Specialized models: Domain-specific tasks

Pattern 1: Task-Based Routing

Classify the task, then route to the appropriate model.

Implementation

from enum import Enum
from typing import Literal

class TaskComplexity(Enum):
    SIMPLE = "simple"
    MEDIUM = "medium"
    COMPLEX = "complex"

def classify_task(prompt: str) -> TaskComplexity:
    """
    Classify task complexity based on prompt.
    """
    # Rule-based classification (in production, use ML model)
    keywords_simple = ["spell", "grammar", "format", "classify", "extract"]
    keywords_complex = ["design", "architecture", "reason", "analyze", "plan"]
    
    prompt_lower = prompt.lower()
    
    # Count keyword matches
    simple_score = sum(1 for kw in keywords_simple if kw in prompt_lower)
    complex_score = sum(1 for kw in keywords_complex if kw in prompt_lower)
    
    # Length heuristic: longer prompts often more complex
    if len(prompt) > 2000:
        complex_score += 1
    
    if complex_score > simple_score:
        return TaskComplexity.COMPLEX
    elif complex_score == 0 and simple_score > 0:
        return TaskComplexity.SIMPLE
    else:
        return TaskComplexity.MEDIUM

def select_model(task_complexity: TaskComplexity) -> tuple[str, str, float]:
    """
    Select model, provider, and budget based on task complexity.
    Returns (model, provider, max_tokens)
    """
    if task_complexity == TaskComplexity.SIMPLE:
        return "gpt-3.5-turbo", "openai", 500
    elif task_complexity == TaskComplexity.MEDIUM:
        return "gpt-4-turbo", "openai", 2000
    else:
        return "gpt-4", "openai", 4000

def route_and_call(prompt: str):
    """
    End-to-end task routing and LLM call.
    """
    # Step 1: Classify task
    complexity = classify_task(prompt)
    print(f"Task complexity: {complexity.value}")
    
    # Step 2: Select model
    model, provider, max_tokens = select_model(complexity)
    print(f"Selected: {model} from {provider}")
    
    # Step 3: Call appropriate client
    if provider == "openai":
        response = openai_client.chat.completions.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            max_tokens=int(max_tokens)
        )
    elif provider == "anthropic":
        response = anthropic_client.messages.create(
            model=model,
            messages=[{"role": "user", "content": prompt}],
            max_tokens=int(max_tokens)
        )
    # ... more providers
    
    return response

# Usage
prompt = "Check spelling in this sentence: The qwick brown fox"
response = route_and_call(prompt)

Pattern 2: Cost-Based Routing

Choose the cheapest model that meets accuracy requirements.

Implementation

from dataclasses import dataclass

@dataclass
class ModelInfo:
    name: str
    provider: str
    cost_per_1k_tokens: float  # Input tokens
    speed_tokens_per_sec: float
    accuracy_score: float  # 0-1, estimated quality

MODELS = [
    ModelInfo("gpt-3.5-turbo", "openai", 0.0005, 100, 0.85),
    ModelInfo("gpt-4-turbo", "openai", 0.003, 50, 0.95),
    ModelInfo("gpt-4", "openai", 0.015, 30, 0.99),
    ModelInfo("claude-3-opus", "anthropic", 0.015, 30, 0.98),
    ModelInfo("mistral-7b", "local", 0.0, 20, 0.75),
]

def select_cheapest_model(min_accuracy: float) -> ModelInfo:
    """
    Select the cheapest model that meets accuracy requirement.
    """
    qualified_models = [m for m in MODELS if m.accuracy_score >= min_accuracy]
    
    if not qualified_models:
        # Use best available if none meet requirement
        qualified_models = MODELS
    
    return min(qualified_models, key=lambda m: m.cost_per_1k_tokens)

def route_by_accuracy(prompt: str, min_accuracy: float = 0.80):
    """
    Route to cheapest model that meets accuracy requirement.
    """
    model = select_cheapest_model(min_accuracy)
    print(f"Selected {model.name} (cost: ${model.cost_per_1k_tokens}/1k, accuracy: {model.accuracy_score})")
    
    # Call the selected model
    # ...

# Usage
# For simple classification: accuracy can be lower, so use cheap model
route_by_accuracy("Classify: is this positive or negative?", min_accuracy=0.75)

# For legal docs: need high accuracy, so use best model
route_by_accuracy("Review contract for risks", min_accuracy=0.98)

Pattern 3: Provider Fallback Chains

If one provider fails, fall back to another.

Implementation

from typing import Optional

class ProviderChain:
    def __init__(self, providers: list[tuple[str, str]]):
        """
        providers: list of (model, provider) tuples in fallback order
        """
        self.providers = providers
    
    def call(self, prompt: str) -> tuple[str, str, str]:
        """
        Try each provider in order until one succeeds.
        Returns (response, model_used, provider_used)
        """
        for model, provider in self.providers:
            try:
                print(f"Trying {model} from {provider}...")
                
                if provider == "openai":
                    response = openai_client.chat.completions.create(
                        model=model,
                        messages=[{"role": "user", "content": prompt}],
                        timeout=30
                    )
                    return response.choices[0].message.content, model, provider
                
                elif provider == "anthropic":
                    response = anthropic_client.messages.create(
                        model=model,
                        messages=[{"role": "user", "content": prompt}],
                        max_tokens=4000
                    )
                    return response.content[0].text, model, provider
                
                elif provider == "local":
                    response = local_client.generate(prompt)
                    return response, model, provider
            
            except Exception as e:
                print(f"{model} from {provider} failed: {e}")
                if provider == self.providers[-1][1]:
                    raise  # Last provider, give up
                continue  # Try next provider

# Usage
chain = ProviderChain([
    ("gpt-4", "openai"),
    ("claude-3-opus", "anthropic"),
    ("mistral-7b", "local")
])

result, model, provider = chain.call("Write a poem")
print(f"Response from {model} ({provider}): {result}")

Pattern 4: A/B Testing Models

Test two models to see which performs better.

Implementation

import random
from collections import defaultdict

class ModelABTest:
    def __init__(self, model_a: str, model_b: str, metric_func):
        self.model_a = model_a
        self.model_b = model_b
        self.metric_func = metric_func  # Function that scores response
        self.results = defaultdict(list)
    
    def call(self, prompt: str) -> tuple[str, str]:
        """
        Randomly route to model A or B. Record metric.
        """
        # Randomly choose model (50/50 split)
        chosen_model = random.choice([self.model_a, self.model_b])
        
        if chosen_model == self.model_a:
            response = openai_client.chat.completions.create(
                model="gpt-4-turbo",
                messages=[{"role": "user", "content": prompt}]
            )
        else:
            response = anthropic_client.messages.create(
                model="claude-3-opus",
                messages=[{"role": "user", "content": prompt}],
                max_tokens=4000
            )
        
        response_text = response.choices[0].message.content if chosen_model == self.model_a else response.content[0].text
        
        # Score the response
        score = self.metric_func(response_text)
        self.results[chosen_model].append(score)
        
        return response_text, chosen_model
    
    def compare(self) -> dict:
        """
        Compare performance of both models.
        """
        avg_a = sum(self.results[self.model_a]) / len(self.results[self.model_a]) if self.results[self.model_a] else 0
        avg_b = sum(self.results[self.model_b]) / len(self.results[self.model_b]) if self.results[self.model_b] else 0
        
        return {
            self.model_a: {"avg_score": avg_a, "samples": len(self.results[self.model_a])},
            self.model_b: {"avg_score": avg_b, "samples": len(self.results[self.model_b])},
            "winner": self.model_a if avg_a > avg_b else self.model_b
        }

# Usage
def score_response(response: str) -> float:
    # Simple scoring: length and coherence
    return min(1.0, len(response) / 500)  # Scores 0-1

test = ModelABTest("gpt-4-turbo", "claude-3-opus", score_response)

# Run test
for _ in range(100):
    prompt = "Explain quantum computing in one paragraph"
    response, model = test.call(prompt)

# Compare results
results = test.compare()
print(f"Winner: {results['winner']} with score {results[results['winner']]['avg_score']:.2f}")

Pattern 5: Abstraction Layer with LiteLLM

Use an abstraction layer to simplify multi-model calls.

Implementation

from litellm import completion, embedding

def call_llm_abstracted(prompt: str, model_preference: str = "gpt-4"):
    """
    Use LiteLLM to abstract away provider differences.
    Same function works with any model.
    """
    try:
        response = completion(
            model=model_preference,
            messages=[{"role": "user", "content": prompt}],
            temperature=0.7
        )
        return response.choices[0].message.content
    except Exception as e:
        print(f"Primary model failed: {e}. Falling back...")
        # Automatically tries fallback
        response = completion(
            model="gpt-3.5-turbo",  # Fallback
            messages=[{"role": "user", "content": prompt}]
        )
        return response.choices[0].message.content

# Works with any provider
# LiteLLM handles API key management and format conversion
result = call_llm_abstracted("Write code", model_preference="claude-3-opus")

Pattern 6: Model Versioning

Track which model version produced which results.

Implementation

from datetime import datetime

class ModelVersion:
    def __init__(self, name: str, version: str, provider: str, config: dict):
        self.name = name
        self.version = version
        self.provider = provider
        self.config = config
        self.created_at = datetime.now()
        self.deprecated = False
    
    def __str__(self):
        return f"{self.name}:{self.version}"

class VersionedLLM:
    def __init__(self):
        self.models = {}
        self.current_version = None
    
    def register_model(
        self,
        name: str,
        version: str,
        provider: str,
        config: dict
    ):
        key = f"{name}:{version}"
        model = ModelVersion(name, version, provider, config)
        self.models[key] = model
        self.current_version = key
        print(f"Registered {key}")
    
    def call(self, prompt: str, version: str = None) -> tuple[str, str]:
        """
        Call LLM with specific version.
        """
        version = version or self.current_version
        model = self.models.get(version)
        
        if not model:
            raise ValueError(f"Model version {version} not found")
        
        if model.deprecated:
            print(f"WARNING: {version} is deprecated")
        
        print(f"Using {version}")
        # Call actual LLM with config from model.config
        # ...
        return "response", version

# Usage
versioned_llm = VersionedLLM()

# Register multiple versions
versioned_llm.register_model(
    "code-gen",
    "v1",
    "openai",
    {"model": "gpt-3.5-turbo", "temperature": 0.5}
)

versioned_llm.register_model(
    "code-gen",
    "v2",
    "openai",
    {"model": "gpt-4-turbo", "temperature": 0.3}
)

# Call specific version
response, version_used = versioned_llm.call("Write a function", version="code-gen:v2")

End-to-End Example: Production Router

class ProductionRouter:
    def __init__(self):
        self.task_classifier = TaskClassifier()
        self.cost_optimizer = CostOptimizer()
        self.provider_chain = ProviderChain([
            ("gpt-4", "openai"),
            ("claude-3-opus", "anthropic"),
            ("mistral-7b", "local")
        ])
        self.logger = LLMLogger()
    
    def process_request(self, user_id: str, prompt: str, min_accuracy: float = 0.85) -> str:
        """
        Full request routing: classify task, optimize for cost/accuracy, select model, call LLM.
        """
        # Step 1: Classify
        complexity = self.task_classifier.classify(prompt)
        
        # Step 2: Optimize
        model = self.cost_optimizer.select(complexity, min_accuracy)
        
        # Step 3: Call with fallback
        try:
            response = self.provider_chain.call(prompt)
        except Exception as e:
            self.logger.log_error(user_id, prompt, str(e))
            raise
        
        # Step 4: Log
        self.logger.log_success(
            user_id,
            prompt,
            response,
            model,
            complexity
        )
        
        return response

router = ProductionRouter()
response = router.process_request(
    user_id="user_123",
    prompt="Optimize my database query",
    min_accuracy=0.95
)

Summary

Multi-model systems are key to production AI:

Task-based routing: right model for the right job
Cost-based routing: optimize spend without sacrificing quality
Provider fallback: resilience through diversity
A/B testing: measure and improve over time
Abstraction layers: simplify management
Model versioning: track and control changes

Start simple (one model), add complexity as your needs grow. The best production systems use multiple models strategically.

In the next step, you will explore the best AI tools for deploying and serving AI models. Browse the options, pick one that fits your workflow, and try it before continuing.

Discussion

Loading…

← Back to Academy