AI & Training/Cost Optimization

Cost Optimization

Reduce AI costs by 60% or more without sacrificing quality. Learn caching strategies, intelligent model routing, and budget management techniques.

Typical Cost Reduction

40%

Response Caching

25%

Model Routing

15%

Prompt Optimization

10%

Token Management

Response Caching

Many AI requests are similar or identical. Caching responses eliminates redundant API calls.

// lib/ai/cache.ts
import { Redis } from '@upstash/redis'
import { generateText } from 'ai'
import { createHash } from 'crypto'

const redis = new Redis({
  url: process.env.KV_REST_API_URL!,
  token: process.env.KV_REST_API_TOKEN!,
})

interface CacheConfig {
  ttlSeconds: number
  similarity: 'exact' | 'semantic'
}

// Generate cache key from prompt
function generateCacheKey(
  model: string,
  systemPrompt: string,
  userPrompt: string
): string {
  const content = `${model}:${systemPrompt}:${userPrompt}`
  return `ai:cache:${createHash('sha256').update(content).digest('hex')}`
}

export async function generateWithCache(
  model: string,
  systemPrompt: string,
  userPrompt: string,
  config: CacheConfig = { ttlSeconds: 3600, similarity: 'exact' }
): Promise<{ text: string; cached: boolean; cost: number }> {
  const cacheKey = generateCacheKey(model, systemPrompt, userPrompt)
  
  // Check cache
  const cached = await redis.get<string>(cacheKey)
  if (cached) {
    return { text: cached, cached: true, cost: 0 }
  }
  
  // Generate new response
  const { text, usage } = await generateText({
    model,
    system: systemPrompt,
    prompt: userPrompt,
  })
  
  // Cache the response
  await redis.setex(cacheKey, config.ttlSeconds, text)
  
  // Calculate cost
  const cost = calculateCost(model, usage.promptTokens, usage.completionTokens)
  
  return { text, cached: false, cost }
}

// Semantic caching with embeddings
export async function generateWithSemanticCache(
  model: string,
  systemPrompt: string,
  userPrompt: string,
  similarityThreshold: number = 0.95
): Promise<{ text: string; cached: boolean }> {
  // Generate embedding for the prompt
  const embedding = await generateEmbedding(userPrompt)
  
  // Search for similar cached prompts
  const similar = await searchSimilarPrompts(embedding, similarityThreshold)
  
  if (similar) {
    return { text: similar.response, cached: true }
  }
  
  // Generate and cache
  const { text } = await generateText({ model, system: systemPrompt, prompt: userPrompt })
  await cacheWithEmbedding(userPrompt, embedding, text)
  
  return { text, cached: false }
}

Intelligent Model Routing

Route simple tasks to cheaper models, reserve expensive models for complex reasoning.

// lib/ai/router.ts
interface TaskClassification {
  complexity: 'simple' | 'moderate' | 'complex'
  requiresReasoning: boolean
  requiresCreativity: boolean
  estimatedTokens: number
}

const MODEL_COSTS = {
  'gpt-4o': { input: 5.00, output: 15.00 },        // per 1M tokens
  'gpt-4o-mini': { input: 0.15, output: 0.60 },
  'claude-3-haiku': { input: 0.25, output: 1.25 },
  'llama-3.1-70b': { input: 0.35, output: 0.40 },
}

// Classify task complexity
async function classifyTask(prompt: string): Promise<TaskClassification> {
  // Use cheap model to classify
  const { text } = await generateText({
    model: 'gpt-4o-mini',
    prompt: `Classify this task:
    
"${prompt}"

Respond with JSON: {
  "complexity": "simple|moderate|complex",
  "requiresReasoning": boolean,
  "requiresCreativity": boolean,
  "estimatedTokens": number
}`,
  })
  
  return JSON.parse(text)
}

// Route to optimal model
export async function routeToOptimalModel(
  prompt: string,
  maxBudget?: number // cents per request
): Promise<string> {
  const classification = await classifyTask(prompt)
  
  // Simple tasks -> cheapest model
  if (classification.complexity === 'simple' && !classification.requiresReasoning) {
    return 'gpt-4o-mini'
  }
  
  // Creative tasks -> Claude
  if (classification.requiresCreativity && !classification.requiresReasoning) {
    return 'claude-3-haiku'
  }
  
  // Complex reasoning -> GPT-4o
  if (classification.complexity === 'complex' || classification.requiresReasoning) {
    // Check budget
    if (maxBudget) {
      const estimatedCost = estimateCost('gpt-4o', classification.estimatedTokens)
      if (estimatedCost > maxBudget) {
        return 'llama-3.1-70b' // Cheaper alternative
      }
    }
    return 'gpt-4o'
  }
  
  // Default
  return 'gpt-4o-mini'
}

function estimateCost(model: string, tokens: number): number {
  const costs = MODEL_COSTS[model as keyof typeof MODEL_COSTS]
  // Assume 50/50 input/output split
  return (tokens / 2 / 1_000_000 * costs.input) + 
         (tokens / 2 / 1_000_000 * costs.output)
}

Token Optimization

// lib/ai/token-optimizer.ts

// 1. Compress conversation history
export function compressHistory(
  messages: Message[],
  maxTokens: number = 2000
): Message[] {
  // Keep system message
  const system = messages.find(m => m.role === 'system')
  
  // Summarize older messages
  const recent = messages.slice(-4) // Keep last 4
  const older = messages.slice(1, -4) // Middle messages
  
  if (older.length > 0 && countTokens(messages) > maxTokens) {
    const summary = await summarizeMessages(older)
    return [
      system,
      { role: 'system', content: `Previous context: ${summary}` },
      ...recent,
    ].filter(Boolean)
  }
  
  return messages
}

// 2. Optimize system prompts
export function optimizeSystemPrompt(prompt: string): string {
  // Remove unnecessary whitespace
  let optimized = prompt.replace(/\s+/g, ' ').trim()
  
  // Remove redundant phrases
  const redundantPhrases = [
    'Please note that',
    'It is important to',
    'Make sure to always',
    'Remember that you should',
  ]
  
  for (const phrase of redundantPhrases) {
    optimized = optimized.replace(new RegExp(phrase, 'gi'), '')
  }
  
  return optimized
}

// 3. Batch similar requests
export async function batchRequests<T>(
  requests: T[],
  processFn: (batch: T[]) => Promise<string[]>,
  batchSize: number = 5
): Promise<string[]> {
  const results: string[] = []
  
  for (let i = 0; i < requests.length; i += batchSize) {
    const batch = requests.slice(i, i + batchSize)
    const batchResults = await processFn(batch)
    results.push(...batchResults)
  }
  
  return results
}

// 4. Use structured outputs (shorter responses)
const structuredOutputPrompt = `
Respond with minimal JSON only:
{
  "action": "approve|reject|escalate",
  "reason": "brief reason",
  "confidence": 0.0-1.0
}
No additional text.
`

Budget Management

// lib/ai/budget-manager.ts
interface BudgetConfig {
  dailyLimit: number      // cents
  monthlyLimit: number    // cents
  alertThreshold: number  // 0-1 (e.g., 0.8 = 80%)
  hardLimit: boolean      // Stop requests when exceeded
}

export class BudgetManager {
  private redis: Redis
  
  async trackUsage(
    organizationId: string,
    cost: number,
    metadata: { model: string; tokens: number; agentId: string }
  ): Promise<void> {
    const today = new Date().toISOString().split('T')[0]
    const month = today.slice(0, 7)
    
    // Increment daily and monthly counters
    await this.redis.incrby(`budget:${organizationId}:daily:${today}`, cost)
    await this.redis.incrby(`budget:${organizationId}:monthly:${month}`, cost)
    
    // Store detailed usage for analytics
    await this.redis.lpush(`usage:${organizationId}:${today}`, JSON.stringify({
      ...metadata,
      cost,
      timestamp: Date.now(),
    }))
  }
  
  async checkBudget(
    organizationId: string,
    config: BudgetConfig
  ): Promise<{ allowed: boolean; usage: BudgetUsage }> {
    const today = new Date().toISOString().split('T')[0]
    const month = today.slice(0, 7)
    
    const [dailyUsage, monthlyUsage] = await Promise.all([
      this.redis.get<number>(`budget:${organizationId}:daily:${today}`) || 0,
      this.redis.get<number>(`budget:${organizationId}:monthly:${month}`) || 0,
    ])
    
    const usage = {
      daily: dailyUsage,
      monthly: monthlyUsage,
      dailyPercentage: dailyUsage / config.dailyLimit,
      monthlyPercentage: monthlyUsage / config.monthlyLimit,
    }
    
    // Check if over limit
    if (config.hardLimit) {
      if (dailyUsage >= config.dailyLimit || monthlyUsage >= config.monthlyLimit) {
        return { allowed: false, usage }
      }
    }
    
    // Send alerts at threshold
    if (usage.dailyPercentage >= config.alertThreshold) {
      await this.sendBudgetAlert(organizationId, 'daily', usage)
    }
    if (usage.monthlyPercentage >= config.alertThreshold) {
      await this.sendBudgetAlert(organizationId, 'monthly', usage)
    }
    
    return { allowed: true, usage }
  }
  
  async getUsageReport(
    organizationId: string,
    period: 'day' | 'week' | 'month'
  ): Promise<UsageReport> {
    // Aggregate usage by model, agent, and time
    const usage = await this.aggregateUsage(organizationId, period)
    
    return {
      totalCost: usage.total,
      byModel: usage.byModel,
      byAgent: usage.byAgent,
      byDay: usage.byDay,
      topExpensive: usage.topRequests.slice(0, 10),
      recommendations: this.generateRecommendations(usage),
    }
  }
}

Monitoring Dashboard

Key Metrics to Track

Cost per request
Track
Cache hit rate
Target: 40%+
Model distribution
Optimize
Token efficiency
Monitor
Cost per task type
Compare

Alert Thresholds

Daily budget 80%
Warning
Daily budget 100%
Critical
Unusual spike (+50%)
Warning
Cache hit rate below 20%
Warning

Cost Optimization Checklist

Implement response caching

Cache similar queries with Redis/Upstash

Route by task complexity

Use GPT-4o-mini for simple tasks

Compress conversation history

Summarize older messages

Set budget limits and alerts

Prevent unexpected cost spikes

Monitor and analyze usage

Identify optimization opportunities

← Fine-Tuning Use Cases

Response Caching

Many AI requests are similar or identical. Caching responses eliminates redundant API calls.

// lib/ai/cache.ts
import { Redis } from '@upstash/redis'
import { generateText } from 'ai'
import { createHash } from 'crypto'

const redis = new Redis({
  url: process.env.KV_REST_API_URL!,
  token: process.env.KV_REST_API_TOKEN!,
})

interface CacheConfig {
  ttlSeconds: number
  similarity: 'exact' | 'semantic'
}

// Generate cache key from prompt
function generateCacheKey(
  model: string,
  systemPrompt: string,
  userPrompt: string
): string {
  const content = `${model}:${systemPrompt}:${userPrompt}`
  return `ai:cache:${createHash('sha256').update(content).digest('hex')}`
}

export async function generateWithCache(
  model: string,
  systemPrompt: string,
  userPrompt: string,
  config: CacheConfig = { ttlSeconds: 3600, similarity: 'exact' }
): Promise<{ text: string; cached: boolean; cost: number }> {
  const cacheKey = generateCacheKey(model, systemPrompt, userPrompt)
  
  // Check cache
  const cached = await redis.get<string>(cacheKey)
  if (cached) {
    return { text: cached, cached: true, cost: 0 }
  }
  
  // Generate new response
  const { text, usage } = await generateText({
    model,
    system: systemPrompt,
    prompt: userPrompt,
  })
  
  // Cache the response
  await redis.setex(cacheKey, config.ttlSeconds, text)
  
  // Calculate cost
  const cost = calculateCost(model, usage.promptTokens, usage.completionTokens)
  
  return { text, cached: false, cost }
}

// Semantic caching with embeddings
export async function generateWithSemanticCache(
  model: string,
  systemPrompt: string,
  userPrompt: string,
  similarityThreshold: number = 0.95
): Promise<{ text: string; cached: boolean }> {
  // Generate embedding for the prompt
  const embedding = await generateEmbedding(userPrompt)
  
  // Search for similar cached prompts
  const similar = await searchSimilarPrompts(embedding, similarityThreshold)
  
  if (similar) {
    return { text: similar.response, cached: true }
  }
  
  // Generate and cache
  const { text } = await generateText({ model, system: systemPrompt, prompt: userPrompt })
  await cacheWithEmbedding(userPrompt, embedding, text)
  
  return { text, cached: false }
}

Intelligent Model Routing

Route simple tasks to cheaper models, reserve expensive models for complex reasoning.

// lib/ai/router.ts
interface TaskClassification {
  complexity: 'simple' | 'moderate' | 'complex'
  requiresReasoning: boolean
  requiresCreativity: boolean
  estimatedTokens: number
}

const MODEL_COSTS = {
  'gpt-4o': { input: 5.00, output: 15.00 },        // per 1M tokens
  'gpt-4o-mini': { input: 0.15, output: 0.60 },
  'claude-3-haiku': { input: 0.25, output: 1.25 },
  'llama-3.1-70b': { input: 0.35, output: 0.40 },
}

// Classify task complexity
async function classifyTask(prompt: string): Promise<TaskClassification> {
  // Use cheap model to classify
  const { text } = await generateText({
    model: 'gpt-4o-mini',
    prompt: `Classify this task:
    
"${prompt}"

Respond with JSON: {
  "complexity": "simple|moderate|complex",
  "requiresReasoning": boolean,
  "requiresCreativity": boolean,
  "estimatedTokens": number
}`,
  })
  
  return JSON.parse(text)
}

// Route to optimal model
export async function routeToOptimalModel(
  prompt: string,
  maxBudget?: number // cents per request
): Promise<string> {
  const classification = await classifyTask(prompt)
  
  // Simple tasks -> cheapest model
  if (classification.complexity === 'simple' && !classification.requiresReasoning) {
    return 'gpt-4o-mini'
  }
  
  // Creative tasks -> Claude
  if (classification.requiresCreativity && !classification.requiresReasoning) {
    return 'claude-3-haiku'
  }
  
  // Complex reasoning -> GPT-4o
  if (classification.complexity === 'complex' || classification.requiresReasoning) {
    // Check budget
    if (maxBudget) {
      const estimatedCost = estimateCost('gpt-4o', classification.estimatedTokens)
      if (estimatedCost > maxBudget) {
        return 'llama-3.1-70b' // Cheaper alternative
      }
    }
    return 'gpt-4o'
  }
  
  // Default
  return 'gpt-4o-mini'
}

function estimateCost(model: string, tokens: number): number {
  const costs = MODEL_COSTS[model as keyof typeof MODEL_COSTS]
  // Assume 50/50 input/output split
  return (tokens / 2 / 1_000_000 * costs.input) + 
         (tokens / 2 / 1_000_000 * costs.output)
}

Token Optimization

// lib/ai/token-optimizer.ts

// 1. Compress conversation history
export function compressHistory(
  messages: Message[],
  maxTokens: number = 2000
): Message[] {
  // Keep system message
  const system = messages.find(m => m.role === 'system')
  
  // Summarize older messages
  const recent = messages.slice(-4) // Keep last 4
  const older = messages.slice(1, -4) // Middle messages
  
  if (older.length > 0 && countTokens(messages) > maxTokens) {
    const summary = await summarizeMessages(older)
    return [
      system,
      { role: 'system', content: `Previous context: ${summary}` },
      ...recent,
    ].filter(Boolean)
  }
  
  return messages
}

// 2. Optimize system prompts
export function optimizeSystemPrompt(prompt: string): string {
  // Remove unnecessary whitespace
  let optimized = prompt.replace(/\s+/g, ' ').trim()
  
  // Remove redundant phrases
  const redundantPhrases = [
    'Please note that',
    'It is important to',
    'Make sure to always',
    'Remember that you should',
  ]
  
  for (const phrase of redundantPhrases) {
    optimized = optimized.replace(new RegExp(phrase, 'gi'), '')
  }
  
  return optimized
}

// 3. Batch similar requests
export async function batchRequests<T>(
  requests: T[],
  processFn: (batch: T[]) => Promise<string[]>,
  batchSize: number = 5
): Promise<string[]> {
  const results: string[] = []
  
  for (let i = 0; i < requests.length; i += batchSize) {
    const batch = requests.slice(i, i + batchSize)
    const batchResults = await processFn(batch)
    results.push(...batchResults)
  }
  
  return results
}

// 4. Use structured outputs (shorter responses)
const structuredOutputPrompt = `
Respond with minimal JSON only:
{
  "action": "approve|reject|escalate",
  "reason": "brief reason",
  "confidence": 0.0-1.0
}
No additional text.
`

Budget Management

// lib/ai/budget-manager.ts
interface BudgetConfig {
  dailyLimit: number      // cents
  monthlyLimit: number    // cents
  alertThreshold: number  // 0-1 (e.g., 0.8 = 80%)
  hardLimit: boolean      // Stop requests when exceeded
}

export class BudgetManager {
  private redis: Redis
  
  async trackUsage(
    organizationId: string,
    cost: number,
    metadata: { model: string; tokens: number; agentId: string }
  ): Promise<void> {
    const today = new Date().toISOString().split('T')[0]
    const month = today.slice(0, 7)
    
    // Increment daily and monthly counters
    await this.redis.incrby(`budget:${organizationId}:daily:${today}`, cost)
    await this.redis.incrby(`budget:${organizationId}:monthly:${month}`, cost)
    
    // Store detailed usage for analytics
    await this.redis.lpush(`usage:${organizationId}:${today}`, JSON.stringify({
      ...metadata,
      cost,
      timestamp: Date.now(),
    }))
  }
  
  async checkBudget(
    organizationId: string,
    config: BudgetConfig
  ): Promise<{ allowed: boolean; usage: BudgetUsage }> {
    const today = new Date().toISOString().split('T')[0]
    const month = today.slice(0, 7)
    
    const [dailyUsage, monthlyUsage] = await Promise.all([
      this.redis.get<number>(`budget:${organizationId}:daily:${today}`) || 0,
      this.redis.get<number>(`budget:${organizationId}:monthly:${month}`) || 0,
    ])
    
    const usage = {
      daily: dailyUsage,
      monthly: monthlyUsage,
      dailyPercentage: dailyUsage / config.dailyLimit,
      monthlyPercentage: monthlyUsage / config.monthlyLimit,
    }
    
    // Check if over limit
    if (config.hardLimit) {
      if (dailyUsage >= config.dailyLimit || monthlyUsage >= config.monthlyLimit) {
        return { allowed: false, usage }
      }
    }
    
    // Send alerts at threshold
    if (usage.dailyPercentage >= config.alertThreshold) {
      await this.sendBudgetAlert(organizationId, 'daily', usage)
    }
    if (usage.monthlyPercentage >= config.alertThreshold) {
      await this.sendBudgetAlert(organizationId, 'monthly', usage)
    }
    
    return { allowed: true, usage }
  }
  
  async getUsageReport(
    organizationId: string,
    period: 'day' | 'week' | 'month'
  ): Promise<UsageReport> {
    // Aggregate usage by model, agent, and time
    const usage = await this.aggregateUsage(organizationId, period)
    
    return {
      totalCost: usage.total,
      byModel: usage.byModel,
      byAgent: usage.byAgent,
      byDay: usage.byDay,
      topExpensive: usage.topRequests.slice(0, 10),
      recommendations: this.generateRecommendations(usage),
    }
  }
}