Reduce AI costs by 60% or more without sacrificing quality. Learn caching strategies, intelligent model routing, and budget management techniques.
Many AI requests are similar or identical. Caching responses eliminates redundant API calls.
// lib/ai/cache.ts
import { Redis } from '@upstash/redis'
import { generateText } from 'ai'
import { createHash } from 'crypto'
const redis = new Redis({
url: process.env.KV_REST_API_URL!,
token: process.env.KV_REST_API_TOKEN!,
})
interface CacheConfig {
ttlSeconds: number
similarity: 'exact' | 'semantic'
}
// Generate cache key from prompt
function generateCacheKey(
model: string,
systemPrompt: string,
userPrompt: string
): string {
const content = `${model}:${systemPrompt}:${userPrompt}`
return `ai:cache:${createHash('sha256').update(content).digest('hex')}`
}
export async function generateWithCache(
model: string,
systemPrompt: string,
userPrompt: string,
config: CacheConfig = { ttlSeconds: 3600, similarity: 'exact' }
): Promise<{ text: string; cached: boolean; cost: number }> {
const cacheKey = generateCacheKey(model, systemPrompt, userPrompt)
// Check cache
const cached = await redis.get<string>(cacheKey)
if (cached) {
return { text: cached, cached: true, cost: 0 }
}
// Generate new response
const { text, usage } = await generateText({
model,
system: systemPrompt,
prompt: userPrompt,
})
// Cache the response
await redis.setex(cacheKey, config.ttlSeconds, text)
// Calculate cost
const cost = calculateCost(model, usage.promptTokens, usage.completionTokens)
return { text, cached: false, cost }
}
// Semantic caching with embeddings
export async function generateWithSemanticCache(
model: string,
systemPrompt: string,
userPrompt: string,
similarityThreshold: number = 0.95
): Promise<{ text: string; cached: boolean }> {
// Generate embedding for the prompt
const embedding = await generateEmbedding(userPrompt)
// Search for similar cached prompts
const similar = await searchSimilarPrompts(embedding, similarityThreshold)
if (similar) {
return { text: similar.response, cached: true }
}
// Generate and cache
const { text } = await generateText({ model, system: systemPrompt, prompt: userPrompt })
await cacheWithEmbedding(userPrompt, embedding, text)
return { text, cached: false }
}Route simple tasks to cheaper models, reserve expensive models for complex reasoning.
// lib/ai/router.ts
interface TaskClassification {
complexity: 'simple' | 'moderate' | 'complex'
requiresReasoning: boolean
requiresCreativity: boolean
estimatedTokens: number
}
const MODEL_COSTS = {
'gpt-4o': { input: 5.00, output: 15.00 }, // per 1M tokens
'gpt-4o-mini': { input: 0.15, output: 0.60 },
'claude-3-haiku': { input: 0.25, output: 1.25 },
'llama-3.1-70b': { input: 0.35, output: 0.40 },
}
// Classify task complexity
async function classifyTask(prompt: string): Promise<TaskClassification> {
// Use cheap model to classify
const { text } = await generateText({
model: 'gpt-4o-mini',
prompt: `Classify this task:
"${prompt}"
Respond with JSON: {
"complexity": "simple|moderate|complex",
"requiresReasoning": boolean,
"requiresCreativity": boolean,
"estimatedTokens": number
}`,
})
return JSON.parse(text)
}
// Route to optimal model
export async function routeToOptimalModel(
prompt: string,
maxBudget?: number // cents per request
): Promise<string> {
const classification = await classifyTask(prompt)
// Simple tasks -> cheapest model
if (classification.complexity === 'simple' && !classification.requiresReasoning) {
return 'gpt-4o-mini'
}
// Creative tasks -> Claude
if (classification.requiresCreativity && !classification.requiresReasoning) {
return 'claude-3-haiku'
}
// Complex reasoning -> GPT-4o
if (classification.complexity === 'complex' || classification.requiresReasoning) {
// Check budget
if (maxBudget) {
const estimatedCost = estimateCost('gpt-4o', classification.estimatedTokens)
if (estimatedCost > maxBudget) {
return 'llama-3.1-70b' // Cheaper alternative
}
}
return 'gpt-4o'
}
// Default
return 'gpt-4o-mini'
}
function estimateCost(model: string, tokens: number): number {
const costs = MODEL_COSTS[model as keyof typeof MODEL_COSTS]
// Assume 50/50 input/output split
return (tokens / 2 / 1_000_000 * costs.input) +
(tokens / 2 / 1_000_000 * costs.output)
}// lib/ai/token-optimizer.ts
// 1. Compress conversation history
export function compressHistory(
messages: Message[],
maxTokens: number = 2000
): Message[] {
// Keep system message
const system = messages.find(m => m.role === 'system')
// Summarize older messages
const recent = messages.slice(-4) // Keep last 4
const older = messages.slice(1, -4) // Middle messages
if (older.length > 0 && countTokens(messages) > maxTokens) {
const summary = await summarizeMessages(older)
return [
system,
{ role: 'system', content: `Previous context: ${summary}` },
...recent,
].filter(Boolean)
}
return messages
}
// 2. Optimize system prompts
export function optimizeSystemPrompt(prompt: string): string {
// Remove unnecessary whitespace
let optimized = prompt.replace(/\s+/g, ' ').trim()
// Remove redundant phrases
const redundantPhrases = [
'Please note that',
'It is important to',
'Make sure to always',
'Remember that you should',
]
for (const phrase of redundantPhrases) {
optimized = optimized.replace(new RegExp(phrase, 'gi'), '')
}
return optimized
}
// 3. Batch similar requests
export async function batchRequests<T>(
requests: T[],
processFn: (batch: T[]) => Promise<string[]>,
batchSize: number = 5
): Promise<string[]> {
const results: string[] = []
for (let i = 0; i < requests.length; i += batchSize) {
const batch = requests.slice(i, i + batchSize)
const batchResults = await processFn(batch)
results.push(...batchResults)
}
return results
}
// 4. Use structured outputs (shorter responses)
const structuredOutputPrompt = `
Respond with minimal JSON only:
{
"action": "approve|reject|escalate",
"reason": "brief reason",
"confidence": 0.0-1.0
}
No additional text.
`// lib/ai/budget-manager.ts
interface BudgetConfig {
dailyLimit: number // cents
monthlyLimit: number // cents
alertThreshold: number // 0-1 (e.g., 0.8 = 80%)
hardLimit: boolean // Stop requests when exceeded
}
export class BudgetManager {
private redis: Redis
async trackUsage(
organizationId: string,
cost: number,
metadata: { model: string; tokens: number; agentId: string }
): Promise<void> {
const today = new Date().toISOString().split('T')[0]
const month = today.slice(0, 7)
// Increment daily and monthly counters
await this.redis.incrby(`budget:${organizationId}:daily:${today}`, cost)
await this.redis.incrby(`budget:${organizationId}:monthly:${month}`, cost)
// Store detailed usage for analytics
await this.redis.lpush(`usage:${organizationId}:${today}`, JSON.stringify({
...metadata,
cost,
timestamp: Date.now(),
}))
}
async checkBudget(
organizationId: string,
config: BudgetConfig
): Promise<{ allowed: boolean; usage: BudgetUsage }> {
const today = new Date().toISOString().split('T')[0]
const month = today.slice(0, 7)
const [dailyUsage, monthlyUsage] = await Promise.all([
this.redis.get<number>(`budget:${organizationId}:daily:${today}`) || 0,
this.redis.get<number>(`budget:${organizationId}:monthly:${month}`) || 0,
])
const usage = {
daily: dailyUsage,
monthly: monthlyUsage,
dailyPercentage: dailyUsage / config.dailyLimit,
monthlyPercentage: monthlyUsage / config.monthlyLimit,
}
// Check if over limit
if (config.hardLimit) {
if (dailyUsage >= config.dailyLimit || monthlyUsage >= config.monthlyLimit) {
return { allowed: false, usage }
}
}
// Send alerts at threshold
if (usage.dailyPercentage >= config.alertThreshold) {
await this.sendBudgetAlert(organizationId, 'daily', usage)
}
if (usage.monthlyPercentage >= config.alertThreshold) {
await this.sendBudgetAlert(organizationId, 'monthly', usage)
}
return { allowed: true, usage }
}
async getUsageReport(
organizationId: string,
period: 'day' | 'week' | 'month'
): Promise<UsageReport> {
// Aggregate usage by model, agent, and time
const usage = await this.aggregateUsage(organizationId, period)
return {
totalCost: usage.total,
byModel: usage.byModel,
byAgent: usage.byAgent,
byDay: usage.byDay,
topExpensive: usage.topRequests.slice(0, 10),
recommendations: this.generateRecommendations(usage),
}
}
}Implement response caching
Cache similar queries with Redis/Upstash
Route by task complexity
Use GPT-4o-mini for simple tasks
Compress conversation history
Summarize older messages
Set budget limits and alerts
Prevent unexpected cost spikes
Monitor and analyze usage
Identify optimization opportunities