在前面的文章中,我们讨论了 AI Agent 的部署和运维。今天,我想分享一下如何优化 AI Agent 的性能。说实话,这个话题我研究了很久,因为性能直接影响用户体验和运营成本。
从一个性能瓶颈说起
还记得去年我们的 AI 助手刚上线时的情况:
用户:这个响应也太慢了吧,要等好几秒
运营:API 费用每天都在涨
我:让我看看到底是哪里慢...
通过分析,我发现主要有三个问题:
- LLM 调用延迟高
- 向量检索效率低
- 内存占用过大
架构层面的优化
首先是架构层面的优化:
class OptimizedAgent:
def __init__(
self,
config: Dict[str, Any]
):
# 1. 模型池
self.model_pool = ModelPool([
{
"model": "gpt-4",
"max_tokens": 4000,
"timeout": 30,
"cost_per_token": 0.03
},
{
"model": "gpt-3.5-turbo",
"max_tokens": 2000,
"timeout": 10,
"cost_per_token": 0.002
},
{
"model": "codellama-7b",
"max_tokens": 2000,
"timeout": 5,
"cost_per_token": 0
}
])
# 2. 多级缓存
self.cache = CacheManager([
MemoryCache(max_size=1000),
RedisCache(url=config["redis_url"]),
DiskCache(path=config["cache_path"])
])
# 3. 向量存储
self.vector_store = VectorManager(
primary=MilvusStore(
host=config["milvus_host"],
collection="embeddings",
dimension=1536
),
replica=FaissStore(
index_path=config["faiss_path"],
dimension=1536
)
)
async def process(
self,
request: Dict[str, Any]
) -> Dict[str, Any]:
# 1. 选择合适的模型
model = await self._select_model(request)
# 2. 检查缓存
cache_key = self._generate_cache_key(request)
if cached := await self.cache.get(cache_key):
return cached
# 3. 并行处理
async with asyncio.TaskGroup() as group:
# 3.1 向量检索
search_task = group.create_task(
self._search_relevant_docs(request)
)
# 3.2 上下文准备
context_task = group.create_task(
self._prepare_context(request)
)
docs = search_task.result()
context = context_task.result()
# 4. 生成响应
response = await model.generate(
prompt=self._build_prompt(
request,
docs,
context
)
)
# 5. 更新缓存
await self.cache.set(
cache_key,
response,
ttl=self._calculate_ttl(response)
)
return response
async def _select_model(
self,
request: Dict[str, Any]
) -> BaseModel:
# 根据请求特征选择合适的模型
features = await self._extract_features(request)
if features.complexity > 0.8:
return self.model_pool.get("gpt-4")
elif features.is_code_related:
return self.model_pool.get("codellama-7b")
else:
return self.model_pool.get("gpt-3.5-turbo")
缓存系统优化
缓存是性能优化的关键:
class CacheManager:
def __init__(
self,
backends: List[CacheBackend]
):
self.backends = backends
async def get(
self,
key: str
) -> Optional[Any]:
# 从快到慢依次查找
for backend in self.backends:
if value := await backend.get(key):
# 找到后,异步更新其他缓存
asyncio.create_task(
self._update_other_caches(
key,
value,
backend
)
)
return value
return None
async def set(
self,
key: str,
value: Any,
ttl: int = None
):
# 并行更新所有缓存
await asyncio.gather(*[
backend.set(key, value, ttl)
for backend in self.backends
])
async def _update_other_caches(
self,
key: str,
value: Any,
found_in: CacheBackend
):
# 更新优先级更高的缓存
tasks = []
for backend in self.backends:
if backend.priority < found_in.priority:
tasks.append(
backend.set(key, value)
)
if tasks:
await asyncio.gather(*tasks)
class MemoryCache:
def __init__(self, max_size: int):
self.cache = LRUCache(max_size)
self.priority = 1
async def get(self, key: str) -> Optional[Any]:
return self.cache.get(key)
async def set(
self,
key: str,
value: Any,
ttl: int = None
):
self.cache.set(key, value)
if ttl:
asyncio.create_task(
self._expire_key(key, ttl)
)
class RedisCache:
def __init__(self, url: str):
self.redis = aioredis.from_url(url)
self.priority = 2
async def get(self, key: str) -> Optional[Any]:
value = await self.redis.get(key)
if value:
return pickle.loads(value)
return None
async def set(
self,
key: str,
value: Any,
ttl: int = None
):
await self.redis.set(
key,
pickle.dumps(value),
ex=ttl
)
向量检索优化
向量检索的优化也很重要:
class VectorManager:
def __init__(
self,
primary: VectorStore,
replica: VectorStore = None
):
self.primary = primary
self.replica = replica
self.sync_interval = 3600 # 1小时
# 启动同步任务
if replica:
asyncio.create_task(
self._sync_loop()
)
async def search(
self,
query: str,
limit: int = 5
) -> List[Document]:
# 1. 计算查询向量
vector = await self._compute_embedding(query)
try:
# 2. 优先从主存储搜索
results = await self.primary.search(
vector,
limit=limit
)
except Exception as e:
if not self.replica:
raise
# 3. 主存储失败,使用副本
results = await self.replica.search(
vector,
limit=limit
)
return results
async def _sync_loop(self):
while True:
try:
# 同步数据
await self._sync_data()
except Exception as e:
logger.error(f"同步失败: {e}")
finally:
await asyncio.sleep(
self.sync_interval
)
async def _sync_data(self):
# 获取上次同步时间
last_sync = await self._get_last_sync()
# 增量同步数据
vectors = await self.primary.get_updates(
since=last_sync
)
if vectors:
await self.replica.batch_add(vectors)
# 更新同步时间
await self._update_last_sync()
内存优化
内存管理也需要特别注意:
class MemoryOptimizer:
def __init__(self):
self.gc_threshold = 0.8 # 80%
self.check_interval = 60 # 1分钟
async def start(self):
while True:
try:
await self._check_memory()
except Exception as e:
logger.error(f"内存检查失败: {e}")
finally:
await asyncio.sleep(
self.check_interval
)
async def _check_memory(self):
usage = self._get_memory_usage()
if usage > self.gc_threshold:
# 触发垃圾回收
await self._collect_garbage()
def _get_memory_usage(self) -> float:
import psutil
process = psutil.Process()
return process.memory_percent()
async def _collect_garbage(self):
# 1. 清理缓存
await cache_manager.cleanup()
# 2. 压缩向量索引
await vector_store.optimize()
# 3. 强制GC
import gc
gc.collect()
性能监控
要持续优化性能,需要好的监控:
class PerformanceMonitor:
def __init__(self):
self.metrics = {
"latency": Histogram(
name="request_latency",
buckets=[0.1, 0.5, 1, 2, 5]
),
"token_usage": Counter(
name="token_usage",
labels=["model"]
),
"cache_hits": Counter(
name="cache_hits",
labels=["backend"]
),
"vector_search_time": Histogram(
name="vector_search_time",
buckets=[0.01, 0.05, 0.1, 0.5]
),
"memory_usage": Gauge(
name="memory_usage_bytes"
)
}
async def record_request(
self,
start_time: float,
request: Dict[str, Any],
response: Dict[str, Any]
):
# 1. 记录延迟
latency = time.time() - start_time
self.metrics["latency"].observe(latency)
# 2. 记录token使用
tokens = self._count_tokens(
request,
response
)
self.metrics["token_usage"].inc(
tokens,
labels={"model": response["model"]}
)
# 3. 更新其他指标
self._update_metrics(
request,
response
)
def _count_tokens(
self,
request: Dict[str, Any],
response: Dict[str, Any]
) -> int:
import tiktoken
encoding = tiktoken.encoding_for_model(
response["model"]
)
return len(encoding.encode(
request["input"]
)) + len(encoding.encode(
response["output"]
))
成本优化
性能优化的同时也要考虑成本:
class CostOptimizer:
def __init__(self):
self.budget = {
"daily_limit": 100, # 美元
"model_weights": {
"gpt-4": 0.3,
"gpt-3.5-turbo": 0.6,
"codellama-7b": 0.1
}
}
async def optimize_request(
self,
request: Dict[str, Any]
) -> Dict[str, Any]:
# 1. 检查预算
if await self._is_over_budget():
# 切换到更便宜的模型
request["model"] = "gpt-3.5-turbo"
# 2. 优化prompt
request["prompt"] = await self._optimize_prompt(
request["prompt"]
)
# 3. 调整参数
request["parameters"] = self._adjust_parameters(
request["parameters"]
)
return request
async def _is_over_budget(self) -> bool:
usage = await self._get_daily_usage()
return usage > self.budget["daily_limit"]
async def _optimize_prompt(
self,
prompt: str
) -> str:
# 1. 删除冗余内容
prompt = self._remove_redundancy(prompt)
# 2. 压缩上下文
prompt = await self._compress_context(prompt)
# 3. 优化指令
prompt = self._optimize_instructions(prompt)
return prompt
def _adjust_parameters(
self,
params: Dict[str, Any]
) -> Dict[str, Any]:
# 根据成本调整参数
if self._is_peak_time():
params["temperature"] = 0.3
params["max_tokens"] = min(
params.get("max_tokens", 2000),
1000
)
return params
实践心得
在优化 AI Agent 性能的过程中,我总结了几点经验:
分层优化很重要
- 架构层面要合理
- 缓存策略要到位
- 代码细节要注意
监控要做好
- 实时性能监控
- 成本使用跟踪
- 及时发现问题
平衡很关键
- 性能和成本
- 实时性和准确性
- 通用性和专用性
写在最后
性能优化是一个持续的过程,需要我们不断监控、分析和改进。就像调教一辆赛车,既要提升速度,又要控制油耗。
在下一篇文章中,我会讲解如何构建一个完整的 AI Agent 应用。如果你对性能优化有什么想法,欢迎在评论区交流。