主要有两个作用,一是省钱,二是减少耗时。目前通过api请求chatgpt等能力比较强的商业化大模型是要按token收费的,而且一点儿也不便宜。另一方面,大模型生成结果的速度比较慢,如果调用大模型做第三方应用的话很影响用户体验。如果可以对问题和大模型给出的对应结果做cache,下次再问到相同或者相似的问题时就可以直接查cache词典给出结果了,这样既省钱还能减少耗时。GPTCache就提供了这样的能力。
GPTCache的github项目和文档都没有提及如何对自定义大模型进行cache,这让GPTCache的灵活性大大受限。langchain有提及如何使用GPTCache对自定义大模型进行cache以及如何使用自定义embedding模型,但是写得不详细,需要自己尝试。这篇文章可以直接给出方法和例子。
from typing import Any, List, Mapping, Optional
from langchain.callbacks.manager import CallbackManagerForLLMRun
from langchain.llms.base import LLM
from transformers import AutoTokenizer, AutoModel
from langchain.cache import GPTCache
import requests, json
from gptcache.core import cache, Cache
from gptcache.processor.post import temperature_softmax
from gptcache.processor.pre import last_content, get_prompt
from gptcache.adapter.langchain_models import LangChainLLMs
import time
from gptcache.session import Session
import numpy as np
from gptcache.manager import get_data_manager, CacheBase, VectorBase
from gptcache.similarity_evaluation.distance import SearchDistanceEvaluation
from gptcache.embedding import Onnx
from gptcache.utils.log import gptcache_log
from gptcache.similarity_evaluation.distance import SearchDistanceEvaluation
from gptcache.adapter.api import put, get
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm3-6b", trust_remote_code=True)
model = AutoModel.from_pretrained("THUDM/chatglm3-6b", trust_remote_code=True).half().cuda()
model = model.eval()
def response_text(resp):
return resp.json()["data"]["answer"]
class CUSTOMLLM(LLM):
@property
def _llm_type(self) -> str:
return "custom"
def _call(
self,
prompt: str,
stop: Optional[List[str]] = None,
run_manager: Optional[CallbackManagerForLLMRun] = None,
**kwargs: Any,
) -> str:
response, history = model.chat(tokenizer, prompt, history=[])
return response
class BGE:
# 自定义的embedding模型,使用的是embedding server
def __init__(self, model_name: str="BAAI/bge-large-zh-v1.5", **kwargs):
self.model_name = model_name
if model_name in self.dim_dict():
self.__dimension = self.dim_dict()[model_name]
else:
self.__dimension = None
self.model = SentenceTransformer(self.model_name)
def to_embeddings(self, data, **_):
embeddings = model.encode(data, normalize_embeddings=True)
return np.array(embeddings).astype('float32')
@property
def dimension(self):
if not self.__dimension:
foo_emb = self.to_embeddings("foo")
self.__dimension = len(foo_emb)
return self.__dimension
@staticmethod
def dim_dict():
# embedding模型的输出纬度
return {
"BAAI/bge-large-zh-v1.5": 1024
}
def custom_data_process(data_path, cache):
test_queries = []
data = json.load(open(data_path))
for line in data:
result = line["result"]
query = line["query"]
test_queries.append(query)
positive_queries = line["positive_queries"]
for pos_query_score in positive_queries:
pos_query = pos_query_score[0]
score = pos_query_score[1]
put(pos_query, result, cache_obj=cache)
return test_queries
def main():
llm = CUSTOMLLM()
question = "货币的本质是什么?"
question = "介绍一下货币的定义"
llm_cache = Cache()
session = Session()
bge = BGE()
# 注意:输出的embedding维度不同时不能搜索同一个向量库,否则会报错
data_manager = get_data_manager(CacheBase("sqlite"), VectorBase("faiss", dimension=bge.dimension), max_size=100000)
# cached_llm = LangChainLLMs(llm=CUSTOMLLM(), session=session)
cached_llm = LangChainLLMs(llm=CUSTOMLLM())
llm_cache.init(
embedding_func=bge.to_embeddings,
data_manager=data_manager,
# pre_embedding_func=last_content,
pre_embedding_func=get_prompt,
post_process_messages_func=temperature_softmax,
similarity_evaluation=SearchDistanceEvaluation(), # 可以根据设置的最大距离来筛选召回结果
)
put("货币的定义", "货币是指被政府或国家认可,并用于购买商品、支付债务和服务的一种广义支付手段。它可以是纸币、硬币或数字形式", cache_obj=llm_cache)
# data_path文件里面的格式是{'query': xxx, 'result': xxx}
data_path = "xxxxxx"
# 灌库
test_queries = custom_data_process(data_path, llm_cache)
"""
for _ in range(2):
start_time = time.time()
# top_k=3设置了召回结果数量,但是因为post_process_messages_func的输出只取第一个所以最后只出一个结果
# 想要输出多个结果可以自定义post_process_messages_func
response = cached_llm(question, cache_obj=llm_cache, top_k=3)
print(f'Question: {question}')
print("Time consuming: {:.2f}s".format(time.time() - start_time))
print(f'Answer: {response}\n')
"""
for i in range(3):
response = cached_llm(test_queries[i], cache_obj=llm_cache, top_k=3)
print(f'Question: {test_queries[i]}')
print(f'Answer: {response}\n')
if __name__ == '__main__':
main()
GPTCache的项目地址
import os
import time
import openai
def response_text(openai_resp):
return openai_resp['choices'][0]['message']['content']
question = 'what‘s chatgpt'
# OpenAI API original usage
openai.api_key = os.getenv("OPENAI_API_KEY")
start_time = time.time()
response = openai.ChatCompletion.create(
model='gpt-3.5-turbo',
messages=[
{
'role': 'user',
'content': question
}
],
)
print(f'Question: {question}')
print("Time consuming: {:.2f}s".format(time.time() - start_time))
print(f'Answer: {response_text(response)}\n')
import time
def response_text(openai_resp):
return openai_resp['choices'][0]['message']['content']
print("Cache loading.....")
# To use GPTCache, that's all you need
# -------------------------------------------------
from gptcache import cache
from gptcache.adapter import openai
cache.init()
cache.set_openai_key()
# -------------------------------------------------
question = "what's github"
for _ in range(2):
start_time = time.time()
response = openai.ChatCompletion.create(
model='gpt-3.5-turbo',
messages=[
{
'role': 'user',
'content': question
}
],
)
print(f'Question: {question}')
print("Time consuming: {:.2f}s".format(time.time() - start_time))
print(f'Answer: {response_text(response)}\n')
import time
def response_text(openai_resp):
return openai_resp['choices'][0]['message']['content']
from gptcache import cache
from gptcache.adapter import openai
from gptcache.embedding import Onnx
from gptcache.manager import CacheBase, VectorBase, get_data_manager
from gptcache.similarity_evaluation.distance import SearchDistanceEvaluation
print("Cache loading.....")
onnx = Onnx()
data_manager = get_data_manager(CacheBase("sqlite"), VectorBase("faiss", dimension=onnx.dimension))
cache.init(
embedding_func=onnx.to_embeddings,
data_manager=data_manager,
similarity_evaluation=SearchDistanceEvaluation(),
)
cache.set_openai_key()
questions = [
"what's github",
"can you explain what GitHub is",
"can you tell me more about GitHub",
"what is the purpose of GitHub"
]
for question in questions:
start_time = time.time()
response = openai.ChatCompletion.create(
model='gpt-3.5-turbo',
messages=[
{
'role': 'user',
'content': question
}
],
)
print(f'Question: {question}')
print("Time consuming: {:.2f}s".format(time.time() - start_time))
print(f'Answer: {response_text(response)}\n')
import time
from gptcache import cache, Config
from gptcache.manager import manager_factory
from gptcache.embedding import Onnx
from gptcache.processor.post import temperature_softmax
from gptcache.similarity_evaluation.distance import SearchDistanceEvaluation
from gptcache.adapter import openai
cache.set_openai_key()
onnx = Onnx()
data_manager = manager_factory("sqlite,faiss", vector_params={"dimension": onnx.dimension})
cache.init(
embedding_func=onnx.to_embeddings,
data_manager=data_manager,
similarity_evaluation=SearchDistanceEvaluation(),
post_process_messages_func=temperature_softmax
)
# cache.config = Config(similarity_threshold=0.2)
question = "what's github"
for _ in range(3):
start = time.time()
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
temperature = 1.0, # Change temperature here
messages=[{
"role": "user",
"content": question
}],
)
print("Time elapsed:", round(time.time() - start, 3))
print("Answer:", response["choices"][0]["message"]["content"])
要将GPTCache用于自定义大模型中得先创建一个继承langchain的LLM的自定义LLM类,然后重写LLM类的三个方法:
from typing import Any, List, Mapping, Optional
from langchain.callbacks.manager import CallbackManagerForLLMRun
from langchain_core.language_models.llms import LLM
class CustomLLM(LLM):
n: int
@property
def _llm_type(self) -> str:
return "custom"
def _call(
self,
prompt: str,
stop: Optional[List[str]] = None,
run_manager: Optional[CallbackManagerForLLMRun] = None,
**kwargs: Any,
) -> str:
if stop is not None:
raise ValueError("stop kwargs are not permitted.")
return prompt[: self.n]
@property
def _identifying_params(self) -> Mapping[str, Any]:
"""Get the identifying parameters."""
return {"n": self.n}
这里的重点是重写_call方法时只要返回大模型的输出,这个输出是str就行,所以_call方法里可以直接调用大模型服务,或者用其他的框架调用大模型生成结果即可。比如下面调用开源大模型ChatGLM:
from langchain.llms.base import LLM
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm3-6b", trust_remote_code=True)
model = AutoModel.from_pretrained("THUDM/chatglm3-6b", trust_remote_code=True).half().cuda()
model = model.eval()
class CUSTOMLLM(LLM):
@property
def _llm_type(self) -> str:
return "custom"
def _call(
self,
prompt: str,
stop: Optional[List[str]] = None,
run_manager: Optional[CallbackManagerForLLMRun] = None,
**kwargs: Any,
) -> str:
response, history = model.chat(tokenizer, prompt, history=[])
return response
embedding function是用于相似搜索的,GPTCache目前支持5中方式:OpenAI, Cohere, Huggingface, ONNX, and SentenceTransformers。还提供了一个默认的字符串嵌入方法。当然也可以自定义embedding function,方法可以参考How to set embedding function。
主要是在自定义的embedding类里面实现to_embedding()方法,并在llm_cache.init()里将自定义的embedding实例赋给embedding_func。以BGE模型为例:
from sentence_transformers import SentenceTransformer
class BGE:
# 自定义的embedding模型,使用的是embedding server
def __init__(self, model_name: str="BAAI/bge-large-zh-v1.5", **kwargs):
self.model_name = model_name
if model_name in self.dim_dict():
self.__dimension = self.dim_dict()[model_name]
else:
self.__dimension = None
self.model = SentenceTransformer(self.model_name)
def to_embeddings(self, data, **_):
embeddings = model.encode(data, normalize_embeddings=True)
return np.array(embeddings).astype('float32')
@property
def dimension(self):
if not self.__dimension:
foo_emb = self.to_embeddings("foo")
self.__dimension = len(foo_emb)
return self.__dimension
@staticmethod
def dim_dict():
# embedding模型的输出纬度
return {
"BAAI/bge-large-zh-v1.5": 1024
}
cache storage存储所有标量数据,如原始问题、提示、答案和访问时间。GPTCache目前支持SQLite, MySQL, 和PostgreSQL,未来会将NoSQL加进去。
向量存储组件存储并搜索所有embedding,以在语义上找到最相似的结果。GPTCache支持使用矢量搜索库(如FAISS)或矢量数据库(如Milvus)。未来将增加更多的矢量数据库和云服务。
例子:
## create user defined data manager
data_manager = get_data_manager()
## create data manager with sqlite and faiss
data_manager = get_data_manager(CacheBase("sqlite"), VectorBase("faiss", dimension=128))
## create data manager with mysql and milvus, max cache size is 100
data_manager = get_data_manager(CacheBase("mysql"), VectorBase("milvus", dimension=128), max_size=100)
## create data manager with mysql and milvus, max cache size is 100, eviction policy is LRU
data_manager = get_data_manager(CacheBase("mysql"), VectorBase("milvus", dimension=128), max_size=100, eviction='LRU')
GPTCache支持基于缓存计数来清除数据。您可以选择使用LRU或FIFO策略。在未来,我们计划支持其他缓存策略,例如基于上次访问时间或上次写入时间来清除数据。
LRU(Least recently used,最近最少使用)算法根据数据的历史访问记录来进行淘汰数据,其核心思想是“如果数据最近被访问过,那么将来被访问的几率也更高”。
评估函数有助于确定缓存的答案是否与输入查询匹配。它需要三个输入值:用户请求数据、缓存数据和用户定义的参数。GPTCache目前支持三种类型的评估函数:精确匹配评估、嵌入距离评估和ONNX模型评估。
要启用 ONNX 评估,只需将 EvaluationOnnx 传递给 similarity_evaluation。这允许您运行任何可以在 ONNX 上运行的模型。我们将在未来支持 Pytorch、TensorRT 和其他推理引擎。
onnx = EmbeddingOnnx()
data_manager = get_data_manager(CacheBase("sqlite"), VectorBase("faiss", dimension=onnx.dimension))
evaluation_onnx = EvaluationOnnx()
cache.init(
embedding_func=onnx.to_embeddings,
data_manager=data_manager,
similarity_evaluation=evaluation_onnx,
)
参考:https://github.com/zilliztech/GPTCache/tree/main/examples#How-to-set-the-similarity-evaluation-interface
用户还可以传入其他配置选项,例如:
GPTCache目前支持两个LLM adapter:OpenAI和Langchain
使用OpenAI适配器,您可以指定要使用的模型,并以用户角色生成查询。
cache.init()
cache.set_openai_key()
question = "what's github"
answer = openai.ChatCompletion.create(
model='gpt-3.5-turbo',
messages=[
{
'role': 'user',
'content': question
}
],
)
print(answer)
下面是一个使用OpenAI的流响应API的示例:
from gptcache.cache import get_data_manager
from gptcache.core import cache, Cache
from gptcache.adapter import openai
cache.init(data_manager=get_data_manager())
os.environ["OPENAI_API_KEY"] = "API KEY"
cache.set_openai_key()
response = openai.ChatCompletion.create(
model='gpt-3.5-turbo',
messages=[
{'role': 'user', 'content': "What's 1+1? Answer in one word."}
],
temperature=0,
stream=True # this time, we set stream=True
)
# create variables to collect the stream of chunks
collected_chunks = []
collected_messages = []
# iterate through the stream of events
for chunk in response:
collected_chunks.append(chunk) # save the event response
chunk_message = chunk['choices'][0]['delta'] # extract the message
collected_messages.append(chunk_message) # save the message
full_reply_content = ''.join([m.get('content', '') for m in collected_messages])
如果你想使用其他LLM,Langchain适配器提供了一个标准接口来连接支持Langchain的LLM。
template = """Question: {question}
Answer: Let's think step by step."""
prompt = PromptTemplate(template=template, input_variables=["question"])
llm = OpenAI()
question = "What NFL team won the Super Bowl in the year Justin Bieber was born?"
llm_cache = Cache()
llm_cache.init(
pre_embedding_func=get_prompt,
post_process_messages_func=postnop,
)
cached_llm = LangChainLLMs(llm)
answer = cached_llm(question, cache_obj=llm_cache)
cache_obj: 自定义请求缓存。如果你想使缓存成为单例,请使用此方法。
onnx = Onnx()
data_manager = get_data_manager(CacheBase("sqlite"), VectorBase("faiss", dimension=onnx.dimension))
one_cache = Cache()
one_cache.init(embedding_func=onnx.to_embeddings,
data_manager=data_manager,
evaluation_func=pair_evaluation,
config=Config(
similarity_threshold=1,
),
)
question = "what do you think about chatgpt"
openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": question}
],
cache_obj=one_cache
)
cache_context: 可以针对每个请求单独传递自定义缓存函数。
question = "what do you think about chatgpt"
openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": question}
],
cache_context={
"pre_embedding_func": {},
"embedding_func": {},
"search_func": {},
"get_scalar_data": {},
"evaluation_func": {},
}
)
cache_skip: 此选项允许您跳过缓存搜索,但仍存储LLM模型返回的结果。
question = "what do you think about chatgpt"
openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": question}
],
cache_skip=True
)
session: 指定当前请求的会话,您还可以设置一些规则来检查会话是否命中缓存,有关详细信息,请参阅此示例。
from gptcache.session import Session
session = Session(name="my-session")
question = "what do you think about chatgpt"
openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": question}
],
session=session
)
temperature: 您始终可以通过一个介于0和2之间的温度系数来控制输出的随机性。更高的温度值,如0.8,将使输出更加随机。在相同的输入下,较低的温度值,如0.2,将使输出更加连贯。
温度系数的范围为[0, 2],默认值为0.0
一个更高的温度系数意味着更大的可能性跳过cache search,直接请求大模型。当温度系数为2时一定会直接请求大模型。当温度系数为0时一定会先search cache然后再请求大模型。
默认的 post_process_messages_func 是 temperature_softmax。在这种情况下,请参阅API参考资料,了解温度如何影响输出。
GPTCache现在支持构建具有缓存和对话功能的服务器。您可以在几行内启动一个定制的GPTCache服务。这里是一个简单的例子,展示了如何构建GPTCache服务器并与之交互。有关更多详细信息、参数和参数,请参阅此处。
安装GPTCache后,您可以:
$ gptcache_server -s 127.0.0.1 -p 8000
用doker启动服务:
$ docker pull zilliz/gptcache:latest
$ docker run -p 8000:8000 -it zilliz/gptcache:latest
与服务器交互:
GPTCache支持与服务器进行交互的两种方式:
# put the data to cache
curl -X 'POST' \
'http://localhost:8000/put' \
-H 'accept: application/json' \
-H 'Content-Type: application/json' \
-d '{
"prompt": "Hi",
"answer": "Hi back"
}'
# get the data from the cache
curl -X 'POST' \
'http://localhost:8000/get' \
-H 'accept: application/json' \
-H 'Content-Type: application/json' \
-d '{
"prompt": "Hi"
}'
>>> from gptcache.client import Client
>>> client = Client(uri="http://localhost:8000")
>>> client.put("Hi", "Hi back")
200
>>> client.get("Hi")
'Hi back'
data_manager选择SSDataManager时耗时主要发生在data_manager.py的SSDataManager()里面的get_scalar_data()的cache_data = self.s.get_data_by_id(res_data[1])中,也就是在用检索出的id获取对应的内容的时候。
在adapter.py中获取cache_data时会调用get_scalar_data(),如果search_data_list里面item多的话就会用很长时间,这里item的数量由自己设定的topk决定
for search_data in search_data_list:
cache_data = time_cal(
chat_cache.data_manager.get_scalar_data,
func_name="get_data",
report_func=chat_cache.report.data,
)(
search_data,
extra_param=context.get("get_scalar_data", None),
session=session,
)