虚拟桌宠
ChatGLM
zkz098大佬找到的BUG和给出的解决方案
cd Project
git clone https://github.com/THUDM/ChatGLM2-6B.git
GIT_LFS_SKIP_SMUDGE=1 git clone https://huggingface.co/THUDM/chatglm2-6b
# 这里下载的只是一个占位符,并不是真正的模型
https://cloud.tsinghua.edu.cn/d/674208019e314311ab5c/?p=%2Fchatglm2-6b&mode=list
[模型文件]: https://cloud.tsinghua.edu.cn/d/674208019e314311ab5c/?p=%2Fchatglm2-6b&mode=list
下载这些文件,完成后替换掉步骤2时下载的模型实现
2023/8/20
由于openai_api中没有返回一个关键词:usage,会导致调用时出现报错
需要做如下修改
import time
import tiktoken
import torch
import uvicorn
from pydantic import BaseModel, Field
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from contextlib import asynccontextmanager
from typing import Any, Dict, List, Literal, Optional, Union
from transformers import AutoTokenizer, AutoModel
from sse_starlette.sse import ServerSentEvent, EventSourceResponse
@asynccontextmanager
async def lifespan(app: FastAPI): # collects GPU memory
yield
if torch.cuda.is_available():
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
app = FastAPI(lifespan=lifespan)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
class ModelCard(BaseModel):
id: str
object: str = "model"
created: int = Field(default_factory=lambda: int(time.time()))
owned_by: str = "owner"
root: Optional[str] = None
parent: Optional[str] = None
permission: Optional[list] = None
class ModelList(BaseModel):
object: str = "list"
data: List[ModelCard] = []
class ChatMessage(BaseModel):
role: Literal["user", "assistant", "system"]
content: str
class DeltaMessage(BaseModel):
role: Optional[Literal["user", "assistant", "system"]] = None
content: Optional[str] = None
class ChatCompletionRequest(BaseModel):
model: str
messages: List[ChatMessage]
temperature: Optional[float] = None
top_p: Optional[float] = None
max_length: Optional[int] = None
stream: Optional[bool] = False
class ChatCompletionResponseChoice(BaseModel):
index: int
message: ChatMessage
finish_reason: Literal["stop", "length"]
class ChatCompletionResponseStreamChoice(BaseModel):
index: int
delta: DeltaMessage
finish_reason: Optional[Literal["stop", "length"]]
class ChatCompletionResponse(BaseModel):
model: str
object: Literal["chat.completion", "chat.completion.chunk"]
choices: List[Union[ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice]]
created: Optional[int] = Field(default_factory=lambda: int(time.time()))
usage: dict
@app.get("/v1/models", response_model=ModelList)
async def list_models():
global model_args
model_card = ModelCard(id="gpt-3.5-turbo")
return ModelList(data=[model_card])
@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
async def create_chat_completion(request: ChatCompletionRequest):
global model, tokenizer
if request.messages[-1].role != "user":
raise HTTPException(status_code=400, detail="Invalid request")
query = request.messages[-1].content
prev_messages = request.messages[:-1]
if len(prev_messages) > 0 and prev_messages[0].role == "system":
query = prev_messages.pop(0).content + query
history = []
if len(prev_messages) % 2 == 0:
for i in range(0, len(prev_messages), 2):
if prev_messages[i].role == "user" and prev_messages[i + 1].role == "assistant":
history.append([prev_messages[i].content, prev_messages[i + 1].content])
if request.stream:
generate = predict(query, history, request.model)
return EventSourceResponse(generate, media_type="text/event-stream")
response, _ = model.chat(tokenizer, query, history=history)
choice_data = ChatCompletionResponseChoice(
index=0,
message=ChatMessage(role="assistant", content=response),
finish_reason="stop"
)
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")
pt = len(encoding.encode(query))
rt = len(encoding.encode(response))
usage_data = {
"prompt_tokens": pt,
"completion_tokens": rt,
"total_tokens": pt + rt
}
return ChatCompletionResponse(model=request.model, choices=[choice_data], object="chat.completion",
usage=usage_data)
async def predict(query: str, history: List[List[str]], model_id: str):
global model, tokenizer
choice_data = ChatCompletionResponseStreamChoice(
index=0,
delta=DeltaMessage(role="assistant"),
finish_reason=None
)
chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
yield "{}".format(chunk.json(exclude_unset=True, ensure_ascii=False))
current_length = 0
for new_response, _ in model.stream_chat(tokenizer, query, history):
if len(new_response) == current_length:
continue
new_text = new_response[current_length:]
current_length = len(new_response)
choice_data = ChatCompletionResponseStreamChoice(
index=0,
delta=DeltaMessage(content=new_text),
finish_reason=None
)
chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
yield "{}".format(chunk.json(exclude_unset=True, ensure_ascii=False))
choice_data = ChatCompletionResponseStreamChoice(
index=0,
delta=DeltaMessage(),
finish_reason="stop"
)
chunk = ChatCompletionResponse(model=model_id, choices=[choice_data], object="chat.completion.chunk")
yield "{}".format(chunk.json(exclude_unset=True, ensure_ascii=False))
yield '[DONE]'
if __name__ == "__main__":
tokenizer = AutoTokenizer.from_pretrained("models/chatglm2-6b", trust_remote_code=True)
model = AutoModel.from_pretrained("models/chatglm2-6b", trust_remote_code=True).quantize(8).half().cuda()
# model = AutoModel.from_pretrained("models/chatglm2-6b", trust_remote_code=True).quantize(4).half().cuda()
# model = AutoModel.from_pretrained("models/chatglm2-6b", trust_remote_code=True).half().cuda()
# "models/chatglm2-6b"改为你存放模型的路径
# 多显卡支持,使用下面两行代替上面一行,将num_gpus改为你实际的显卡数量
# from utils import load_model_on_gpus
# model = load_model_on_gpus("THUDM/chatglm2-6b", num_gpus=2)
model.eval()
uvicorn.run(app, host='0.0.0.0', port=8080, workers=1, root_path="/ChatGLM/OpenAPI")
# port改为你自己设置的port,这里的设置需要和nginx中适配
下载nginx(装过的略过)
apt install nginx -y
添加配置(根据自己实际情况来)
# /etc/nginx/nginx.conf -> 在http{}中添加如下内容
include /etc/nginx/myHost/*.conf;
# 创建文件夹 /etc/nginx/myHost (根据自己实际情况来)
cd /etc/nginx
mkdir myHost
cd myHost
# 创建文件 /etc/nginx/myHost/ChatGLM.conf
touch ChatGLM.conf
# 在ChatGLM.conf中添加如下内容
server {
listen 8080;
server_name i-2.gpushare.com; # Change this to your domain name
location /ChatGLM/OpenAPI/ { # Change this if you'd like to server your Gradio app on a different path
proxy_pass http://0.0.0.0:18203/; # Change this if your Gradio app will be running on a different port
proxy_redirect off;
proxy_http_version 1.1;
proxy_set_header Upgrade $http_upgrade;
proxy_set_header Connection "upgrade";
proxy_set_header Host $host;
}
}
conda create -n glm python=3.8 -y
conda activate glm
pip install -r requirements.txt
pip install tiktoken
nohup python openai_api.py
修改桌宠的chatapi配置,将api-url改为你自己的url,key不用填