LLM十分火热,各种花样封装使用,但是每次infer的时候,加载模型都需要很久,那么就需要把服务挂载在服务器上,就不用每次infer都花那么多时间加载模型了。
笔者更喜欢flask框架,觉得他更接地气,且易于使用。
使用了客户端和服务端的分离,使得你可以在需要的时候发送推理请求。
# start_server.py
from fastapi import FastAPI, Request
from transformers import AutoTokenizer, AutoModel
import uvicorn, json, datetime
import torch
from tqdm import tqdm
from transformers import LlamaTokenizer, AutoTokenizer, AutoModelForCausalLM, AutoConfig
from peft import PeftModel
DEVICE = "cuda"
DEVICE_ID = "0"
CUDA_DEVICE = f"{DEVICE}:{DEVICE_ID}" if DEVICE_ID else DEVICE
print(CUDA_DEVICE)
def torch_gc():
if torch.cuda.is_available():
with torch.cuda.device(CUDA_DEVICE):
torch.cuda.empty_cache()
torch.cuda.ipc_collect()
app = FastAPI()
@app.post("/")
async def create_item(request: Request):
global model, tokenizer
json_post_raw = await request.json()
json_post = json.dumps(json_post_raw)
json_post_list = json.loads(json_post)
prompt = json_post_list.get('prompt') # input_text
##### 以下为模型推理代码,替换成自己的代码 #####
inputs = tokenizer(
prompt,
add_special_tokens=False,
return_tensors="pt"
)
generation_output = model.generate(
input_ids = inputs["input_ids"].to(CUDA_DEVICE),
**generation_config
) # [0]
for i, line in enumerate(generation_output):
generate_text = tokenizer.decode(line,skip_special_tokens=True)
infer_result = generate_text.split('Assistant:\n')[-1].strip() + '\n'
##### 以上为模型推理代码,替换成自己的代码 #####
answer = {
"response": infer_result, # 返回的结果
}
torch_gc()
return answer
if __name__ == '__main__':
# 以下加载你的模型
tokenizer = AutoTokenizer.from_pretrained('/workspace/BELLE/train/dataset/llama2_7b_chat_hf')
tokenizer.pad_token_id = 0
tokenizer.bos_token_id = 1
tokenizer.eos_token_id = 2
tokenizer.padding_side = "left"
load_type = torch.float16 #Sometimes may need torch.float32
model_config = AutoConfig.from_pretrained('/workspace/BELLE/train/dataset/llama2_7b_chat_hf')
model = AutoModelForCausalLM.from_pretrained('/workspace/BELLE/train/output/saved_models/llama2_7b_chat_hf/checkpoint-67008', torch_dtype=load_type, config=model_config, device_map='auto')
model.eval()
# 以上加载你的模型
uvicorn.run(app, host='0.0.0.0', port=7999, workers=1)
'''
curl -X POST "http://172.23.148.46:7999" -H 'Content-Type: application/json' -d '{"prompt": "你好", "history": []}'
'''
客户端代码如下:
# client.py
import requests
# 定义请求URL
url = "http://0.0.0.0:7999"
# 定义请求头
headers = {
"Content-Type": "application/json"
}
# 定义请求体数据
data = {
"prompt": "Human: \nYou are now a proficient people, \n\nAssistant:\n",
# "history": []
}
# 发送POST请求
response = requests.post(url, headers=headers, json=data)
# 打印响应
print(response.text)
使用flask,记得安装相应的库pip install Flask requests
将服务启动和推理分开为两个脚本。以下是一个简单的示例,其中一个脚本 start_server.py 负责启动服务,另一个脚本 client.py 负责进行推理:
# start_server.py
from flask import Flask, request, jsonify
from transformers import GPT2LMHeadModel, GPT2Tokenizer
app = Flask(__name__)
# 以下为模型初始化,加载你的模型
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# 以上为模型初始化,加载你的模型
@app.route('/infer', methods=['POST'])
def infer():
data = request.json
input_text = data.get('input_text', '')
# 处理推理请求
result = generate_response(input_text)
return jsonify({'result': result})
def generate_response(input_text):
# 以下实现模型推理逻辑
# 例如,使用预训练的 GPT-2 模型进行文本生成
input_ids = tokenizer.encode(input_text, return_tensors='pt')
output = model.generate(input_ids, max_length=100, num_return_sequences=1)
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
return generated_text
# 以上实现模型推理逻辑
if __name__ == '__main__':
app.run(host='0.0.0.0', port=5000)
客户端代码如下:
# client.py
import requests
def run_client(input_text):
url = 'http://localhost:5000/infer'
data = {'input_text': input_text}
response = requests.post(url, json=data)
if response.status_code == 200: # 200 表示请求成功
result = response.json()
print(result['result'])
else:
print('Error:', response.status_code)
if __name__ == '__main__':
input_text = "Human: \nYou are now a proficient people, \n\nAssistant:\n"
run_client(input_text)
笔者最近也是遇到类似的问题,做个小结,后续还会继续优化相关代码,持续更新。