据ChatGLM-6B b站的说法:【官方教程】ChatGLM-6B 微调:P-Tuning,LoRA,Full parameter大概意思就是练了后面的前面就忘了。
```bash
export PRE_SEQ_LEN=128
export LR=2e-2
export NUM_GPUS=1
# export WANDB_MODE=dryrun
export WANDB_DISABLED=true
torchrun --standalone --nnodes=1 --nproc-per-node=$NUM_GPUS ptuning/main.py \
--do_train \
--train_file data/AdvertiseGen/train.json \
--validation_file data/AdvertiseGen/dev.json \
--preprocessing_num_workers 4 \
--prompt_column content \
--response_column summary \
--overwrite_cache \
--model_name_or_path /home/geekplusa/ai/models/bigmodels/prepare_models/chatglm/chatglm2-6b-32k-int4 \
--output_dir /home/geekplusa/ai/models/bigmodels/train_models/chatglm/chatglm2/adgen-chatglm2-6b-pt-$PRE_SEQ_LEN-$LR \
--overwrite_output_dir \
--max_source_length 64 \
--max_target_length 256 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--gradient_accumulation_steps 16 \
--predict_with_generate \
--max_steps 2000 \
--logging_steps 10 \
--save_steps 500 \
--learning_rate $LR \
--pre_seq_len $PRE_SEQ_LEN \
--quantization_bit 4
```
```bash
export PRE_SEQ_LEN=128
export LR=2e-2
export NUM_GPUS=4
# export WANDB_MODE=dryrun
export WANDB_DISABLED=true
nohup torchrun --standalone --nnodes=1 --nproc-per-node=$NUM_GPUS ptuning/main.py --do_train --train_file data/AdvertiseGen/train_min.json --validation_file data/AdvertiseGen/dev_min.json --preprocessing_num_workers 4 --prompt_column content --response_column summary --overwrite_cache --model_name_or_path models/chatglm2-6b-int4 --output_dir models/adgen-chatglm2-6b-int4-pt-128-2e-e --overwrite_output_dir --max_source_length 64 --max_target_length 256 --per_device_train_batch_size 1 --per_device_eval_batch_size 1 --gradient_accumulation_steps 16 --predict_with_generate --max_steps 100 --logging_steps 10 --save_steps 50 --learning_rate $LR --pre_seq_len $PRE_SEQ_LEN --quantization_bit 4 > log 2>&1 &
```
```bash
export WANDB_DISABLED=true
export PRE_SEQ_LEN=128
export LR=2e-2
CUDA_VISIBLE_DEVICES=0 python ptuning/main.py \
--do_train \
--train_file data/AdvertiseGen/train_min.json \
--validation_file data/AdvertiseGen/val_min.json \
--prompt_column content \
--response_column summary \
--overwrite_cache \
--model_name_or_path /home/geekplusa/ai/models/bigmodels/prepare_models/chatglm/chatglm2-6b-int4 \
--output_dir /home/geekplusa/ai/models/bigmodels/train_models/chatglm/chatglm2/adgen-chatglm2-6b-pt-医院1-$PRE_SEQ_LEN-$LR \
--overwrite_output_dir \
--max_source_length 64 \
--max_target_length 1000 \
--per_device_train_batch_size 1 \
--per_device_eval_batch_size 1 \
--gradient_accumulation_steps 16 \
--predict_with_generate \
--max_steps 1 \
--logging_steps 10 \
--save_steps 1 \
--learning_rate 2e-2 \
--pre_seq_len 128 \
--quantization_bit 4
```
这里包含基础模型单卡部署、基础模型多卡部署、ptuning模型单卡部署、ptuning模型多卡部署
参见下面的脚本web_demo2.py
from transformers import AutoModel, AutoTokenizer
from transformers import AutoConfig
import streamlit as st
import os
st.set_page_config(
page_title="ChatGLM2-6b 演示",
page_icon=":robot:",
layout='wide'
)
@st.cache_resource
def get_model_onegpu():
tokenizer = AutoTokenizer.from_pretrained("models/chatglm2-6b-int4", trust_remote_code=True)
model = AutoModel.from_pretrained("models/chatglm2-6b-int4", trust_remote_code=True).cuda()
return tokenizer, model
@st.cache_resource
def get_model_mitugpu():
tokenizer = AutoTokenizer.from_pretrained("models/chatglm2-6b-int4", trust_remote_code=True)
from utils import load_model_on_gpus
model = load_model_on_gpus("models/chatglm2-6b-int4", num_gpus=4)
return tokenizer, model
@st.cache_resource
def get_model_ptuning_onegpu():
tokenizer = AutoTokenizer.from_pretrained("models/chatglm2-6b-int4", trust_remote_code=True)
import torch
CHECKPOINT_PATH = "models/gukai/checkpoint-500/"
config = AutoConfig.from_pretrained("models/chatglm2-6b-int4", trust_remote_code=True, pre_seq_len=128)
model = AutoModel.from_pretrained("models/chatglm2-6b-int4", config=config, trust_remote_code=True)
prefix_state_dict = torch.load(os.path.join(CHECKPOINT_PATH, "pytorch_model.bin"))
new_prefix_state_dict = {}
for k, v in prefix_state_dict.items():
if k.startswith("transformer.prefix_encoder."):
new_prefix_state_dict[k[len("transformer.prefix_encoder."):]] = v
model.transformer.prefix_encoder.load_state_dict(new_prefix_state_dict)
model = model.quantize(4)
model = model.cuda()
return tokenizer, model
@st.cache_resource
def get_model_ptuning_mutigpu():
tokenizer = AutoTokenizer.from_pretrained("models/chatglm2-6b-int4", trust_remote_code=True)
import torch
from utils import load_model_on_gpus
CHECKPOINT_PATH = "models/gukai/checkpoint-500/"
prefix_state_dict = torch.load(os.path.join(CHECKPOINT_PATH, "pytorch_model.bin"))
#prefix_state_dict = torch.load(os.path.join(CHECKPOINT_PATH, "pytorch_model.bin"), map_location=lambda storage, loc: storage.cuda(1))
new_prefix_state_dict = {}
for k, v in prefix_state_dict.items():
if k.startswith("transformer.prefix_encoder."):
new_prefix_state_dict[k[len("transformer.prefix_encoder."):]] = v
model = load_model_on_gpus("models/chatglm2-6b-int4", num_gpus=4, pre_seq_len=128)
model.transformer.prefix_encoder.load_state_dict(new_prefix_state_dict)
model.transformer.prefix_encoder.float()
model = model.quantize(4)
#model = model.cuda()
return tokenizer, model
@st.cache_resource
def get_model():
tokenizer = AutoTokenizer.from_pretrained("models/gukai/checkpoint-500", trust_remote_code=True)
model = AutoModel.from_pretrained("models/gukai/checkpoint-500", trust_remote_code=True).cuda()
# 多显卡支持,使用下面两行代替上面一行,将num_gpus改为你实际的显卡数量
# from utils import load_model_on_gpus
# model = load_model_on_gpus("THUDM/chatglm2-6b", num_gpus=2)
model = model.eval()
return tokenizer, model
#tokenizer, model = get_model()
tokenizer, model = get_model_ptuning_mutigpu()
st.title("ChatGLM2-6B")
max_length = st.sidebar.slider(
'max_length', 0, 32768, 8192, step=1
)
top_p = st.sidebar.slider(
'top_p', 0.0, 1.0, 0.8, step=0.01
)
temperature = st.sidebar.slider(
'temperature', 0.0, 1.0, 0.8, step=0.01
)
if 'history' not in st.session_state:
st.session_state.history = []
if 'past_key_values' not in st.session_state:
st.session_state.past_key_values = None
for i, (query, response) in enumerate(st.session_state.history):
with st.chat_message(name="user", avatar="user"):
st.markdown(query)
with st.chat_message(name="assistant", avatar="assistant"):
st.markdown(response)
with st.chat_message(name="user", avatar="user"):
input_placeholder = st.empty()
with st.chat_message(name="assistant", avatar="assistant"):
message_placeholder = st.empty()
prompt_text = st.text_area(label="用户命令输入",
height=100,
placeholder="请在这儿输入您的命令")
button = st.button("发送", key="predict")
if button:
input_placeholder.markdown(prompt_text)
history, past_key_values = st.session_state.history, st.session_state.past_key_values
for response, history, past_key_values in model.stream_chat(tokenizer, prompt_text, history,
past_key_values=past_key_values,
max_length=max_length, top_p=top_p,
temperature=temperature,
return_past_key_values=True):
message_placeholder.markdown(response)
st.session_state.history = history
st.session_state.past_key_values = past_key_values