参考源:https://zhuanlan.zhihu.com/p/620885226#Chatglm-6B%E6%A8%A1%E5%9E%8B%E5%BE%AE%E8%B0%83
大模型微调的工具包有:
deepspeed:GPU不够时,CPU来凑
peft :github有官网介绍。(支持:LoRA: LORA: LOW-RANK ADAPTATION OF LARGE LANGUAGE MODELS
Prefix Tuning: Prefix-Tuning: Optimizing Continuous Prompts for Generation, P-Tuning v2: Prompt Tuning Can Be Comparable to Fine-tuning Universally Across Scales and Tasks
P-Tuning: GPT Understands, Too
Prompt Tuning: The Power of Scale for Parameter-Efficient Prompt Tuning
AdaLoRA: Adaptive Budget Allocation for Parameter-Efficient Fine-Tuning)
def print_trainable_parameters(model):
trainable_params = 0
all_param = 0
for _, param in model.named_parameters():
num_params = param.numel()
if num_params == 0 and hasattr(param, "ds_numel"):
num_params = param.ds_numel
all_param += num_params
if param.requires_grad:
trainable_params += num_params
print(
f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}")
model = ChatGLMForConditionalGeneration.from_pretrained(args.model_dir)
tokenizer = ChatGLMTokenizer.from_pretrained(args.model_dir)
model = model.half().cuda()
for name, param in model.named_parameters():
if not any(nd in name for nd in ["layers.27", "layers.26", "layers.25", "layers.24", "layers.23"]):
param.requires_grad = False
print_trainable_parameters(model)
for name, param in model.named_parameters():
if param.requires_grad == True:
print(name)
config = LoraConfig(r=args.lora_r,
lora_alpha=32,
target_modules=["query_key_value"],
lora_dropout=0.1,
bias="none",
task_type="CAUSAL_LM",
inference_mode=False,
)
model = get_peft_model(model, config)
for name, param in model.named_parameters():
if not any(nd in name for nd in ["prefix_encoder"]):
param.requires_grad = False
print_trainable_parameters(model)
for name, param in model.named_parameters():
if param.requires_grad == True:
print(name)
数据集样例:
{“text”: “清热解毒口服液由生石膏、知母、紫花地丁、金银花、麦门冬、黄芩、玄参、连翘、龙胆草、生地黄、栀子、板蓝根组成。具有疏风解表、清热解毒利咽、生津止渴的功效,适用于治疗外感时邪、内有蕴热所致的身热汗出、头痛身痛、心烦口渴、微恶寒或反恶热、舌红、苔黄、脉数等症。现代临床主要用于治疗流行性感冒、流行性脑脊髓膜炎、肺炎等各种发热性疾病。口服液:每支10毫升,每次10~20毫升,每日3次。〔注意事项〕阳虚便澹者不宜使用。”, “answer”: “当代临床医学中,清热解毒口服液都能治疗哪些疾病?”}
train_dataset = Seq2SeqDataSet(args.train_path, tokenizer, args.max_len, args.max_src_len, args.prompt_text)
train_dataloader = DataLoader(train_dataset,
batch_size=conf["train_micro_batch_size_per_gpu"],
sampler=RandomSampler(train_dataset),
collate_fn=coll_fn,
drop_last=True,
num_workers=0)
# TODO 更改
'''model_engine, optimizer, _, _ = deepspeed.initialize(config=conf,
model=model,
model_parameters=model.parameters())'''
model_engine=model
# optimizer=model.parameters()
optimizer = optim.Adam(model.parameters(), lr=1e-5)
model_engine.train()
global_step = 0
for i_epoch in range(args.num_train_epochs):
train_iter = iter(train_dataloader)
for step, batch in enumerate(train_iter):
input_ids = batch["input_ids"].cuda()
labels = batch["labels"].cuda()
outputs = model_engine.forward(input_ids=input_ids, labels=labels)
loss = outputs[0]
if conf["gradient_accumulation_steps"] > 1:
loss = loss / conf["gradient_accumulation_steps"]
# TODO 注释了
# model_engine.backward(loss)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
if (step + 1) % conf["gradient_accumulation_steps"] == 0:
# TODO 注释了
# model_engine.step()
optimizer.step()
global_step += 1
if global_step % args.log_steps == 0:
print("loss:{}, global_step:{}".format(float(loss.item()), global_step))
save_dir = os.path.join(args.output_dir, f"global_step-{global_step}")
model_engine.save_pretrained(save_dir)