torch.distributed
和ColossalAI再搞)、llama.cpp量化模型。在LoRA方法提出之前,也有很多方法尝试解决大模型微调困境的方法。其中有两个主要的方向:
hidden_state
。financial_sentiment_analysis
,给定一个句子,要求识别出该句子是negative、positive还是neutral三个中的哪一个next(iter(train_dataloader)).keys()
Out[2]: dict_keys(['input_ids', 'attention_mask', 'labels'])
# train_dataset.data如下所示
input_ids: [[[486,7834,304,259,35610,...,0,0,0,0,0],[259,229832,259,277,263,...,0,0,0,0,0],...,[259,96890,259,5330,259,...,0,0,0,0,0],[486,5835,259,39509,259,...,0,0,0,0,0]],[[1494,1546,259,69541,259,...,0,0,0,0,0],[486,7495,13159,339,2847,...,0,0,0,0,0],...,[20871,72726,702,92223,332,...,0,0,0,0,0],[486,584,193394,347,11470,...,0,0,0,0,0]],[[274,298,259,62434,263,...,0,0,0,0,0],[1477,514,1904,259,263,...,0,0,0,0,0],...,[143129,268,259,277,263,...,0,0,0,0,0],[35446,339,31499,285,288,...,0,0,0,0,0]]]
attention_mask: [[[1,1,1,1,1,...,0,0,0,0,0],[1,1,1,1,1,...,0,0,0,0,0],...,[1,1,1,1,1,...,0,0,0,0,0],[1,1,1,1,1,...,0,0,0,0,0]],[[1,1,1,1,1,...,0,0,0,0,0],[1,1,1,1,1,...,0,0,0,0,0],...,[1,1,1,1,1,...,0,0,0,0,0],[1,1,1,1,1,...,0,0,0,0,0]],[[1,1,1,1,1,...,0,0,0,0,0],[1,1,1,1,1,...,0,0,0,0,0],...,[1,1,1,1,1,...,0,0,0,0,0],[1,1,1,1,1,...,0,0,0,0,0]]]
labels: [[[59006,1,-100],[59006,1,-100],...,[59006,1,-100],[59006,1,-100]],[[18205,1,-100],[59006,1,-100],...,[259,32588,1],[18205,1,-100]],[[59006,1,-100],[59006,1,-100],...,[59006,1,-100],[59006,1,-100]]]
peft
库(Parameter-Efficient Fine-Tuning)进行微调,支持如下tuning:
# !/usr/bin/python
# -*- coding: utf-8 -*-
"""
@Author : guomiansheng
@Software : Pycharm
@Contact : [email protected]
@File : main.py
"""
from transformers import AutoModelForSeq2SeqLM
from peft import get_peft_config, get_peft_model, get_peft_model_state_dict, LoraConfig, TaskType
import torch
from datasets import load_dataset
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from transformers import default_data_collator, get_linear_schedule_with_warmup
from tqdm import tqdm
from datasets import load_dataset
def train_model():
# device = "cuda"
device = "mps"
model_name_or_path = "bigscience/mt0-large"
tokenizer_name_or_path = "bigscience/mt0-large"
checkpoint_name = "financial_sentiment_analysis_lora_v1.pt"
text_column = "sentence"
label_column = "text_label"
max_length = 128
lr = 1e-3
num_epochs = 3
batch_size = 8
# 搭建model
peft_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32,
lora_dropout=0.1)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
# 加载数据
dataset = load_dataset("financial_phrasebank", "sentences_allagree")
dataset = dataset["train"].train_test_split(test_size=0.1)
dataset["validation"] = dataset["test"]
del dataset["test"]
classes = dataset["train"].features["label"].names
dataset = dataset.map(
lambda x: {"text_label": [classes[label] for label in x["label"]]},
batched=True,
num_proc=1,
)
# 训练数据预处理
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
def preprocess_function(examples):
inputs = examples[text_column]
targets = examples[label_column]
model_inputs = tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True,
return_tensors="pt")
labels = tokenizer(targets, max_length=3, padding="max_length", truncation=True, return_tensors="pt")
labels = labels["input_ids"]
labels[labels == tokenizer.pad_token_id] = -100
model_inputs["labels"] = labels
return model_inputs
processed_datasets = dataset.map(
preprocess_function,
batched=True,
num_proc=1,
remove_columns=dataset["train"].column_names,
load_from_cache_file=False,
desc="Running tokenizer on dataset",
)
train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["validation"]
train_dataloader = DataLoader(
train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
)
eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)
# 设定优化器和正则项
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
optimizer=optimizer,
num_warmup_steps=0,
num_training_steps=(len(train_dataloader) * num_epochs),
)
# 训练和评估
model = model.to(device)
for epoch in range(num_epochs):
model.train()
total_loss = 0
for step, batch in enumerate(tqdm(train_dataloader)):
batch = {k: v.to(device) for k, v in batch.items()}
outputs = model(**batch)
loss = outputs.loss
total_loss += loss.detach().float()
loss.backward()
optimizer.step()
lr_scheduler.step()
optimizer.zero_grad()
model.eval()
eval_loss = 0
eval_preds = []
for step, batch in enumerate(tqdm(eval_dataloader)):
batch = {k: v.to(device) for k, v in batch.items()}
with torch.no_grad():
outputs = model(**batch)
loss = outputs.loss
eval_loss += loss.detach().float()
eval_preds.extend(
tokenizer.batch_decode(torch.argmax(outputs.logits, -1).detach().cpu().numpy(),
skip_special_tokens=True)
)
eval_epoch_loss = eval_loss / len(eval_dataloader)
eval_ppl = torch.exp(eval_epoch_loss)
train_epoch_loss = total_loss / len(train_dataloader)
train_ppl = torch.exp(train_epoch_loss)
print(f"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}")
# 保存模型
peft_model_id = f"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}"
model.save_pretrained(peft_model_id)
def inference_model():
# device = "cuda"
device = "mps"
model_name_or_path = "bigscience/mt0-large"
tokenizer_name_or_path = "bigscience/mt0-large"
checkpoint_name = "financial_sentiment_analysis_lora_v1.pt"
text_column = "sentence"
label_column = "text_label"
max_length = 128
lr = 1e-3
num_epochs = 3
batch_size = 8
# 搭建model
peft_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM, inference_mode=False, r=8, lora_alpha=32,
lora_dropout=0.1)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()
# 加载数据
dataset = load_dataset("financial_phrasebank", "sentences_allagree")
dataset = dataset["train"].train_test_split(test_size=0.1)
dataset["validation"] = dataset["test"]
del dataset["test"]
classes = dataset["train"].features["label"].names
dataset = dataset.map(
lambda x: {"text_label": [classes[label] for label in x["label"]]},
batched=True,
num_proc=1,
)
# 训练数据预处理
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)
def preprocess_function(examples):
inputs = examples[text_column]
targets = examples[label_column]
model_inputs = tokenizer(inputs, max_length=max_length, padding="max_length", truncation=True,
return_tensors="pt")
labels = tokenizer(targets, max_length=3, padding="max_length", truncation=True, return_tensors="pt")
labels = labels["input_ids"]
labels[labels == tokenizer.pad_token_id] = -100
model_inputs["labels"] = labels
return model_inputs
processed_datasets = dataset.map(
preprocess_function,
batched=True,
num_proc=1,
remove_columns=dataset["train"].column_names,
load_from_cache_file=False,
desc="Running tokenizer on dataset",
)
train_dataset = processed_datasets["train"]
eval_dataset = processed_datasets["validation"]
train_dataloader = DataLoader(
train_dataset, shuffle=True, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True
)
eval_dataloader = DataLoader(eval_dataset, collate_fn=default_data_collator, batch_size=batch_size, pin_memory=True)
# 设定优化器和正则项
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
lr_scheduler = get_linear_schedule_with_warmup(
optimizer=optimizer,
num_warmup_steps=0,
num_training_steps=(len(train_dataloader) * num_epochs),
)
# 训练和评估
model = model.to(device)
# 模型推理预测
from peft import PeftModel, PeftConfig
peft_model_id = f"{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)
model = PeftModel.from_pretrained(model, peft_model_id)
model.eval()
i = 0
inputs = tokenizer(dataset["validation"][text_column][i], return_tensors="pt")
print(dataset["validation"][text_column][i])
print(inputs)
with torch.no_grad():
outputs = model.generate(input_ids=inputs["input_ids"], max_new_tokens=10)
print(outputs)
print(tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True))
print("=============test=============")
if __name__ == '__main__':
# train_model()
inference_model()
可以看到上面的LoraConfig
参数如下:
peft_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM,
inference_mode=False,
r=8,
lora_alpha=32,
lora_dropout=0.1)
task_type
inference_mode
:r
:lora的秩;lora_A用高斯分布初始化,lora_B用0初始化lora_alpha
:lora微调的缩放系数lora_dropout
:lora微调的dropout系数learning_rate
:adamw优化器的初始学习速率也可以看LoraConfig
源码:
class LoraConfig(PeftConfig):
r: int = field(default=8, metadata={"help": "Lora attention dimension"})
target_modules: Optional[Union[List[str], str]] = field(
default=None,
metadata={
"help": "List of module names or regex expression of the module names to replace with Lora."
"For example, ['q', 'v'] or '.*decoder.*(SelfAttention|EncDecAttention).*(q|v)$' "
},
)
lora_alpha: int = field(default=None, metadata={"help": "Lora alpha"})
lora_dropout: float = field(default=None, metadata={"help": "Lora dropout"})
fan_in_fan_out: bool = field(
default=False,
metadata={"help": "Set this to True if the layer to replace stores weight like (fan_in, fan_out)"},
)
bias: str = field(default="none", metadata={"help": "Bias type for Lora. Can be 'none', 'all' or 'lora_only'"})
modules_to_save: Optional[List[str]] = field(
default=None,
metadata={
"help": "List of modules apart from LoRA layers to be set as trainable and saved in the final checkpoint. "
"For example, in Sequence Classification or Token Classification tasks, "
"the final layer `classifier/score` are randomly initialized and as such need to be trainable and saved."
},
)
init_lora_weights: bool = field(
default=True,
metadata={"help": "Whether to initialize the weights of the Lora layers."},
)
def __post_init__(self):
self.peft_type = PeftType.LORA
int
): Lora attention dimension.Union[List[str],str]
): The names of the modules to apply Lora to.float
): The alpha parameter for Lora scaling.float
): The dropout probability for Lora layers.bool
): Set this to True if the layer to replace stores weight like (fan_in, fan_out).
Conv1D
which stores weights like (fan_in, fan_out) and hence this should be set to True
.:str
): Bias type for Lora. Can be ‘none’, ‘all’ or ‘lora_only’List[str]
):List of modules apart from LoRA layers to be set as trainable[1] A Survey of Large Language Models. Wayne Xin Zhao
[2] 大模型论文综述介绍
[3] LLaMA类模型没那么难,LoRA将模型微调缩减到几小时
[4] RLHF中的PPO算法原理及其实现
[5] 基于DeepSpeed训练ChatGPT
[6] Prompt-Tuning——深度解读一种新的微调范式
[7] 大模型参数高效微调技术原理综述(七)-最佳实践、总结
[8] chatGLM2-6B模型的全参数微调(改进多轮对话交互质量等):https://github.com/SpongebBob/Finetune-ChatGLM2-6B
[9] 大模型微调样本构造的trick
[10] 大模型参数高效微调技术原理综述(一)-背景、参数高效微调简介(附全量参数微调与参数高效微调对比-表格)
[11] 大模型训练之微调篇.无数据不智能
[12] 理解金融报告:使用大模型.无数据不智能
[13] Scaling Down to Scale Up: A Guide to Parameter-Efficient Fine-Tuning
[14] 低资源微调大模型:LoRA模型思想与BLOOM-LORA代码实现分析
[15] 模型和指令微调方法.山顶夕景
[16] 详谈大模型训练和推理优化技术
[17] LLM+LoRa微调加速技术原理及基于PEFT的动手实践:一些思考和mt0-large+lora完整案例
[18] 再看大模型Lora微调加速是否有效:Full-Parameter全参数微调与LoRA低秩微调的性能对比开源实验介绍
[19] 微调范式对比Freeze、P-Tuning、Lora、full-Finetune开源实现
[20] 基于GLM-6B对话模型的实体属性抽取项目实现解析:对Zero-shot与In-Context Learning的若干思考
[21] 微调实战:DeepSpeed+Transformers实现简单快捷上手百亿参数模型微调
[22] LLaMA:小参数+大数据的开放、高效基础语言模型阅读笔记
[23] 代码角度看LLaMA语言模型
[24] ChatGPT应用端的Prompt解析:从概念、基本构成、常见任务、构造策略到开源工具与数据集
[25] LLM实战:大语言模型BLOOM推理工具测试实践与效果分析实录
[26] 谈langchain大模型外挂知识库问答系统核心部件:如何更好地解析、分割复杂非结构化文本
[27] 看支持32K上下文的ChatGLM2-6B模型:优化点简读及现有开源模型主流训练优化点概述
[28] 极低资源条件下如何微调大模型:LoRA模型思想与BLOOM-LORA代码实现分析
[29] The Power of Scale for Parameter-Efficient Prompt Tuning
[30] https://github.com/mymusise/ChatGLM-Tuning
一种平价的 Chatgpt 实现方案,基于清华的ChatGLM-6B+ LoRA 进行finetune
[31] https://github.com/jxhe/unify-parameter-efficient-tuning
[31] 简单分析LoRA方法
[32] financial_phrasebank dataset.huggingface
[33] GPT大语言模型Alpaca-lora本地化部署实践.某东技术