model = transformers.AutoModelForCausalLM.from_pretrained(
model_args.model_name_or_path,
cache_dir=training_args.cache_dir,
torch_dtype='auto',
# if model_args.model_name_or_path.find("falcon") != -1 else False
trust_remote_code=True
)
model = transformers.AutoModelForCausalLM.from_pretrained(
model_args.model_name_or_path,
cache_dir=training_args.cache_dir,
device_map='auto',
torch_dtype='auto',
# if model_args.model_name_or_path.find("falcon") != -1 else False
trust_remote_code=True
)
from peft import LoraConfig, get_peft_model
LORA_R = 32
# LORA_ALPHA = 16
LORA_DROPOUT = 0.05
TARGET_MODULES = [
"o_proj","gate_proj", "down_proj", "up_proj"
]
config = LoraConfig(
r=LORA_R,
# lora_alpha=LORA_ALPHA,
target_modules=TARGET_MODULES,
lora_dropout=LORA_DROPOUT,
bias="none",
task_type="CAUSAL_LM",
#加载配置
model = get_peft_model(model, config)
#打印训练参数比例
model.print_trainable_parameters()
tokenizer = transformers.AutoTokenizer.from_pretrained(
model_args.model_name_or_path, trust_remote_code=True)
通过Hugging Face的dateset库进行加载数据
使用dateset可以轻松加载数据,样例如下所示:
from datasets import load_dataset
dataset = load_dataset('csv', data_files='my_file.csv')
dataset = load_dataset('csv', data_files=['my_file_1.csv', 'my_file_2.csv', 'my_file_3.csv'])
dataset = load_dataset('csv', data_files={'train':['my_train_file_1.csv','my_train_file_2.csv'],'test': 'my_test_file.csv'})
我们可以按下面方式加载数据
def load_dataset_from_own(data_path: Optional[str] = None,
cache_dir: Optional[str] = "cache_data") -> Dataset:
all_file_list = ['a.json','b.json','c.json']
data_files = {'train': all_file_list}
extension = all_file_list[0].split(".")[-1]
datasets = load_dataset(
extension,
data_files=data_files,
cache_dir=cache_dir,
)['train']
return datasets
PROMPT_DICT = {
"prompt_input": (
"Below is an instruction that describes a task, paired with an input that provides further context. "
"Write a response that appropriately completes the request.\n\n"
"### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
),
"prompt_no_input": (
"Below is an instruction that describes a task. "
"Write a response that appropriately completes the request.\n\n"
"### Instruction:\n{instruction}\n\n### Response:"
),
}
sources = [prompt_input.format_map({'instruction': ins_data[i], 'input': input_data[i]}) if input_data[
i] != "" else prompt_no_input.format_map(
{'instruction': ins_data[i]})
for i in range(len_)]
#限制长度
sources = [i[:data_args.source_length] for i in sources]
targets = [f"{example[:data_args.target_length-1]}{tokenizer.eos_token}" for example in output]
输入需要构建的text,输出构建好的ids
def _tokenize_fn(strings: Sequence[str], tokenizer: transformers.PreTrainedTokenizer) -> Dict:
"""Tokenize a list of strings."""
tokenized_list = [
tokenizer(
text,
return_tensors="pt",
padding="longest",
max_length=tokenizer.model_max_length,
truncation=True,
)
for text in strings
]
#获得ids
input_ids = labels = [tokenized.input_ids[0]
for tokenized in tokenized_list]
#终止符设置
ne_pad_token_id = IGNORE_INDEX if tokenizer.pad_token_id is None else tokenizer.pad_token_id
#统计长度
input_ids_lens = labels_lens = [
tokenized.input_ids.ne(ne_pad_token_id).sum().item() for tokenized in tokenized_list
]
return dict(
input_ids=input_ids,
labels=labels,
input_ids_lens=input_ids_lens,
labels_lens=labels_lens,
)
构建input_ids 和label
examples = [s + t for s, t in zip(sources, targets)]
#问题+答案、问题
examples_tokenized, sources_tokenized = [_tokenize_fn(
strings, tokenizer) for strings in (examples, sources)]
input_ids = examples_tokenized["input_ids"]
labels = copy.deepcopy(input_ids)
#构建labels
for label, source_len in zip(labels, sources_tokenized["input_ids_lens"]):
label[:source_len] = IGNORE_INDEX
在动态batching中我们需要一个data collator完成padding。这里不适用DataCollatorWithPadding
来进行补齐操作,因为这个函数仅对输入的键(包括input_ids, attention_mask, token_type_ids
)进行补齐,不会对labels
进行补齐操作。还有在对labels
进行补齐操作时,使用的是-100而不是分词器的pad_token,这么做到的目的是在计算损失函数的时候忽略掉这些padding token。
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model,
label_pad_token_id=IGNORE_INDEX)
from transformers import DataCollatorForSeq2Seq, Trainer
trainer = Trainer(model=model,
tokenizer=tokenizer,
args=training_args,
train_dataset=train_dataset,
eval_dataset=None,
data_collator=data_collator)
trainer.train()
trainer.save_state()
trainer.save_model(output_dir=training_args.output_dir)
base_model_name_or_path = "internlm-7b"
lora_model_name_or_path ="checkpoint-9695"
model = AutoModelForCausalLM.from_pretrained(
base_model_name_or_path,
torch_dtype="auto",
# device_map="auto",
# if model_args.model_name_or_path.find("falcon") != -1 else False
trust_remote_code=True,
).cuda(0)
model = PeftModel.from_pretrained(model, model_id=lora_model_name_or_path)
model.eval()
print("ok")
tokenizer = AutoTokenizer.from_pretrained(
base_model_name_or_path, trust_remote_code=True, padding_side="left"
)
def batch_generate_data(
text_input: List[str], use_train_model: bool = True, temp: float = 0.7
):
text_input_format = [generate_input(i) for i in text_input]
batch_inputs = tokenizer.batch_encode_plus(
text_input_format, padding="longest", return_tensors="pt"
)
batch_inputs["input_ids"] = batch_inputs["input_ids"].cuda()
batch_inputs["attention_mask"] = batch_inputs["attention_mask"].cuda()
if use_train_model:
# with model.disable_adapter():
outputs = model.generate(
**batch_inputs,
max_new_tokens=256,
do_sample=True,
temperature=temp,
top_p=0.8,
)
else:
with model.disable_adapter():
outputs = model.generate(
**batch_inputs,
max_new_tokens=256,
do_sample=True,
temperature=temp,
top_p=0.8,
)
outputs = tokenizer.batch_decode(
outputs.cpu()[:, batch_inputs["input_ids"].shape[-1] :],
skip_special_tokens=True,
)
return outputs
text_input = ["工作压力太大怎么办\n"] * 32
# lora 训练结果
batch_generate_data(text_input, use_train_model=True, temp=0.8)
# 原来的模型
batch_generate_data(text_input, use_train_model=False, temp=0.8)
model = model.merge_and_unload()
model.save_pretrained("internlm-7b-lml")
tokenizer.save_pretrained("internlm-7b-lml")