背景:
一些竞赛给的是脱敏数据,中文竞赛一般将汉字表示为一个整数,汉字间用空格分割,在这样的数据集上想要预训练一个自己的模型,可以参考这个文章。
首先介绍本文参考的文章:
1、别人做的该任务的总结
2、官方tokenizer
注:这里我使用的是wordlevel的,和参考文档中wordpiece的不同,因为我认为脱敏得到的数字前缀没有意义。
import pandas as pd
from tokenizers import decoders, models, normalizers, pre_tokenizers, processors, trainers, Tokenizer
from transformers import BertTokenizerFast
tokenizer = Tokenizer(models.WordLevel())
tokenizer.normalizer = normalizers.BertNormalizer(lowercase=True)
tokenizer.pre_tokenizer = pre_tokenizers.BertPreTokenizer()
special_tokens = ["[UNK]", "[PAD]", "[CLS]", "[SEP]", "[MASK]"]
trainer = trainers.WordLevelTrainer(vocab_size=50000, special_tokens=special_tokens)
dataset = open("/data/train_token.json")
batch_size = 10
def batch_iterator():
result = []
for l in dataset:
result.append(l)
if len(result)==batch_size:
yield result
result = []
tokenizer.train_from_iterator(batch_iterator(), trainer=trainer)
new_tokenizer = BertTokenizerFast(tokenizer_object=tokenizer)
new_tokenizer.save_vocabulary("./tokenizer/")
最终在tokenizer文件夹下的内容如下:
import sys
import os
import csv
from transformers import BertTokenizer, WEIGHTS_NAME,TrainingArguments,BertForMaskedLM,BertConfig
import tokenizers
import torch
os.environ["CUDA_VISIBLE_DEVICES"] = '4,5,6,7'
# from datasets import load_dataset,Dataset
from transformers import (
CONFIG_MAPPING,
MODEL_FOR_MASKED_LM_MAPPING,
AutoConfig,
AutoModelForMaskedLM,
AutoTokenizer,
DataCollatorForLanguageModeling,
HfArgumentParser,
Trainer,
TrainingArguments,
set_seed,
LineByLineTextDataset
)
## 加载tokenizer和模型
model_path='/data/share_v5/jupyter/outputs-23/' ##如果是从头开始预训练,可以去huggingface下载一个模型。我此处是从上一次训练开始
token_path='/data/share_v5/jupyter/tokenizer/vocab.txt'
tokenizer = BertTokenizer.from_pretrained(token_path, do_lower_case=True)
config=BertConfig.from_pretrained(model_path)
model=BertForMaskedLM.from_pretrained(model_path, config=config)
model.resize_token_embeddings(len(tokenizer))
# 训练语料按行放置,并且空格间隔
train_dataset=LineByLineTextDataset(tokenizer=tokenizer,file_path='/data/share_v5/jupyter/5_7.txt',block_size=128)
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=0.15)
# 训练参数
pretrain_batch_size=64
num_train_epochs=40
training_args = TrainingArguments(
output_dir='/data/share_v5/jupyter/outputs-24',
overwrite_output_dir=True,
num_train_epochs=num_train_epochs,
learning_rate=6e-5,
per_device_train_batch_size=pretrain_batch_size,
save_total_limit=10,
logging_dir='/data/share_v5/jupyter/logs-24',
logging_steps=10000,
no_cuda=False)# save_steps=10000
# 通过Trainer接口训练模型
trainer = Trainer(
model=model, args=training_args, data_collator=data_collator, train_dataset=train_dataset)
# 开始训练
trainer.train()
trainer.save_model('/data/share_v5/jupyter/outputs-24')
说明:
1、token_path指定的就是第一步训练得到的结果中的词典。
2、可以分批训练,使用阶段结果进行下游任务。