build_model_and_tokenizer(args)
def build_model_and_tokenizer(args):
tokenizer = BertTokenizer.from_pretrained(args.vocab_path)
model_config = NeZhaConfig.from_pretrained(args.pretrain_model_path)
model = NeZhaForMaskedLM.from_pretrained(pretrained_model_name_or_path=args.pretrain_model_path,
config=model_config)
model.to(args.device)
return tokenizer, model
根据vaob_path
创建词表以及根据pretrain_model_path
来创建模型的config
,以及导入NeZhaForMaskedLM
的预训练模型。
返回分词器和模型。
read_data(args, tokenizer)
返回的是编码之后的dicinputs['input_ids']
inputs['token_type_ids']
inputs['attention_mask']
def read_data(args, tokenizer: BertTokenizer) -> dict:
train_path = os.path.join(args.pretrain_data_path, 'shandong', 'train.csv')
test_path = os.path.join(args.pretrain_data_path, 'shandong', 'testa_nolabel.csv')
train_df = pd.read_csv(train_path, sep=',')
test_df = pd.read_csv(test_path, sep=',')
if args.debug:
train_df = train_df.head(3000)
test_df = test_df.head(300)
inputs = defaultdict(list)
for i, row in tqdm(train_df.iterrows(), desc='', total=len(train_df)):
id, name, content, label = row[0], row[1], row[2], row[3]
if str(name) == 'nan':
name = '无'
if str(content) == 'nan':
content = '无'
inputs_dict = tokenizer.encode_plus(name, content, add_special_tokens=True,
return_token_type_ids=True, return_attention_mask=True)
inputs['input_ids'].append(inputs_dict['input_ids'])
inputs['token_type_ids'].append(inputs_dict['token_type_ids'])
inputs['attention_mask'].append(inputs_dict['attention_mask'])
for i, row in tqdm(test_df.iterrows(), desc='', total=len(test_df)):
id, name, content = row[0], row[1], row[2]
if str(name) == 'nan':
name = '无'
if str(content) == 'nan':
content = '无'
inputs_dict = tokenizer.encode_plus(name, content, add_special_tokens=True,
return_token_type_ids=True, return_attention_mask=True)
inputs['input_ids'].append(inputs_dict['input_ids'])
inputs['token_type_ids'].append(inputs_dict['token_type_ids'])
inputs['attention_mask'].append(inputs_dict['attention_mask'])
os.makedirs(os.path.dirname(args.data_cache_path), exist_ok=True)
save_pickle(inputs, args.data_cache_path)
return inputs
首先读取训练数据和测试数据,然后获取到数据的每一列的数据,使用tokenizer.encode_plus
来对数据进行编码,然后将编码之后的保存到inputs = defaultdict(list)
的inputs里面。然后pickle将编码之后的数据保存到本地路径。
train_dataloader = load_data(args, tokenizer)
根据argsDGDataCollator
,然后定义DataSet
和DataLoader
def load_data(args, tokenizer):
with open(args.data_cache_path, 'rb') as f:
train_data = pickle.load(f)
collate_fn = DGDataCollator(args.max_seq_len, tokenizer)
train_dataset = DGDataset(train_data)
train_dataloader = DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True,
num_workers=args.num_workers, collate_fn=collate_fn)
return train_dataloader
DGDataCollator将输入的inputs(字典)的每一个样本进行填充和截断,并且随机15%的概率进行mask(80% MASK, 10% random, 10% original),然后返回处理之后的字典。
class DGDataCollator:
def __init__(self, max_seq_len: int, tokenizer: BertTokenizer, mlm_probability=0.15):
self.max_seq_len = max_seq_len
self.tokenizer = tokenizer
self.mlm_probability = mlm_probability
self.special_token_ids = {tokenizer.cls_token_id, tokenizer.sep_token_id}
def pad_and_truncate(self, input_ids_list, token_type_ids_list,
attention_mask_list, max_seq_len):
input_ids = torch.zeros((len(input_ids_list), max_seq_len), dtype=torch.long)
token_type_ids = torch.zeros_like(input_ids)
attention_mask = torch.zeros_like(input_ids)
for i in range(len(input_ids_list)):
seq_len = len(input_ids_list[i])
if seq_len <= max_seq_len:
input_ids[i, :seq_len] = torch.tensor(input_ids_list[i], dtype=torch.long)
token_type_ids[i, :seq_len] = torch.tensor(token_type_ids_list[i], dtype=torch.long)
attention_mask[i, :seq_len] = torch.tensor(attention_mask_list[i], dtype=torch.long)
else:
input_ids[i] = torch.tensor(input_ids_list[i][:max_seq_len - 1] + [self.tokenizer.sep_token_id],
dtype=torch.long)
token_type_ids[i] = torch.tensor(token_type_ids_list[i][:max_seq_len], dtype=torch.long)
attention_mask[i] = torch.tensor(attention_mask_list[i][:max_seq_len], dtype=torch.long)
return input_ids, token_type_ids, attention_mask
def mask_tokens(
self, inputs: torch.Tensor, special_tokens_mask: Optional[torch.Tensor] = None
) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Prepare masked tokens inputs/labels for masked language modeling: 80% MASK, 10% random, 10% original.
"""
labels = inputs.clone()
# We sample a few tokens in each sequence for MLM training (with probability `self.mlm_probability`)
probability_matrix = torch.full(labels.shape, self.mlm_probability)
if special_tokens_mask is None:
special_tokens_mask = [
self.tokenizer.get_special_tokens_mask(val, already_has_special_tokens=True) for val in labels.tolist()
]
special_tokens_mask = torch.tensor(special_tokens_mask, dtype=torch.bool)
else:
special_tokens_mask = special_tokens_mask.bool()
probability_matrix.masked_fill_(special_tokens_mask, value=0.0)
masked_indices = torch.bernoulli(probability_matrix).bool()
labels[~masked_indices] = -100 # We only compute loss on masked tokens
# 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices
inputs[indices_replaced] = self.tokenizer.convert_tokens_to_ids(self.tokenizer.mask_token)
# 10% of the time, we replace masked input tokens with random word
indices_random = torch.bernoulli(torch.full(labels.shape, 0.5)).bool() & masked_indices & ~indices_replaced
random_words = torch.randint(len(self.tokenizer), labels.shape, dtype=torch.long)
inputs[indices_random] = random_words[indices_random]
# The rest of the time (10% of the time) we keep the masked input tokens unchanged
return inputs, labels
def __call__(self, examples: list) -> dict:
input_ids_list, token_type_ids_list, attention_mask_list = list(zip(*examples))
cur_max_seq_len = max(len(input_id) for input_id in input_ids_list)
max_seq_len = min(cur_max_seq_len, self.max_seq_len)
input_ids, token_type_ids, attention_mask = self.pad_and_truncate(input_ids_list,
token_type_ids_list,
attention_mask_list,
max_seq_len)
input_ids, mlm_labels = self.mask_tokens(input_ids)
data_dict = {
'input_ids': input_ids,
'attention_mask': attention_mask,
'token_type_ids': token_type_ids,
'labels': mlm_labels
}
return data_dict
DGDataset,返回dict
class DGDataset(Dataset):
def __init__(self, data_dict: dict):
super(Dataset, self).__init__()
self.data_dict = data_dict
def __getitem__(self, index: int) -> tuple:
data = (self.data_dict['input_ids'][index],
self.data_dict['token_type_ids'][index],
self.data_dict['attention_mask'][index])
return data
def __len__(self) -> int:
return len(self.data_dict['input_ids'])
4.optimizer, scheduler = build_optimizer(args, model, total_steps)
创建优化器和scheduler学习率策略
def build_optimizer(args, model, train_steps):
no_decay = ['bias', 'LayerNorm.weight']
param_optimizer = list(model.named_parameters())
optimizer_grouped_parameters = [
{'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
'weight_decay_rate': args.weight_decay},
{'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
'weight_decay_rate': 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.eps)
scheduler = WarmupLinearSchedule(optimizer, warmup_steps=train_steps * args.warmup_ratio, t_total=train_steps)
return optimizer, scheduler
WarmupLinearSchedule
class WarmupLinearSchedule(LambdaLR):
def __init__(self, optimizer, warmup_steps, t_total, last_epoch=-1):
self.warmup_steps = warmup_steps
self.t_total = t_total
super(WarmupLinearSchedule, self).__init__(optimizer, self.lr_lambda, last_epoch=last_epoch)
def lr_lambda(self, step):
if step < self.warmup_steps:
return float(step) / float(max(1, self.warmup_steps))
return max(0.0, float(self.t_total - step) / float(max(1.0, self.t_total - self.warmup_steps)))