关于数据集的介绍可以参考前面的文章:
PyTorch搭建LSTM对IMDB数据集进行情感分析(详细的数据分析与处理过程)
def load_data(args, path, tokenizer):
classes = ['pos', 'neg']
def process(flag):
tokens = []
labels = []
seqs = []
for label in classes:
files = os.listdir(os.path.join(path, flag, label))
# 去除标点符号
r = '[’!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\n。!,]+'
cnt = 0
for file in tqdm(files):
cnt += 1
if flag == 'train' and cnt == 2000:
break
if flag == 'test' and cnt == 1000:
break
with open(os.path.join(path, flag, label, file), 'r', encoding='utf8') as rf:
temp = rf.read().replace('\n', '')
temp = temp.replace('
', ' ')
temp = re.sub(r, '', temp)
# token
token = tokenizer(temp, padding='max_length', max_length=args.seq_len,
truncation=True, return_tensors='pt')
tokens.append(token)
cur_label = 1 if label == 'pos' else 0
seqs.append((token, cur_label))
return seqs
seq = process('train')
_seq = process('test')
seq.extend(_seq)
# shuffle
random.seed(42)
random.shuffle(seq)
Dtr = seq[:int(len(seq) * 0.6)]
Dtr = MyDataset(Dtr)
Dtr = DataLoader(dataset=Dtr, batch_size=args.batch_size, shuffle=True, num_workers=0, drop_last=False)
Val = seq[int(len(seq) * 0.6):int(len(seq) * 0.8)]
Val = MyDataset(Val)
Val = DataLoader(dataset=Val, batch_size=args.batch_size, shuffle=True, num_workers=0, drop_last=False)
Dte = seq[int(len(seq) * 0.8):]
Dte = MyDataset(Dte)
Dte = DataLoader(dataset=Dte, batch_size=args.batch_size, shuffle=True, num_workers=0, drop_last=False)
return Dtr, Val, Dte
由于Bert
参数量巨大,训练十分缓慢,因此这里只取一部分数据(一共6000条),打乱后按622的比例划分训练集、验证集以及测试集。
观察上述代码可以发现,针对每一个需要分类的文本,我们做了如下操作:
temp = rf.read().replace('\n', '')
temp = temp.replace('
', ' ')
temp = re.sub(r, '', temp)
# token
token = tokenizer(temp, padding='max_length', max_length=args.seq_len,
truncation=True, return_tensors='pt')
即使用BertTokenizer
将文本token
化,得到input_ids
和attention_mask
,关于token
化网上有很多资料,这里不再详细解释。
将文本token
化后得到的input_ids
和attention_mask
将作为Bert
模型的输入。
模型搭建如下:
class Bert_Classifier(nn.Module):
def __init__(self, args):
super(Bert_Classifier, self).__init__()
self.config = BertConfig.from_pretrained("bert-base-uncased")
#
self.config.hidden_size = args.hidden_size
self.config.num_hidden_layers = 1
self.config.num_attention_heads = 4
#
self.bert = BertModel(config=self.config)
self.dropout = nn.Dropout(p=0.1)
self.fc = nn.Linear(args.hidden_size, 2)
def forward(self, input_id, mask):
h, output = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
output = self.dropout(output)
output = self.fc(output)
return output
由于预训练的Bert
有上亿参数,虽说效果比较好,但训练时间比较缓慢,因此这里没有利用预训练模型进行fine tune
,而是直接初始化一个未被训练的Bert
:
self.bert = BertModel(config=self.config)
其中config
为参数配置列表,这里用了预训练模型时的config
:
self.config = BertConfig.from_pretrained("bert-base-uncased")
print(self.config)
输出如下:
BertConfig {
"architectures": [
"BertForMaskedLM"
],
"attention_probs_dropout_prob": 0.1,
"classifier_dropout": null,
"gradient_checkpointing": false,
"hidden_act": "gelu",
"hidden_dropout_prob": 0.1,
"hidden_size": 768,
"initializer_range": 0.02,
"intermediate_size": 3072,
"layer_norm_eps": 1e-12,
"max_position_embeddings": 512,
"model_type": "bert",
"num_attention_heads": 12,
"num_hidden_layers": 12,
"pad_token_id": 0,
"position_embedding_type": "absolute",
"transformers_version": "4.21.0",
"type_vocab_size": 2,
"use_cache": true,
"vocab_size": 30522
}
主要做了以下几点改变:
self.config.hidden_size = args.hidden_size
self.config.num_hidden_layers = 1
self.config.num_attention_heads = 4
hidden_size
表示隐藏层神经元数,默认为768,这里改成了128,这样最后得到的是每个位置处大小为128的输出。num_hidden_layers
表示Transformer encoder中的隐藏层数,默认为12,这里为了降低参数数量,改成1。num_attention_heads
表示多头注意力中的头数,默认为12,这里改成4。观察forward
过程:
def forward(self, input_id, mask):
h, output = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
output = self.dropout(output)
output = self.fc(output)
return output
Bert
需要input_ids
和attention_mask
作为输入,最后得到每一个token
的embedding。对于文本分类任务,我们一般只关注[CLS]的embedding,也就是output
(池化后的输出)。
def train(args, Dtr, Val, path):
device = args.device
model = Bert_Classifier(args).to(device)
# freeze parameters
# unfreeze_layers = ['layer.10', 'layer.11', 'bert.poller', 'out.']
if args.freeze:
unfreeze_layers = ['fc.weight', 'fc.bias']
for name, param in model.named_parameters():
param.requires_grad = False
for e in unfreeze_layers:
if e in name:
param.requires_grad = True
break
loss_function = nn.BCEWithLogitsLoss().to(device)
if args.optimizer == 'adam':
optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr,
weight_decay=args.weight_decay)
else:
optimizer = torch.optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr,
momentum=0.9, weight_decay=args.weight_decay)
scheduler = StepLR(optimizer, step_size=args.step_size, gamma=args.gamma)
# training
min_epochs = 2
best_model = None
min_val_loss = 5
for epoch in tqdm(range(args.epochs)):
train_loss = []
for (bert_input, label) in Dtr:
label = Variable(label.long()).to(device)
input_ids = bert_input['input_ids'].squeeze(1).to(device)
attention_mask = bert_input['attention_mask'].to(device)
y_pred = model(input_ids, attention_mask)
# one hot
label = F.one_hot(label, 2).float()
loss = loss_function(y_pred, label)
train_loss.append(loss.item())
optimizer.zero_grad()
loss.backward()
optimizer.step()
# validation
val_loss = get_val_loss(args, model, Val)
if epoch > min_epochs and val_loss < min_val_loss:
min_val_loss = val_loss
best_model = copy.deepcopy(model)
print('epoch {:03d} train_loss {:.8f} val_loss {:.8f}'.format(
epoch, np.mean(train_loss), val_loss))
scheduler.step()
model.train()
state = {'models': best_model.state_dict()}
torch.save(state, path)
需要注意的是,由于损失函数采用了BCEWithLogitsLoss
,因此在模型的最后不需要加sigmoid
激活函数,而是直接将logits
作为输入。
由于模型训练过程是在服务器(4张RTX 3090Ti)中进行,因此batch_size(512)
开的比较大。如果在其他环境中运行,可以适当减小batch_size
。
训练15轮,最终准确率:
acc: 0.85