NLP transformer网络下Bert模型的完整训练

NLP transformer网络下Bert模型的完整训练

数据准备

import torch
import pandas as pd
import torch.nn as nn
from transformers import BertModel
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import Adam

data_worse = pd.read_csv('data/1.csv')
data_worse['label'] = 0
data_bad = pd.read_csv('data/2.csv')
data_bad['label'] = 1
data_normal = pd.read_csv('data/3.csv')
data_normal['label'] = 2
data_good = pd.read_csv('data/4.csv')
data_good['label'] = 3
data_better = pd.read_csv('data/happy.csv')
data_better['label'] = 4
data = pd.concat([data_worse[:10000], data_bad[:10000], data_normal[:10000], data_good[:10000], data_better[:10000]],
                 axis=0).reset_index(drop=True)

X = data.comment.values
y = data.label.values
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, test_size=0.1)

此处使用的是豆瓣的五个级别的影评 两负面01 两正面 34 一中性2 分别读取每个文件的前10000条评论 按照训练集与测试集9:1的比例进行分割

数据处理

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)


def preprocessing_for_bert(data):
    input_ids = []
    attention_masks = []

    for sent in data:
        encoded_sent = tokenizer.encode_plus(
            text=sent,
            add_special_tokens=True,
            max_length=MAX_LEN,
            padding='max_length',
            return_attention_mask=True
        )

        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))

    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    return input_ids, attention_masks


encoded_comment = [tokenizer.encode(sent, add_special_tokens=True) for sent in data.comment.values]

MAX_LEN = max([len(sent) for sent in encoded_comment])

train_inputs, train_masks = preprocessing_for_bert(X_train)
test_inputs, test_masks = preprocessing_for_bert(X_test)

train_labels = torch.tensor(y_train)
test_labels = torch.tensor(y_test)

batch_size = 128

train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)

首先要加载bert的tokenize方法

然后进行token 利用preprocessing_for_bert函数进行预处理

add_special_tokens=True 用于给句子加上 [CLS] 和 [SEP]标签 具体可以去看一下bert模型具体的训练过程 此处不加赘述
max_length=MAX_LEN 截断或者填充的最大长度
padding=‘max_length’ 填充为最大长度

随后分别给训练集与测试集创建 DataLoader

创建神经网络BertClassifier

class BertClassifier(nn.Module):
    def __init__(self, ):
        super(BertClassifier, self).__init__()
        D_in, H, D_out = 768, 100, 5

        self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            nn.Linear(H, D_out)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids,
                            attention_mask=attention_mask)

        last_hidden_state_cls = outputs[0][:, 0, :]

        outputs = self.classifier(last_hidden_state_cls)

        return outputs

D_in, H, D_out

输入维度(hidden size of Bert)默认768,分类器隐藏维度,输出维度(label)

classifier

实体化一个单层前馈分类器 说白了就是最后要输出的时候搞个全连接层

forward

首先输入 随后为分类任务提取标记[CLS]的最后隐藏状态,因为要连接传到全连接层去 最后全连接 计算 输出label

开始训练与测试

learning_rate = 1e-5  #设置学习率
num_epoch = 10   #训练轮数

model = BertClassifier()   #创建网络模型
model = model.to(device)
optimizer = Adam(model.parameters(), learning_rate)  ##优化器
CE_loss = nn.CrossEntropyLoss()   #损失函数
#设置训练网络的一些参数
train_step = 0     #记录训练的次数
test_step = 0     #记录测试的次数



for epoch in range(num_epoch):

    print('------第{}轮训练开始------'.format(epoch+1))

    model.train()

    for batch in train_dataloader:

        input_ids, attn_mask, labels = tuple(t.to(device) for t in batch)

        outputs = model(input_ids, attn_mask)

        loss = CE_loss(outputs, labels)

        #优化器优化模型(套路都是这样)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_step = train_step + 1
        if train_step % 100 == 0:
            print('训练次数:{},Loss:{}'.format(train_step,loss.item()))

    #测试步骤开始
    model.eval()

    total_test_loss = 0
    test_acc = 0

    with torch.no_grad():

        for batch in test_dataloader:

            input_ids, attn_mask, labels = tuple(t.to(device) for t in batch)

            outputs = model(input_ids, attn_mask)

            loss = CE_loss(outputs,labels)

            total_test_loss = total_test_loss + loss.item()

            acc = (outputs.argmax(dim=1) == labels).sum()

            test_acc = test_acc + acc

    print('整体测试集上的Loss:{}'.format(total_test_loss))
    print('Test_Acc: {}'.format(test_acc / len(test_data)))
    test_step = test_step + 1

    torch.save(model,'bert_{}.pth'.format(epoch))  #可以选择保存模型

训练结果

NLP transformer网络下Bert模型的完整训练_第1张图片
五分类问题92%的正确率还是不错的

你可能感兴趣的:(自然语言处理,transformer,bert)