import torch
import pandas as pd
import torch.nn as nn
from transformers import BertModel
from transformers import BertTokenizer
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import Adam
data_worse = pd.read_csv('data/1.csv')
data_worse['label'] = 0
data_bad = pd.read_csv('data/2.csv')
data_bad['label'] = 1
data_normal = pd.read_csv('data/3.csv')
data_normal['label'] = 2
data_good = pd.read_csv('data/4.csv')
data_good['label'] = 3
data_better = pd.read_csv('data/happy.csv')
data_better['label'] = 4
data = pd.concat([data_worse[:10000], data_bad[:10000], data_normal[:10000], data_good[:10000], data_better[:10000]],
axis=0).reset_index(drop=True)
X = data.comment.values
y = data.label.values
X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=0.1)
此处使用的是豆瓣的五个级别的影评 两负面01 两正面 34 一中性2 分别读取每个文件的前10000条评论 按照训练集与测试集9:1的比例进行分割
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
def preprocessing_for_bert(data):
input_ids = []
attention_masks = []
for sent in data:
encoded_sent = tokenizer.encode_plus(
text=sent,
add_special_tokens=True,
max_length=MAX_LEN,
padding='max_length',
return_attention_mask=True
)
input_ids.append(encoded_sent.get('input_ids'))
attention_masks.append(encoded_sent.get('attention_mask'))
input_ids = torch.tensor(input_ids)
attention_masks = torch.tensor(attention_masks)
return input_ids, attention_masks
encoded_comment = [tokenizer.encode(sent, add_special_tokens=True) for sent in data.comment.values]
MAX_LEN = max([len(sent) for sent in encoded_comment])
train_inputs, train_masks = preprocessing_for_bert(X_train)
test_inputs, test_masks = preprocessing_for_bert(X_test)
train_labels = torch.tensor(y_train)
test_labels = torch.tensor(y_test)
batch_size = 128
train_data = TensorDataset(train_inputs, train_masks, train_labels)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
test_data = TensorDataset(test_inputs, test_masks, test_labels)
test_sampler = SequentialSampler(test_data)
test_dataloader = DataLoader(test_data, sampler=test_sampler, batch_size=batch_size)
add_special_tokens=True 用于给句子加上 [CLS] 和 [SEP]标签 具体可以去看一下bert模型具体的训练过程 此处不加赘述
max_length=MAX_LEN 截断或者填充的最大长度
padding=‘max_length’ 填充为最大长度
class BertClassifier(nn.Module):
def __init__(self, ):
super(BertClassifier, self).__init__()
D_in, H, D_out = 768, 100, 5
self.bert = BertModel.from_pretrained('bert-base-uncased')
self.classifier = nn.Sequential(
nn.Linear(D_in, H),
nn.ReLU(),
nn.Linear(H, D_out)
)
def forward(self, input_ids, attention_mask):
outputs = self.bert(input_ids=input_ids,
attention_mask=attention_mask)
last_hidden_state_cls = outputs[0][:, 0, :]
outputs = self.classifier(last_hidden_state_cls)
return outputs
输入维度(hidden size of Bert)默认768,分类器隐藏维度,输出维度(label)
实体化一个单层前馈分类器 说白了就是最后要输出的时候搞个全连接层
首先输入 随后为分类任务提取标记[CLS]的最后隐藏状态,因为要连接传到全连接层去 最后全连接 计算 输出label
learning_rate = 1e-5 #设置学习率
num_epoch = 10 #训练轮数
model = BertClassifier() #创建网络模型
model = model.to(device)
optimizer = Adam(model.parameters(), learning_rate) ##优化器
CE_loss = nn.CrossEntropyLoss() #损失函数
#设置训练网络的一些参数
train_step = 0 #记录训练的次数
test_step = 0 #记录测试的次数
for epoch in range(num_epoch):
print('------第{}轮训练开始------'.format(epoch+1))
model.train()
for batch in train_dataloader:
input_ids, attn_mask, labels = tuple(t.to(device) for t in batch)
outputs = model(input_ids, attn_mask)
loss = CE_loss(outputs, labels)
#优化器优化模型(套路都是这样)
optimizer.zero_grad()
loss.backward()
optimizer.step()
train_step = train_step + 1
if train_step % 100 == 0:
print('训练次数:{},Loss:{}'.format(train_step,loss.item()))
#测试步骤开始
model.eval()
total_test_loss = 0
test_acc = 0
with torch.no_grad():
for batch in test_dataloader:
input_ids, attn_mask, labels = tuple(t.to(device) for t in batch)
outputs = model(input_ids, attn_mask)
loss = CE_loss(outputs,labels)
total_test_loss = total_test_loss + loss.item()
acc = (outputs.argmax(dim=1) == labels).sum()
test_acc = test_acc + acc
print('整体测试集上的Loss:{}'.format(total_test_loss))
print('Test_Acc: {}'.format(test_acc / len(test_data)))
test_step = test_step + 1
torch.save(model,'bert_{}.pth'.format(epoch)) #可以选择保存模型