Pytorch bert模型 实现文本分类

先导入数据集,这里是先新建train.py文件导入数据和添加标签(也可以直接在一个文件里进行)

import pandas as pd
import numpy as np

def load_data(path):
    data = pd.read_csv(path, encoding='utf-8',sep='|',nrows=14)
    data = data.content
    label_list = [2,0,1,1,1,0,2,0,2,0,2,0,1,1]#手动添加标签
    data = pd.DataFrame(data)
    data["label"] = label_list
    return data




同一目录下再建一个.py文件,导入数据集进行训练和测试

import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.utils.data as Data
import torch.optim as optim
import transformers
from transformers import AutoModel, AutoTokenizer
import matplotlib.pyplot as plt

train_curve = []
device = torch.device('cuda')

# 定义一些参数,模型选择了最基础的bert中文模型
batch_size = 2
epoches = 100
model = "bert-base-chinese"
hidden_size = 768
n_class = 3
maxlen = 8

from train import load_data


data = load_data("data/train_dataset/train_dataset.csv")
sentences = list(data.content)
labels = list(data.label)

#print(sentences)
#print(labels)

# 将数据构造成bert的输入格式
# inputs_ids: token的字典编码
# attention_mask:长度与inputs_ids一致,真实长度的位置填充1,padding位置填充0
# token_type_ids: 第一个句子填充0,第二个句子句子填充1
class MyDataset(Data.Dataset):#调用dataset要重写__len__和__getitem__方法
  def __init__(self, sentences, labels=None, with_labels=True,):
    self.tokenizer = AutoTokenizer.from_pretrained(model)
    self.with_labels = with_labels
    self.sentences = sentences
    self.labels = labels
  def __len__(self):
    return len(sentences)

  def __getitem__(self, index):
    # Selecting sentence1 and sentence2 at the specified index in the data frame
    sent = self.sentences[index]

    # Tokenize the pair of sentences to get token ids, attention masks and token type ids
    encoded_pair = self.tokenizer(sent,
                    padding='max_length',  # Pad to max_length
                    truncation=True,       # Truncate to max_length
                    max_length=maxlen,
                    return_tensors='pt')  # Return torch.Tensor objects

    token_ids = encoded_pair['input_ids'].squeeze(0)  # 词向量tensor of token ids
    attn_masks = encoded_pair['attention_mask'].squeeze(0)  # 掩码→要覆盖掉的词binary tensor with "0" for padded values and "1" for the other values
    token_type_ids = encoded_pair['token_type_ids'].squeeze(0)  # 索引binary tensor with "0" for the 1st sentence tokens & "1" for the 2nd sentence tokens

    if self.with_labels:  # True if the dataset has labels
      label = self.labels[index]
      return token_ids, attn_masks, token_type_ids, label
    else:
      return token_ids, attn_masks, token_type_ids

train = Data.DataLoader(dataset=MyDataset(sentences, labels), batch_size=batch_size, shuffle=True, num_workers=0)

# model
class BertClassify(nn.Module):
  def __init__(self):
    super(BertClassify, self).__init__()
    self.bert = AutoModel.from_pretrained(model, output_hidden_states=True, return_dict=True)
    self.linear = nn.Linear(hidden_size, n_class) # 直接用cls向量接全连接层分类
    self.dropout = nn.Dropout(0.5)

  def forward(self, X):
    input_ids, attention_mask, token_type_ids = X[0], X[1], X[2]
    outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) # 返回一个output字典
    # 用最后一层cls向量做分类
    # outputs.pooler_output: [bs, hidden_size]
    logits = self.linear(self.dropout(outputs.pooler_output))

    return logits

bc = BertClassify().to(device)

optimizer = optim.Adam(bc.parameters(), lr=1e-3, weight_decay=1e-2)
loss_fn = nn.CrossEntropyLoss()

# train
sum_loss = 0
total_step = len(train)
for epoch in range(epoches):
  for i, batch in enumerate(train):
    optimizer.zero_grad()
    batch = tuple(p.to(device) for p in batch)
    pred = bc([batch[0], batch[1], batch[2]])
    loss = loss_fn(pred, batch[3])
    sum_loss += loss.item()#累加损失

    loss.backward()
    optimizer.step()
    if epoch % 10 == 0:
      print('[{}|{}] step:{}/{} loss:{:.4f}'.format(epoch+1, epoches, i+1, total_step, loss.item()))
  train_curve.append(sum_loss)
  sum_loss = 0#每计算一个epoch的损失,要先置为0

# test
bc.eval()#测试的时候不会算梯度
with torch.no_grad():
  test_text = ['服务,喂,你好,是这样的,我那个我的电话因为上次突然把给我停机了,我不知道啥意思,然后我就好长时间没有使用,我这次拿着10,怎么我欠费160多了?噢,我想问一下,我这机器现在是在停机状态吧,']
  test = MyDataset(test_text, labels=None, with_labels=False)
  x = test.__getitem__(0)#得到三个输出(token mask belong)和label
  x = tuple(p.unsqueeze(0).to(device) for p in x)#把所有输出封装成元组(加维度[在测试数据集为1时])
  pred = bc([x[0], x[1], x[2]])
  pred = pred.data.max(dim=1, keepdim=True)[1]
  if pred[0][0] == 0:
    print('中性')
  elif pred[0][0] == 1:
    print('不满')
  else:
    print('强烈不满')

pd.DataFrame(train_curve).plot() # loss曲线
plt.show()

你可能感兴趣的:(python,bert,分类)