Prompt进行情感分类-Pytorch实现

Prompt进行情感分类-Pytorch实现

    • 数据
    • 步骤
    • 代码
    • 结果

数据

数据为推特数据集
百度云连接:https://pan.baidu.com/s/1Nx7htUBWKBZfo3QPPty3mw 密码:1234

步骤

1.选择模板。模板可以放在前面或者后面,本文以“It was ().”作为模板,并放在原始句子的前面。
2.设计输入。模型的输入将成为“It was (). your sentence.”,我们将其转换为序列的形式同时将括号中的空字符变为“Mask”。
3.设计标签。在训练的时候,我们采取MLM损失作为最终的目标。目的是预测Mask是什么词,若是good则情感极性为正面,若是bad则情感极性为负面。因为我们的标签其实就是加上模板后的整句话,但请注意模板中的空必须填上相应的词(good或者bad),因此我们在做交叉熵时候的真实值就是整句话,例如,“It was (bad). your sentence.”。但是在这之前,我们必须也将其转化为序列形式。请注意!我们仅预测Mask位置的词是什么,因此我们在将其转化为序列的时候,将其他词的索引值设置为-1或者其它负数。这是因为在pytorch中的交叉熵损失中给我们提供了一个参数“ignore_index”,此参数的作用就是当ignore_index = a时,那么真实值里面为a的索引不参与交叉熵运算。
4.模型输出。当我们输入一个句子进去,模型是输出是“batch size · number words · vocab size”。而真实值是“batch size · number words”,我们需要将“batch size · number words · vocab size”变为“(batch size·number words) · vocab size”,将“batch size · number words”变为“(batch size*number words)”然后送入交叉熵损失即可。
5.预测。将一个句子送入模型得到的输出是“batch size · number words · vocab size”,我们取出Mask位置的vocab size维度的向量,哪个位置的概率大,那么预测值就是哪个。

代码


import warnings
warnings.filterwarnings('ignore')
from datetime import datetime
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from gensim.models import word2vec
import re
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as Data
import torch.nn.functional as F
import csv
from transformers import BertModel, BertConfig, BertTokenizer, AdamW, get_cosine_schedule_with_warmup,BertForMaskedLM
from datetime import datetime
from sklearn.model_selection import train_test_split

//将你的数据和标签存入列表中
StrongData =  fine_all_data
StrongLabel = fine_all_label

//划分训练、测试、验证集
x_train, x_test, y_train, y_test = train_test_split(StrongData, StrongLabel, test_size=0.3,random_state=42)
#xp_train, xp_evil, yp_train, yp_evil = train_test_split(x_fune_train,y_fune_train, test_size=0.2, random_state=42)
print("测试数据共:",len(y_test)  , len(x_test))

//导入Bert分词器
vocab_path = r"/mnt/JuChiYun-WangFei/bert-base-chinese/vocab.txt"
tokenizer = BertTokenizer.from_pretrained(vocab_path)

//模板
prefix = 'It was Mask. '

//你的空处填的词
pos_id = tokenizer.convert_tokens_to_ids('good')  #2204
neg_id = tokenizer.convert_tokens_to_ids('bad')      #2919


//构建训练集
Inputid = []
Labelid = []
sid = []
atid = []

for i in range(len(x_train)):
    text_ =    prefix + x_train[i]
    encode_dict = tokenizer.encode_plus(text_ , max_length=60 , padding='max_length', truncation=True)

    id = encode_dict["input_ids"]
    segmentid = encode_dict["token_type_ids"]
    attid = encode_dict["attention_mask"]
    labelid , inputid = id[:] , id[:]
    maskpos = 3
    if y_train[i] == 0:
        labelid[maskpos] = neg_id
        labelid[: maskpos ] = [-1]*len(labelid[: maskpos ])
        labelid[maskpos + 1 : ] = [-1]*len(labelid[maskpos + 1 : ])
        inputid[maskpos] = tokenizer.mask_token_id
    else:
        labelid[maskpos] = pos_id
        labelid[: maskpos] = [-1] * len(labelid[: maskpos])
        labelid[maskpos + 1:] = [-1] * len(labelid[maskpos + 1:])
        inputid[maskpos] = tokenizer.mask_token_id

    Labelid.append(labelid)
    Inputid.append(inputid)
    sid.append(segmentid)
    atid.append(attid)

Inputid = np.array(Inputid)
Labelid = np.array(Labelid)
sid = np.array(sid)
atid = np.array(atid)

print(Inputid.shape)
print(Labelid.shape)

print("正在划分数据集")
idxes = np.arange(Inputid.shape[0])  #idxes的第一维度,也就是数据大小
np.random.seed(2019)   # 固定种子
np.random.shuffle(idxes)
a = 4509
# 划分训练集、验证集
input_ids_train,  input_ids_valid  = Inputid[idxes[:a]], Inputid[idxes[a:5632]]
input_masks_train,  input_masks_valid = atid[idxes[:a]], atid[idxes[a:5632]]
input_types_train, input_types_valid = sid[idxes[:a]], sid[idxes[a:5632]]
label_train, y_valid = Labelid[idxes[:a]], Labelid[idxes[a:5632]]
print(input_ids_train.shape, label_train.shape, input_ids_valid.shape, y_valid.shape)
print(label_train[:, 3])


#测试集构建
tInputid = []
tLabelid = []
tsid = []
tatid = []
for i in range(len(x_test)):
    text_ =    prefix + x_test[i]
    encode_dict = tokenizer.encode_plus(text_ , max_length=60 , padding='max_length', truncation=True)
    id = encode_dict["input_ids"]
    segmentid = encode_dict["token_type_ids"]
    attid = encode_dict["attention_mask"]
    labelid , inputid = id[:] , id[:]
    maskpos = 3
    if y_test[i] == 0:
        labelid[maskpos] = neg_id
        labelid[: maskpos ] = [-1]*len(labelid[: maskpos ])
        labelid[maskpos + 1 : ] = [-1]*len(labelid[maskpos + 1 : ])
        inputid[maskpos] = tokenizer.mask_token_id
    else:
        labelid[maskpos] = pos_id
        labelid[: maskpos] = [-1] * len(labelid[: maskpos])
        labelid[maskpos + 1:] = [-1] * len(labelid[maskpos + 1:])
        inputid[maskpos] = tokenizer.mask_token_id
        
    tLabelid.append(labelid)
    tInputid.append(inputid)
    tsid.append(segmentid)
    tatid.append(attid)

tInputid = np.array(tInputid)
tLabelid = np.array(tLabelid)
tsid = np.array(tsid)
tatid = np.array(tatid)
print("测试集大小",tInputid.shape , tLabelid.shape)

//构建数据集
class MyDataSet(Data.Dataset):
    def __init__(self, sen , mask , typ ,label ):
        super(MyDataSet, self).__init__()
        self.sen = sen
        self.mask = mask
        self.typ = typ
        self.label = label

    def __len__(self):
        return self.sen.shape[0]

    def __getitem__(self, idx):
        return self.sen[idx], self.mask[idx],self.typ[idx],self.label[idx]


input_ids_train = torch.from_numpy(input_ids_train).long()
input_ids_valid = torch.from_numpy(input_ids_valid).long()
input_ids_test = torch.from_numpy(tInputid).long()

input_masks_train = torch.from_numpy(input_masks_train).long()
input_masks_valid = torch.from_numpy(input_masks_valid).long()
input_masks_test = torch.from_numpy(tatid).long()

input_types_train = torch.from_numpy(input_types_train).long()
input_types_valid = torch.from_numpy(input_types_valid).long()
input_types_test = torch.from_numpy(tsid).long()

label_train = torch.from_numpy(label_train).long()
y_valid = torch.from_numpy(y_valid).long()
label_test = torch.from_numpy(tLabelid).long()


train_dataset = Data.DataLoader(MyDataSet(input_ids_train,  input_masks_train , input_types_train , label_train), 32, True)
valid_dataset = Data.DataLoader(MyDataSet(input_ids_valid,  input_masks_valid , input_types_valid , y_valid), 32, True)
test_dataset = Data.DataLoader(MyDataSet(input_ids_test,  input_masks_test , input_types_test , label_test), 128, True)


#构建模型
class Bert_Model(nn.Module):
    def __init__(self,  bert_path ,config_file ):
        super(Bert_Model, self).__init__()
        self.bert = BertForMaskedLM.from_pretrained(bert_path,config=config_file)  # 加载预训练模型权重


    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert(input_ids, attention_mask, token_type_ids)
        logit = outputs[0]  # 池化后的输出 [bs, config.hidden_size]


        return logit 


config_path = r"/mnt/JuChiYun-WangFei/bert-base-chinese/config12.json"
config = BertConfig.from_pretrained(config_path)  # 导入模型超参数
print(config)
DEVICE = torch.device("cuda:0" if  torch.cuda.is_available() else "cpu")
print(DEVICE)

# bert_path=r"/mnt/JuChiYun-WangFei/bert-base-chinese/pytorch_model.bin"
print("正在加载模型")
model = Bert_Model( bert_path=r"/mnt/JuChiYun-WangFei/bert-base-chinese/pytorch_model.bin", config_file=config).to(DEVICE)
print("模型加载完毕")


optimizer = AdamW(model.parameters(),lr=2e-5,weight_decay=1e-4)  #使用Adam优化器
loss_func = nn.CrossEntropyLoss(ignore_index=-1)
EPOCH = 200
schedule = get_cosine_schedule_with_warmup(optimizer,num_warmup_steps=len(train_dataset),num_training_steps=EPOCH*len(train_dataset))
print("正在训练中。。。")
import time

for epoch in range(EPOCH):

    starttime_train = datetime.now()
    start = time.time()
    correct = 0
    train_loss_sum = 0.0
    model.train()
    print("***** Running training epoch {} *****".format(epoch + 1))

    for idx, (ids, att, tpe, y) in enumerate(train_dataset):
        ids, att, tpe, y = ids.to(DEVICE), att.to(DEVICE), tpe.to(DEVICE), y.to(DEVICE)
        out_train  = model(ids, att, tpe)
        #print(out_train.view(-1, 30522).shape, y.view(-1).shape)
        loss = loss_func(out_train.view(-1, 30522), y.view(-1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        schedule.step()
        train_loss_sum += loss.item()

        if (idx + 1) % 40 == 0:
            print("Epoch {:04d} | Step {:06d}/{:06d} | Loss {:.4f} | Time {:.0f}".format(
                epoch + 1, idx + 1, len(train_dataset), train_loss_sum / (idx + 1), time.time() - start))

        truelabel = y[:, 3]
        out_train_mask = out_train[:, 3, :]

        predicted = torch.max(out_train_mask.data, 1)[1]
        correct += (predicted == truelabel).sum()
        correct = np.float(correct)
    acc = float(correct / len(label_train))

    eval_loss_sum = 0.0
    model.eval()
    correct_test = 0
    with torch.no_grad():
        for ids, att, tpe, y in test_dataset:
            ids, att, tpe, y = ids.to(DEVICE), att.to(DEVICE), tpe.to(DEVICE), y.to(DEVICE)
            out_test = model(ids , att , tpe)
            loss_eval = loss_func(out_test.view(-1, 30522), y.view(-1))
            eval_loss_sum += loss_eval.item()
            ttruelabel = y[:, 3]
            tout_train_mask = out_test[:, 3, :]
            predicted_test = torch.max(tout_train_mask.data, 1)[1]
            correct_test += (predicted_test == ttruelabel).sum()
            correct_test = np.float(correct_test)
    acc_test = float(correct_test / len(label_test))

    if epoch % 1 == 0:
        out = ("epoch {}, train_loss {},  train_acc {} , eval_loss {} ,acc_test {}"
               .format(epoch + 1, train_loss_sum / (len(train_dataset)), acc, eval_loss_sum / (len(test_dataset)),
                acc_test))

        print(out)

结果

使用prompt方式对测试集进行测试的结果是0.9126,而使用微调的方式结果是0.8948,可见prompt效果确实很好。

你可能感兴趣的:(自然语言处理,深度学习,pytorch)