Pytorch框架下应用Bi-LSTM实现汽车评论文本关键词抽取

需要调用的模块及整体Bi-lstm流程

import torch
import pandas as pd
import numpy as np
from tensorflow import keras
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
import gensim
from sklearn.model_selection import train_test_split
class word_extract(nn.Module):
    def __init__(self,d_model,embedding_matrix):
        super(word_extract, self).__init__()
        self.d_model=d_model
        self.embedding=nn.Embedding(num_embeddings=len(embedding_matrix),embedding_dim=200)
        self.embedding.weight.data.copy_(embedding_matrix)
        self.embedding.weight.requires_grad=False
        self.lstm1=nn.LSTM(input_size=200,hidden_size=50,bidirectional=True)
        self.lstm2=nn.LSTM(input_size=2*self.lstm1.hidden_size,hidden_size=50,bidirectional=True)
        self.linear=nn.Linear(2*self.lstm2.hidden_size,4)

    def forward(self,x):
        w_x=self.embedding(x)
        first_x,(first_h_x,first_c_x)=self.lstm1(w_x)
        second_x,(second_h_x,second_c_x)=self.lstm2(first_x)
        output_x=self.linear(second_x)
        return output_x

将文本转换为数值形式

def trans_num(word2idx,text):
    text_list=[]
    for i in text:
        s=i.rstrip().replace('\r','').replace('\n','').split(' ')
        numtext=[word2idx[j] if j in word2idx.keys() else word2idx['_PAD'] for j in s ]
        text_list.append(numtext)
    return text_list

将Gensim里的词向量模型转为矩阵形式,后续导入到LSTM模型中

def establish_word2vec_matrix(model):  #负责将数值索引转为要输入的数据
    word2idx = {"_PAD": 0}  # 初始化 `[word : token]` 字典,后期 tokenize 语料库就是用该词典。
    num2idx = {0: "_PAD"}
    vocab_list = [(k, model.wv[k]) for k, v in model.wv.vocab.items()]

    # 存储所有 word2vec 中所有向量的数组,留意其中多一位,词向量全为 0, 用于 padding
    embeddings_matrix = np.zeros((len(model.wv.vocab.items()) + 1, model.vector_size))
    for i in range(len(vocab_list)):
        word = vocab_list[i][0]
        word2idx[word] = i + 1
        num2idx[i + 1] = word
        embeddings_matrix[i + 1] = vocab_list[i][1]
    embeddings_matrix = torch.Tensor(embeddings_matrix)
    return embeddings_matrix, word2idx, num2idx

训练过程

def train(model,epoch,learning_rate,batch_size,x, y, val_x, val_y):
    optimizor = optim.Adam(model.parameters(), lr=learning_rate)
    data = TensorDataset(x, y)
    data = DataLoader(data, batch_size=batch_size)
    for i in range(epoch):
        for j, (per_x, per_y) in enumerate(data):
            output_y = model(per_x)
            loss = F.cross_entropy(output_y.view(-1,output_y.size(2)), per_y.view(-1))
            optimizor.zero_grad()
            loss.backward()
            optimizor.step()
            arg_y=output_y.argmax(dim=2)
            fit_correct=(arg_y==per_y).sum()
            fit_acc=fit_correct.item()/(per_y.size(0)*per_y.size(1))
            print('##################################')
            print('第{}次迭代第{}批次的训练误差为{}'.format(i + 1, j + 1, loss), end=' ')
            print('第{}次迭代第{}批次的训练准确度为{}'.format(i + 1, j + 1, fit_acc))
            val_output_y = model(val_x)
            val_loss = F.cross_entropy(val_output_y.view(-1,val_output_y.size(2)), val_y.view(-1))
            arg_val_y=val_output_y.argmax(dim=2)
            val_correct=(arg_val_y==val_y).sum()
            val_acc=val_correct.item()/(val_y.size(0)*val_y.size(1))
            print('第{}次迭代第{}批次的预测误差为{}'.format(i + 1, j + 1, val_loss), end=' ')
            print('第{}次迭代第{}批次的预测准确度为{}'.format(i + 1, j + 1, val_acc))
    torch.save(model,'./extract_model.pkl')#保存模型

主函数部分

if __name__ =='__main__':
    #生成词向量矩阵
    word2vec = gensim.models.Word2Vec.load('./word2vec_model')
    embedding_matrix,word2idx,num2idx=establish_word2vec_matrix(word2vec)#输入的是词向量模型
    #
    train_data=pd.read_csv('./数据.csv')
    x=list(train_data['文本'])
    # 将文本从文字转化为数值,这部分trans_num函数你需要自己改动去适应你自己的数据集
    x=trans_num(word2idx,x)
    #x需要先进行填充,也就是每个句子都是一样长度,不够长度的以0来填充,填充词单独分为一类
    # #也就是说输入的x是固定长度的数值列表,例如[50,123,1850,21,199,0,0,...]
    #输入的y是[2,0,1,0,0,1,3,3,3,3,3,.....]
    #填充代码你自行编写,以下部分是针对我的数据集
    x=keras.preprocessing.sequence.pad_sequences(
         x,maxlen=60,value=0,padding='post',
    )
    y=list(train_data['BIO数值'])
    y_text=[]
    for i in y:
        s=i.rstrip().split(' ')
        numtext=[int(j) for j in s]
        y_text.append(numtext)
    y=y_text
    y=keras.preprocessing.sequence.pad_sequences(
         y,maxlen=60,value=3,padding='post',
     )
	# 将数据进行划分
    fit_x,val_x,fit_y,val_y=train_test_split(x,y,train_size=0.8,test_size=0.2)
    fit_x=torch.LongTensor(fit_x)
    fit_y=torch.LongTensor(fit_y)
    val_x=torch.LongTensor(val_x)
    val_y=torch.LongTensor(val_y)
    #开始应用
    w_extract=word_extract(d_model=200,embedding_matrix=embedding_matrix)
    train(model=w_extract,epoch=5,learning_rate=0.001,batch_size=50,
          x=fit_x,y=fit_y,val_x=val_x,val_y=val_y)#可以自行改动参数,设置学习率,批次,和迭代次数
    w_extract=torch.load('./extract_model.pkl')#加载保存好的模型
    pred_val_y=w_extract(val_x).argmax(dim=2)

你可能感兴趣的:(Pytorch框架下应用Bi-LSTM实现汽车评论文本关键词抽取)