TextRNN pytorch 面向对象实践

TextRNN pytorch 实践

    • 前言
    • Embedding
    • 面向对象编程实现 TextRNN
      • 准备数据
      • 实现数据集转换:
      • 配置参数
      • TextRNN 实现
      • 使用TextRNN训练 和测试数据
    • 数据集下载:

前言

首先吐槽一下 ,研究机器学习深度学习的时候 发现 各路大神 都是面向过程编程 ,非常简单的东西看的也是云里雾里,代码面向对象编程 很多东西一看就能看明白, 希望借此能抛砖引玉。
另外命名也喜欢用 驼峰命名法

Embedding

理解 textRNN 之前 首先得理解 Embedding ,把这个东西了解了 其他的东西 就是 RNN。

一般处理 文档的时候 遵循下面几步:

  1. 将文档变成 [ [文档1] , [ 文档2] … ] 这种 list 结构 每一个文档就是一个list

  2. 将list 文档切割成单词形式 [ [“我”,“爱” ,“小喵咪”] ,[“我”,“吃” ,“西瓜” ] … ] 一般用jieba 分词

  3. 去停顿词 像 ! , 啊 … 这样的标点符号 语气词 没有实际意义

  4. 将所有的单词 按照词频记录 比如: { “我": 1800 , “爱”:1500} ==>词频排序{ “我”:1 ,“爱”:2 ,“吃”: 3 ,“西瓜”:4,“小喵咪”:5 }

  5. 将所有的文档中的单词 按照 这个词频顺序变成数字 [“我”,“爱” ,“小喵咪”]==>[1, 2, 5] ,[“我”,“吃” ,“西瓜” ] ==>[1,3,4]

  6. 准备词向量 我们假设这个词向量 A 的 size = [ 100,300] , 这个词向量 获取:
    1,可以从别人预训练好的直接获取 比如
    搜狗
    腾讯
    2,用 genisim word2vect 本文采用的是这种 因为数据量小 直接用这种方式比较简单

  7. Embedding 就是用第5步得到的 数字向量 B 去取第6步的真正词向量A 比如 nn.Embedding([ 1,2,5] ) 就是 把 A 的第 1,2,5 行的向量取出来 。

  8. Embedding 出来的词向量 参与 RNN 等各种神经网络计算 。 后面的步骤就和图像处理一样了。如果对RNN 不太理解 移步RNN实战

所以可以看到文档的深度学习计算 就是处理数据比较麻烦点, 要将数据变成特有的格式。明白了上述步骤就可以写代码运算了。

面向对象编程实现 TextRNN

准备数据

我从网上找到一个 ch_auto.csv数据集

TextRNN pytorch 面向对象实践_第1张图片

然后切分成 三份:

  1. train.tsv 训练集
  2. test.tsv 测试集
  3. dev.tsv 验证集

数据格式:

TextRNN pytorch 面向对象实践_第2张图片

导入库:

import   pandas  as  pd

import   jieba
from gensim import models

from gensim.models.doc2vec import Doc2Vec, TaggedDocument

import  gensim

import  numpy  as np

import  os
import  torch

实现数据集转换:

'''
后面训练 和测试的 数据格式
'''
class  Batch(object):
    def __init__(self,label,text):
        self.text =torch.LongTensor(text)
        self.label=torch.LongTensor([int(x) for x in  label])



class  DataSet(object):

    def __init__(self):
        self.UNK, self.PAD = '', ''  # 未知字,padding符号

    '''
    # 将 csv 文件 变成 [  [ 文档1] ,[ 文档2]]
    '''
    def  ReadCSV(self,path):
         csvFile=  pd.read_csv(path,usecols=[1,2],encoding="utf-8")

         labelList  = csvFile.values[:,0].tolist()
         valueList = csvFile.values[:,1].tolist()

         print( labelList[0],valueList[0])

         print(len(valueList))


         return   labelList,valueList


    '''
    将tsv 文件变成  [  ["我","爱" ,"小喵咪"]  ,["我","吃" ,"西瓜" ] ... ]
    '''
    def ReadTSV(self,path):

        label=[]
        text=[]
        with  open(path,"r",encoding="utf-8") as fhandle:
           line=  fhandle.readline()

           while line :
               line = fhandle.readline()
               lines= line.split("\t")

               if len(lines) != 3:
                   continue

               label.append(lines[1])

               tmpLine= [word  for  word in jieba.cut(lines[2])]


               #print(lines)
               #print("line=",len(lines),lines[1],tmpLine)

               text.append(tmpLine)

        return  label,text


    '''
    将数据[[],[]]切割成Batch 块 [ batch1,batch2]
    '''
    def  SpliteData(self,text,label,batchSize):
         allLen = len(text)
         step = int(allLen/batchSize)

         spliteText=[]

         for i in  range(0,step):
             spliteText.append(Batch(label[i:i+batchSize] ,text[i:i+batchSize]))


         return  spliteText




    '''
    将解析tsv文件
    数据变成等长的 list [ [1, 2, 5],  [1, 2, 4]  ...]
    切割成[ batch1,batch2]
    '''
    def BuidBatch(self,batchSize,vocabDict,sentenSize=32,
                  testPath="./data/test.tsv",valPath="./data/dev.tsv",trainPath="./data/train.tsv"):

        testLabel,testText = self.ReadTSV(testPath)
        valLabel,valText  = self.ReadTSV(valPath)
        trainLabel,trainText = self.ReadTSV(trainPath)

        testText = self.Doc2Embedding(testText,vocabDict,sentenSize)
        valText = self.Doc2Embedding(valText,vocabDict,sentenSize)
        trainText = self.Doc2Embedding(trainText, vocabDict, sentenSize)



        testIter = self.SpliteData(label=testLabel,text= testText,batchSize=batchSize)

        valIter= self.SpliteData(label=valLabel,text= valText,batchSize=batchSize)
        trainIter = self.SpliteData(label=trainLabel, text=trainText, batchSize=batchSize)

        return  testIter,valIter,trainIter






    '''
    splitTexts =[["xx","xxx"],["xx","xxx"]]

    vocabDict=[('空间', index1), ('很', index2),]
    '''
    def  BuildVocabDict(self,splitTexts,minFreq=1):
          vocabDict={}
          for   setences in   splitTexts:
              #print("setence==",setences)
              for word in  setences:

                  vocabDict[word] = vocabDict.get(word,0)+1

          vocabList = sorted([_ for _ in vocabDict.items() if _[1] >= minFreq], key=lambda x: x[1], reverse=True)

          print("vocabList==",vocabList)
          vocabDict = {word_count[0]: idx for idx, word_count in enumerate(vocabList)}

          vocabDict.update({self.UNK: len(vocabDict),self.PAD: len(vocabDict) + 1})

          print("vocabDict size",len(vocabDict))

          return   vocabDict






    #将词向量存到硬盘
    def  Word2Vect(self, docments,vocabDict,embSize=300,prePath="./data/"):
        model = gensim.models.Word2Vec(docments, sg=1, size=embSize, window=5, min_count=1, negative=3, sample=0.001, hs=1,
                                       workers=4)

        #model.wv.save_word2vec_format("word300.txt",binary=False)


        wordEmbding=[]
        for   k,v in  vocabDict.items():

            #print("k,",k )
            if(model.wv.__contains__(k)):
                tmpVect = model.wv.get_vector(k)
                #print("model==",tmpVect)
                wordEmbding.append(tmpVect)
            else:
                embedding = np.random.uniform(0, 1, embSize)
                wordEmbding.append(embedding)
                print("not contain key==",k)



        fileName = "{}word{}.npz".format(prePath,embSize)
        if not os.path.exists(fileName):
            fd = open(fileName,"w",encoding="utf-8")
            fd.close()

        np.savez(fileName,vocabDict=vocabDict,wordEmbding=wordEmbding)


        return  model

    #加载词向量
    def  LoadWordEmbding(self,path="./data/word300.npz"):

        savNpz=  np.load(path,allow_pickle=True)
        embedding_pretrained =  savNpz["wordEmbding"].astype('float32')
        vocabDict = savNpz["vocabDict"].item()

        print("embding===",len(embedding_pretrained))
        print("embding dict=",vocabDict)

        return  vocabDict, embedding_pretrained

    #建立词向量
    def  BuildWordVect(self,embSize):

        if not os.path.exists("./data/word300.npz"):
            label, text = self.ReadCSV("./data/ch_auto.csv")

            splitTexts = [[word for word in jieba.cut(setences)] for setences in text]

            print(splitTexts[0])

            vocabDict = self.BuildVocabDict(splitTexts)

            self.Word2Vect(splitTexts, vocabDict, embSize=embSize)

配置参数

class   RNNConfig():
    def __init__(self,vocabSize,outputSize=2,batchSize=50,embedDimention=300,
                 hiddenSize=64,hiddenLayer=3,dropKeep=0.1,bidirectional=True,
                 lr=0.001,cuda=False,saveDir="./data/snap/",
                 logInteval=5,epochs= 3 ,evalInteval=-1,preTrain=True,embdingVect=None
                 ):

        self.vocabSize= vocabSize                    # 总词数 多少个
        self.batchSize = batchSize                   #一次性传入多少数据
        self.embedDimention= embedDimention            #词向量维度
        self.hiddenSize = hiddenSize                      # lstm隐藏层大小
        self.hiddenLayer= hiddenLayer                  # lstm层数
        self.dropKeep= dropKeep                       # 随机失活
        self.bidirectional= bidirectional             # 是否双向
        self.outputSize=outputSize                    #输出大小
        self.lr= lr                                  #学习率
        self.cuda= cuda                               #是否用GPU
        self.saveDir = saveDir                        #保存快照的位置
        self.logInteval =logInteval                    #隔多少步 打一个log
        self.epochs= epochs                           # 循环多少次
        self.evalInteval = evalInteval                #隔多少步 评估一下保存快照
        self.preTrain= preTrain
        self.embdingVect = embdingVect

TextRNN 实现

class TextRNN(nn.Module):
    def __init__(self, config):
        super(TextRNN, self).__init__()
        self.config = config

        if  config.preTrain:
            # self.embeddings = nn.Embedding(self.config.vocabSize, self.config.embedDimention)
            #
            # self.embeddings.weight.data.copy_(config.embdingVect)
            self.embeddings = nn.Embedding.from_pretrained(config.embdingVect,freeze=False)

            print("pr train")
        else:

            # Embedding 层, 随机初始化

            self.embeddings = nn.Embedding(self.config.vocabSize, self.config.embedDimention)

        # LSTM 层
        '''
        input_size:输入特征的数目
        hidden_size:隐层的特征数目
        num_layers:这个是模型集成的LSTM的个数 记住这里是模型中有多少个LSTM摞起来 一般默认就1个
        #batch_first: 输入数据的size为[batch_size, time_step, input_size]还是[time_step, batch_size, input_size]
       '''
        self.lstm = nn.LSTM(input_size=self.config.embedDimention,
                            hidden_size=self.config.hiddenSize,
                            num_layers=self.config.hiddenLayer,
                            dropout=self.config.dropKeep,
                            bidirectional=self.config.bidirectional,
                            batch_first=True
                            )
    

        # dropout
        self.dropout = nn.Dropout(self.config.dropKeep)

        outSize= self.config.hiddenSize   * ( 2 if  self.config.bidirectional  else 1 ) #*self.config.hiddenLayer

        print("outSize=",outSize)
        # 全连接层
        self.fc = nn.Linear(  # 就是 hn、cn 的输出然后去掉 batch_size
            outSize,
            self.config.outputSize
        )

        # softmax 层
        self.softmax = nn.Softmax(dim=1)

        # for  param  in  self.parameters():
        #      print("parm==",param)

        self.optimizer= torch.optim.Adam(self.parameters(),config.lr)
        self.lossFunc = nn.CrossEntropyLoss()








    def RunModel(self,x):
        # x.shape = (max_sen_len, batch_size)

        #x = torch.LongTensor(x)
        print("x:",x.size(),x[0])

        embedded_sent = self.embeddings(x)  # (max_sen_len = 30, batch_size=64, embed_size=300)

        embedded_sent = self.dropout(embedded_sent)

        print("embedded_sent==",embedded_sent.size())



        # LSTM
        lstm_out, (h_n, c_n) = self.lstm(embedded_sent,None)

        # dropout
        final_feature_map = self.dropout(h_n)  # (num_layers * num_directions, batch_size, hidden_size)

        print("final_feature_map:",final_feature_map.size())



        final_feature_map = torch.cat((final_feature_map[-1, :, :] ,final_feature_map[-2, :, :]), dim=1)

        print("final_feature_map22:", final_feature_map.size())
        # 全连接
        final_out = self.fc(final_feature_map)
        #final_out = self.softmax(final_out)


        return final_out # 返回 softmax 的结果



    def forward(self, x):
        return  self.RunModel(x)


    def  Refrush(self,predictY,targetY):
        self.optimizer.zero_grad()
        loss= self.lossFunc(predictY,targetY)
        loss.backward()
        self.optimizer.step()
        print("loss:",loss.data.item())

        return  loss


    def ShowRate(self,prdictY,targetY):
        result = torch.argmax(prdictY,dim=1)
        print("rate==",prdictY[0:5],result[0:5],targetY[0:5])
        corrects = (result == targetY).sum().item()

        accuracy = corrects  / self.config.batchSize
        print ("correct:",corrects,"acc:",accuracy)

    def SaveMode(self,saveDir,step):
        if not  os.path.exists(saveDir):
            os.mkdir(saveDir)
        savePath = "{}Steps_{}.pt".format(saveDir,step)
        torch.save(self.state_dict(),savePath)

    def RunTrain(self,trainIter,evalIter):

         step =0
         bestAcc = 0
         self.train()
         for   epoch  in  range (1,self.config.epochs+1):
             for batch in  trainIter:
                 feature, target = batch.text, batch.label


                 if self.config.cuda:
                     feature,target = feature.cuda(),target.cuda()
                 predictY = self.RunModel(feature)
                 print("predict Y:",predictY.size(),target.size())
                 loss =self.Refrush(predictY,target)

                 if loss.data.item() < 0.0001:
                     break

                 step += 1
                 if step % self.config.logInteval ==0:
                     self.ShowRate(predictY, target)

                 if  self.config.evalInteval >0  and step % self.config.evalInteval ==0  :
                     devAcc= self.Eval(evalIter)
                     if devAcc > bestAcc:
                         bestAcc = devAcc
                         #self.SaveMode(self.config.saveDir,step)
                     self.train()




    def  Eval(self,dataIter):
        self.eval()
        avgLoss =0.0
        corrects=0.0
        accuracy=0.0
        for batch in dataIter:
            feature, target = batch.text, batch.label
            #feature.data.t_()

            if self.config.cuda:
                feature, target = feature.cuda(), target.cuda()

            predictY = self.RunModel(feature)

            loss = F.cross_entropy(predictY,target)
            avgLoss += loss.item()
            #result = torch.max(predictY, 1)[1]


            result = torch.argmax(predictY, dim=1)
            print("rate==", predictY[0:5], result[0:5], target[0:5])
            correct = (result == target).sum().item()
            acc= correct / self.config.batchSize
            accuracy += acc
            print("correct:", correct, "acc:", acc)

        size =len(dataIter)
        avgLoss /= size

        accuracy =  accuracy /size


        print("eval loss:{} acc:{}".format(avgLoss,accuracy))
        return  accuracy

使用TextRNN训练 和测试数据

import torch
import torch.nn.functional as F
from torch import nn

import os

#一次性传入的文档数量
BATHSIZE = 50
#每个句子的统一长度
SentenceLength = 32
#词向量维度
EmbdingDemition=300


if __name__ == '__main__':

    dataSet = DataSet()

    ## 训练词向量
    dataSet.BuildWordVect(EmbdingDemition)


    ##加载词向量
    vocabDict,wordVect= dataSet.LoadWordEmbding()

    ## 准备训练数据
    trainIter, valIter, testIter =dataSet.BuidBatch(BATHSIZE,vocabDict,sentenSize=SentenceLength)


    ## 配置文件
    config = RNNConfig(len(vocabDict),embedDimention=EmbdingDemition,batchSize=BATHSIZE,
                       preTrain=True,embdingVect= torch.tensor(wordVect)
                       )

    ## 初始化 RNN
    myRNN= TextRNN(config)

    ## 开始训练
    myRNN.RunTrain(trainIter,valIter)

    ## 开始测试
    myRNN.Eval(testIter)

数据集下载:

将数据下载 ,建立文件夹 data 放入其中。

链接:https://pan.baidu.com/s/1u6xhcLqI6NWZU1Qh8h7imA
提取码:6w30

你可能感兴趣的:(nlp,深度学习)