文本分类BiLSTM-Attenion

BiLSTM-Attenion

  • 导包+参数

      import os
      import csv
      import time
      import random
      import json
      import numpy as np
      import pandas as pd
      from tqdm import tqdm 
      
      import torch
      import torch.nn as nn
      import torchtext
      from torchtext import data
      import spacy
      import torch.nn.functional as F
    
    
      class TrainingConfig:
          epoches = 10
          evaluateEvery = 100
          checkpointEvery = 100
          learningRate = 0.001
          
      class ModelConfig:
          embeddingSize = 100
          hiddenSize = 256
          dropoutKeepProb = 0.5
          l2RegLambda = 0.0
          
      class Config:
          sequenceLength = 200
          batchSize = 80
          dataSource = "../data/preProcess/labeledTrain.csv"
          stopWordSource = "../data/english"
          numClasses = 2
          
          rate = 0.8
          training = TrainingConfig()
          model = ModelConfig()
    
      config = Config()
    
  • 数据集

      class MyDataSet(data.Dataset):
          name = 'grand dataset'
          
          @staticmethod
          def sort_key(ex):
              return len(ex.review)
          
          def __init__(self,path,text_field,label_field,test=False,aug=False,**kwargs):
              fields=[
                  ("id",None),
                  ("review",text_field),
                  ("sentiment",label_field)
                  
              ]
              examples = []
              csv_data = pd.read_csv(path)
              
              if test:
                  for text,label in tqdm(csv_data['review'],csv_data['sentiment']):
                      examples.append(data.Example.fromlist([None,text,None],fields))
              else:
                  for text,label in tqdm(zip(csv_data['review'],csv_data['sentiment'])):
                      if aug:
                          rate = random.random()
                          if rate > 0.5:
                              text = self.dropout(text)
                          else:
                              text = self.shuffle(text)
                      examples.append(data.Example.fromlist([None,text,label],fields))
              super(MyDataSet,self).__init__(examples,fields)
              
          def shuffle(self,text):
              text = np.random.permutation(text.strip().splits())
              return ''.join(text)
          
          def dropout(self,text,p=0.5):
              text = text.strip().split()
              len_ = len(text)
              indexs = np.random.choice(len_,int(len_*p))
              for i in indexs:
                  text[i] = ''
              return ' '.join(text)
    
      SEED  = 1
      np.random.seed(1)
      torch.manual_seed(1)
      torch.cuda.manual_seed_all(1)
      ## 构建数据集
      TEXT = data.Field(sequential=True,tokenize="spacy",lower=True,include_lengths=True)
      LABEL=data.LabelField(dtype=torch.long)
      myset = MyDataSet(config.dataSource,text_field=TEXT,label_field=LABEL,test=False,aug=False)
      
      train_data,valid_data = myset.split(random_state=random.seed(1))
      
      TEXT.build_vocab(train_data,max_size=25000,vectors="glove.6B.100d",unk_init = torch.Tensor.normal_)
      LABEL.build_vocab(train_data)
    
    
      device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
      train_iter,valid_iter = data.BucketIterator.splits((train_data,valid_data),batch_size=config.batchSize,device=device)
    
  • 模型

      ## 开始构建模型
      class LSTM(nn.Module):
          
          def __init__(self,input_size,hidden_size,batch_first=True,num_layers=1,bidirectional=False,dropout=0.2):
              super(LSTM,self).__init__()
              self.rnn = nn.LSTM(input_size = input_size,hidden_size=hidden_size,num_layers=num_layers,bidirectional=bidirectional,batch_first=batch_first)
              self.dropout = nn.Dropout(dropout)
              
              self.input_size = input_size
              self.hidden_size = hidden_size
              self.batch_first = batch_first
              self.num_layers = num_layers
              self.bidirectional = bidirectional
              self.dropout = self.dropout
              
          def forward(self,x):
              x,x_len = x
              x = self.dropout(x)
              x_len_sorted,x_idx = torch.sort(x_len,descending=True)
              x_sorted = x.index_select(dim=0,index=x_idx)
              _,x_ori_idx = torch.sort(x_idx)
              
              
              x_packed = nn.utils.rnn.pack_padded_sequence(x_sorted,x_len_sorted,batch_first=self.batch_first)
              x_packed,(h,c) = self.rnn(x_packed)
          
              x = nn.utils.rnn.pad_packed_sequence(x_packed,batch_first=self.batch_first)[0]
              x = x.index_select(dim=0,index = x_ori_idx)
              
              h = h.permute(1,0,2).contiguous().view(-1,h.size(0)*h.size(2)).squeeze()
              h = h.index_select(dim=0,index=x_ori_idx)
              
              return x,h
    
    
      class Linear(nn.Module):
          def __init__(self,in_features,out_features,dropout=0.0):
              super(Linear,self).__init__()
              self.linear = nn.Linear(in_features=in_features,out_features=out_features)
              if dropout >0 :
                  self.dropout = nn.Dropout(dropout)
                  
          def forward(self,x):
              if hasattr(self,'dropout'):
                  x = self.dropout(x)
              x = self.linear(x)
              return x
    
    
      class BiLSTMAttention(nn.Module):
          
          def __init__(self,config,vocab_size):
              ## 嵌入层
              super(BiLSTMAttention,self).__init__()
              self.embedded = nn.Embedding(vocab_size,config.model.embeddingSize)
              ## 双向LSTM层
              self.lstm = LSTM(input_size = config.model.embeddingSize,hidden_size=config.model.hiddenSize,num_layers=1,bidirectional=True,batch_first=True)
              ## 全连接层
              self.linear = Linear(in_features=config.model.hiddenSize*4,out_features=config.numClasses,dropout=0.5)
              self.config = config
              
          def forward(self,x):
              ## x batch_size,sen_len
              x,x_len = x
              x = x.permute(1,0)##将batch_size放在首位
              embedded = self.embedded(x)## batch_size,sen_len,embedding
              x,h =self.lstm((embedded,x_len))## batch_size,hidden_size*2
              ## 增加一层注意力层
              att = self.attention(x,h,x_len) ## batch_size,hidden_size*2
              att = torch.cat((att,h),dim=1)
              linear = self.linear(att)
              return linear
          
          def attention(self,x,h,x_len):
              ## 两层网络做加权
              ## x为 batch_size,sen_len,2*embedding
              ## h为 batch_size,2*embedding
              ## 开始构建注意力模型
              h= h.unsqueeze(2)## batch_size,2*embedding,1
              weights = torch.bmm(x,h).squeeze(-1)## batch_size,sen_len
              ## mask batch_size,sen_len
              mask = torch.arange(x_len.max().item(),device=torch.device('cuda' if torch.cuda.is_available() else 'cpu') )[None,:] < x_len[:,None] ## 构建掩码矩阵
              ## 取消掩码
              weights.data.masked_fill(mask,-1e6)
              ## 重新计算权重
              weights = F.softmax(weights,dim=1)## batch_size,sen_len
              ## 计算计算对应的值然后相加
              weights = weights.unsqueeze(1)## batch_size,1,sen_len
              attention = torch.bmm(weights,x).squeeze(1)## batch_szie,2*embedding
              return attention
    
  • 训练

      vocab_size = len(TEXT.vocab)
      PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
      model = BiLSTMAttention(config=config,vocab_size=vocab_size)
      pretrained_embedding = TEXT.vocab.vectors
      model.embedded.weight.data.copy_(pretrained_embedding)
      UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
      model.embedded.weight.data[UNK_IDX] = torch.zeros(config.model.embeddingSize)
      model.embedded.weight.data[PAD_IDX] = torch.zeros(config.model.embeddingSize)
      model = model.to(device)
    
    
      ## 开始训练数据
      loss_fn = nn.CrossEntropyLoss()
      learning_rate = 0.001
      optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)
      def binary_accuracy(preds,y):
          correct = (preds.argmax(1) == y).float()
          acc = correct.sum()/len(correct)
          return acc
      def repackage_hidden(h):
          if isinstance(h,torch.Tensor):
              return h.detach()
          else:
              return tuple(repackage_hidden(v) for v in h)
    
    
      GRAD_CLIP = 1
      NUMM_EPOCHS = 50
      val_losses = []
      for each in range(NUMM_EPOCHS):
          epoch_loss,total_len,epoch_acc = 0,0,0
          model.train()
          it = iter(train_iter)
          for i,batch in enumerate(it):
              review,sentiment = batch.review,batch.sentiment
              
              if torch.cuda.is_available():
                  review,sentiment = (review[0].cuda(),review[1].cuda()),sentiment.cuda()
              output = model(review)
              loss=loss_fn(output,sentiment)
              acc = binary_accuracy(output,sentiment)
              
              optimizer.zero_grad()
              loss.backward()
              torch.nn.utils.clip_grad_norm_(model.parameters(),GRAD_CLIP)
              ## 
              optimizer.step()
              
              epoch_loss += loss.item()*config.batchSize
              epoch_acc += acc.item()*config.batchSize
              total_len += config.batchSize
          print(epoch_loss/total_len,';',epoch_acc/total_len)
    

你可能感兴趣的:(文本分类BiLSTM-Attenion)