文本分类之Bi-LSTM

Bi-LSTM

  • 库导入与参数配置

      import os
      import csv
      import time
      import random
      import json
      import numpy as np
      import pandas as pd
      from tqdm import tqdm 
      
      import torch
      import torch.nn as nn
      import torchtext
      from torchtext import data
      import spacy
      
      class TrainingConfig:
          epoches = 10
          evaluateEvery = 100
          checkpointEvery = 100
          learningRate = 0.001
          
      class ModelConfig:
          embeddingSize = 100
          hiddenSize = 256
          dropoutKeepProb = 0.5
          l2RegLambda = 0.0
    
      class Config:
          sequenceLength = 200
          batchSize = 80
          dataSource = "../data/preProcess/labeledTrain.csv"
          stopWordSource = "../data/english"
          numClasses = 2
          
          rate = 0.8
          training = TrainingConfig()
          model = ModelConfig()
          
      config = Config()
    
  • 数据集获取

     	class MyDataSet(data.Dataset):
         name = 'grand dataset'
         
         @staticmethod
         def sort_key(ex):
             return len(ex.review)
         
         def __init__(self,path,text_field,label_field,test=False,aug=False,**kwargs):
             fields=[
                 ("id",None),
                 ("review",text_field),
                 ("sentiment",label_field)
                 
             ]
             examples = []
             csv_data = pd.read_csv(path)
             
             if test:
                 for text,label in tqdm(csv_data['review'],csv_data['sentiment']):
                     examples.append(data.Example.fromlist([None,text,None],fields))
             else:
                 for text,label in tqdm(zip(csv_data['review'],csv_data['sentiment'])):
                     if aug:
                         rate = random.random()
                         if rate > 0.5:
                             text = self.dropout(text)
                         else:
                             text = self.shuffle(text)
                     examples.append(data.Example.fromlist([None,text,label],fields))
             super(MyDataSet,self).__init__(examples,fields)
             
         def shuffle(self,text):
             text = np.random.permutation(text.strip().splits())
             return ''.join(text)
         
         def dropout(self,text,p=0.5):
             text = text.strip().split()
             len_ = len(text)
             indexs = np.random.choice(len_,int(len_*p))
             for i in indexs:
                 text[i] = ''
             return ' '.join(text)
    
     SEED  = 1
     np.random.seed(1)
     torch.manual_seed(1)
     torch.cuda.manual_seed_all(1)
     ## 构建数据集
     TEXT = data.Field(sequential=True,tokenize="spacy",lower=True,include_lengths=True)
     LABEL=data.LabelField(dtype=torch.long)
     myset = MyDataSet(config.dataSource,text_field=TEXT,label_field=LABEL,test=False,aug=False)
     
     train_data,valid_data = myset.split(random_state=random.seed(1))
     
     TEXT.build_vocab(train_data,max_size=25000,vectors="glove.6B.100d",unk_init = torch.Tensor.normal_)
     LABEL.build_vocab(train_data)
    
  • 模型构建

开始构建模型

	class LSTM(nn.Module):
	    
	    def __init__(self,input_size,hidden_size,batch_first=True,num_layers=1,bidirectional=False,dropout=0.2):
	        super(LSTM,self).__init__()
	        self.rnn = nn.LSTM(input_size = input_size,hidden_size=hidden_size,num_layers=num_layers,bidirectional=bidirectional,batch_first=batch_first)
	        self.dropout = nn.Dropout(dropout)
	        
	        self.input_size = input_size
	        self.hidden_size = hidden_size
	        self.batch_first = batch_first
	        self.num_layers = num_layers
	        self.bidirectional = bidirectional
	        self.dropout = self.dropout
	        
	    def forward(self,x):
	        x,x_len = x
	        x = self.dropout(x)
	        x_len_sorted,x_idx = torch.sort(x_len,descending=True)
	        x_sorted = x.index_select(dim=0,index=x_idx)
	        _,x_ori_idx = torch.sort(x_idx)
	        
	        
	        x_packed = nn.utils.rnn.pack_padded_sequence(x_sorted,x_len_sorted,batch_first=self.batch_first)
	        x_packed,(h,c) = self.rnn(x_packed)
	    
	        x = nn.utils.rnn.pad_packed_sequence(x_packed,batch_first=self.batch_first)[0]
	        x = x.index_select(dim=0,index = x_ori_idx)
	        
	        h = h.permute(1,0,2).contiguous().view(-1,h.size(0)*h.size(2)).squeeze()
	        h = h.index_select(dim=0,index=x_ori_idx)
	        
	        return x,h


	class Linear(nn.Module):
	    def __init__(self,in_features,out_features,dropout=0.0):
	        super(Linear,self).__init__()
	        self.linear = nn.Linear(in_features=in_features,out_features=out_features)
	        if dropout >0 :
	            self.dropout = nn.Dropout(dropout)
	            
	    def forward(self,x):
	        if hasattr(self,'dropout'):
	            x = self.dropout(x)
	        x = self.linear(x)
	        return x

	class BiLSTM(nn.Module):
	    
	    def __init__(self,config,vocab_size):
	        ## 嵌入层
	        super(BiLSTM,self).__init__()
	        self.embedded = nn.Embedding(vocab_size,config.model.embeddingSize)
	        ## 双向LSTM层
	        self.lstm = LSTM(input_size = config.model.embeddingSize,hidden_size=config.model.hiddenSize,num_layers=1,bidirectional=True,batch_first=True)
	        ## 全连接层
	        self.linear = Linear(in_features=config.model.hiddenSize*2,out_features=config.numClasses,dropout=0.5)
	        self.config = config
	        
	    def forward(self,x):
	        ## x batch_size,sen_len
	        x,x_len = x
	        x = x.permute(1,0)##将batch_size放在首位
	        embedded = self.embedded(x)## batch_size,sen_len,embedding
	        x,h =self.lstm((embedded,x_len))## batch_size,hidden_size*2
	        
	        linear = self.linear(h)
	        return linear
  • 训练

      vocab_size = len(TEXT.vocab)
      PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
      model = BiLSTM(config=config,vocab_size=vocab_size)
      pretrained_embedding = TEXT.vocab.vectors
      model.embedded.weight.data.copy_(pretrained_embedding)
      UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
      model.embedded.weight.data[UNK_IDX] = torch.zeros(config.model.embeddingSize)
      model.embedded.weight.data[PAD_IDX] = torch.zeros(config.model.embeddingSize)
      model = model.to(device)
    

开始训练数据

	loss_fn = nn.CrossEntropyLoss()
	learning_rate = 0.001
	optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)
	
	def binary_accuracy(preds,y):
	    correct = (preds.argmax(1) == y).float()
	    acc = correct.sum()/len(correct)
	    return acc
	
	def repackage_hidden(h):
	    if isinstance(h,torch.Tensor):
	        return h.detach()
	    else:
	        return tuple(repackage_hidden(v) for v in h)
	
	GRAD_CLIP = 1
	NUMM_EPOCHS = 50
	val_losses = []
	for each in range(NUMM_EPOCHS):
	    epoch_loss,total_len,epoch_acc = 0,0,0
	    model.train()
	    it = iter(train_iter)
	    for i,batch in enumerate(it):
	        review,sentiment = batch.review,batch.sentiment
	        
	        if torch.cuda.is_available():
	            review,sentiment = (review[0].cuda(),review[1].cuda()),sentiment.cuda()
	        output = model(review)
	        loss=loss_fn(output,sentiment)
	        acc = binary_accuracy(output,sentiment)
	        
	        optimizer.zero_grad()
	        loss.backward()
	        torch.nn.utils.clip_grad_norm_(model.parameters(),GRAD_CLIP)
	        ## 
	        optimizer.step()
	        
	        epoch_loss += loss.item()*config.batchSize
	        epoch_acc += acc.item()*config.batchSize
	        total_len += config.batchSize
	    print(epoch_loss/total_len,';',epoch_acc/total_len)

你可能感兴趣的:(文本分类之Bi-LSTM)