库导入与参数配置
import os
import csv
import time
import random
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
import torch.nn as nn
import torchtext
from torchtext import data
import spacy
class TrainingConfig:
epoches = 10
evaluateEvery = 100
checkpointEvery = 100
learningRate = 0.001
class ModelConfig:
embeddingSize = 100
hiddenSize = 256
dropoutKeepProb = 0.5
l2RegLambda = 0.0
class Config:
sequenceLength = 200
batchSize = 80
dataSource = "../data/preProcess/labeledTrain.csv"
stopWordSource = "../data/english"
numClasses = 2
rate = 0.8
training = TrainingConfig()
model = ModelConfig()
config = Config()
数据集获取
class MyDataSet(data.Dataset):
name = 'grand dataset'
@staticmethod
def sort_key(ex):
return len(ex.review)
def __init__(self,path,text_field,label_field,test=False,aug=False,**kwargs):
fields=[
("id",None),
("review",text_field),
("sentiment",label_field)
]
examples = []
csv_data = pd.read_csv(path)
if test:
for text,label in tqdm(csv_data['review'],csv_data['sentiment']):
examples.append(data.Example.fromlist([None,text,None],fields))
else:
for text,label in tqdm(zip(csv_data['review'],csv_data['sentiment'])):
if aug:
rate = random.random()
if rate > 0.5:
text = self.dropout(text)
else:
text = self.shuffle(text)
examples.append(data.Example.fromlist([None,text,label],fields))
super(MyDataSet,self).__init__(examples,fields)
def shuffle(self,text):
text = np.random.permutation(text.strip().splits())
return ''.join(text)
def dropout(self,text,p=0.5):
text = text.strip().split()
len_ = len(text)
indexs = np.random.choice(len_,int(len_*p))
for i in indexs:
text[i] = ''
return ' '.join(text)
SEED = 1
np.random.seed(1)
torch.manual_seed(1)
torch.cuda.manual_seed_all(1)
## 构建数据集
TEXT = data.Field(sequential=True,tokenize="spacy",lower=True,include_lengths=True)
LABEL=data.LabelField(dtype=torch.long)
myset = MyDataSet(config.dataSource,text_field=TEXT,label_field=LABEL,test=False,aug=False)
train_data,valid_data = myset.split(random_state=random.seed(1))
TEXT.build_vocab(train_data,max_size=25000,vectors="glove.6B.100d",unk_init = torch.Tensor.normal_)
LABEL.build_vocab(train_data)
模型构建
class LSTM(nn.Module):
def __init__(self,input_size,hidden_size,batch_first=True,num_layers=1,bidirectional=False,dropout=0.2):
super(LSTM,self).__init__()
self.rnn = nn.LSTM(input_size = input_size,hidden_size=hidden_size,num_layers=num_layers,bidirectional=bidirectional,batch_first=batch_first)
self.dropout = nn.Dropout(dropout)
self.input_size = input_size
self.hidden_size = hidden_size
self.batch_first = batch_first
self.num_layers = num_layers
self.bidirectional = bidirectional
self.dropout = self.dropout
def forward(self,x):
x,x_len = x
x = self.dropout(x)
x_len_sorted,x_idx = torch.sort(x_len,descending=True)
x_sorted = x.index_select(dim=0,index=x_idx)
_,x_ori_idx = torch.sort(x_idx)
x_packed = nn.utils.rnn.pack_padded_sequence(x_sorted,x_len_sorted,batch_first=self.batch_first)
x_packed,(h,c) = self.rnn(x_packed)
x = nn.utils.rnn.pad_packed_sequence(x_packed,batch_first=self.batch_first)[0]
x = x.index_select(dim=0,index = x_ori_idx)
h = h.permute(1,0,2).contiguous().view(-1,h.size(0)*h.size(2)).squeeze()
h = h.index_select(dim=0,index=x_ori_idx)
return x,h
class Linear(nn.Module):
def __init__(self,in_features,out_features,dropout=0.0):
super(Linear,self).__init__()
self.linear = nn.Linear(in_features=in_features,out_features=out_features)
if dropout >0 :
self.dropout = nn.Dropout(dropout)
def forward(self,x):
if hasattr(self,'dropout'):
x = self.dropout(x)
x = self.linear(x)
return x
class BiLSTM(nn.Module):
def __init__(self,config,vocab_size):
## 嵌入层
super(BiLSTM,self).__init__()
self.embedded = nn.Embedding(vocab_size,config.model.embeddingSize)
## 双向LSTM层
self.lstm = LSTM(input_size = config.model.embeddingSize,hidden_size=config.model.hiddenSize,num_layers=1,bidirectional=True,batch_first=True)
## 全连接层
self.linear = Linear(in_features=config.model.hiddenSize*2,out_features=config.numClasses,dropout=0.5)
self.config = config
def forward(self,x):
## x batch_size,sen_len
x,x_len = x
x = x.permute(1,0)##将batch_size放在首位
embedded = self.embedded(x)## batch_size,sen_len,embedding
x,h =self.lstm((embedded,x_len))## batch_size,hidden_size*2
linear = self.linear(h)
return linear
训练
vocab_size = len(TEXT.vocab)
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
model = BiLSTM(config=config,vocab_size=vocab_size)
pretrained_embedding = TEXT.vocab.vectors
model.embedded.weight.data.copy_(pretrained_embedding)
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embedded.weight.data[UNK_IDX] = torch.zeros(config.model.embeddingSize)
model.embedded.weight.data[PAD_IDX] = torch.zeros(config.model.embeddingSize)
model = model.to(device)
loss_fn = nn.CrossEntropyLoss()
learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)
def binary_accuracy(preds,y):
correct = (preds.argmax(1) == y).float()
acc = correct.sum()/len(correct)
return acc
def repackage_hidden(h):
if isinstance(h,torch.Tensor):
return h.detach()
else:
return tuple(repackage_hidden(v) for v in h)
GRAD_CLIP = 1
NUMM_EPOCHS = 50
val_losses = []
for each in range(NUMM_EPOCHS):
epoch_loss,total_len,epoch_acc = 0,0,0
model.train()
it = iter(train_iter)
for i,batch in enumerate(it):
review,sentiment = batch.review,batch.sentiment
if torch.cuda.is_available():
review,sentiment = (review[0].cuda(),review[1].cuda()),sentiment.cuda()
output = model(review)
loss=loss_fn(output,sentiment)
acc = binary_accuracy(output,sentiment)
optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(),GRAD_CLIP)
##
optimizer.step()
epoch_loss += loss.item()*config.batchSize
epoch_acc += acc.item()*config.batchSize
total_len += config.batchSize
print(epoch_loss/total_len,';',epoch_acc/total_len)