导包+参数
import os
import csv
import time
import random
import json
import numpy as np
import pandas as pd
from tqdm import tqdm
import torch
import torch.nn as nn
import torchtext
from torchtext import data
import spacy
import torch.nn.functional as F
class TrainingConfig:
epoches = 10
evaluateEvery = 100
checkpointEvery = 100
learningRate = 0.001
class ModelConfig:
embeddingSize = 100
hiddenSize = 256
dropoutKeepProb = 0.5
l2RegLambda = 0.0
class Config:
sequenceLength = 200
batchSize = 80
dataSource = "../data/preProcess/labeledTrain.csv"
stopWordSource = "../data/english"
numClasses = 2
rate = 0.8
training = TrainingConfig()
model = ModelConfig()
config = Config()
数据集
class MyDataSet(data.Dataset):
name = 'grand dataset'
@staticmethod
def sort_key(ex):
return len(ex.review)
def __init__(self,path,text_field,label_field,test=False,aug=False,**kwargs):
fields=[
("id",None),
("review",text_field),
("sentiment",label_field)
]
examples = []
csv_data = pd.read_csv(path)
if test:
for text,label in tqdm(csv_data['review'],csv_data['sentiment']):
examples.append(data.Example.fromlist([None,text,None],fields))
else:
for text,label in tqdm(zip(csv_data['review'],csv_data['sentiment'])):
if aug:
rate = random.random()
if rate > 0.5:
text = self.dropout(text)
else:
text = self.shuffle(text)
examples.append(data.Example.fromlist([None,text,label],fields))
super(MyDataSet,self).__init__(examples,fields)
def shuffle(self,text):
text = np.random.permutation(text.strip().splits())
return ''.join(text)
def dropout(self,text,p=0.5):
text = text.strip().split()
len_ = len(text)
indexs = np.random.choice(len_,int(len_*p))
for i in indexs:
text[i] = ''
return ' '.join(text)
SEED = 1
np.random.seed(1)
torch.manual_seed(1)
torch.cuda.manual_seed_all(1)
## 构建数据集
TEXT = data.Field(sequential=True,tokenize="spacy",lower=True,include_lengths=True)
LABEL=data.LabelField(dtype=torch.long)
myset = MyDataSet(config.dataSource,text_field=TEXT,label_field=LABEL,test=False,aug=False)
train_data,valid_data = myset.split(random_state=random.seed(1))
TEXT.build_vocab(train_data,max_size=25000,vectors="glove.6B.100d",unk_init = torch.Tensor.normal_)
LABEL.build_vocab(train_data)
device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
train_iter,valid_iter = data.BucketIterator.splits((train_data,valid_data),batch_size=config.batchSize,device=device)
模型
## 开始构建模型
class LSTM(nn.Module):
def __init__(self,input_size,hidden_size,batch_first=True,num_layers=1,bidirectional=False,dropout=0.2):
super(LSTM,self).__init__()
self.rnn = nn.LSTM(input_size = input_size,hidden_size=hidden_size,num_layers=num_layers,bidirectional=bidirectional,batch_first=batch_first)
self.dropout = nn.Dropout(dropout)
self.input_size = input_size
self.hidden_size = hidden_size
self.batch_first = batch_first
self.num_layers = num_layers
self.bidirectional = bidirectional
self.dropout = self.dropout
def forward(self,x):
x,x_len = x
x = self.dropout(x)
x_len_sorted,x_idx = torch.sort(x_len,descending=True)
x_sorted = x.index_select(dim=0,index=x_idx)
_,x_ori_idx = torch.sort(x_idx)
x_packed = nn.utils.rnn.pack_padded_sequence(x_sorted,x_len_sorted,batch_first=self.batch_first)
x_packed,(h,c) = self.rnn(x_packed)
x = nn.utils.rnn.pad_packed_sequence(x_packed,batch_first=self.batch_first)[0]
x = x.index_select(dim=0,index = x_ori_idx)
h = h.permute(1,0,2).contiguous().view(-1,h.size(0)*h.size(2)).squeeze()
h = h.index_select(dim=0,index=x_ori_idx)
return x,h
class Linear(nn.Module):
def __init__(self,in_features,out_features,dropout=0.0):
super(Linear,self).__init__()
self.linear = nn.Linear(in_features=in_features,out_features=out_features)
if dropout >0 :
self.dropout = nn.Dropout(dropout)
def forward(self,x):
if hasattr(self,'dropout'):
x = self.dropout(x)
x = self.linear(x)
return x
class BiLSTMAttention(nn.Module):
def __init__(self,config,vocab_size):
## 嵌入层
super(BiLSTMAttention,self).__init__()
self.embedded = nn.Embedding(vocab_size,config.model.embeddingSize)
## 双向LSTM层
self.lstm = LSTM(input_size = config.model.embeddingSize,hidden_size=config.model.hiddenSize,num_layers=1,bidirectional=True,batch_first=True)
## 全连接层
self.linear = Linear(in_features=config.model.hiddenSize*4,out_features=config.numClasses,dropout=0.5)
self.config = config
def forward(self,x):
## x batch_size,sen_len
x,x_len = x
x = x.permute(1,0)##将batch_size放在首位
embedded = self.embedded(x)## batch_size,sen_len,embedding
x,h =self.lstm((embedded,x_len))## batch_size,hidden_size*2
## 增加一层注意力层
att = self.attention(x,h,x_len) ## batch_size,hidden_size*2
att = torch.cat((att,h),dim=1)
linear = self.linear(att)
return linear
def attention(self,x,h,x_len):
## 两层网络做加权
## x为 batch_size,sen_len,2*embedding
## h为 batch_size,2*embedding
## 开始构建注意力模型
h= h.unsqueeze(2)## batch_size,2*embedding,1
weights = torch.bmm(x,h).squeeze(-1)## batch_size,sen_len
## mask batch_size,sen_len
mask = torch.arange(x_len.max().item(),device=torch.device('cuda' if torch.cuda.is_available() else 'cpu') )[None,:] < x_len[:,None] ## 构建掩码矩阵
## 取消掩码
weights.data.masked_fill(mask,-1e6)
## 重新计算权重
weights = F.softmax(weights,dim=1)## batch_size,sen_len
## 计算计算对应的值然后相加
weights = weights.unsqueeze(1)## batch_size,1,sen_len
attention = torch.bmm(weights,x).squeeze(1)## batch_szie,2*embedding
return attention
训练
vocab_size = len(TEXT.vocab)
PAD_IDX = TEXT.vocab.stoi[TEXT.pad_token]
model = BiLSTMAttention(config=config,vocab_size=vocab_size)
pretrained_embedding = TEXT.vocab.vectors
model.embedded.weight.data.copy_(pretrained_embedding)
UNK_IDX = TEXT.vocab.stoi[TEXT.unk_token]
model.embedded.weight.data[UNK_IDX] = torch.zeros(config.model.embeddingSize)
model.embedded.weight.data[PAD_IDX] = torch.zeros(config.model.embeddingSize)
model = model.to(device)
## 开始训练数据
loss_fn = nn.CrossEntropyLoss()
learning_rate = 0.001
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)
def binary_accuracy(preds,y):
correct = (preds.argmax(1) == y).float()
acc = correct.sum()/len(correct)
return acc
def repackage_hidden(h):
if isinstance(h,torch.Tensor):
return h.detach()
else:
return tuple(repackage_hidden(v) for v in h)
GRAD_CLIP = 1
NUMM_EPOCHS = 50
val_losses = []
for each in range(NUMM_EPOCHS):
epoch_loss,total_len,epoch_acc = 0,0,0
model.train()
it = iter(train_iter)
for i,batch in enumerate(it):
review,sentiment = batch.review,batch.sentiment
if torch.cuda.is_available():
review,sentiment = (review[0].cuda(),review[1].cuda()),sentiment.cuda()
output = model(review)
loss=loss_fn(output,sentiment)
acc = binary_accuracy(output,sentiment)
optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(),GRAD_CLIP)
##
optimizer.step()
epoch_loss += loss.item()*config.batchSize
epoch_acc += acc.item()*config.batchSize
total_len += config.batchSize
print(epoch_loss/total_len,';',epoch_acc/total_len)