1.常见的循环神经网络
RNN,LSTM,GRU
RNN
torch.nn.RNN()
单纯的RNN会出现随着地柜次数的增加,权重指数级爆炸或小时的问题,从而难以捕捉长时间的关联,导致RNN训练是收敛困难。
LSTM
引入门的机制,使网络有更强的记忆能力。
LSTM信息处理方面的三个阶段:
遗忘阶段。对上一阶段传进来的输入进行选择性忘记。
选择记忆阶段。
输出阶段
GRU(循环们控制单元)
LSTM需要训练较多的参数,训练难度较大。
GRU通过将遗忘门和输入门组合在一起,减少了门的数量。
2.RNN手写字体分类
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import time
import copy
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim as optim
import torchvision
import torch.utils.data as Data
from torchvision import transforms
import hiddenlayer as hl
train_data=torchvision.datasets.MNIST(
root="./data/MNIST",train=True,transform=transforms.ToTensor(),download=False
)
train_loader=Data.DataLoader(
dataset=train_data,batch_size=64,shuffle=True,num_workers=2
)
test_data=torchvision.datasets.MNIST(
root="./data/MNIST",train=False,transforms=transforms.ToTensor(),download=False
)
test_loader=Data.DataLoader(
dataset=test_data,batch_size=64,shuffle=True,num_workers=2
)
class RNNimc(nn.Module):
def __init__(self,input_dim,hidden_dim,layer_dim,output_dim):
super(RNNimc,self).__init__()
self.hidden_dim=hidden_dim
self.layer_dim=layer_dim
self.rnn=nn.RNN(input_dim,hidden_dim,layer_dim,batch_fitst=True,nonlinearity='relu')
self.fc1=nn.Linear(hidden_dim,output_dim)
def forward(self,x):
out,h_n=self.rnn(x,None)
out=self.fc1(out[:,-1,:])
return out
input_dim=28
hidden_dim=128
layer_dim=1
output_dim=10
MyRNNimc=RNNimc(input_dim,hidden_dim,layer_dim,output_dim)
print(MyRNNimc)
h1_graph=h1.build_graph(MyRNNimc,torch.zeros([1,28,28]))
h1_graph.theme=h1.graph.THEMES["blue"].copy()
h1_graph
optimizer=torch.optim.RMSprop(MyRNNimc.parameters(),lr=0.0003)
criterion=nn.CrossEntropyLoss()
train_loss_all=[]
train_acc_all=[]
test_loss_all=[]
test_acc_all=[]
num_epochs=30
for epoch in range(num_epochs):
print('EPoch {}/{}'.format(epoch,num_epochs-1))
MyRNNimc.train()
corrects=0
train_num=0
for step,(b_x,b_y) in enumerate(trian_loader):
xdata=b_x.view(-1,28,28)
output=MyRNNimc(xdata)
pre_lab=torch.argmax(output,1)
loss=criterion(output,b_y)
optimizer.zero_grad()
loss.backward()
optimizer.step()
loss+=loss.item()*b_x.size(0)
corrects+=torch.sum(pre_lab==b_y.data)
train_num++b_x.size(0)
train_loss_all.append(loss/train_num)
train_acc_all.append(corrects.double().item()/train_num)
print('{} Train Loss: {:.4f} Train Acc: {:.4f}'.format(epoch,train_loss_all[-1],train_acc_all[-1]))
MyRNNimc.eval()
corrects=0
test_num=0
for step,(b_x,b_y) in enumerate(test_loader):
xdata=b_x.view(-1,28,28)
output=MyRNNimc(xdata)
pre_lab = torch.argmax(output, 1)
loss = criterion(output, b_y)
loss += loss.item() * b_x.size(0)
corrects += torch.sum(pre_lab == b_y.data)
train_num + +b_x.size(0)
test_loss_all.append(loss / test_num)
test_acc_all.append(corrects.double().item() / test_num)
print('{} Test Loss: {:.4f} Test Acc: {:.4f}'.format(epoch, test_loss_all[-1], test_acc_all[-1]))
plt.figure(figsize=(14,5))
plt.subplot(1,2,1)
plt.plot(train_loss_all,"ro-",label="Train loss")
plt.plot()test_loss_all,"bs-",label="Val loss")
plt.legend()
plt.xlabel("epoch")
plt.ylabel("Loss")
plt.subplot(1,2,2)
plt.plot(train_acc_all,"ro-",label="Train acc")
plt.plot(test_acc_all,"bs-",label="Val acc")
plt.xlabel("epoch")
plt.ylabel("acc")
plt.legend()
plt.show()
3.LSTM进行中文新闻分类
搭建一个分类器,对中文新闻数据进行分类。
该新闻数据集是THUCNews的一个子集,一共包含10类文本数据,没类数据有6500条文本。
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.font_manager import FontProperties
fonts=FontProperties(fname="/Library/Fonts/华文细黑.ttf")
import re
import string
import time
import copy
from sklearn.metriics import accuracy_score,confusion_matrix
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim as optim
import jieba
import torch.utils.data as Data
from torchvision.vocab import Vectors
train_df=pd.read_csv("data/chap7/cnews/cnews.train.txt",sep="\t",
header=None,names=["label","text"])
val_df=pd.read_csv("data/chap7/cnews/cnews.val.txt",sep="\t",
header=None,names=["label","text"])
test_df=pd.read_csv("data/chap7/cnews/cnews.test.txt",sep="\t",
header=None,names=["label","text"])
stop_words=pd.read_csv("data/chap7/cnews/中文停用词库.txt",sep="\t",
header=None,names=["text"])
def chinese_pre(text_data):
text_data=text_data.lower()
text_data=re.sub("\d+","",text_data)
text_data=list(jieba.cut(text_data,cut_all=False))
text_data=[word.strip() for word in text_data if word not in stop_words.text.values]
text_data=" ".join(text_data)
return text_data
train_df["cutword"]=train_df.text.apply(chinese_pre)
val_df["cutword"]=val_df.text.apply(chinese_pre)
test_df["cutword"]=test_df.text.apply(chinese_pre)
train_df.cutword.head()
labelMap={"体育":0,"娱乐":1,"家居":2,"房产":3,"教育":4,"时尚":5,"时政":6,"游戏":7,"科技":8,"财经":9}
train_df["labelcode"]=train_df["label"].map(labelMap)
val_df["labelcode"]=val_df["label"].map(labelMap)
test_df["labelcode"]=test_df["label"].map(labelMap)
train_df[["labelcode","cutword"]].to_csv("data/chap7,cnews_train2.csv",index=False)
val_df[["labelcode","cutword"]].to_csv("data/chap7,cnews_val2.csv",index=False)
test_df[["labelcode","cutword"]].to_csv("data/chap7,cnews_test2.csv",index=False)
mytokenize=lambda x:x.split()
TEXT=data.Field(sequential=True,tokenize=mytokenize,include_lengths=True,use_vocb=True,batch_first=True,fix_length=400)
LABEL=data.Field(sequential=False,use_vocab=False,pad_token=None,unk_token=None)
text_data_fielsd=[
("labelcode",LABEL),
("cutword",TEXT)
]
traindata,valdata,testdata=data.TabularDataset.splits(
path="data/chap7",format="csv",
train="cnews_train2.csv",fields=text_data_fielsd,
validation="cnews_val2.csv",
test="cnews_test2.csv",skip_header=True
)
len(traindata),len(valdata),len(testdata)
TEXT.build_vocab(traindata,max_size=20000,vectors=None)
LABEL.build_vocab(traindata)
word_fre=TEXT.vocab.freqs.most_common(n=50)
word_fre=pd.DataFrame(data=word_fre,columns=["word","fre"])
word_fre.plot(x="word",y="fre",kind="bar",legend=False,figsize=(12,7))
plt.xticks(rotationn=90,fontproperties=fonts,size=10)
plt.show()
BATCH_SIZE=64
train_iter=data.BUcketIterator(traindata,batch_size=BATCH_SIZE)
val_iter=data.BUcketIterator(valdata,batch_size=BATCH_SIZE)
test_iter=data.BUcketIterator(testdata,batch_size=BATCH_SIZE)
class LSTMNet(nn.Module):
def __init__(self,vocab_size,embedding_dim,hidden_dim,layer_dim,output_dim):
super(LSTMNet, self).__init__()
self.hidden_dim=hidden_dim
self.layer_dim=layer_dim
self.embedding=nn.Embedding(vocab_size,embedding_dim)
self.lstm=nn.LSTM(embedding_dim,hidden_dim,layer_dim,batch_first=True)
self.fc1=nn.Linear(hidden_dim,output_dim)
def forward(selfself,x):
wmbeds=self.embedding(x)
r_out,(h_n,h_c)=self.lstm(embeds,None)
out=self.fc1(r_out[:,-1,:])
return out
vocab_size=len(TEXT.vocab)
embedding_dim=100
hidden_dim=128
layer_dim=1
output_dim=10
lstmmodel=LSTMNet(vocab_size,embedding_dim,hidden_dim,layer_dim,output_dim)
lstmmodel
def train_model2(model,traindataloader,valdataloader,criterion,optimizer,num_epochs=25,):
train_loss_all=[]
train_acc_all=[]
val_loss_all=[]
val_acc_all=[]
since=time.time()
for epoch in range(num_epochs):
print('-'*10)
print('Epoch {}/{}'.format(epoch,num_epochs-1))
train_loss=0.0
train_corrects=0
train_num=0
val_loss=0.0
val_corrects=0
val_num=0
model.train()
for step,(b_x,b_y) in enumerate(trian_loader):
textdata,target=batch.cutword[0],batch.labelcode.view(-1)
out=model(textdata)
pre_lab=torch.argmax(out,1)
loss=criterion(out,target)
optimizer.zero_grad()
loss.backward()
optimizer.step()
train_loss += loss.item()*len(target)
train_corrects += torch.sum(pre_lab==target.data)
train_num += len(target)
train_loss_all.append(train_loss/train_num)
train_acc_all.append(train_corrects.double().item()/train_num)
print('{} Train Loss: {:.4f} Train Acc: {:.4f}'.format(epoch,train_loss_all[-1],train_acc_all[-1]))
model.eval()
for step,batch in enumerate(valdataloader):
textdata,target=batch.cutword[0],batch.labelcode.view(-1)
out=model(textdata)
pre_lab=torch.argmax(out,1)
loss=criterion(out,target)
val_loss+=loss.item()*len(target)
val_corrects+=torch.sum(pre_lab==target.data)
val_num+=len(target)
val_loss_all.append(val_loss/val_num)
val_acc_all.append(val_corrects.double().item()/val_num)
print('{} Val Loss: {:.4f} Val Acc: {:.4f}'.format(epoch, val_loss_all[-1], val_acc_all[-1]))
train_process=pd.DataFrame(
data={"epoch":range(num_epochs),
"train_loss_all":train_loss_all,
"train_acc_all":train_acc_all,
"val_loss_all":val_loss_all,
"val_acc_all":val_acc_all}
)
return model,train_process
optimizer=torch.optim.Adam(lstmmodel.parameters(),lr=0.0003)
loss_func=nn.CrossEntropyLoss()
lstmmodel,train_process=train_model2(lstmmodel,train_iter,val_iter,loss_func,optimizer,num_epochs=20)
plt.figure(figsize=(18,6))
plt.subplot(1,2,1)
plt.plot(train_process.epoch,train_process.train_loss_all,"r.-",label="Train loss")
plt.plot(train_process.epoch,train_process.val_loss_all,"bs-",label="Val loss")
plt.legend()
plt.xlabel("epoch number",size=13)
plt.ylabel("Loss value",size=13)
plt.subplot(1,2,2)
plt.plot(train_process.epoch,train_process.train_acc_all,"r.-",label="Train acc")
plt.plot(train_process.epoch,train_process.val_acc_all,"bs-",label="Val acc")
plt.xlabel("epoch number",size=13)
plt.ylabel("acc",size=13)
plt.legend()
plt.show()
lstmmodel.eval()
test_y_all=torch.LongTensor()
pre_lab_all=torch.LongTensor()
for step,batch in enumerate(test_iter):
textdata,target=batch.cutword[0],batch.labelcode.view(-1)
out=lstmmodel(textdata)
pre_lab=torch.argmax(out,1)
test_y_all=torch.cat((test_y_all,target))
pre_lab_all=torch.cat(pre_lab_all,pre_lab)
acc=accuracy_score(test_y_all,pre_lab_all)
print("在测试数据集上的预测精度为:"acc)
class_label=["体育","娱乐","家居","房产","教育","时尚","时政","游戏","科技","财经"]
conf_mat=confusion_matrix(test_y_all,pre_lab_all)
df_cm=pd.DataFrame(conf_mat,index=class_label,columns=class_label)
heatmap=sns.heatmap(df_cm,annot=True,fmt="d",cmap="Y1GnBu")
heatmap.yaxis.set_ticklabels(heatmap.yaxis.get_ticklabels(),rotation=0,ha='right',fontpoperties=fonts)
heatmap.xaxis.set_ticklabels(heatmap.xaxis.get_ticklabels(),rotation=45,ha='right',fontpoperties=fonts)
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
可视化词向量分布
from sklearn.maniford import TSNE
lstmmodel=torch.load("data/chap7/lstmmodel.pkl")
word2vec=lstmmodel.embedding.weight
words=TEXT.vocab.itos
tsne=TSNE(n_componects=2,random_state=123)
word2vec_tsne=tsne.fit_trandform(word2vec.data.numpy())
plt.figure(figsize=(10,8))
plt.scatter(word2vec_tsne[:,0],word2vec_tsne[:,1],s=4)
plt.title("所有词向量的分布情况",fontproperties=fpnts,size=15)
plt.show()
vis_word=["中国","市场","公司","美国","记者","学生","游戏","北京","投资","电影","银行","工作","留学","大学","经济","产品","设计","方面","玩家","学校","房价","专家","楼市"]
vis_word_index=[words.index(ii) for ii in vis_word]
plt.figure(figsize=(10,8))
for ii,index in enumerate(vis_word_index):
plt.scatter(word2vec_tsne[index,0],word2vec_tsne[index,1])
plt.text(word2vec_tsne[index,0],word2vec_tsne[index,1],vis_word[ii],fontproperties=fonts)
plt.title("词向量的分布情况",fontproperties=fonts,size=15)
plt.show()
4.GRU网络进行情感分类
搭建一个对IMDB电影评论数据分类的GRU网络,该数据集imdb_train.csv和
imdb_test.csv在之前已经介绍和预处理。
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import accuracy_score
import time
import copy
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim as optim
from torchvision import transforms
from torchtext import data
from torchvision.vocab import Vectors
mytokenize=lambda x:x.split()
TEXT=data.Field(sequential=True,tokenize=mytokenize,include_lengths=True,use_vocab=True,batch_first=True,fix_length=200)
LABEL=data.Field(sequential=False,use_vocab=False,pad_token=None,unk_token=None)
train_test_fields=[
("label",LABEL),
("text",TEXT)
]
traindata,testdata=data.TabularDataset.splits(
path="./data/chap6",format="csv",
train="imdb_train.csv",fields=train_test_fields,
test="imdb_test.csv",skip_header=True
)
vec=Vectors("gloves.6b.100d.txt","./data")
TEXT.build_vocab(traindata,max_size=20000,vectors=vec)
LABEL.build_vocab(traindata)
BATCH_SIZE=32
train_iter=data.BucketIterator(traindata,batch_size=BATCH_SIZE)
test_iter=data.BucketIterator(testdata,batch_size=BATCH_SIZE)
class GRUNet(nn.Module):
def __init__(self,vocab_size,embedding_dim,hidden_dim,layer_dim,output_dim):
super(GRUNet,self).__init__()
self.hidden_dim=hidden_dim
self.layer_dim=layer_dim
self.emdedding=nn.Embedding(vocab_size,embedding_dim)
self.gru=nn.GRU(embedding_dim,hidden_dim,layer_dim,batch_first=True)
self.fc1=nn.Sequential(
nn.Linear(hidden_dim,hidden_dim),
torch.nn.Dropout(0.5),
torch.nn.ReLU(),
nn.Linear(hidden_dim,output_dim)
)
def forward(selfself,x):
embeds=self.embedding(x)
r_out,h_n=self.gru(embeds,None)
out=self.fc1(r_out[:,-1,:])
return out
vocab_size=len(TEXT.vocab)
embedding_dim=vec.dim
hidden_dim=128
layer_dim=1
output_dim=2
grumodel=GRUNet(vocab_size,embedding_dim,hidden_dim,layer_dim,output_dim)
grumodel
grumodel.embedding.weight.data.copy_(TEXT.vocab.vectors)
UNK_IDX=TEXT.vocab.stoi[TEXT.unk_token]
PAD_IDX=TEXT.vocab.stoi[TEXT.pad_token]
grumodel.embedding.weight.data[UNK_IDX]=torch.zeros(vec.dim)
grumodel.embedding.weight.data[PAD_IDX]=torch.zeros(vec.dim)
def train_model(model,traindataloader,testdataloader,criterion,optimizer,num_epochs=25):
train_loss_all=[]
train_acc_all=[]
test_loss_all=[]
test_acc_all=[]
learn_rate=[]
since=time.time()
scheduler=optim.lr_scheduler.StepLR(optimizer,step_size=5,gamma=0.1)
for epoch in range(num_epochs):
learn_rate.append(scheduler.get_lr()[0])
print('-'*10)
print('EPoch {}/{}, Lr:{}'.format(epoch,num_epochs-1,learn_rate[-1]))
train_loss=0.0
train_corrects=0
train_num=0
test_loss=0.0
test_corrects=0
test_num=0
model.train()
for step,batch in enumerate(traindataloader):
textdata,target=batch.text[0],batch.label
out=model(textdata)
pre_lab=torch.argmax(out,1)
loss=criterion(out,target)
optimizer.zero_grad()
loss.backward()
optimizer.step()
train_loss += loss.item()*len(target)
train_corrects += torch.sum(pre_lab==target.data)
train_num += len(target)
train_loss_all.append(train_loss/train_num)
train_acc_all.append(train_corrects.double().item()/train_num)
print('{} Train Loss : {:.4f} Train Acc :{:.4f}'.format(epoch,train_loss_all[-1],train_acc_all[-1]))
scheduler.step()
model.eval()
for step,batch in enumerate(testdataloader):
textdata,target=batch.text[0],batch.label
out=model(textdata)
pre_lab=torch.argmax(out,1)
loss=criterion(out,target)
test_loss+=loss.item()*len(target)
test_corrects+=torch.sum(pre_lab==target.data)
test_num+=len(target)
test_loss_all.append(test_loss/test_num)
test_acc_all.append(test_corrects.double().item()/test_num)
print('{} Test Loss : {:.4f} Test Acc :{:.4f}'.format(epoch,test_loss_all[-1],test_acc_all[-1]))
train_process=pd.DataFrame(
data={"epoch":range(num_epochs),
"train_loss_all":train_loss_all,
"train_acc_all":train_acc_all,
"test_loss_all":test_loss_all,
"test_acc_all":test_acc_all,
"learn_rate":learn_rate}
)
return model,train_process
optimizer=optim.RMSprop(grumodel.parameters(),lr=0.003)
loss_func=nn.CrossEntropyLoss()
grumodel,train_process=train_model(grumodel,train_iter,test_iter,loss_func,optimizer,num_epochs=10)
plt.figure(figsize=(18,6))
plt.subplot(1,2,1)
plt.plot(train_process.epoch,train_process.train_loss_all,"r.-",label="Train loss")
plt.plot(train_process.epoch,train_process.test_loss_all,"bs-",label="Test loss")
plt.legend()
plt.xlabel("Epoch number",size=13)
plt.ylabel("Loss value",size=13)
plt.subplot(1,2,2)
plt.plot(train_process.epoch,train_process.train_acc_all,"r.-",label="Train acc")
plt.plot(train_process.epoch,train_process.test_acc_all,"bs-",label="Test acc")
plt.xlabel("Epoch number",size=13)
plt.ylabel("Acc",size=13)
plt.legend()
plt.show()
grumodel.eval()
test_y_all=torch.LongTensor()
pre_lab_all=torch.LongTensor()
for step,batch in enumerate(test_iter):
textdata,target=batch.text[0],batch.label.view(-1)
out=grumodel(testdata)
pre_lab=torch.argmax(out,1)
test_y_all=torch.cat((test_y_all,target))
pre_lab_all=torch.cat((pre_lab_all,pre_lab))
acc=accuracy_score(test_y_all,pre_lab_all)
print("测试数据集上的预测精度为: ",acc)