# !pip install torchtext -i https://pypi.tuna.tsinghua.edu.cn/simple
首先安装好pytorch中的torchtext,接着便是导入一些所需要的库函数
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import copy
import time
from sklearn.metrics import accuracy_score,confusion_matrix
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader,Dataset
import jieba
from torchtext import data
from torchtext.vocab import Vectors
##输出显示中文
from matplotlib.font_manager import FontProperties
fonts = FontProperties(fname = "/Libray/Fonts/华文细黑.ttf")
1.中文数据的读取和预处理,这里使用的数据集分为训练集,测试集,验证集,以及中文停词库。
停用词数据集的作用是:用于去除多余的停用词,提升网络的效果。对于数据读取后,需要对原始数据进行预处理和清洗操作,处理成模型所需要的格式
train_df = pd.read_csv("./chap7/cnews/cnews.train.txt",sep = "\t",header = None,names = ["label","text"])
val_df = pd.read_csv("./chap7/cnews/cnews.val.txt",sep = "\t",header = None,names=["label","text"])
test_df = pd.read_csv("./chap7/cnews/cnews.test.txt",sep = "\t",header = None,names=["label","text"])
stop_words = pd.read_csv("./chap7/cnews/中文停用词库.txt",header=None,names = ["text"])
train_df.head()
# 对中文文本数据进行预处理操作,去除一些不需要的字符,分词,停用词等操作。
def chinese_pre(text_data):
#字母转化为小写,去除数字。
text_data = text_data.lower()
text_data = re.sub("\d+","",text_data)
#分词,使用精确模式
text_data = list(jieba.cut(text_data,cut_all=False))
#去掉停用词和多余空格
text_data = [word.strip() for word in text_data if word not in stop_words.text.values]
#处理后的词语使用空格连接为字符串
text_data = " ".join(text_data)
return text_data
train_df["cut_word"] = train_df.text.apply(chinese_pre)
val_df["cut_word"] = val_df.text.apply(chinese_pre)
test_df["cut_word"] = test_df.text.apply(chinese_pre)
train_df.head()
labelmap = {'体育':0, '娱乐':1, '家居':2, '房产':3, '教育':4,
'时尚':5, '时政':6, '游戏':7, '科技':8, '财经':9}
train_df["labelcode"] = train_df["label"].map(labelmap)
val_df["labelcode"] = val_df["label"].map(labelmap)
test_df["labelcode"] = test_df["label"].map(labelmap)
train_df[['cut_word', 'labelcode']].to_csv("./precess/train.csv",index=False)
val_df[['cut_word', 'labelcode']].to_csv("./precess/val.csv",index=False)
test_df[['cut_word', 'labelcode']].to_csv("./precess/test.csv",index=False)
做好了数据处理部分,将数据集保存到了precess文件夹中,用于做分类时候备用。
import numpy as np
import gc
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import copy
import time
from sklearn.metrics import accuracy_score,confusion_matrix
import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader,Dataset
import jieba
import torchtext
from torchtext.legacy import data
from torchtext.vocab import Vectors
##输出显示中文
from matplotlib.font_manager import FontProperties
fonts = FontProperties(fname = "/Libray/Fonts/华文细黑.ttf")
train_df = pd.read_csv("./precess/train.csv")
val_df = pd.read_csv("./precess/val.csv")
test_df = pd.read_csv("./precess/test.csv")
test_df.head()
#使用torchtext库进行数据准备
mytokenize = lambda x:x.split()
TEXT = data.Field(sequential = True,tokenize = mytokenize,
include_lengths = True,use_vocab = True,
batch_first = True,fix_length =400)
LABEL= data.Field(sequential = False,use_vocab = False,pad_token = None,unk_token = None)
#对要读取的数据集进行处理
text_data_fields = [("cut_word",TEXT),#对文本操作
("labelcode",LABEL)#对标签处理
]
#读取数据
traindata,valdata,testdata = data.TabularDataset.splits(
path = "./precess",format= "csv",
train = "train.csv",validation = "val.csv",
test = "test.csv",skip_header=True,
fields = text_data_fields
)
然后进行可视化
# 使用训练集构建单词表,没有预训练好的词向量
TEXT.build_vocab(traindata,max_size = 20000,vectors = None)
LABEL.build_vocab(traindata)
#可视化前50个高频词
word_fre = TEXT.vocab.freqs.most_common(50)
word_fre = pd.DataFrame(data=word_fre,columns = ["word","fre"])
word_fre.plot(x="word",y = "fre",kind = "bar",legend =False,figsize = (12,7))
# plt.xticks(rotation = 90,fontproperties = fonts,size = 10)
plt.xticks(rotation = 90,size = 10)
plt.show
使用data.BucketIterator将他们分别处理为数据加载器,每次输入32个样本用于训练。
batch_size = 32
train_iter = data.BucketIterator(traindata,batch_size=batch_size)
val_iter = data.BucketIterator(valdata,batch_size=batch_size)
test_iter = data.BucketIterator(testdata,batch_size=batch_size)
查看batch中的内容
for step, batch in enumerate(train_iter):
print(step,batch)
print("="*20)
print(batch.cut_word[0])
print(batch.labelcode)
print(batch.labelcode.view(-1))
break
以上准备好后 开始构建模型,这里选用LSTM,首先我们试用下GPU
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
device = torch.device('cuda:1'if torch.cuda.is_available() else 'cpu' )
device
class lstm_net(nn.Module):
def __init__(self,vocab_size,embedding_dim,hidden_dim,layer_dim,output_dim):
"""
params:
param: vocab_size 词典长度,
param: embedding_dim 词向量的维度
param: hidden_dim lstm神经元的个数
param: layer_dim lstm的层数
param: output_dim 隐藏层的输出维度
"""
super(lstm_net,self).__init__()
self.hidden_dim = hidden_dim
self.layer_dim = layer_dim
#对文本信息进行词向量处理。
self.embedding = nn.Embedding(vocab_size,embedding_dim)
self.lstm = nn.LSTM(embedding_dim,hidden_dim,layer_dim,batch_first = True)
self.fc1 = nn.Linear(hidden_dim,output_dim)
def forward(self,x):
#embedding (seqlen,batch_size) ->(seqlen,batch_size,hidden_size)
embeds = self.embedding(x)
#r_out:(batch_size,temp_step,output_size)
#h_n :(n_layers,batch_size,hidden_size)
#h_c:(n_layers,batch_size,hidden_size)
r_out,(h_n,h_c) = self.lstm(embeds,None)
out = self.fc1(r_out[:,-1,:])
return out
vocab_size = len(TEXT.vocab)
embedding_dim = 100
hidden_dim = 128
layer_dim =1
output_dim =10
lstm_model = lstm_net(vocab_size,embedding_dim,hidden_dim,layer_dim,output_dim).to(device)
lstm_model
定义我们的训练函数
def train_model(model,traindataloader,valdataloader,criterion,optimizer,num_epochs = 25):
"""
param: model: 网络模型
param:traindataloader: 训练数据
param:valdataloader: 验证数据
param: criterion: 损失函数
param:optimizer: 优化方法
param: num_epochs :训练的次数
"""
train_loss_all =[]
train_acc_all = []
val_loss_all = []
val_acc_all = []
since = time.time()
for epoch in range(num_epochs):
print("="*20)
print(f"Epoch {epoch+1}/{num_epochs}")
#每个epoch分为训练阶段和验证阶段
train_loss = 0.0
train_corrects =0
train_num = 0
val_loss = 0.0
val_corrects = 0
val_num = 0
model.train()#设置模型为训练模式
for step,batch in enumerate(traindataloader):
textdata,target = batch.cut_word[0].to(device),batch.labelcode.view(-1).to(device)
out = model.forward(textdata)
loss = criterion(out,target)
loss.backward()
optimizer.step()
optimizer.zero_grad(set_to_none=True)
pre_lab = torch.argmax(out,1)
train_loss+=loss.item()*len(target)
train_corrects+=torch.sum(pre_lab ==target.data)
train_num+=len(target)
#计算一个epoch在训练集上的损失和精度
train_loss_all.append(train_loss/train_num)
train_acc_all.append(train_corrects.double().item()/train_num)
print(f"{epoch+1} Train Loss:{train_loss_all[-1]:.4f},Train Acc:{train_acc_all[-1]:.4f}")
del train_corrects,loss,textdata,target,train_loss
gc.collect()
torch.cuda.empty_cache()
#计算一个epoch在验证集的损失和精度
model.eval()#设置转换为评估模式
for step,batch in enumerate(valdataloader):
textdata,target = batch.cut_word[0].to(device),batch.labelcode.view(-1).to(device)
out = model.forward(textdata)
pre_lab = torch.argmax(out,1)
loss = criterion(out,target)
val_loss+=loss.item()*len(target)
val_corrects +=torch.sum(pre_lab==target.data)
val_num+=len(target)
#计算一个epoch在训练集上的损失和精度
val_loss_all.append(val_loss/val_num)
val_acc_all.append(val_corrects.double().item()/val_num)
print(f"{epoch+1} Val Loss:{val_loss_all[-1]:.4f},Val Acc:{val_acc_all[-1]:.4f}")
del val_corrects,loss,textdata,target,val_loss
gc.collect()
torch.cuda.empty_cache()
train_process = pd.DataFrame(data = {"epoch":range(num_epochs),
"train_loss_all":train_loss_all,
"train_acc_all":train_acc_all,
"val_loss_all":val_loss_all,
"val_acc_all":val_acc_all})
return model, train_process
# 定义优化器
optimizer = optim.Adam(lstm_model.parameters(),lr = 0.0003)
criterion = nn.CrossEntropyLoss()
model,train_process = train_model(lstm_model,train_iter,val_iter,criterion,optimizer,num_epochs=20)
训练好模型之后,可视化我们的训练和验证结果
#可视化模型的训练过程
plt.figure(figsize = (18,6))
plt.subplot(1,2,1)
plt.plot(train_process.epoch,train_process.train_loss_all,"r.-",label = "Train Loss")
plt.plot(train_process.epoch,train_process.val_loss_all,"bs-",label = "Val Loss")
plt.legend()
plt.xlabel("Epoch number",size=13)
plt.ylabel("Loss value",size = 13)
plt.subplot(1,2,2)
plt.plot(train_process.epoch,train_process.train_acc_all,"r.-",label = "Train acc")
plt.plot(train_process.epoch,train_process.val_acc_all,"bs-",label = "Val acc")
plt.legend()
plt.xlabel("Epoch number",size=13)
plt.ylabel("ACC value",size = 13)
plt.show()
保存模型
torch.save(lstm_model,"./precess/lstm_model.pkl")
基于训练好的模型做预测
对测试集进行预测,并且计算精度
lstm_model.eval()
test_y_all = torch.LongTensor().to(device)
pre_lab_all = torch.LongTensor().to(device)
for step,batch in enumerate(test_iter):
textdata,target = batch.cut_word[0].to(device),batch.labelcode.view(-1).to(device)
out = lstm_model.forward(textdata)
pre_lab = torch.argmax(out,1)
test_y_all = torch.cat((test_y_all,target))#标签
pre_lab_all = torch.cat((pre_lab_all,pre_lab))#预测
acc = accuracy_score(test_y_all.data.cpu().numpy(),pre_lab_all.data.cpu().numpy())
print(f"测试集上的预测精度为:{acc}")
计算混淆矩阵并可视化
class_label = ['体育', '娱乐', '家居', '房产', '教育', '时尚', '时政', '游戏', '科技', '财经']
conf_mat = confusion_matrix(test_y_all.data.cpu().numpy(),pre_lab_all.data.cpu().numpy())
df_cm = pd.DataFrame(conf_mat,index=class_label,columns=class_label)
heat_map =sns.heatmap(df_cm,annot=True,fmt ="d",cmap = "YlGnBu")
heat_map.yaxis.set_ticklabels(heat_map.yaxis.get_ticklabels(),rotation = 0,ha ='right')
heat_map.xaxis.set_ticklabels(heat_map.xaxis.get_ticklabels(),rotation = 45,ha ='right')
plt.ylabel("true label")
plt.xlabel("predict label")
plt.show()
# 可视化词向量的分布
from sklearn.manifold import TSNE
#导入保存模型
lstmmodel = torch.load("./precess/lstm_model.pkl")
# 获取词向量
word2vec = lstmmodel.embedding.weight
# 使用tsne对词向量降维并且可视化
tsne = TSNE(n_components=2,random_state=123)
word2vec_tsne = tsne.fit_transform(word2vec.data.cpu().numpy())
plt.figure(figsize = (10,8))
plt.scatter(word2vec_tsne[:,0],word2vec_tsne[:,1],s =4)
plt.title("所有词向量的分布情况",size =15)
plt.show()
# 词向量对应的词语
words = TEXT.vocab.itos
# 调出一些感兴趣的高频词语进行可视化。
vis_word = ["中国","市场","公司","美国","记者","学生","游戏",
"北京","投资","电影","银行","工作","留学","大学",
"经济","产品","设计","方面","玩家","学校","学习",
"房价","专家","楼市"]
# 计算词语在词向量的索引
vis_word_index = [words.index(i) for i in vis_word]
print(vis_word_index)
plt.figure(figsize = (10,8))
for i ,index in enumerate(vis_word_index):
plt.scatter(word2vec_tsne[index,0],word2vec_tsne[index,1])
plt.text(word2vec_tsne[index,0],word2vec_tsne[index,1],vis_word[i])
plt.title("词向量的分布情况:")
plt.show()