为了对word embedding这种常用的文本向量化的方法进行巩固,这里我们会完成一个文本情感分类的案例
现在我们有一个经典的数据集IMDB
数据集,地址:http://ai.stanford.edu/~amaas/data/sentiment/
,这是一份包含了5万条流行电影的评论数据,其中训练集25000条,测试集25000条。数据格式如下:
下图左边为名称,其中名称包含两部分,分别是序号和情感评分,(1-4为neg,5-10为pos),右边为评论内容
但本次实验从简设计只实现二分类,即实现积极
和消极
的预测
import pickle
import torch
from torch.utils.data import DataLoader, Dataset
import os
import re
'''
data: 电影评论数据
数据集准备
使用W2S模型将文本序列化
'''
data_base_path = r"./aclImdb"
# 加载-词典(Word2Sequence中保存的模型,事先生成使用w2s_save保存的模型)-用于文本序列化
ws = pickle.load(open("./models/ws.pkl", "rb"))
Max_Len = 40
train_batch_size = 512
test_batch_size = 1024
# 定义tokenize的方法
def tokenize(text):
# fileters = '!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'
fileters = ['!', '"', '#', '$', '%', '&', '\(', '\)', '\*', '\+', ',', '-', '\.', '/', ':', ';', '<', '=', '>',
'\?', '@', '\[', '\\', '\]', '^', '_', '`', '\{', '\|', '\}', '~', '\t', '\n', '\x97', '\x96', '”',
'“', ]
text = re.sub("<.*?>", " ", text)
text = re.sub("|".join(fileters), " ", text)
return [i.strip().lower() for i in text.split()]
# 准备dataset
class ImdbDataset(Dataset):
def __init__(self, train=True):
super(ImdbDataset, self).__init__()
self.train_data_path = data_base_path + r'\train'
self.test_data_path = data_base_path + r'\test'
self.data_path = self.train_data_path if train else self.test_data_path
# 把所有文件名放入列表
self.temp_data_path = [os.path.join(self.data_path, 'pos'), os.path.join(self.data_path, 'neg')]
self.total_file_path_list = [] # 所有评论文件的path
for path in self.temp_data_path:
self.total_file_path_list.extend([os.path.join(path, j) for j in os.listdir(path) if j.endswith('.txt')])
def __getitem__(self, index):
# 获取评论路径
path = self.total_file_path_list[index]
# 获取标签
label_str = path.split('\\')[-2]
label = 0 if label_str == 'neg' else 1
content = tokenize(open(path,encoding='utf-8').read())
return content, label
def __len__(self):
return len(self.total_file_path_list)
def collate_fn(batch):
# batch是list,其中是一个一个元组,每个元组是dataset中__getitem__的结果
# print(batch)
content, labels = list(zip(*batch))
content = torch.LongTensor([ws.transform(i,max_len=Max_Len) for i in content])
labels = torch.LongTensor(labels)
print(content, labels)
return content, labels
def get_dataloader(train=True,batch_size=train_batch_size):
# 2. 实例化,准备dataloader
dataset = ImdbDataset(train)
dataloader = DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
return dataloader
if __name__ == '__main__':
# 3. 观察数据输出结果
for idx, (sentence, label) in enumerate(get_dataloader()):
print("idx:", idx)
print("sentence:", sentence)
print("label:", label)
break
import numpy as np
'''
文本序列化,序列化文本类
'''
class Word2Sequence():
UNK_TAG = "UNK"
PAD_TAG = "PAD"
UNK = 0
PAD = 1
def __init__(self):
self.dict = {
self.UNK_TAG :self.UNK,
self.PAD_TAG :self.PAD
}
# 是否进行fit 操作
self.fited = False
# 是否构建词典操作
self.build_vocabd = False
# 词频统计
self.count = {}
def to_index(self,word):
"""word -> index"""
assert self.fited == True,"必须先进行fit操作"
return self.dict.get(word,self.UNK)
def to_word(self,index):
"""index -> word"""
assert self.fited , "必须先进行fit操作"
if index in self.inversed_dict:
return self.inversed_dict[index]
return self.UNK_TAG
def __len__(self):
return len(self.dict)
def fit(self,sentence):
for word in sentence:
self.count[word] = self.count.get(word, 0) + 1
self.fited = True
def build_vocab(self, min_count=5, max_count=None, max_features=None):
'''
生成词典
:param sentence: [word1.word2,word3 .....]
:param min_count: 最小出现的次数
:param max_count: 最大出现的次数
:param max_features: 一共保留多少个特征(word)
:return:
'''
# 删除词频中count 最小的word
if min_count is not None:
self.count = {k: v for k, v in self.count.items() if v >= min_count}
# 删除词频超过限制的word
if max_count is not None:
self.count = {k: v for k, v in self.count.items() if v <= max_count}
if max_features is not None:
# 对词频字典排序取前max_features个词语
temp = sorted(self.count.items(),key=lambda x: x[-1],reverse=True)[:max_features]
self.count = dict(temp)
# 给处理好的word编号
for word in self.count:
self.dict[word] = len(self.dict)
# 得到一个翻转的字典(编号:词)
self.inversed_dict = dict(zip(self.dict.values(), self.dict.keys()))
self.build_vocabd = True
def transform(self, sentence, max_len=None):
"""
实现把句子转化为数组(向量)
:param sentence: [word1,word2,word3 ....]
:param max_len: 向量的限制长度
:return:
"""
assert self.fited, "必须先进行fit操作"
assert self.build_vocabd,"必须先进行build_vocab操作"
if max_len is not None:
if max_len > len(sentence):
sentence = sentence + [self.PAD_TAG] * (max_len-len(sentence)) # 填充
else:
sentence = sentence[:max_len] # 裁剪
return [self.dict.get(word,self.UNK) for word in sentence]
def inverse_transform(self,indices):
"""
实现从数组转化为文字
:param indices: [1,2,3....]
:return:[word1,word2.....]
"""
assert self.fited, "必须先进行fit操作"
assert self.build_vocabd,"必须先进行build_vocab操作"
return [self.inversed_dict.get(idx) for idx in indices]
if __name__ == '__main__':
w2s = Word2Sequence()
w2s.fit(["你", "好", "么"])
w2s.fit(["你", "好", "哦"])
w2s.build_vocab(min_count=1)
print(w2s.dict)
print(w2s.fited)
print(w2s.transform(["你","好","嘛"]))
print(w2s.transform(["你好嘛"],max_len=10))
print(w2s.inverse_transform([5,2,4]))
print(len(w2s))
略
import os
import torch
import numpy as np
from torch import nn, optim
from DataSet import get_dataloader, ws, Max_Len, test_batch_size
import torch.nn.functional as F
from tqdm import tqdm
'''
IMDB电影评论情感分析(pos,neg)积极和消极-改进版
使用LSTM双向循环神经网络,抽取最后一个时间步的特征用作全连接层特征输入
即:文本 -> num -> vector -> LSTM[last TimeStep] -> 2层全连接 -> softmax
'''
class IMDBLstmmodel(nn.Module):
def __init__(self):
super(IMDBLstmmodel,self).__init__()
# 以下部分为超参数,可以自行修改
self.hidden_size = 64 # 每一层的LSTM单元数
self.embedding_dim = 200 # 每个词的向量长度
self.num_layer = 2 # 隐藏层数
self.bidriectional = True # 是否使用双向的LSTM
self.bi_num = 2 if self.bidriectional else 1 # 是否是双向的LSTM
self.dropout = 0.5
self.embedding = nn.Embedding(len(ws),self.embedding_dim, padding_idx=ws.PAD)
self.lstm = nn.LSTM(input_size=self.embedding_dim, hidden_size=self.hidden_size, num_layers=self.num_layer,
bidirectional=self.bidriectional, dropout=self.dropout)
# 使用两个全连接层,中间使用relu激活函数
self.fc = nn.Linear(self.hidden_size*self.bi_num, 20)
self.fc2 = nn.Linear(20, 2)
def forward(self, x): # 输入的x的形状[batch_size, sequence_len]
# x 经过embedding 之后的形状 [batch_size, sequence_len, embedding_dim]
x = self.embedding(x)
# 进行轴交换 x的形状变为[sequence_len, batch_size, embdedding_dim]
x = x.permute(1, 0, 2)
# x的形状[sequence_len, batch_size, hidden_size] , h_n的形状[num_layer*bi_num, batch, hidden_size], c_n同h_n
x, (h_n, c_n) = self.lstm(x)
# 只要最后一个lstm单元处理的结果,即正向的最后一个lstm 和 反向的最后一个处理结果合并,形状为[batch_szie, hidden_size*bi_num]
out = torch.cat([h_n[-2, :, :], h_n[-1, :, :]], dim=-1)
out = self.fc(out)
out = F.relu(out)
out = self.fc2(out)
return F.log_softmax(out,dim=-1)
# 实例化模型
model = IMDBLstmmodel()
# 实例化优化器
optimizer = optim.Adam(model.parameters(), lr=0.001)
if os.path.exists('./models/lstm_model.pkl'): # 是否有已训练的模型,方便快速训练
model.load_state_dict(torch.load('./models/lstm_model.pkl'))
optimizer.load_state_dict(torch.load('./models/lstm_optimizer.pkl'))
# 定义训练函数
def train(epoch):
data_loader = get_dataloader()
for idx, (input, label) in tqdm(enumerate(data_loader),total=len(data_loader),ascii=True,desc='第%d轮训练'%epoch):
# 梯度清零
optimizer.zero_grad()
# 使用模型进行预测
ouput = model(input)
# 计算损失
loss = F.nll_loss(ouput, label)
# 误差反向传播
loss.backward()
# 梯度更新
optimizer.step()
if idx == len(data_loader)-1:
print('result: 第%d轮次训练,损失%f'%(epoch,loss.item()))
torch.save(model.state_dict(), "./models/lstm_model.pkl") # 模型保存
torch.save(optimizer.state_dict(), './models/lstm_optimizer.pkl') # 优化器保存
训练效果(这里我已经提前训练了,所以损失已经很低了)
# 模型评估
def test():
model.eval()
loss_ = []
acc_ = []
with torch.no_grad():
data_loader = get_dataloader(train=False,batch_size=test_batch_size)
for idx, (input, label) in tqdm(enumerate(data_loader),total=len(data_loader),ascii=True,desc='模型评估'):
ouput = model(input)
loss = F.nll_loss(ouput,label,reduction="mean")
loss_.append(loss.item())
pred = ouput.max(dim=1)[1] # [batch_size, 1]
acc_.append(pred.eq(pred).float().mean()) # 每个批次的平均准确率
print('模型损失%f,平均准确率%f' % (np.mean(loss_), np.mean(acc_)))
准确率在99% 以上