config.vocab_size ## 已知词库大小
config.embedding_size ##每个词向量长度
config.num_clas ##类别数
config.out_channels = 16 ## 输出卷积核的个数
args.kernel_sizes ## 卷积核list,形如[3,4,5]
class TextCNN(nn.Module):
def __init__(self, config):
super(TextCNN, self).__init__()
self.dropout_rate = config.dropout_rate
self.num_class = config.num_clas
self.embedding = nn.Embedding(num_embeddings=config.vocab_size,
embedding_dim=config.embedding_size)
self.convs = nn.ModuleList([
nn.Sequential(
nn.Conv1d(in_channels=config.embedding_size,
out_channels=config.out_channels,
kernel_size= ks),
nn.ReLU(),
nn.MaxPool1d(kernel_size=config.max_len-ks+1))
for ks in config.kernel_size ]) # 创建3个nn.Sequential,包含了 图中的convolution层、activation function层 和 maxPooling层, 其中每个层的参数都有变化
self.fc = nn.Linear(in_features=config.out_channels*len(config.kernel_size),
out_features=config.num_class) # 每种类别的卷积核个数相乘,得到的长度就是全连接层输入的长度
def forward(self, x):
embed_x = self.embedding(x) # b x src_len
embed_x = embed_x.permute(0, 2, 1)
# b x src_len x embed_size --> b x embed_size x src_lem
out = [conv(embed_x) for conv in self.convs] #计算每层卷积的结果,这里输出的结果已经经过池化层处理了
out = torch.cat(out, dim=1) # 对池化后的向量进行拼接
out = out.view(-1, out.size(1)) # 拉成一竖条作为全连接层的输入
out = F.dropout(input=out, p=self.dropout_rate) # 这里也没有在图中的表现出来,这里是随机让一部分的神经元失活,避免过拟合。它只会在train的状态下才会生效。进入train状态可查看nn.Module。train()方法
out = self.fc(out)
return out
代码参考博文NLP学习之使用pytorch搭建textCNN模型进行中文文本分类
主要记录自学NLP的一些心得。前面和该博主代码一样,只是样本只选择了2000,因为我的电脑没有GPU,后面模型那块想自己试着写写看。模型正确率差不多70%,因为只迭代了10次,大家可以增大样本量和迭代次数来提高准确率。代码中有很多可以改进的地方,代码中也会有一些错误,欢迎大家指出来中文数据是数据
具体是第3个百科问答Json版,下载下来得到两个文件:baike_qa_train.json和baike_qa_valid.json。大概内容如下:
{“qid”: “qid_1815059893214501395”, “category”: “烦恼-恋爱”, “title”: "请问深入骨髓地喜欢一个人怎么办我不能确定对方是不是喜欢我,我却想 ", “desc”: “我不能确定对方是不是喜欢我,我却想分分秒秒跟他在一起,有谁能告诉我如何能想他少一点”, “answer”: "一定要告诉他你很喜欢他 很爱他!! 虽然不知道你和他现在的关系是什么!但如果真的觉得很喜欢就向他表白啊!!起码你努力过了! 女生主动多少占一点优势的!!呵呵 只愿曾经拥有! 到以后就算感情没现在这么强烈了也不会觉得遗憾啊~! 与其每天那么痛苦的想他 恋他 还不如直接告诉他 ! 不要怕回破坏你们现有的感情!因为如果不告诉他 你可能回后悔一辈子!! "}
下下来的数据类别非常多,为了简化,我从中筛选了少量的样本进行学习。具体来说,只选择了标题前2个字为教育、健康、生活、娱乐和游戏五个类别,同时各个类别各2000个。代码如下:
'''
从原始数据中选取部分样本2000;
选取数据的“category”前两个字符在字典wanted_classes中;
且各个类别的数量在wanted_num中;
'''
import jieba
import json
TrainJsonFile = r"J:\NLP语料\baike_qa2019\baike_qa_train.json"
ValidJsonFile = r"J:\NLP语料\baike_qa2019\baike_qa_valid.json"
# MyTrainFile用来存筛选好的训练数据
#MyValidFile用来存筛选好的验证数据
MyTrainFile = r"J:\NLP语料\my_baike_qa_train.json
#MyValidFile = r"J:\NLP语料\my_baike_qa_valid.json" # 对验证集数据也做样本筛选
StopWordFile = r"J:\NLP语料\baike_qa2019\stopword.txt"
wanted_classes = {"教育":0, "健康":0, "生活":0, "娱乐":0, "游戏":0}
wanted_num = 2000
wanted_all = 2000 * 5
def main()
f_read = open(ValidJsonFile, "r", encoding="utf-8")
f_write = open(MyTrainFile, "w", encoding= "utf-8")
#f_write = open(MyValidFile, "w", encoding="utf-8"
numind = 0
for line in f_read.readlines(): #list,每个元素都是json字符串(句子)
data = json.loads(line) #将json字符串转化为python字典
cla = data['category'][:2]
if cla in wanted_classes and wanted_classes[cla] < wanted_num:
json_data = json.dumps(data) # 把data转化为json字符串
f_write.write(json_data)
f_write.write("\n")
wanted_classes[cla] += 1
numind += 1
if numind >= wanted_all:
break
if __name__ == "__main__":
main()
我们需要得到训练数据中所有的“title”对应的词表。也就是说我们首先对每个标题使用jieba分词工具进行分词,之后去除停用词,剩下的就构成了我们的词表。
词表的每一行的内容为:词 词的序号 词的频次
from tqdm import tqdm
import json
import jieba
MytrainFile = r"J:\NLP语料\my_baike_qa_train.json"
stopwordFile = r"J:\NLP语料\baike_qa2019\stopword.txt"
wordLabelFile = r"J:\NLP语料\wordLabel.txt"
LengthFile = r"J:\NLP语料\Length.txt"
def read_stopword(fp):
f_stopword = open(fp,"r",encoding="utf-8")
data = f_stopword.read().split("\n")
return data
def main():
worddict = {}
stopword_list = read_stopword(stopwordFile)
datas = open(MytrainFile, "r", encoding="utf-8").read().split("\n")
datas = list(filter(None,datas))
print("len(datas):", len(datas))
data_num = len(datas)
lenth_dict = {}
for data in datas:
data = json.loads(data)
title = data["title"]
title_seg = jieba.cut(title) # jieba.cut返回迭代器
lenth = 0
for word in title_seg:
if word in stopword_list:
continue
lenth += 1 # 一个title中去掉停用词的长度
if word in worddict:
worddict[word] += 1
else:
worddict[word] = 1
if lenth in lenth_dict:
lenth_dict[lenth] += 1
else:
lenth_dict[lenth] = 1
wordlist = sorted(worddict.items(), key= lambda item:item[1], reverse=True)# 从大到小排列
#print("wordlist:", wordlist)
f = open(wordLabelFile, "w", encoding="utf-8")
ind = 0
for tup in wordlist:
d = tup[0] + " " + str(ind) + " " + str(tup[1]) + "\n"
ind += 1
f.write(d)
for k, v in lenth_dict.items():
lenth_dict[k] = round(v * 1.0 / data_num, 3)
#print(lenth_dict.items())
lenlist = sorted(lenth_dict.items(), key=lambda item:item[0], reverse=True)
print("lenlist:",lenlist)
f = open(LengthFile,"a", encoding="utf-8")
for tup in lenlist:
d = str(tup[0]) + " "+ str(tup[1]) + "\n"f.write(d)
if __name__ == "__main__":
main()
有了词表,我们就可以把文本转化为数字了。比如下面这句话:
我爱NLP啊 (原始句子)
我 / 爱 / NLP/啊 (jieba分词结果)
我 / 爱 / NLP(去除停用词啊)
1 5 102 0 0(将其数字化,“我”对应1,人工智能对应102。假设我们设定句子长度为5.则需要在后面加两个0)
import json
import jieba
import random
trainFile = r"J:\NLP语料\my_baike_qa_train.json"
#validFile = r"J:\NLP语料\my_baike_qa_valid.json" #也需要将验证集转化为数字
stopwordFile = r"J:\NLP语料\baike_qa2019\stopword.txt"
wordlabelFile = r"J:\NLP语料\wordLabel.txt"
trainDataVecFile = r"J:\NLP语料\traindata_vec.txt"
validDataVecFile = r"J:\NLP语料\validdata_vec.txt"
labelFile = r"J:\NLP语料\label.txt"
maxLen = 20
def read_labelFile(fp):
data = open(fp, "r", encoding="utf-8").read().split("\n")
#print(data)
label_w2n = {}
label_n2w = {}
for labelItem in data:
labelItem = labelItem.split(" ") # list
label_w = labelItem[0]
label_ind = int(labelItem[1])
label_w2n[label_w] = label_ind
label_n2w[label_ind] = label_w
return label_w2n, label_n2w
# label_w2n, label_n2w = read_labelFile(labelFile)
# print(label_n2w)
# print(label_w2n)
def read_stopword(fp):
f_stopword = open(fp,"r",encoding="utf-8")
data = f_stopword.read().split("\n") # list,每个元素是句子字符串
return data
def get_wordtoken(fp):
datas = open(fp, "r", encoding="utf-8").read().split("\n")
print(len(datas))
datas = list(filter(None, datas)) # 过滤空句子
#print(len(datas))
word2ind = {}
for i in datas:
i = i.split(" ")
word2ind[i[0]] = int(i[1])
ind2word = {word2ind[w] : w for w in word2ind}
return word2ind, ind2word
#get_wordtoken(wordlabelFile)
def title2Vec():
label_w2n, label_n2w = read_labelFile(labelFile)
word2ind, ind2word = get_wordtoken(wordlabelFile)
trainDataFile = open(trainDataVecFile, "w")
#validDataFile = open(validDataVecFile, "w")
stoplist = read_stopword(stopwordFile)
datas = open(validFile, "r",encoding="utf-8").read().split("\n")
datas =list(filter(None, datas))
random.shuffle(datas)
for line in datas:
line_dict = json.loads(line)
title = line_dict["title"]
cls = line_dict["category"][:2]
cls_ind = label_w2n[cls]
title_seg = jieba.cut(title)
title_ind = [cls_ind]
for word in title_seg:
if word in stoplist:
continue
title_ind.append(word2ind[word])
#print("title_ind:",title_ind)
length = len(title_ind)
if length > maxLen: # 句子截断
title_ind = title_ind[:20]
else: # 句子填充0,句子的总长度是20,第一个数字其实是类别的索引(0,1,2,3,4)
title_ind.extend([0] * (maxLen - length))
#print(len(title_ind))
for num in title_ind: #list
trainDataFile.write(str(num) + ",")
trainDataFile.write("\n")
#validDataFile.write(str(num) + ",")
#validDataFile.write("\n")
return word2ind, label_w2n
def main():
word2ind, label_w2n = title2Vec()
return word2ind, label_w2n
if __name__ == "__main__":
main()
'''
模型包含embedding层,卷积层,dropout层和全连接层
'''
from word2token import title2Vec
word2ind, label_w2n = title2Vec()
#print(word2ind) maxLen = 19 # 长度的第一个数字是标签,去掉之后是19了
textcnn_param = {
"vocab_size": len(word2ind),
"embed_size": 50,
"class_num":len(label_w2n),
"kernel_num": 16,
"kernel_size":[3,4,5],
"dropout_p": 0.5
}
import torch
import torch.nn as nn
import torch.functional as F
import math
class textCNN(nn.Module):
def __init__(self):
super().__init__()
# nn.embedding 生成了一个矩阵,当制定padding = ind时,那么矩阵的第ind行元素会置零
#padding会让矩阵的某些行置为0
# 用什么数字num填充padding, padding就应该设置为num
self.vocab_size = textcnn_param["vocab_size"]
self.embed_size = textcnn_param["embed_size"]
self.class_num = textcnn_param["class_num"]
self.kernel_num = textcnn_param["kernel_num"]
self.kernel_size = textcnn_param["kernel_size"]
self.dropout_p = textcnn_param["dropout_p"]
self.embedding = nn.Embedding(self.vocab_size, self.embed_size, padding_idx=0)
self.textcnn1 = nn.Conv1d(in_channels = self.embed_size, out_channels=self.kernel_num,kernel_size= self.kernel_size[0])
self.textcnn2 = nn.Conv1d(in_channels = self.embed_size, out_channels=self.kernel_num,kernel_size= self.kernel_size[1])
self.textcnn3 = nn.Conv1d(in_channels = self.embed_size, out_channels=self.kernel_num,kernel_size= self.kernel_size[2])
self.pool1 = nn.MaxPool1d(kernel_size= (maxLen - self.kernel_size[0] + 1))
self.pool2 = nn.MaxPool1d(kernel_size=(maxLen - self.kernel_size[1] + 1))
self.pool3 = nn.MaxPool1d(kernel_size=(maxLen - self.kernel_size[2] + 1))
self.relu = nn.ReLU()
self.linear = nn.Linear(in_features= len(self.kernel_size) * self.kernel_num, out_features= self.class_num)
def forward(self, x):
#x : b x src_len
x = self.embedding(x) # b x src_len x embed_size
#print("input x:", x.size())
x = x.permute(0,2,1)
x1 = self.textcnn1(x) # conver1d 的输入输出格式
# input : b x in_channel x src_len (in_channel == embed_size)
# output: b x out_channel x out_len1
#print("x1 conv", x1.size()) # 16 x 16 x 17
x1 = self.relu(x1)
x1 = self.pool1(x1)
#print("x1 pool",x1.size())
# output: b x out_channel x pool_len ( pool_len = out_len - pool_kernel_size + 1)
x2 = self.textcnn2(x) # input : b x in_channel x src_len (in_channel == embed_size)
# output: b x out_channel x out_len2
#print("x2 conv", x2.size())
x2 = self.relu(x2)
x2= self.pool2(x2)
# output: bx out_channel x pool_len ( pool_len = out_len - pool_kernel_size + 1)
#print("x2 pool", x2.size())
x3 = self.textcnn3(x) # input : b x in_channel x src_len (in_channel == embed_size)
# output: b x out_channel x out_len3
x3 = self.relu(x3)
x3 = self.pool3(x3)
# output: b x out_channel x pool_len ( pool_len = out_len - pool_kernel_size + 1)
x = torch.cat([x1,x2,x3], dim = 1) # b x 3*out_channel x 1
#print("x.size:", x.size())
nn.Dropout(self.dropout_p)
# self.linear 的input: b x in_channel
x = x.squeeze(2) # x: b x 3*out_channel(即:3*kernel_num)
out = self.linear(x) # b x class_num
return out
from torch.utils.data import Dataset, DataLoader
import torch
import random
trainDataVecFile = r"J:\NLP语料\traindata_vec.txt"
class TextCnnData(Dataset):
def __init__(self, fp = trainDataVecFile): # 不需要继承父类的init,只需填写自己需要的信息就好
DataVec = open(fp, "r", encoding="utf-8").read().split("\n") # list的每个元素都是字符串
DataVec = list(filter(None, DataVec)) # ["2, 6720,54,186,9,0,0,0,0", ""2,467,175,1959,2619,5,10,0,0, "1,234,678,9,",...]
#print(trainDataVec) 每个字符串开头第一个数字代表该句子的类别
# 原地打乱
random.shuffle(DataVec)
self.trainDataVec = DataVec
#print("trainDataVec:", self.trainDataVec)
def __getitem__(self, item):
sent = self.trainDataVec[item] # str
sent = sent.split(",") # list 里面每个元素为数字字符
sent = list(filter(None, sent))
sents = [int(digit_str) for digit_str in sent]
#print("sent: ", sents)
idx = sents[0]
#print("idx",idx)
sentence = torch.LongTensor(sents[1:]) # 转为tensor格式
return idx, sentence
def __len__(self):
return len(self.trainDataVec)
if __name__ == "__main__":
# from tqdm import tqdm
# idx, sent = TextCnnData().__getitem__(1)
# print("idx",idx) # tensor(4)
# print("sent _ size: ",sent, sent.size()) # tensor([3334, 2376,9,883,1287,681,....]
BATCH_SIZE = 5
TextCnnDataLoader = DataLoader(TextCnnData(), batch_size=BATCH_SIZE, shuffle=True)
for i, data in enumerate(TextCnnDataLoader):
# forward
idx_, sents = data
print("idx___",idx_)
print(len(sents))
break
'''模型训练过程:5步
1.数据
2.模型
3.损失函数
4.优化器
5.训练'''
import torch
import os
import torch.nn as nn
import numpy as np
import time
from textCNN import textCNN
from DataLoader import TextCnnData
import word2token
from torch.utils.data import DataLoader
import torch.optim as optim
from matplotlib import pyplot as plt
from tqdm import tqdm
MAX_EPOCH = 10
BATCH_SIZE = 16
LR = 0.01
log_interval = 10
val_interval = 1
===============================step1 数据
trainDataVecFile = r"J:\NLP语料\traindata_vec.txt"
validDataVecFile = r"J:\NLP语料\validdata_vec.txt"
# 构建MyDataset实例
train_data = TextCnnData(fp= trainDataVecFile)
valid_data = TextCnnData(fp = validDataVecFile)
# 构建DataLoder
train_loader = DataLoader(dataset=train_data, batch_size=BATCH_SIZE, shuffle=True)
valid_loader = DataLoader(dataset=valid_data, batch_size=BATCH_SIZE)
================================ step 2 模型
net = textCNN()
================================ step 3 损失函数
criterion = nn.CrossEntropyLoss()
================================ step 4 优化器
optimizer = optim.SGD(net.parameters(), lr=LR, momentum=0.9) # 选择优化器"
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1) # 设置学习率下降策略"
================================ step 5训练
train_curve = list()
valid_curve = list()
for epoch in tqdm(range(MAX_EPOCH)):
loss_mean = 0.
correct = 0.
total = 0.
net.train()
for i, data in tqdm(enumerate(train_loader)):
# forward
label, sents = data
outputs = net(sents)
# backward
optimizer.zero_grad()
loss = criterion(outputs, label)
loss.backward()
# update weights
optimizer.step()
# 统计分类情况
_, predicted = torch.max(outputs.data, 1)
total += label.size(0)
crrrect += (predicted == label).squeeze().sum().numpy()
# 打印训练信息
loss_mean += loss.item()
train_curve.append(loss.item())
if (i + 1) % log_interval == 0:
loss_mean = loss_mean / log_interval
print("Training:Epoch[{:0>3}/{:0>3}] Iteration[{:0>3}/{:0>3}] Loss: {:.4f} Acc:{:.2%}".format(
epoch, MAX_EPOCH, i+1, len(train_loader), loss_mean, correct / total))
loss_mean = 0
scheduler.step() # 更新学习率
# validate the model
print("save model...........")
model_path = r"J:\NLP语料\model\{}_model_iter_{}_{}_loss_{:.2f}.pkl".format(time.strftime("%y-%m-%d:%H"), epoch, i, loss.item())
f = open(model_path, "w")
torch.save(net.state_dict(),model_path)
==================================== 验证 ===============================
if (epoch+1) % val_interval == 0:
correct_val = 0.
total_val = 0.
loss_val = 0.
net.eval()
with torch.no_grad():
for j, data in tqdm(enumerate(valid_loader)):
label, inputs = data
outputs = net(inputs)
loss = criterion(outputs, label)
_, predicted = torch.max(outputs.data, 1)
total_val += label.size(0)
correct_val += (predicted == label).squeeze().sum().numpy()
loss_val += loss.item()
loss_val_epoch = loss_val / len(valid_loader)
valid_curve.append(loss_val_epoch)
# valid_curve.append(loss.item()) #记录整个epoch样本的loss,注意要取平均
print("Valid:\t Epoch[{:0>3}/{:0>3}] Iteration[{:0>3}/{:0>3}] Loss: {:.4f} Acc:{:.2%}".format(
epoch, MAX_EPOCH, j+1, len(valid_loader), loss_val_epoch, correct_val / total_val))
train_x = range(len(train_curve))
train_y = train_curve
train_iters = len(train_loader)
valid_x = np.arange(1, len(valid_curve)+1) * train_iters*val_interval # 由于valid中记录的是epochloss,需要对记录点进行转换到iterations
valid_y = valid_curve
plt.plot(train_x, train_y, label='Train')
plt.plot(valid_x, valid_y, label='Valid')
plt.legend(loc='upper right')
plt.ylabel('loss value')
plt.xlabel('Iteration')
plt.show()