使用jieba分词
data_processing.py
import jieba
data_path = "sources/weibo_senti_100k.csv"
data_stop_path = "sources/hit_stopword"
data_list = open(data_path,encoding='utf-8').readlines()[1:]
#处理停用词
stops_word = open(data_stop_path,encoding='utf-8').readlines()
stops_word = [line.strip() for line in stops_word]#去掉换行符
stops_word.append(" ")#防止被strip过滤掉
stops_word.append("\n")
#根据分词结果统计字典
voc_dict = {}
min_seq = 1
top_n = 1000
UNK=""
PAD = ""
for item in data_list[:]:
label = item[0]
content = item[2:].strip()#去掉结尾的换行符
seg_list = jieba.cut(content, cut_all=False)
seg_res = []
for seg_item in seg_list:
print(seg_item)
if seg_item in stops_word: #去掉停用词
continue
seg_res.append(seg_item)
if seg_item in voc_dict.keys(): #使用字典统计词频
voc_dict[seg_item] = voc_dict[seg_item] + 1
else:
voc_dict[seg_item] = 1
print(content)
print(seg_res)
#排序字典词频,取topN的词定义字典
voc_list = sorted([_ for _ in voc_dict.items() if _[1] > min_seq],
key=lambda x:x[1], reverse=True)[:top_n]
voc_dict = {word_count[0]: idx for idx,word_count in enumerate(voc_list)}
#将字典以外的词固定为特定字符UNK
voc_dict.update({UNK:len(voc_dict),PAD:len(voc_dict) + 1})
print(voc_dict)
#保存字典
ff = open("sources/dict","w")
for item in voc_dict.keys():
ff.writelines("{},{}\n".format(item,voc_dict[item]))
此时如果分词结果里还有不想用到的词,就把这些词复制到停用词词典里
datasets.py
from torch.utils.data import Dataset,DataLoader
import jieba
import numpy as np
def read_dict(voc_dict_path):
voc_dict = {}
dict_list = open(voc_dict_path,encoding='utf-8').readlines()
for item in dict_list:
item = item.split(",")
voc_dict[item[0]] = int(item[1].strip())
return voc_dict
def load_data(data_path,data_stop_path):
data_list = open(data_path,encoding='utf-8').readlines()[1:]
stops_word = open(data_stop_path,encoding='utf-8').readlines()
stops_word = [line.strip() for line in stops_word]
stops_word.append(" ")
stops_word.append("\n")
voc_dict = {}
data = []
max_len_seq = 0 #统计最长的句子长度
np.random.shuffle(data_list)
for item in data_list[:]:
label = item[0]
content = item[2:].strip()
seg_list = jieba.cut(content, cut_all=False)
seg_res = []
for seg_item in seg_list:
if seg_item in stops_word:
continue
seg_res.append(seg_item)
if seg_item in voc_dict.keys():
voc_dict[seg_item] = voc_dict[seg_item] + 1
else:
voc_dict[seg_item] = 1
if len(seg_res) > max_len_seq:
max_len_seq = len(seg_res)
data.append([label, seg_res])
return data, max_len_seq
class text_ClS(Dataset):
def __init__(self, voc_dict_path,data_path,data_stop_path):
self.data_path = data_path
self.data_stop_path = data_stop_path
self.voc_dict = read_dict(voc_dict_path)
self.data, self.max_len_seq = \
load_data(self.data_path,self.data_stop_path)
np.random.shuffle(self.data)
def __len__(self):
return len(self.data)
def __getitem__(self, item):
data = self.data[item]
label = int(data[0])
word_list = data[1]
input_idx = []
for word in word_list:
if word in self.voc_dict.keys():
input_idx.append(self.voc_dict[word])
else:
input_idx.append(self.voc_dict["" ])
if len(input_idx) < self.max_len_seq:#长度是否是和最大的句子的长度对齐
input_idx += [self.voc_dict["" ]
for _ in range(self.max_len_seq - len(input_idx))]
data = np.array(input_idx)
return label, data
def data_loader(dataset, config):
return DataLoader(dataset, batch_size=config.batch_size, shuffle=config.is_shuffle)
#data_path = "sources/weibo_senti_100k.csv"
#data_stop_path = "sources/hit_stopword"
#dict_path = "sources/dict"
if __name__ == '__main__':
data_path = "sources/weibo_senti_100k.csv"
data_stop_path = "sources/hit_stopword"
dict_path = "sources/dict"
train_dataloader = data_loader(data_path,data_stop_path,dict_path)
for i, batch in enumerate(train_dataloader):
print(batch[1].size())
models.py
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
class Model(nn.Module):
def __init__(self,config):
super(Model,self).__init__()
self.embeding = nn.Embedding(config.n_vocab,config.embed_size,
padding_idx=config.n_vocab -1)
self.lstm = nn.LSTM(config.embed_size, config.hidden_size,
config.num_layers, bidirectional=True,
batch_first=True, dropout=config.dropout)
self.maxpooling = nn.MaxPool1d(config.pad_size)
self.fc = nn.Linear(config.hidden_size * 2 + config.embed_size
, config.num_classes)
self.softmax = nn.Softmax(dim=1)
def forward(self,x):
embed = self.embeding(x) #输出为[batchsize, seqlen, embed_size] 标准RNN网络的输入
out, _ = self.lstm(embed)
out = torch.cat((embed,out),2)
out = F.relu(out)
out = out.permute(0,2,1)#交换维度
out = self.maxpooling(out).reshape(out.size()[0],-1)#转化为2维tensor
print(out.size())
out = self.fc(out)
out = self.softmax(out)
return out
if __name__ == '__main__':
from configs import Config
cfg = Config()
cfg.pad_size = 640
model_textcls = Model(config=cfg)
input_tensor = torch.tensor([i for i in range(640)]).reshape([1, 640])
out_tensor = model_textcls.forward(input_tensor)
print(out_tensor.size())
print(out_tensor)
cofigs.py
import torch
class Config():
def __init__(self):
'''
self.embeding = nn.Embedding(config.n_vocab,
config.embed_size,
padding_idx=config.n_vocab - 1)
self.lstm = nn.LSTM(config.embed_size,
config.hidden_size,
config.num_layers,
bidirectional=True, batch_first=True,
dropout=config.dropout)
self.maxpool = nn.MaxPool1d(config.pad_size)
self.fc = nn.Linear(config.hidden_size * 2 + config.embed_size,
config.num_classes)
self.softmax = nn.Softmax(dim=1)
'''
self.n_vocab = 1002 #字典长度
self.embed_size = 128
self.hidden_size = 128
self.num_layers = 3
self.dropout = 0.8
self.num_classes = 2 #二分类问题
self.pad_size = 32
self.batch_size = 128
self.is_shuffle = True
self.learn_rate = 0.001
self.num_epochs = 100
self.devices = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
run_train.py
import torch
import torch.nn as nn
from torch import optim
from models import Model
from datasets import data_loader, text_ClS
from configs import Config
cfg = Config()
#读取数据
data_path = "sources/weibo_senti_100k.csv"
data_stop_path = "sources/hit_stopword"
dict_path = "sources/dict"
dataset = text_ClS(dict_path, data_path, data_stop_path)
train_dataloader = data_loader(dataset, cfg)
cfg.pad_size = dataset.max_len_seq
model_text_cls = Model(cfg)
model_text_cls.to(cfg.devices)
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_text_cls.parameters(),lr=cfg.learn_rate)
for epoch in range(cfg.num_epochs):
for i, batch in enumerate(train_dataloader):
label, data = batch
data = torch.tensor(data).to(cfg.devices)
label = torch.tensor(label,dtype=torch.int64).to(cfg.devices)
optimizer.zero_grad()
pred = model_text_cls.forward(data)
loss_val = loss_func(pred, label)
# print(pred)
# print(label)
print("epoch is {}, ite is {}, val is {}".format(epoch,i,loss_val))
loss_val.backward()
optimizer.step()
if epoch % 10 == 0:#每10次迭代存储一次模型
torch.save(model_text_cls.state_dict(),"models/{}.pth".format(epoch))
test.py
import torch
import torch.nn as nn
from torch import optim
from models import Model
from datasets import data_loader, text_ClS
from configs import Config
cfg = Config()
#读取数据
data_path = "sources/weibo_senti_100k.csv"
data_stop_path = "sources/hit_stopword"
dict_path = "sources/dict"
dataset = text_ClS(dict_path, data_path, data_stop_path)
train_dataloader = data_loader(dataset, cfg)
cfg.pad_size = dataset.max_len_seq
model_text_cls = Model(cfg)
model_text_cls.to(cfg.devices)
#加载模型
model_text_cls.load_state_dict(torch.load("models/10.pth"))
for i, batch in enumerate(train_dataloader):
label, data = batch
data = torch.tensor(data).to(cfg.devices)
label = torch.tensor(label,dtype=torch.int64).to(cfg.devices)
pred_softmax = model_text_cls.forward(data)
#print(pred_softmax)
print(label)
pred = torch.argmax(pred_softmax, dim=1)
print(pred)
#统计准确率
out = torch.eq(pred,label)
print(out.sum() * 1.0 / pred.size()[0])