中文文本情感分类实战(weibo_senti_100k为数据集)

中文文本情感分类

  • 数据准备
  • 加载数据集
  • 搭建模型结构
  • 训练脚本的搭建
  • 测试脚本的编写

数据准备

使用jieba分词

data_processing.py

import jieba

data_path = "sources/weibo_senti_100k.csv"
data_stop_path = "sources/hit_stopword"
data_list = open(data_path,encoding='utf-8').readlines()[1:]
#处理停用词
stops_word = open(data_stop_path,encoding='utf-8').readlines()
stops_word = [line.strip() for line in stops_word]#去掉换行符
stops_word.append(" ")#防止被strip过滤掉
stops_word.append("\n")
#根据分词结果统计字典
voc_dict = {}
min_seq = 1
top_n = 1000
UNK=""
PAD = ""
for item in data_list[:]:
    label = item[0]
    content = item[2:].strip()#去掉结尾的换行符
    seg_list = jieba.cut(content, cut_all=False)
    seg_res = []

    for seg_item in seg_list:
        print(seg_item)
        if seg_item in stops_word: #去掉停用词
            continue
        seg_res.append(seg_item)
        if seg_item in voc_dict.keys(): #使用字典统计词频
            voc_dict[seg_item] = voc_dict[seg_item] + 1
        else:
            voc_dict[seg_item] = 1

    print(content)
    print(seg_res)

#排序字典词频,取topN的词定义字典
voc_list = sorted([_ for _ in voc_dict.items() if _[1] > min_seq],
                  key=lambda x:x[1], reverse=True)[:top_n]
voc_dict = {word_count[0]: idx for idx,word_count in enumerate(voc_list)}
#将字典以外的词固定为特定字符UNK
voc_dict.update({UNK:len(voc_dict),PAD:len(voc_dict) + 1})

print(voc_dict)

#保存字典
ff = open("sources/dict","w")
for item in voc_dict.keys():
    ff.writelines("{},{}\n".format(item,voc_dict[item]))

此时如果分词结果里还有不想用到的词,就把这些词复制到停用词词典里

加载数据集

datasets.py

from torch.utils.data import Dataset,DataLoader
import jieba
import numpy as np

def read_dict(voc_dict_path):
    voc_dict = {}
    dict_list = open(voc_dict_path,encoding='utf-8').readlines()
    for item in dict_list:
        item = item.split(",")
        voc_dict[item[0]] = int(item[1].strip())
    return voc_dict

def load_data(data_path,data_stop_path):
    data_list = open(data_path,encoding='utf-8').readlines()[1:]
    stops_word = open(data_stop_path,encoding='utf-8').readlines()
    stops_word = [line.strip() for line in stops_word]
    stops_word.append(" ")
    stops_word.append("\n")
    voc_dict = {}
    data = []
    max_len_seq = 0 #统计最长的句子长度
    np.random.shuffle(data_list)
    for item in data_list[:]:
        label = item[0]
        content = item[2:].strip()
        seg_list = jieba.cut(content, cut_all=False)
        seg_res = []
        for seg_item in seg_list:
            if seg_item in stops_word:
                continue
            seg_res.append(seg_item)
            if seg_item in voc_dict.keys():
                voc_dict[seg_item] = voc_dict[seg_item] + 1
            else:
                voc_dict[seg_item] = 1
        if len(seg_res) > max_len_seq:
            max_len_seq = len(seg_res)
        data.append([label, seg_res])
    return data, max_len_seq

class text_ClS(Dataset):
    def __init__(self, voc_dict_path,data_path,data_stop_path):
        self.data_path = data_path
        self.data_stop_path = data_stop_path
        self.voc_dict = read_dict(voc_dict_path)
        self.data, self.max_len_seq = \
            load_data(self.data_path,self.data_stop_path)

        np.random.shuffle(self.data)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, item):
        data = self.data[item]
        label = int(data[0])
        word_list = data[1]
        input_idx = []
        for word in word_list:
            if word in self.voc_dict.keys():
                input_idx.append(self.voc_dict[word])
            else:
                input_idx.append(self.voc_dict[""])
        if len(input_idx) < self.max_len_seq:#长度是否是和最大的句子的长度对齐
            input_idx += [self.voc_dict[""]
                          for _ in range(self.max_len_seq - len(input_idx))]
        data = np.array(input_idx)
        return label, data

def data_loader(dataset, config):
    return DataLoader(dataset, batch_size=config.batch_size, shuffle=config.is_shuffle)
    #data_path = "sources/weibo_senti_100k.csv"
    #data_stop_path = "sources/hit_stopword"
    #dict_path = "sources/dict"

if __name__ == '__main__':
    data_path = "sources/weibo_senti_100k.csv"
    data_stop_path = "sources/hit_stopword"
    dict_path = "sources/dict"
    train_dataloader = data_loader(data_path,data_stop_path,dict_path)
    for i, batch in enumerate(train_dataloader):
        print(batch[1].size())

搭建模型结构

models.py

import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

class Model(nn.Module):
    def __init__(self,config):
        super(Model,self).__init__()
        self.embeding = nn.Embedding(config.n_vocab,config.embed_size,
                                     padding_idx=config.n_vocab -1)
        self.lstm = nn.LSTM(config.embed_size, config.hidden_size,
                            config.num_layers, bidirectional=True,
                            batch_first=True, dropout=config.dropout)
        self.maxpooling = nn.MaxPool1d(config.pad_size)
        self.fc = nn.Linear(config.hidden_size * 2 + config.embed_size
                            , config.num_classes)
        self.softmax = nn.Softmax(dim=1)

    def forward(self,x):
        embed = self.embeding(x) #输出为[batchsize, seqlen, embed_size] 标准RNN网络的输入
        out, _ = self.lstm(embed)
        out = torch.cat((embed,out),2)
        out = F.relu(out)
        out = out.permute(0,2,1)#交换维度
        out = self.maxpooling(out).reshape(out.size()[0],-1)#转化为2维tensor
        print(out.size())
        out = self.fc(out)
        out =  self.softmax(out)
        return out

if __name__ == '__main__':
    from configs import Config
    cfg = Config()
    cfg.pad_size = 640
    model_textcls = Model(config=cfg)
    input_tensor = torch.tensor([i for i in range(640)]).reshape([1, 640])
    out_tensor = model_textcls.forward(input_tensor)
    print(out_tensor.size())
    print(out_tensor)

cofigs.py

import torch
class Config():
    def __init__(self):
        '''
        self.embeding = nn.Embedding(config.n_vocab,
                                config.embed_size,
                                padding_idx=config.n_vocab - 1)
        self.lstm = nn.LSTM(config.embed_size,
                            config.hidden_size,
                              config.num_layers,
                               bidirectional=True, batch_first=True,
                               dropout=config.dropout)
        self.maxpool = nn.MaxPool1d(config.pad_size)
        self.fc = nn.Linear(config.hidden_size * 2 + config.embed_size,
                                  config.num_classes)
        self.softmax = nn.Softmax(dim=1)
        '''
        self.n_vocab = 1002 #字典长度
        self.embed_size = 128
        self.hidden_size = 128
        self.num_layers = 3
        self.dropout = 0.8
        self.num_classes = 2 #二分类问题
        self.pad_size = 32
        self.batch_size = 128
        self.is_shuffle = True
        self.learn_rate = 0.001
        self.num_epochs = 100
        self.devices = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


训练脚本的搭建

run_train.py

import torch
import torch.nn as nn
from torch import optim
from models import Model
from datasets import data_loader, text_ClS
from configs import Config

cfg = Config()

#读取数据
data_path = "sources/weibo_senti_100k.csv"
data_stop_path = "sources/hit_stopword"
dict_path = "sources/dict"

dataset = text_ClS(dict_path, data_path, data_stop_path)
train_dataloader = data_loader(dataset, cfg)

cfg.pad_size = dataset.max_len_seq

model_text_cls = Model(cfg)
model_text_cls.to(cfg.devices)

loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(model_text_cls.parameters(),lr=cfg.learn_rate)

for epoch in range(cfg.num_epochs):
    for i, batch in enumerate(train_dataloader):
        label, data = batch
        data = torch.tensor(data).to(cfg.devices)
        label = torch.tensor(label,dtype=torch.int64).to(cfg.devices)

        optimizer.zero_grad()
        pred = model_text_cls.forward(data)
        loss_val = loss_func(pred, label)

        # print(pred)
        # print(label)

        print("epoch is {}, ite is {}, val is {}".format(epoch,i,loss_val))
        loss_val.backward()
        optimizer.step()

    if epoch % 10 == 0:#每10次迭代存储一次模型
        torch.save(model_text_cls.state_dict(),"models/{}.pth".format(epoch))


测试脚本的编写

test.py

import torch
import torch.nn as nn
from torch import optim
from models import Model
from datasets import data_loader, text_ClS
from configs import Config

cfg = Config()

#读取数据
data_path = "sources/weibo_senti_100k.csv"
data_stop_path = "sources/hit_stopword"
dict_path = "sources/dict"

dataset = text_ClS(dict_path, data_path, data_stop_path)
train_dataloader = data_loader(dataset, cfg)

cfg.pad_size = dataset.max_len_seq

model_text_cls = Model(cfg)
model_text_cls.to(cfg.devices)
#加载模型
model_text_cls.load_state_dict(torch.load("models/10.pth"))


for i, batch in enumerate(train_dataloader):
    label, data = batch
    data = torch.tensor(data).to(cfg.devices)
    label = torch.tensor(label,dtype=torch.int64).to(cfg.devices)
    pred_softmax = model_text_cls.forward(data)

    #print(pred_softmax)
    print(label)
    pred = torch.argmax(pred_softmax, dim=1)
    print(pred)

    #统计准确率
    out = torch.eq(pred,label)
    print(out.sum() * 1.0 / pred.size()[0])


你可能感兴趣的:(机器学习,模板,深度学习)