NLP-文本分类-二分类

语料获取

本示例所用语料为某外卖平台用户的评论,共11987条样本数据, 其中正向评论样本4000条, 负向评论样本7987条, 点击进入下载地址

文本预处理——分词并去除停用词

分词工具采用 jieba 分词 pip install jieba
停用词表为中文停用词表, 哈工大停用词表, 百度停用词表, 四川大学机器智能实验室停用词库四份停用词表进行了合并去重,下载地址

import pandas as pd
import jieba


def load_stop_words():
    """加载停用词"""
    with open("../data_set/stop_words") as fr:
        stop_words = set([word.strip() for word in fr])
    return stop_words


if __name__ == '__main__':
    # 加载停用词
    stop_words = load_stop_words()
    # 读取文件
    df = pd.read_csv("../data_set/waimai_10k.csv")
    # 切词并过滤调停用词
    df["review"] = df["review"].map(lambda x: " ".join([i for i in jieba.cut(x) if i not in stop_words]))
    # 保存处理好的文本
    df.to_csv("./waimai.csv", index=False, header=False, columns=["review"])

处理后的文件如下

很快 好吃 味道 足 量
送水 送水 送水
快 态度
快捷 味道 可口 快 递给 力
菜 味道 很棒 送餐
师傅 手抖 微辣 辣
送餐 快 态度 特别 辛苦 谢谢
超级 快 送到 冷 天气 骑士 辛苦 谢谢你们 麻辣 香锅 依然 好吃
上次 晚 小时 超级 快 20 分钟 送到 … …
五分钟 订 卖家 特别 接单 谢谢

词向量训练

词向量训练采用 gensim 模块
pip install gensim

import pandas as pd
import gensim


if __name__ == '__main__':
    df = pd.read_csv("./waimai.csv", header=None)
    sentences = df.iloc[:, 1].astype("str").map(lambda x: x.split(" "))
    model = gensim.models.Word2Vec(sentences, size=128, workers=4, min_count=0)
    model.wv.save_word2vec_format('./word_vec.txt', binary=False)

一秒训练完。。。

10246 128
味道 -0.34176767 0.1422865 0.5661937 0.6358043 0.99514866 0.4597006 0.83270687 0.67648256 -0.16697478 -0.39428547 1.0036861 -0.11081296 0.46511438 -0.13658749 0.1482188 -0.73044026 -0.436237 0.49267843 -0.4390354 -0.18539704 0.65958226 -0.16079785 -0.274373 0.5363726 0.30086786 0.8065682 -0.1525818 -0.22661807 -0.22201005 0.4886867 0.17151305 0.6941031 -0.108169384 0.5792473 -0.32802448 -0.55200815 0.25212976 -0.48502052 -0.54447377 -0.6231095 0.074580014 -0.39396325 -0.3055023 0.26453292 -0.22691658 -0.0020165953 -0.61628485 -0.6484135 0.09854976 1.1460947 0.5731212 0.34008676 0.22624384 0.25767615 0.018547993 -0.98884386 0.3754116 0.6716524 0.6616562 0.04034343 -0.76667696 0.4891913 0.7732257 -0.35860583 0.3487415 -0.35146204 -0.22601724 0.58054966 0.4355862 0.76702976 0.301465 -1.2691803 0.5356394 0.371402 0.54972774 0.6574773 0.9245965 0.115193315 -0.4758462 0.15970539 0.0103178015 -0.116095796 0.7121025 0.21703413 -0.38246158 0.20462309 -0.4325303 -0.106235646 -0.4519505 -0.09121104 -0.59677076 -0.07223956 0.5214413 0.02620219 0.27124628 0.4202872 -1.366464 0.12932953 0.24490091 -0.6196757 0.5437061 0.61052674 1.3073382 -0.7672843 -0.72003216 0.06251624 0.26925302 0.58113956 -0.45114592 -0.8383538 0.02999392 -0.1300691 -0.9095514 -0.79298264 0.70403606 0.32387486 0.35832337 0.9410294 0.19609398 -0.21290061 -0.22644106 0.92168874 0.22732252 0.19034281 0.75808144 0.9700917 -0.15570371 0.28114718
送餐 -0.34831476 0.16106193 0.5994892 0.64805424 1.0271257 0.46728194 0.8493655 0.6830762 -0.18260212 -0.3587068 1.01343 -0.11325885 0.47106928 -0.1680743 0.1278685 -0.7440699 -0.4252064 0.47813246 -0.41524065 -0.17440696 0.6049742 -0.15751494 -0.2982898 0.52218205 0.28473657 0.8560871 -0.14586104 -0.16491853 -0.15567955 0.4590574 0.17111456 0.6245679 -0.049771745 0.63952273 -0.36627942 -0.504943 0.24357647 -0.4964198 -0.5470333 -0.63577425 0.097736515 -0.35689566 -0.28207627 0.2810563 -0.2688453 -0.06852475 -0.5780539 -0.70400196 0.12267686 1.0992405 0.62311095 0.34455365 0.2872575 0.20026866 0.032420043 -1.0238267 0.41567177 0.57241106 0.7557455 0.06543916 -0.79624844 0.42394665 0.80884355 -0.2969704 0.3194532 -0.2706822 -0.19089109 0.56951344 0.38337457 0.79456556 0.27990746 -1.2773975 0.48038802 0.307269 0.46341935 0.5919445 0.89985734 0.07449208 -0.5155747 0.099851035 0.015074631 -0.11942169 0.6676198 0.21894054 -0.32572982 0.1891366 -0.5257631 -0.12457273 -0.37080353 -0.12841094 -0.6200164 -0.042225726 0.5027024 0.17725371 0.23022962 0.29081154 -1.42769 0.13530985 0.2771318 -0.5822854 0.6126383 0.62363005 1.2924049 -0.662307 -0.7142578 0.10473512 0.18142381 0.59187007 -0.4600846 -0.85942894 0.005593317 -0.147893 -0.9587728 -0.75739866 0.65089595 0.31477287 0.31118864 0.89044017 0.22873242 -0.2516947 -0.23799604 0.95505095 0.18526594 0.22436847 0.83456284 0.9753054 -0.11093519 0.27834952
好吃 -0.30725345 0.1283905 0.52813786 0.5933264 0.9303256 0.4126685 0.7771101 0.6270748 -0.15313117 -0.37562656 0.92349505 -0.11283011 0.41913372 -0.12276961 0.14269884 -0.6803193 -0.41086897 0.4431171 -0.40913734 -0.17537531 0.6322392 -0.14475387 -0.2530998 0.4914092 0.28248012 0.74353975 -0.13958585 -0.20634086 -0.1969838 0.44981313 0.17648831 0.6567505 -0.09571447 0.52662045 -0.31215164 -0.50666744 0.22952724 -0.4708712 -0.5190871 -0.57296234 0.06719357 -0.37752706 -0.26550692 0.251655 -0.21778527 0.0020325563 -0.5790934 -0.6029802 0.086034015 1.0809773 0.52037036 0.31693274 0.21899918 0.25061807 0.011746211 -0.93361104 0.36211205 0.6272974 0.62583774 0.03684908 -0.6984929 0.45416585 0.7099049 -0.33532396 0.30931497 -0.33459026 -0.20909707 0.5540794 0.41215074 0.721903 0.2852583 -1.1962199 0.50577945 0.3373197 0.50444156 0.6198687 0.86888 0.09832464 -0.43946823 0.15845151 0.004639616 -0.10199037 0.6669751 0.20029305 -0.34718072 0.18566431 -0.4217568 -0.09939404 -0.42297262 -0.07127523 -0.5585624 -0.07119543 0.48151365 0.012957705 0.24921264 0.40019986 -1.2703121 0.12616552 0.22909844 -0.58044565 0.5054929 0.5549034 1.2014142 -0.7182815 -0.68673414 0.06984787 0.26249847 0.5499739 -0.4183708 -0.79104775 0.018213468 -0.11408542 -0.8455202 -0.74249464 0.6450488 0.299841 0.3214686 0.87345284 0.19684848 -0.19231175 -0.21840167 0.8621186 0.2152602 0.1791741 0.7084485 0.9104159 -0.142168 0.25246274
吃 -0.317348 0.1756655 0.6776852 0.7761266 1.1800449 0.48290405 1.0057433 0.7316651 -0.15752426 -0.5104708 1.1479326 -0.13435195 0.5668409 -0.13615549 0.1672698 -0.8952324 -0.5223199 0.5390238 -0.5268473 -0.21923916 0.81747895 -0.18072836 -0.3433976 0.5930387 0.36616674 0.9447755 -0.19561037 -0.23575857 -0.22093017 0.55783033 0.2159247 0.8529153 -0.14994794 0.66850096 -0.40219787 -0.6253349 0.32207093 -0.55757445 -0.6221306 -0.68742085 0.06836425 -0.47764063 -0.34166995 0.27392557 -0.21961759 0.037943263 -0.75723386 -0.79410183 0.11590063 1.3745393 0.6318284 0.3727167 0.23632705 0.32733127 -0.016460735 -1.1486912 0.46428093 0.82637024 0.76288664 0.022743462 -0.8505736 0.57333946 0.8542382 -0.41087016 0.36125845 -0.4183659 -0.2612495 0.6776614 0.5016058 0.87514096 0.42352787 -1.5055473 0.63744575 0.45063123 0.64591473 0.7956222 1.0891579 0.12250588 -0.51378715 0.20177063 0.029593572 -0.1845762 0.8167282 0.25598404 -0.4376466 0.27361465 -0.48486224 -0.094121136 -0.5315548 -0.06680334 -0.66599524 -0.1096651 0.5843629 -0.011161809 0.32369217 0.5596714 -1.6154377 0.136895 0.30053017 -0.7115952 0.66305244 0.6674846 1.5191295 -0.9946287 -0.7906497 0.04872249 0.33399847 0.68079853 -0.5693448 -0.9646293 0.018165639 -0.14496508 -1.0399712 -0.9866209 0.7748378 0.39542305 0.45934528 1.1698129 0.24988888 -0.21097943 -0.31229407 1.0970247 0.29161188 0.24489474 0.87869185 1.1473607 -0.2317162 0.28112486
送 -0.3122476 0.1951807 0.63570726 0.7103084 1.1110979 0.46854934 0.9254026 0.6948979 -0.17032832 -0.4031734 1.0291259 -0.12722725 0.5409529 -0.18306918 0.11178081 -0.80404085 -0.4670057 0.4683021 -0.431196 -0.19963683 0.6596009 -0.18137656 -0.33063754 0.535615 0.2974652 0.921764 -0.1692971 -0.14714755 -0.114276186 0.50176394 0.18717885 0.68588567 -0.06989612 0.67240196 -0.3996118 -0.5202358 0.2958069 -0.53781116 -0.5413914 -0.6655814 0.1056774 -0.40086055 -0.30898282 0.28031486 -0.25067914 -0.026333455 -0.6328047 -0.77418697 0.12626144 1.175131 0.6298331 0.3529492 0.299159 0.19088757 0.03356706 -1.0693496 0.42886 0.6319119 0.78095907 0.050119836 -0.7968591 0.417419 0.80905426 -0.2909621 0.31810242 -0.2809425 -0.19401962 0.6181103 0.38274473 0.81851476 0.3080845 -1.3709657 0.50125974 0.32389018 0.47050905 0.64839256 0.9596635 0.066670604 -0.53640294 0.14316899 0.02482079 -0.16103546 0.712954 0.22898223 -0.34023425 0.22759342 -0.5249994 -0.1033441 -0.39438552 -0.13098606 -0.6477487 -0.039757647 0.5320035 0.17372783 0.24086252 0.36096773 -1.5161527 0.12145833 0.31156737 -0.5838143 0.6931728 0.62331647 1.3688667 -0.7866663 -0.739208 0.10475284 0.1719332 0.62416357 -0.5066214 -0.8924192 0.0045560496 -0.15892388 -1.0038497 -0.8304806 0.65664387 0.34376568 0.3370172 0.96435684 0.26185748 -0.2452078 -0.26580992 1.0299401 0.20957707 0.2636418 0.8989269 1.0241199 -0.14050174 0.23097765
...

第一行为词个数及向量维数

模型搭建

建立词索引

首先我们需要先导入这么多库, 后边会用到

import sys 
from collections import defaultdict

import jieba
import gensim
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split

建立词索引前先把刚刚训练好的词向量加载进来

word_vec_model = gensim.models.KeyedVectors.load_word2vec_format("word_vec.txt", binary=False)

然后建立一个字典(词-索引的映射), 一个表(索引-词向量的矩阵)

def build_embeddings_matrix(word_vec_model):
    # 初始化词向量矩阵
    embeddings_matrix = np.random.random((len(word_vec_model.wv.vocab)+1, 128))
    # 初始化词索引字典
    word_index = defaultdict(dict)

    for index, word in enumerate(word_vec_model.index2word):
        word_index[word] = index + 1
        # 预留0行给查不到的词
        embeddings_matrix[index+1] = word_vec_model.get_vector(word)
    return word_index, embeddings_matrix

word_index 是这样的

{
  "味道": 1,
  "送餐": 2,
  "好吃": 3,
  "吃": 4,
  "送": 5,
  "不错": 6,
  "小时": 7,
  "没有": 8,
  "没": 9,
  "点": 10,
  "送到": 11,
  "说": 12,
  ...
  "满头大汗": 10234,
  "心疼": 10235,
  "单说": 10236,
  "螺号": 10237,
  "多装": 10238,
  "忍心": 10239,
  "羊血": 10240,
  "大不一样": 10241,
  "连双": 10242,
  "羊肉汤": 10243,
  "几元": 10244,
  "羊血要": 10245,
  "羊汤": 10246
}

就是给每个词一个编号

embeddings_matrix 是这样的

[[ 5.69546860e-01  7.94050459e-01  9.21687010e-01 ...  5.39988930e-01  8.41095603e-01  9.84990521e-01]
 [-3.41767669e-01  1.42286494e-01  5.66193700e-01 ...  9.70091701e-01  -1.55703709e-01  2.81147182e-01]
 [-3.48314762e-01  1.61061928e-01  5.99489212e-01 ...  9.75305378e-01  -1.10935189e-01  2.78349519e-01]
 ...
 [-5.99376392e-03  8.19191220e-04  9.59137175e-03 ...  1.21581573e-02  1.40984030e-03 -9.48444358e-04]
 [-1.57012604e-04 -2.43240129e-03  8.13777931e-03 ...  1.11063607e-02  -4.23950190e-03  7.17222691e-04]
 [-2.05256371e-03 -1.63255120e-03  4.44210222e-04 ...  6.44489145e-03  -1.32397411e-03 -2.05684011e-03]]

每个词的向量, 例如 好吃 这个词的index是3, 那么这个词的向量就是上表当中的第三个数组, 即

 [-3.48314762e-01  1.61061928e-01  5.99489212e-01 ...  9.75305378e-01  -1.10935189e-01  2.78349519e-01]

这个过程也可以用 keraskeras.preprocessing.text 模块实现

生成三组数据集(训练集, 验证集, 测试集)

def train_data(word_index):
    df = pd.read_csv("./waimai.csv", names=["label", "review"])
    df["word_index"] = df["review"].astype("str").map(lambda x: np.array([word_index.get(i, 0) for i in x.split(" ")]))
    # 填充及截断
    train = keras.preprocessing.sequence.pad_sequences(df["word_index"].values, maxlen=20, padding='post', truncating='post', dtype="float32")
    x_train, x_test, y_train, y_test = train_test_split(train, df["label"].values, test_size=0.2, random_state=1)
    # 从训练集上分出验证集
    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.15)
    return x_train, x_val, x_test, y_train, y_val, y_test

这几个数据分别长这个样子

x_train x_val x_test

[[5.700e+01 3.790e+02 1.600e+02 ... 1.123e+03 3.320e+02 1.766e+03]
 [4.000e+00 1.800e+01 2.570e+02 ... 0.000e+00 0.000e+00 0.000e+00]
 [1.400e+02 4.010e+02 6.560e+02 ... 0.000e+00 0.000e+00 0.000e+00]
 ...
 [1.120e+02 2.500e+02 2.400e+01 ... 0.000e+00 0.000e+00 0.000e+00]
 [4.026e+03 5.930e+02 3.100e+02 ... 0.000e+00 0.000e+00 0.000e+00]
 [5.700e+02 5.646e+03 4.000e+00 ... 0.000e+00 0.000e+00 0.000e+00]]

其实就是每句话每个词的 index 集合, 至于为啥不是整数, 可能是 numpy 处理过程当中把它转为了浮点, 不过不影响, 单独取出一组数看是没问题的

[114.  14. 405. 100.   7.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.   0.]

根据词的index查 word_index 可以知道这组向量的原句是

[行 速度 好慢 一个多 小时]

后边的 0 向量是对句子的填充, 我们需要保证每组向量的维度为20, 多的截断, 少的用 0 填充

y_train y_val y_test

[0 0 0 ... 0 0 1]

这个就是 label, 1好评, 0差评

构建模型

def build_model(word_index, embeddings_matrix):
    model = keras.Sequential()
    model.add(keras.layers.Embedding(input_dim=len(word_index)+1, 
                                    output_dim=128, 
                                    weights=[embeddings_matrix],
                                    input_length=20,
                                    trainable=False))
    model.add(keras.layers.GlobalAveragePooling1D())
    model.add(keras.layers.Dense(32, activation=tf.nn.relu))
    model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))
    
    model.compile(optimizer=tf.train.AdamOptimizer(),
                    loss='binary_crossentropy',
                    metrics=['accuracy'])
    model.summary()
    return model

keras.layers.Embedding: embedding层

  • input_dim: 词个数(+1是因为我们预留了第0位给查不到的词)
  • output_dim: 词向量维度
  • weights: 预训练的向量矩阵
  • input_length: 截断长度
  • trainable: 因为我们的词向量是预训练的, 所以训练过程当中不需要再对向量矩阵进行调整了

keras.layers.GlobalAveragePooling1D: 通过对序列维度求平均值,针对每个样本返回一个长度固定的输出向量。这样,模型便能够以尽可能简单的方式处理各种长度的输入。
keras.layers.Dense: 全连接 (Dense) 层(包含 32个隐藏单元)。
keras.layers.Dense: 同上, 输出层, 因为是二分类问题, 所以连1个隐藏单元就可以了, 激活函数选 sigmoid

model.compile 是为模型选择的优化方式, 可以看我的另一篇博客博客, 或者查阅TensorFlow Keras 官方文档

model.summary 输出模型各层信息, 如下

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
embedding (Embedding)        (None, 20, 128)           1311616   
_________________________________________________________________
global_average_pooling1d (Gl (None, 128)               0         
_________________________________________________________________
dense (Dense)                (None, 32)                4128      
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
=================================================================
Total params: 1,315,777
Trainable params: 4,161
Non-trainable params: 1,311,616
_________________________________________________________________

训练

迭代50次

model.fit(x_train, y_train, epochs=100, validation_data=(x_val, y_val))
Train on 8150 samples, validate on 1439 samples
Epoch 1/50
OMP: Info #250: KMP_AFFINITY: pid 27275 tid 27289 thread 1 bound to OS proc set 0
OMP: Info #250: KMP_AFFINITY: pid 27275 tid 27288 thread 2 bound to OS proc set 0
8150/8150 [==============================] - 1s 111us/sample - loss: 0.6213 - acc: 0.6618 - val_loss: 0.6101 - val_acc: 0.6782
Epoch 2/50
8150/8150 [==============================] - 0s 61us/sample - loss: 0.6153 - acc: 0.6649 - val_loss: 0.6113 - val_acc: 0.6782
Epoch 3/50
8150/8150 [==============================] - 1s 63us/sample - loss: 0.6138 - acc: 0.6648 - val_loss: 0.6102 - val_acc: 0.6782
Epoch 4/50
8150/8150 [==============================] - 1s 62us/sample - loss: 0.6127 - acc: 0.6655 - val_loss: 0.6127 - val_acc: 0.6782
...
Epoch 46/50
8150/8150 [==============================] - 0s 60us/sample - loss: 0.5445 - acc: 0.7245 - val_loss: 0.5325 - val_acc: 0.7262
Epoch 47/50
8150/8150 [==============================] - 1s 62us/sample - loss: 0.5394 - acc: 0.7303 - val_loss: 0.5336 - val_acc: 0.7457
Epoch 48/50
8150/8150 [==============================] - 1s 63us/sample - loss: 0.5411 - acc: 0.7252 - val_loss: 0.5339 - val_acc: 0.7519
Epoch 49/50
8150/8150 [==============================] - 1s 64us/sample - loss: 0.5400 - acc: 0.7302 - val_loss: 0.5296 - val_acc: 0.7366
Epoch 50/50
8150/8150 [==============================] - 1s 65us/sample - loss: 0.5383 - acc: 0.7298 - val_loss: 0.5343 - val_acc: 0.7165

准确率在 70% 左右

评估

在测试集上进行效果评估

results = model.evaluate(x_test, y_test)
print(f"损失: {results[0]}, 准确率: {results[1]}")
2398/2398 [==============================] - 0s 36us/sample - loss: 0.5138 - acc: 0.7627
损失: 0.5137957387213115, 准确率: 0.762718915939331

模型保存

model.save_weights('./model/waimai_model')

目录结构如下

model
├── checkpoint
├── waimai_model.data-00000-of-00002
├── waimai_model.data-00001-of-00002
└── waimai_model.index

0 directories, 4 files

模型载入

import gensim

from train import build_model, build_embeddings_matrix


if __name__ == '__main__':
    word_vec_model = gensim.models.KeyedVectors.load_word2vec_format("word_vec.txt", binary=False)
    word_index, embeddings_matrix = build_embeddings_matrix(word_vec_model)
    model = build_model(word_index, embeddings_matrix)
    model.load_weights("./model/waimai_model")

载入模型时需保证模型结构与训练时结构一致, 所以需要调用 build_model 方法创建同样结构的模型

预测

import jieba
import gensim
import numpy as np
from tensorflow import keras

from train import build_model, build_embeddings_matrix
from text_preprocessing import load_stop_words


if __name__ == '__main__':
    word_vec_model = gensim.models.KeyedVectors.load_word2vec_format("word_vec.txt", binary=False)
    word_index, embeddings_matrix = build_embeddings_matrix(word_vec_model)
    model = build_model(word_index, embeddings_matrix)
    model.load_weights("./model/waimai_model")
    
    stop_words = load_stop_words()
    
    while True:
        text = input("请输入一句话:")
        text = [word_index.get(word, 0) for word in jieba.cut(text)]
        text = keras.preprocessing.sequence.pad_sequences([text], maxlen=20, padding='post', truncating='post', dtype="float32")

        res = model.predict(text)[0][0]
        if res >= 0.5:
            print(f"好评, 得分: {res*100}")
        else:
            print(f"差评,得分: {res*100}")

        print()

试一下

请输入一句话:特别好吃,量特大,而且送餐特别快,特别特别棒
好评, 得分: 71.35688066482544

请输入一句话:外送员很赞,商家能不能仔细看订单啊!点的干拌面送来的是汤面,说了粉汤羊血要多加辣椒送来的一点儿辣没有!!
差评,得分: 12.211278080940247

请输入一句话:肉夹馍肉太少
差评,得分: 45.60675919055939

请输入一句话:下了单说一个小时之后送,过了一个小时送餐员打电话又说晚15分钟,而且态度不好!味道也一般,跟小螺号比差远了,再也不点了
差评,得分: 4.753769561648369

请输入一句话:味道不错,份量很足,建议都点小份。红薯超好吃就是太烂了容易碎
差评,得分: 36.52855157852173

以上就是使用 tf.keras 实现的简单的文本二分类, 由于语料集大小的问题, 预训练的词向量表征不够好, 大家也可以使用更多的语料进行词向量的预训练, 或者使用网上开源的词向量, 项目的完整代码可以查看我的github

你可能感兴趣的:(NLP-文本分类-二分类)