这篇博客主要是讲解经典的textcnn分类模型。
关于textcnn,主要是对于文本中关键词的卷积。
1.对每一句文本形成的每个词做向量(本例子词向量是在神经网络里面形成)。
2.对于每一行文本形成的词的长度做统一化(一般取最长的那个文本长度的80%-90%左右)
3.对于每一个词做Tokenizer,意味着对每一个词做映射,例如:'用户':1
4.对于Tokenizer后的data_train数据可以进入神经网络。(具体神经网络里面的参数以及实现的原理在下个博客里再详细解释)
5.对于y值的处理one-hot。
import pandas as pd
import numpy as np
import jieba
import re
# 如果多进程分词可以导入
import multiprocessing
from multiprocessing import Pool
from keras.utils import to_categorical,multi_gpu_model
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Dense, Embedding, Activation, merge, Input, Lambda, Reshape,BatchNormalization
from keras.layers import Conv1D, Flatten, Dropout, MaxPool1D, GlobalAveragePooling1D,SeparableConvolution1D
from keras import regularizers
from keras.layers.merge import concatenate
# 准确率
from sklearn import metrics
# 读取数据
df_train=pd.read_excel(r'C:\Users\admin\Desktop\text_cf\zzyw.xlsx',sheetname='训练集')
df_test=pd.read_excel(r'C:\Users\admin\Desktop\text_cf\zzyw.xlsx',sheetname='测试集')
# 分词的代码
def seg_sentences(sentence):
# 去掉特殊字符
sentence = re.sub(u"([^\u4e00-\u9fa5\u0030-\u0039\u0041-\u005a\u0061-\u007a])","",sentence)
sentence_seged = list(jieba.cut(sentence.strip()))
return sentence_seged
df_train['文本分词']=df_train['文本'].apply(seg_sentences)
df_test['文本分词']=df_test['文本'].apply(seg_sentences)
#Y值的处理
lables_list = df_train['标签'].unique().tolist()
dig_lables = dict(enumerate(lables_list))
lable_dig = dict((lable,dig) for dig, lable in dig_lables.items())
df_train['标签_数字'] = df_train['标签'].apply(lambda lable: lable_dig[lable])
train_lables = to_categorical(df_train['标签_数字'],num_classes=num_classes)
#Y值的大小即分为多少类
num_classes = len(dig_lables)
#对X值的处理
num_words = 10000 #总词数
max_len = 200 #每行最大词数,不足补0,多余截取掉。
tokenizer = Tokenizer(num_words=num_words)
df_all=pd.concat([df_train['文本分词'],data_test['文本分词']])
tokenizer.fit_on_texts(df_all)
train_sequences = tokenizer.texts_to_sequences(df_train['文本分词'])
train_data = pad_sequences(train_sequences, maxlen=max_len, padding='post')
#测试集处理
test_sequences = tokenizer.texts_to_sequences(df_test['投诉分词'])
test_data = pad_sequences(test_sequences, maxlen=max_len, padding='post')
def cnn(words_num,embedding_dims,max_len,num_class):
tensor_input = Input(shape=(max_len,), dtype='float64')
embed = Embedding(words_num+1, embedding_dims)(tensor_input)
cnn1 = SeparableConvolution1D(200, 3, padding='same', strides = 1, activation='relu',kernel_regularizer=regularizers.l1(0.00001))(embed)
cnn1 = BatchNormalization()(cnn1)
cnn1 = MaxPool1D(pool_size=100)(cnn1)
cnn2 = SeparableConvolution1D(200, 4, padding='same', strides = 1, activation='relu',kernel_regularizer=regularizers.l1(0.00001))(embed)
cnn2 = BatchNormalization()(cnn2)
cnn2 = MaxPool1D(pool_size=100)(cnn2)
cnn3 = SeparableConvolution1D(200, 5, padding='same', strides = 1, activation='relu',kernel_regularizer=regularizers.l1(0.00001))(embed)
cnn3 = BatchNormalization()(cnn3)
cnn3 = MaxPool1D(pool_size=100)(cnn3)
cnn = concatenate([cnn1,cnn2,cnn3], axis=-1)
dropout = Dropout(0.5)(cnn)
flatten = Flatten()(dropout)
dense = Dense(128, activation='relu')(flatten)
dense = BatchNormalization()(dense)
dropout = Dropout(0.5)(dense)
tensor_output = Dense(num_class, activation='softmax')(dropout)
model = Model(inputs = tensor_input, outputs = tensor_output)
print(model.summary())
#model = multi_gpu_model(model, gpus=i) 如果有gpu,i为gpu数目
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
return model
model_sepcnn = cnn(words_num=len(tokenizer.word_index),embedding_dims=300,max_len=max_len,num_class=num_classes)
model_sepcnn.fit(train_data,train_lables,epochs=8, batch_size=512)
print('训练完成')
pred_ = [model_sepcnn.predict(vec.reshape(1,max_len)).argmax() for vec in test_data]
df_test['分类结果_预测'] = [dig_lables[dig] for dig in pred_]
metrics.accuracy_score(df_test['标签'],df_test['分类结果_预测'])
这里是用keras的embedding做词向量的,当然还可以用word2vec做词向量以及tf-idf做词向量,在以后的文章中会分享更多,望大家多加支持。