新闻文本分类之旅 Word2Vec_CNN_GRU

天池-零基础入门NLP

  • 新闻文本分类
    • 导入相关库
    • 读取数据
    • 数据预处理
    • 自定义模型
    • 输出上传文件

新闻文本分类

导入相关库

import numpy as np
import pandas as pd
from gensim.models import word2vec
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')
%pylab inline

读取数据

train_df = pd.read_csv('../data/train_set.csv', sep='\t')
test_df = pd.read_csv('../data/test_a.csv', sep='\t')

数据预处理

将文本数据预处理,转换成词向量
2967 6758 339 2021 1854 3731 4109 3792 4149
转换成
[-0.04762661, -0.11038123,...,-0.00834203],
[-0.01352869, -0.13543403,...,-0.02658689],
...,
[7.74508417e-02, 6.12210967e-02,...,7.56272748e-02]

  • 将文本数据转换成序列
    注意:MAX_SEQUENCE_LENGTH 越大越吃内存
# 加载Word2Vec模型
w2v_model = word2vec.Word2Vec.load('../emb/word2vec.h5')

# 转换成序列
# MAX_SEQUENCE_LENGTH = 1024
MAX_SEQUENCE_LENGTH = 50  
input_categories = 'text'
list_train = list(train_df['text'].map(lambda x:x.split(' ')))
list_train_ = tf.keras.preprocessing.sequence.pad_sequences(list_train, 
                    padding='post', truncating='post', maxlen=MAX_SEQUENCE_LENGTH)
del list_train
list_test = list(test_df['text'].map(lambda x:x.split(' ')))
list_test_ = tf.keras.preprocessing.sequence.pad_sequences(list_test, 
                    padding='post', truncating='post', maxlen=MAX_SEQUENCE_LENGTH)
del list_test
  • 将序列转换成词向量
# 把词转换成word2vec的词向量
def embedding_sentences(sentences, w2vModel):
    all_vectors = []
    embeddingDim = w2vModel.vector_size
    embeddingUnknown = [0 for i in range(embeddingDim)]
    for sentence in sentences:
        this_vector = []
        for word in sentence:
            word = str(word)
            if word in w2vModel.wv.vocab:
                this_vector.append(w2vModel[word])
            else:
                this_vector.append(embeddingUnknown)
        all_vectors.append(this_vector)
    return all_vectors
 
inputs = np.array(embedding_sentences(list_train_, w2v_model))
test_inputs = np.array(embedding_sentences(list_test_, w2v_model))
  • 类别标签转换
output_categories = 'label'
def compute_output_arrays(df, columns):
    return np.asarray(df[columns].astype(int))
outputs = compute_output_arrays(train_df, output_categories)

自定义模型

  • 用 Focal Loss 自定义损失函数
def Focal_Loss(y_true, y_pred, alpha=0.5, gamma=2):
    y_pred += tf.keras.backend.epsilon()
    ce = -y_true * tf.math.log(y_pred)
    weight = tf.pow(1 - y_pred, gamma) * y_true
    fl = ce * weight * alpha
    reduce_fl = tf.keras.backend.max(fl, axis=-1)
    return reduce_fl
  • 自定义模型
def create_model(embedding_dims, max_len, num_class):
    tensor_input = tf.keras.Input(shape=(max_len, embedding_dims))
    cnn1 = tf.keras.layers.SeparableConv1D(256, 3, padding='same', strides = 1, activation='relu')(tensor_input)
    cnn1 = tf.keras.layers.BatchNormalization()(cnn1)    
    cnn1 = tf.keras.layers.MaxPool1D(pool_size=max_len)(cnn1)
    cnn2 = tf.keras.layers.SeparableConv1D(256, 4, padding='same', strides = 1, activation='relu')(tensor_input)
    cnn2 = tf.keras.layers.BatchNormalization()(cnn2)    
    cnn2 = tf.keras.layers.MaxPool1D(pool_size=max_len)(cnn2)
    cnn3 = tf.keras.layers.SeparableConv1D(256, 5, padding='same', strides = 1, activation='relu')(tensor_input)
    cnn3 = tf.keras.layers.BatchNormalization()(cnn3)    
    cnn3 = tf.keras.layers.MaxPool1D(pool_size=max_len)(cnn3)
    cnn = tf.keras.layers.concatenate([cnn1,cnn2,cnn3], axis=-1)
    x = tf.keras.layers.Dropout(0.2)(cnn)
    x = tf.keras.layers.Flatten()(x)
    
    x = tf.reshape(x,(-1,1,x.shape[-1]))
    x = tf.keras.layers.Dense(768, activation='relu')(x)
    x1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True))(x)
    x1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64))(x1)
    x1 = tf.keras.layers.Dense(32, activation='relu')(x1)
    x2 = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(128, return_sequences=True))(x)
    x2 = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, return_sequences=True))(x2)
    x2 = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32))(x2)
    x2 = tf.keras.layers.Dense(16, activation='relu')(x2)
    x = tf.keras.layers.Concatenate()([x1, x2])
    x = tf.keras.layers.Dropout(0.2)(x)
    tensor_output = tf.keras.layers.Dense(num_class, activation='softmax')(x)

    model = tf.keras.models.Model(tensor_input, outputs=tensor_output)
    # model.summary() 查看模型
    optimizer = tf.keras.optimizers.Nadam(learning_rate=1e-5)
    FL = lambda y_true,y_pred: Focal_Loss(y_true, y_pred, alpha=0.25, gamma=2)
    model.compile(loss=FL, optimizer=optimizer, metrics=['acc'])
    return model
  • 5折训练 + 预测取平均
gkf = StratifiedKFold(n_splits=5).split(X=train_df[input_categories].fillna('13'), y=train_df[output_categories].fillna('13'))

test_preds = []
for fold, (train_idx, valid_idx) in enumerate(gkf):
    train_inputs = inputs[train_idx]
    train_outputs = to_categorical(outputs[train_idx])

    valid_inputs = inputs[valid_idx]
    valid_outputs = to_categorical(outputs[valid_idx])

    K.clear_session()  # 销毁当前的TF图并创建一个新图,有助于避免旧模型/图层混乱。
    # 模型构建
    model = create_model(embedding_dims=120, max_len=MAX_SEQUENCE_LENGTH, num_class=14)
    model.fit(train_inputs, train_outputs, validation_data= [valid_inputs, valid_outputs], epochs=7, batch_size=16)
    # 测试集预测
    test_preds.append(model.predict(test_inputs))
# K折取平均
preds = np.average(test_preds, axis=0)
preds = np.argmax(preds,axis=1)

输出上传文件

submission = pd.read_csv('../data/test_a_sample_submit.csv')
submission['label'] = preds
submission.to_csv('../output/Word2Vec_submission.csv', index=False)

你可能感兴趣的:(自然语言处理,自然语言处理)