import numpy as np
import pandas as pd
from gensim.models import word2vec
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.utils import to_categorical
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')
%pylab inline
train_df = pd.read_csv('../data/train_set.csv', sep='\t')
test_df = pd.read_csv('../data/test_a.csv', sep='\t')
将文本数据预处理,转换成词向量
2967 6758 339 2021 1854 3731 4109 3792 4149
转换成
[-0.04762661, -0.11038123,...,-0.00834203],
[-0.01352869, -0.13543403,...,-0.02658689],
...,
[7.74508417e-02, 6.12210967e-02,...,7.56272748e-02]
# 加载Word2Vec模型
w2v_model = word2vec.Word2Vec.load('../emb/word2vec.h5')
# 转换成序列
# MAX_SEQUENCE_LENGTH = 1024
MAX_SEQUENCE_LENGTH = 50
input_categories = 'text'
list_train = list(train_df['text'].map(lambda x:x.split(' ')))
list_train_ = tf.keras.preprocessing.sequence.pad_sequences(list_train,
padding='post', truncating='post', maxlen=MAX_SEQUENCE_LENGTH)
del list_train
list_test = list(test_df['text'].map(lambda x:x.split(' ')))
list_test_ = tf.keras.preprocessing.sequence.pad_sequences(list_test,
padding='post', truncating='post', maxlen=MAX_SEQUENCE_LENGTH)
del list_test
# 把词转换成word2vec的词向量
def embedding_sentences(sentences, w2vModel):
all_vectors = []
embeddingDim = w2vModel.vector_size
embeddingUnknown = [0 for i in range(embeddingDim)]
for sentence in sentences:
this_vector = []
for word in sentence:
word = str(word)
if word in w2vModel.wv.vocab:
this_vector.append(w2vModel[word])
else:
this_vector.append(embeddingUnknown)
all_vectors.append(this_vector)
return all_vectors
inputs = np.array(embedding_sentences(list_train_, w2v_model))
test_inputs = np.array(embedding_sentences(list_test_, w2v_model))
output_categories = 'label'
def compute_output_arrays(df, columns):
return np.asarray(df[columns].astype(int))
outputs = compute_output_arrays(train_df, output_categories)
def Focal_Loss(y_true, y_pred, alpha=0.5, gamma=2):
y_pred += tf.keras.backend.epsilon()
ce = -y_true * tf.math.log(y_pred)
weight = tf.pow(1 - y_pred, gamma) * y_true
fl = ce * weight * alpha
reduce_fl = tf.keras.backend.max(fl, axis=-1)
return reduce_fl
def create_model(embedding_dims, max_len, num_class):
tensor_input = tf.keras.Input(shape=(max_len, embedding_dims))
cnn1 = tf.keras.layers.SeparableConv1D(256, 3, padding='same', strides = 1, activation='relu')(tensor_input)
cnn1 = tf.keras.layers.BatchNormalization()(cnn1)
cnn1 = tf.keras.layers.MaxPool1D(pool_size=max_len)(cnn1)
cnn2 = tf.keras.layers.SeparableConv1D(256, 4, padding='same', strides = 1, activation='relu')(tensor_input)
cnn2 = tf.keras.layers.BatchNormalization()(cnn2)
cnn2 = tf.keras.layers.MaxPool1D(pool_size=max_len)(cnn2)
cnn3 = tf.keras.layers.SeparableConv1D(256, 5, padding='same', strides = 1, activation='relu')(tensor_input)
cnn3 = tf.keras.layers.BatchNormalization()(cnn3)
cnn3 = tf.keras.layers.MaxPool1D(pool_size=max_len)(cnn3)
cnn = tf.keras.layers.concatenate([cnn1,cnn2,cnn3], axis=-1)
x = tf.keras.layers.Dropout(0.2)(cnn)
x = tf.keras.layers.Flatten()(x)
x = tf.reshape(x,(-1,1,x.shape[-1]))
x = tf.keras.layers.Dense(768, activation='relu')(x)
x1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=True))(x)
x1 = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64))(x1)
x1 = tf.keras.layers.Dense(32, activation='relu')(x1)
x2 = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(128, return_sequences=True))(x)
x2 = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(64, return_sequences=True))(x2)
x2 = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(32))(x2)
x2 = tf.keras.layers.Dense(16, activation='relu')(x2)
x = tf.keras.layers.Concatenate()([x1, x2])
x = tf.keras.layers.Dropout(0.2)(x)
tensor_output = tf.keras.layers.Dense(num_class, activation='softmax')(x)
model = tf.keras.models.Model(tensor_input, outputs=tensor_output)
# model.summary() 查看模型
optimizer = tf.keras.optimizers.Nadam(learning_rate=1e-5)
FL = lambda y_true,y_pred: Focal_Loss(y_true, y_pred, alpha=0.25, gamma=2)
model.compile(loss=FL, optimizer=optimizer, metrics=['acc'])
return model
gkf = StratifiedKFold(n_splits=5).split(X=train_df[input_categories].fillna('13'), y=train_df[output_categories].fillna('13'))
test_preds = []
for fold, (train_idx, valid_idx) in enumerate(gkf):
train_inputs = inputs[train_idx]
train_outputs = to_categorical(outputs[train_idx])
valid_inputs = inputs[valid_idx]
valid_outputs = to_categorical(outputs[valid_idx])
K.clear_session() # 销毁当前的TF图并创建一个新图,有助于避免旧模型/图层混乱。
# 模型构建
model = create_model(embedding_dims=120, max_len=MAX_SEQUENCE_LENGTH, num_class=14)
model.fit(train_inputs, train_outputs, validation_data= [valid_inputs, valid_outputs], epochs=7, batch_size=16)
# 测试集预测
test_preds.append(model.predict(test_inputs))
# K折取平均
preds = np.average(test_preds, axis=0)
preds = np.argmax(preds,axis=1)
submission = pd.read_csv('../data/test_a_sample_submit.csv')
submission['label'] = preds
submission.to_csv('../output/Word2Vec_submission.csv', index=False)