用keras使用glove预训练的词向量来构建实验的embedding矩阵-以Jigsaw Unintended Bias in Toxicity Classification比赛baseline为例

数据加载

import numpy as np  # linear algebra
import pandas as pd  # data processing, CSV file I/O (e.g. pd.read_csv)
import os
# print(os.listdir("../input"))
# print(os.listdir("../input/crawl300d2m"))
# Any results you write to the current directory are saved as output.
import numpy as np
import pandas as pd
import os
import gc
import logging
import datetime
import warnings
import pickle
from keras.callbacks import EarlyStopping, ModelCheckpoint, LearningRateScheduler
from keras.layers import Input, Dense, Embedding, SpatialDropout1D, Dropout, add, concatenate
from keras.layers import CuDNNLSTM, Bidirectional, GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.preprocessing import text, sequence
from keras.losses import binary_crossentropy
from keras import backend as K
import keras.layers as L
from keras.engine.topology import Layer
from keras import initializers, regularizers, constraints, optimizers, layers

from keras.models import Model
from keras.optimizers import Adam
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

COMMENT_TEXT_COL = 'comment_text'
EMB_MAX_FEAT = 300
MAX_LEN = 220
MAX_FEATURES = 100000
BATCH_SIZE = 512
NUM_EPOCHS = 4
LSTM_UNITS = 128
DENSE_HIDDEN_UNITS = 512
NUM_MODELS = 2
EMB_PATHS = [
    #'data/crawl-300d-2M.vec',
    'data/glove.840B.300d.txt'
]
JIGSAW_PATH = 'data/'


def get_logger():
    FORMAT = '[%(levelname)s]%(asctime)s:%(name)s:%(message)s'
    logging.basicConfig(format=FORMAT)
    logger = logging.getLogger('main')
    logger.setLevel(logging.DEBUG)
    return logger
logger = get_logger()
############################################################################################

def custom_loss(y_true, y_pred):#计算loss
    return binary_crossentropy(K.reshape(y_true[:, 0], (-1, 1)), y_pred) * y_true[:, 1]


def load_data():
    logger.info('Load train and test data')
    train = pd.read_csv(os.path.join(JIGSAW_PATH, 'train.csv'), index_col='id')
    test = pd.read_csv(os.path.join(JIGSAW_PATH, 'test.csv'), index_col='id')
    return train, test

数据预处理

def perform_preprocessing(train, test):
    logger.info('data preprocessing')
    punct_mapping = {"_": " ", "`": " "}
    punct = "/-'?!.,#$%\'()*+-/:;<=>@[\\]^_`{|}~" + '""“”’' + '∞θ÷α•à−β∅³π‘₹´°£€\×™√²—–&'
    punct += '©^®` <→°€™› ♥←ק″′Â█½à…“★”–●â►−¢²¬░¶↑±¿▾═¦║―¥▓—‹─▒:¼⊕▼▪†■’▀¨▄♫☆é¯♦¤▲踾Ã⋅‘∞∙)↓、│(»,♪╩╚³・╦╣╔╗▬❤ïع≤‡√'

    def clean_special_chars(text, punct, mapping):#去除特殊字符
        for p in mapping:
            text = text.replace(p, mapping[p])
        for p in punct:
            text = text.replace(p, f' {p} ')
        return text

    for df in [train, test]:#去除comment里的特殊字符
        df[COMMENT_TEXT_COL] = df[COMMENT_TEXT_COL].astype(str)
        df[COMMENT_TEXT_COL] = df[COMMENT_TEXT_COL].apply(lambda x: clean_special_chars(x, punct, punct_mapping))

    return train, test


def run_proc_and_tokenizer(train, test):

    logger.info('Running processing and tokenizer')

    identity_columns = ['asian', 'atheist',
                        'bisexual', 'black', 'buddhist', 'christian', 'female',
                        'heterosexual', 'hindu', 'homosexual_gay_or_lesbian',
                        'intellectual_or_learning_disability', 'jewish', 'latino', 'male',
                        'muslim', 'other_disability', 'other_gender',
                        'other_race_or_ethnicity', 'other_religion',
                        'other_sexual_orientation', 'physical_disability',
                        'psychiatric_or_mental_illness', 'transgender', 'white']

    # Overall
    weights = np.ones((len(train),)) / 4
    # Subgroup
    weights += (train[identity_columns].fillna(0).values >= 0.5).sum(axis=1).astype(bool).astype(np.int) / 4
    # Background Positive, Subgroup Negative
    weights += (((train['target'].values >= 0.5).astype(bool).astype(np.int) +
                 (train[identity_columns].fillna(0).values < 0.5).sum(axis=1).astype(bool).astype(np.int)) > 1).astype(
        bool).astype(np.int) / 4
    # Background Negative, Subgroup Positive
    weights += (((train['target'].values < 0.5).astype(bool).astype(np.int) +
                 (train[identity_columns].fillna(0).values >= 0.5).sum(axis=1).astype(bool).astype(np.int)) > 1).astype(
        bool).astype(np.int) / 4
    loss_weight = 1.0 / weights.mean()

    y_train = np.vstack([(train['target'].values >= 0.5).astype(np.int), weights]).T
    y_aux_train = train[['target', 'severe_toxicity', 'obscene', 'identity_attack', 'insult', 'threat']].values
######################################################################以上是由于题目要求所做的数据预处理
    logger.info('Fitting tokenizer')
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(list(train[COMMENT_TEXT_COL]) + list(test[COMMENT_TEXT_COL]))##使用一系列文档来生成token词典,texts为list类,每个元素为一个文档。
    word_index = tokenizer.word_index  # 保存所有word对应的编号id,从1开始
    X_train = tokenizer.texts_to_sequences(list(train[COMMENT_TEXT_COL]))  # 将多个文档转换为word下标的向量形式,shape为[len(texts),len(text)] -- (文档数,每条文档的长度)
    X_test = tokenizer.texts_to_sequences(list(test[COMMENT_TEXT_COL]))
    X_train = pad_sequences(X_train, maxlen=MAX_LEN)  # 把序列设定为MAX_LEN的长度,超过MAX_LEN的部分舍弃,不到MAX_LEN则补0
    X_test = pad_sequences(X_test, maxlen=MAX_LEN)

    with open('temporary.pickle', mode='wb') as f:
        pickle.dump(X_test, f)  # use temporary file to reduce memory

    del identity_columns, weights, tokenizer, train, test
    gc.collect()

    return X_train, y_train, y_aux_train, word_index, loss_weight


train, test = load_data()
train, test = perform_preprocessing(train, test)
X_train, y_train, y_aux_train, word_index, loss_weight = run_proc_and_tokenizer(train, test)#数据预处理

构建embedding矩阵

def get_coefs(word, *arr):#获取glove数据集中的数据,每一行的形式为word,embedding1...embedding300
    return word, np.asarray(arr, dtype='float32')

def load_embeddings(path):
    with open(path,encoding='utf8') as f:
        return dict(get_coefs(*line.strip().split(' ')) for line in f)

def build_embedding_matrix(word_index, path):
    logger.info('Build embedding matrix')
    #embedding_index是一个dict,key为word,value为embediding之后的多维float值,是被别人预训练好的词向量
    embedding_index = load_embeddings(path)
    #word_index里是本次实验中用到的word的dict
    embedding_matrix = np.zeros((len(word_index) + 1, EMB_MAX_FEAT))#初始化embedding矩阵
    for word, i in word_index.items():#word_index是一个dict,word,index
        try:
            embedding_matrix[i] = embedding_index[word]#填充本实验数据设计到的word的embedding矩阵
        except KeyError:
            pass
        except:
            embedding_matrix[i] = embedding_index["unknown"]
    del embedding_index
    gc.collect()
    return embedding_matrix
def build_embeddings(word_index):  # 拼接多个embedding矩阵
    logger.info('Load and build embeddings')
    for f in EMB_PATHS:
        print('embedding_matrix-shape',build_embedding_matrix(word_index, f).shape)
    embedding_matrix = np.concatenate(
        [build_embedding_matrix(word_index, f) for f in EMB_PATHS], axis=-1)#对两个文件都计算embedding矩阵之后进行拼接
    return embedding_matrix

embedding_matrix = build_embeddings(word_index)

构建模型并预测


def build_model(embedding_matrix, num_aux_targets, loss_weight):  #

    logger.info('Build model')
    words = Input(shape=(MAX_LEN,))
    x = Embedding(*embedding_matrix.shape, weights=[embedding_matrix], trainable=False)(words)
    x = SpatialDropout1D(0.3)(x)
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)
    x = Bidirectional(CuDNNLSTM(LSTM_UNITS, return_sequences=True))(x)

    hidden = concatenate([GlobalMaxPooling1D()(x), GlobalAveragePooling1D()(x), ])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    hidden = add([hidden, Dense(DENSE_HIDDEN_UNITS, activation='relu')(hidden)])
    result = Dense(1, activation='sigmoid')(hidden)
    aux_result = Dense(num_aux_targets, activation='sigmoid')(hidden)

    model = Model(inputs=words, outputs=[result, aux_result])
    model.compile(loss=[custom_loss, 'binary_crossentropy'], loss_weights=[loss_weight, 1.0], optimizer='adam')

    return model


def run_model(X_train, y_train, y_aux_train, embedding_matrix, word_index, loss_weight):
    logger.info('Run model')

    checkpoint_predictions = []
    weights = []
    for model_idx in range(NUM_MODELS):
        model = build_model(embedding_matrix, y_aux_train.shape[-1], loss_weight)
        for global_epoch in range(NUM_EPOCHS):
            model.fit(
                X_train, [y_train, y_aux_train],
                batch_size=BATCH_SIZE, epochs=1, verbose=1,
                callbacks=[LearningRateScheduler(lambda epoch: 1.1e-3 * (0.55 ** global_epoch))]
            )
            with open('temporary.pickle', mode='rb') as f:
                X_test = pickle.load(f)  # use temporary file to reduce memory
            checkpoint_predictions.append(model.predict(X_test, batch_size=1024)[0].flatten())
            del X_test
            gc.collect()
            weights.append(2 ** global_epoch)
        del model
        gc.collect()

    preds = np.average(checkpoint_predictions, weights=weights, axis=0)
    return preds


def submit(sub_preds):
    logger.info('Prepare submission')
    submission = pd.read_csv(os.path.join(JIGSAW_PATH, 'sample_submission.csv'), index_col='id')
    submission['prediction'] = sub_preds
    submission.reset_index(drop=False, inplace=True)
    submission.to_csv('submission.csv', index=False)

sub_preds = run_model(X_train, y_train, y_aux_train, embedding_matrix, word_index, loss_weight)
submit(sub_preds)

你可能感兴趣的:(keras,nlp,数据预处理)