NLP - 加载预训练embedding示例

Sentimental analysis on text (a binary classification model based on Keras)

# -*- coding: utf-8 -*-
import os
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding
import matplotlib.pyplot as plt


labels = []
texts = []

maxlen = 100
training_samples = 20000
validation_samples = 4000
max_words = 10000
embedding_index = {}
embedding_dim = 300

predict_texts = ['I love you and you are so beautiful, you are good, you are good, you are good',
                 'I hate you and you are bad, you are bad, you are bad',
                 'you are good']


def get_datasource():
    imdb_dir = 'D:/DL/keras/aclImdb'
    train_dir = os.path.join(imdb_dir, 'train')
    for label_type in ['neg', 'pos']:
        dir_name = os.path.join(train_dir, label_type)
        for fname in os.listdir(dir_name):
            if fname[-4:] == '.txt':
                f = open(os.path.join(dir_name, fname), 'r', encoding='UTF-8')
                texts.append(f.read())
                f.close()
                if label_type == 'neg':
                    labels.append(0)
                else:
                    labels.append(1)

    print('texts', texts[0:3])
    print('len(texts)', len(texts))
    return texts, labels


def text_split(training_samples, validation_samples, texts, labels):
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(texts)
    # 将字符串转换为整数索引组成的列表
    sequences = tokenizer.texts_to_sequences(texts)

    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    data = pad_sequences(sequences, maxlen=maxlen)

    labels = np.asarray(labels)
    print('Shape of data tensor:', data.shape)
    print('Shape of label tensor:', labels.shape)
    print('data[0]: ', len(data[0]), '\n', data[0])

    indices = np.arange(data.shape[0])
    np.random.shuffle(indices)
    data = data[indices]
    labels = labels[indices]

    x_train = data[:training_samples]
    y_train = labels[:training_samples]
    x_val = data[training_samples: training_samples + validation_samples]
    y_val = labels[training_samples: training_samples + validation_samples]

    return x_train, y_train, x_val, y_val, word_index, tokenizer


def prepare_Glove(embedding_index, word_index, max_words, embedding_dim):
    glove_dir = 'D:/DL/keras/glove.6B'
    f = open(os.path.join(glove_dir, 'glove.6B.300d.txt'), 'r', encoding='UTF-8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs
    f.close()
    print('Found %s word vectors.' % len(embedding_index))

    # word_index contains the words in corpus, embedding_index is the pre-trained embedding
    embedding_matrix = np.zeros((max_words, embedding_dim))
    for word, i in word_index.items():
        if i < max_words:
            embedding_vector = embedding_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix


def predict_samples_vec(samples, tokenizer):
    sequence = tokenizer.texts_to_sequences(samples)
    pad_seq = pad_sequences(sequence, maxlen=maxlen)
    print('pad_seq', pad_seq)
    return pad_seq


def my_model(max_words, embedding_dim, maxlen):
    model = Sequential()
    model.add(Embedding(max_words, embedding_dim, input_length=maxlen))
    # 将三维的嵌入张量展平成形状为二维张量
    model.add(Flatten())
    model.add(Dense(32, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
    model.summary()
    return model


def train_model(embedding_dim, maxlen, embedding_matrix, x_train, y_train, x_val, y_val):
    model = my_model(max_words, embedding_dim, maxlen)
    model.layers[0].set_weights([embedding_matrix])  # 加载预训练embedding
    model.layers[0].trainable = False

    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
    save_model_file = 'pre_trained_glove_model.h5'

    if not os.path.exists(save_model_file):

        history = model.fit(x_train, y_train, epochs=20, batch_size=128, validation_data=(x_val, y_val))
        model.save_weights('pre_trained_glove_model.h5')

        acc = history.history['acc']
        val_acc = history.history['val_acc']
        loss = history.history['loss']
        val_loss = history.history['val_loss']
        epochs = range(1, len(acc) + 1)
        plt.plot(epochs, acc, 'bo', label='Training acc')
        plt.plot(epochs, val_acc, 'b', label='Validation acc')
        plt.title('Training and validation accuracy')
        plt.legend()

        plt.figure()
        plt.plot(epochs, loss, 'bo', label='Training loss')
        plt.plot(epochs, val_loss, 'b', label='Validation loss')
        plt.title('Training and validation loss')
        plt.legend()
        plt.show()


def predict():
    model = my_model(max_words, embedding_dim, maxlen)
    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])

    save_model_file = 'pre_trained_glove_model.h5'
    model.load_weights(save_model_file)
    print('model has restored...')
    pred_test = model.predict(predict_texts)
    for i in pred_test:
        if i[0] < 0.3:
            print('this is negtive comment')
        else:
            print('this is positive comment')


if __name__ == "__main__":
    text, label = get_datasource()
    x_train, y_train, x_val, y_val, word_index, tokenizer = text_split(training_samples, validation_samples, text, label)
    # embedding_matrix = prepare_Glove(embedding_index, word_index, max_words, embedding_dim)

    predict_texts = predict_samples_vec(predict_texts, tokenizer)
    # train_model(embedding_dim, maxlen, embedding_matrix, x_train, y_train, x_val, y_val)
    predict()





There are two functions. 

def prepare_Glove(embedding_index, word_index, max_words, embedding_dim):
    glove_dir = 'D:/DL/keras/glove.6B'
    f = open(os.path.join(glove_dir, 'glove.6B.300d.txt'), 'r', encoding='UTF-8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs
    f.close()
    print('Found %s word vectors.' % len(embedding_index))

    # word_index contains the words in corpus, embedding_index is the pre-trained embedding
    embedding_matrix = np.zeros((max_words, embedding_dim))
    for word, i in word_index.items():
        if i < max_words:
            embedding_vector = embedding_index.get(word)
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector
    return embedding_matrix
def train_model(embedding_dim, maxlen, embedding_matrix, x_train, y_train, x_val, y_val):
    model = my_model(max_words, embedding_dim, maxlen)
    model.layers[0].set_weights([embedding_matrix])  # 加载预训练embedding
    model.layers[0].trainable = False

    model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
    save_model_file = 'pre_trained_glove_model.h5'
需要冻结 Embedding 层(即将其 trainable 属性设为 False ),其原理和预训练的卷 积神经网络特征相同,你已经很熟悉了。如果一个模型的一部分是经过预训练的(如 Embedding 层),而另一部分是随机初始化的(如分类器),那么在训练期间不应该更新预训练的部分,以避免丢失它们所保存的信息。

你可能感兴趣的:(NLP)