文本的embedding

使用:word2vec进行文本的embedding

VECTOR_DIR = 'GoogleNews-vectors-negative300.bin' # 词向量模型文件



from keras.utils import plot_model
from keras.layers import Embedding
import gensim
from gensim.models import Word2Vec
EMBEDDING_DIM = 300 # 词向量空间维度
w2v_model = gensim.models.KeyedVectors.load_word2vec_format(VECTOR_DIR, binary=True)
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items(): 
    if unicode(word) in w2v_model:
        embedding_matrix[i] = np.asarray(w2v_model[unicode(word)],
                                         dtype='float32')
embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=500,

                            trainable=False)

对比glove的embedding:

embeddings_index = {}
f = open('glove.42B.300d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()


print('Found %s word vectors.' % len(embeddings_index))
EMBEDDING_DIM=300
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

from keras.layers import Embedding


embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],

                            input_length=500)

word2vec对比glove,最后的实验结果表明:glove的准确率高8%。

参考资料:

https://eliyar.biz/using-pre-trained-gensim-word2vector-in-a-keras-model-and-visualizing/

http://wuwt.me/2017/08/21/pre-trained-embedding-keras/


你可能感兴趣的:(文本的embedding)