本代码首先将语料文件alice_in_wonderland.txt
以句子为单位进行拆分,然后进行序列化(语料下载地址)。
对每个句子提取出3个连续单词组成一个tuple=(left,center,right),skipgram模型(假设词窗大小为3)的目标是从center预测出left、从center预测出right。
因此对于每个tuple=(left,center,right)的数据,整理出的两组数据,如[x,y] = [[x1,y1],[x2,y2]]=[ [center,left],[center,right] ]
from __future__ import print_function
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation
from keras.preprocessing.text import Tokenizer, one_hot
from sklearn.metrics.pairwise import cosine_distances
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
import nltk
import numpy as np
import operator
np.random.seed(2018)
BATCH_SIZE = 128
NUM_EPOCHS = 20
lines = []
fin = open("./data/alice_in_wonderland.txt", "r")
for line in fin:
line = line.strip()
if len(line) == 0:
continue
lines.append(line)
fin.close()
sents = nltk.sent_tokenize(" ".join(lines)) # 以句子为单位进行划分
tokenizer = Tokenizer(5000) # use top 5000 words only
tokenizer.fit_on_texts(sents)
vocab_size = len(tokenizer.word_counts) + 1
sequences = tokenizer.texts_to_sequences(sents)
'''
对每个句子提取出3个连续单词的tuple=(left,center,right),skipgram模型(假设词窗大小为3)的
目标是从center预测出left、从center预测出right。
因此对于每个tuple=(left,center,right)的数据,整理出的两组数据,如[x,y] = [[x1,y1],[x2,y2]]=[ [center,left],[center,right] ]
'''
xs = []
ys = []
for sequence in sequences:
triples = list(nltk.trigrams(sequence)) # 该句子数字序列中,每3个连续的数字组成一个tuple并返回
w_lefts = [x[0] for x in triples]
w_centers = [x[1] for x in triples]
w_rights = [x[2] for x in triples]
xs.extend(w_centers)
ys.extend(w_lefts)
xs.extend(w_centers)
ys.extend(w_rights)
# 将上面已经得到xs,ys转化为 one-hot矩阵
'''
例如词典大小为 5,有两个待转化为One-hot编码的数字[[2],[4]],则one-hot编码返回一个矩阵为
[
[0,0,1,0,0],
[0,0,0,0,1]
]
'''
ohe = OneHotEncoder(n_values=vocab_size)
X = ohe.fit_transform(np.array(xs).reshape(-1, 1)).todense()
Y = ohe.fit_transform(np.array(ys).reshape(-1, 1)).todense()
# 划分出30%作为测试集,70%作为训练集
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.3,
random_state=42)
print(Xtrain.shape, Xtest.shape, Ytrain.shape, Ytest.shape)
model = Sequential()
model.add(Dense(300, input_shape=(Xtrain.shape[1],)))
model.add(Activation("relu"))
model.add(Dropout(0.5))
model.add(Dense(Ytrain.shape[1]))
model.add(Activation("softmax"))
model.compile(optimizer="Nadam", loss="categorical_crossentropy",
metrics=["accuracy"])
history = model.fit(Xtrain, Ytrain, batch_size=BATCH_SIZE,
epochs=15, verbose=1,
validation_data=(Xtest, Ytest))
'''
(34487, 2642) (14781, 2642) (34487, 2642) (14781, 2642)
Train on 34487 samples, validate on 14781 samples
Epoch 1/15
34487/34487 [==============================] - 21s 606us/step - loss: 6.1716 - acc: 0.0588 - val_loss: 5.8799 - val_acc: 0.0665
Epoch 2/15
34487/34487 [==============================] - 21s 601us/step - loss: 5.6982 - acc: 0.0897 - val_loss: 5.7263 - val_acc: 0.1024
Epoch 3/15
34487/34487 [==============================] - 20s 584us/step - loss: 5.4394 - acc: 0.1207 - val_loss: 5.6298 - val_acc: 0.1187
Epoch 4/15
34487/34487 [==============================] - 20s 588us/step - loss: 5.2074 - acc: 0.1415 - val_loss: 5.5756 - val_acc: 0.1223
Epoch 5/15
34487/34487 [==============================] - 20s 590us/step - loss: 4.9883 - acc: 0.1526 - val_loss: 5.5623 - val_acc: 0.1207
Epoch 6/15
34487/34487 [==============================] - 20s 591us/step - loss: 4.7904 - acc: 0.1606 - val_loss: 5.5681 - val_acc: 0.1238
Epoch 7/15
34487/34487 [==============================] - 20s 588us/step - loss: 4.6211 - acc: 0.1673 - val_loss: 5.5959 - val_acc: 0.1231
Epoch 8/15
34487/34487 [==============================] - 20s 590us/step - loss: 4.4720 - acc: 0.1675 - val_loss: 5.6278 - val_acc: 0.1217
Epoch 9/15
34487/34487 [==============================] - 20s 591us/step - loss: 4.3344 - acc: 0.1686 - val_loss: 5.6912 - val_acc: 0.1202
Epoch 10/15
34487/34487 [==============================] - 20s 587us/step - loss: 4.2347 - acc: 0.1664 - val_loss: 5.7420 - val_acc: 0.1187
Epoch 11/15
34487/34487 [==============================] - 20s 590us/step - loss: 4.1467 - acc: 0.1676 - val_loss: 5.8011 - val_acc: 0.1223
Epoch 12/15
34487/34487 [==============================] - 21s 601us/step - loss: 4.0760 - acc: 0.1665 - val_loss: 5.8599 - val_acc: 0.1216
Epoch 13/15
34487/34487 [==============================] - 21s 597us/step - loss: 4.0243 - acc: 0.1656 - val_loss: 5.9128 - val_acc: 0.1178
Epoch 14/15
34487/34487 [==============================] - 21s 607us/step - loss: 3.9790 - acc: 0.1655 - val_loss: 5.9498 - val_acc: 0.1214
Epoch 15/15
34487/34487 [==============================] - 21s 617us/step - loss: 3.9430 - acc: 0.1673 - val_loss: 6.0056 - val_acc: 0.1194
'''
# plot loss function
plt.subplot(211)
plt.title("accuracy")
plt.plot(history.history["acc"], color="r", label="train")
plt.plot(history.history["val_acc"], color="b", label="validation")
plt.legend(loc="best")
plt.subplot(212)
plt.title("loss")
plt.plot(history.history["loss"], color="r", label="train")
plt.plot(history.history["val_loss"], color="b", label="validation")
plt.legend(loc="best")
plt.tight_layout()
plt.show()
# evaluate model
score = model.evaluate(Xtest, Ytest, verbose=1)
print("Test score: {:.3f}, accuracy: {:.3f}".format(score[0], score[1]))
'''
14781/14781 [==============================] - 3s 177us/step
Test score: 6.006, accuracy: 0.119
'''
# using the word2vec model
word2idx = tokenizer.word_index
idx2word = {v:k for k, v in word2idx.items()}
# retrieve the weights from the first dense layer. This will convert
# the input vector from a one-hot sum of two words to a dense 300
# dimensional representation
W, b = model.layers[0].get_weights()
# 计算词典所有单词的词向量
idx2emb = {}
for word in word2idx.keys():
wid = word2idx[word]
vec_in = ohe.fit_transform(np.array(wid)).todense()
vec_emb = np.dot(vec_in, W)
idx2emb[wid] = vec_emb
# 找出与word的词向量余弦相似度最高的10个单词,并输出这些单词
for word in ["stupid", "wonderful", "succeeded"]:
wid = word2idx[word]
source_emb = idx2emb[wid]
distances = []
for i in range(1, vocab_size):
if i == wid:
continue
target_emb = idx2emb[i]
distances.append(
(
(wid, i),
cosine_distances(source_emb, target_emb)
)
)
sorted_distances = sorted(distances, key=operator.itemgetter(1))[0:10]
predictions = [idx2word[x[0][1]] for x in sorted_distances]
print("{:s} => {:s}".format(word, ", ".join(predictions)))
'''
stupid => holding, derision, northumbria, music, spot, waist, fighting, swallowing, pardoned, axes
wonderful => red, mile, lark, rat, tunnel, grammar, piteous, commotion, saves, poured
succeeded => grazed, sends, stool, upstairs, search, growled, harm, rustling, heels, spite
'''