数据:使用丘吉尔的人物传记作为我的学习语料
框架:Keras
import os
import numpy as np
import nltk
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import LSTM
from keras.callbacks import ModelCheckpoint
from keras.utils import np_utils
from gensim.models.word2vec import Word2Vec
#读入文本
raw_text=""
for file in os.listdir("../input/"):
if file.endswith(".txt"):
raw_text+=open("../input/"+file,errors="ignore").read()+\"n\n"
#row_test=open("../input/Winston_Churchil.txt").read()
raw_text=raw_text.lower()
sentensor=nltk.data.load("tokenizers/punkt/english.pickle")
sents=sentensor.tokenize(raw_text)
corpus=[]
for sen in sents:
corpus.append(nltk.word_tokenize(sen))
print(len(corpus))
print(corpus[:3])
#结果
91007
[['\ufeffthe', 'project', 'gutenberg', 'ebook', 'of', 'great', 'expectations', ',', 'by', 'charles', 'dickens', 'this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with', 'almost', 'no', 'restrictions', 'whatsoever', '.'], ['you', 'may', 'copy', 'it', ',', 'give', 'it', 'away', 'or', 're-use', 'it', 'under', 'the', 'terms', 'of', 'the', 'project', 'gutenberg', 'license', 'included', 'with', 'this', 'ebook', 'or', 'online', 'at', 'www.gutenberg.org', 'title', ':', 'great', 'expectations', 'author', ':', 'charles', 'dickens', 'posting', 'date', ':', 'august', '20', ',', '2008', '[', 'ebook', '#', '1400', ']', 'release', 'date', ':', 'july', ',', '1998', 'last', 'updated', ':', 'september', '25', ',', '2016', 'language', ':', 'english', 'character', 'set', 'encoding', ':', 'utf-8', '***', 'start', 'of', 'this', 'project', 'gutenberg', 'ebook', 'great', 'expectations', '***', 'produced', 'by', 'an', 'anonymous', 'volunteer', 'great', 'expectations', '[', '1867', 'edition', ']', 'by', 'charles', 'dickens', '[', 'project', 'gutenberg', 'editor’s', 'note', ':', 'there', 'is', 'also', 'another', 'version', 'of', 'this', 'work', 'etext98/grexp10.txt', 'scanned', 'from', 'a', 'different', 'edition', ']', 'chapter', 'i', 'my', 'father’s', 'family', 'name', 'being', 'pirrip', ',', 'and', 'my', 'christian', 'name', 'philip', ',', 'my', 'infant', 'tongue', 'could', 'make', 'of', 'both', 'names', 'nothing', 'longer', 'or', 'more', 'explicit', 'than', 'pip', '.'], ['so', ',', 'i', 'called', 'myself', 'pip', ',', 'and', 'came', 'to', 'be', 'called', 'pip', '.']]
word2vec
w2v_model=Word2Vec(corpus,size=128,window=5,min_count=5,workers=4)
w2v_model["office"]
#以下为结果
Out[121]:
array([-0.01398709, 0.15975526, 0.03589381, -0.4449192 , 0.365403 ,
0.13376504, 0.78731823, 0.01640314, -0.29723561, -0.21117583,
0.13451998, -0.65348488, 0.06038611, -0.02000343, 0.05698346,
0.68013376, 0.19010596, 0.56921762, 0.66904438, -0.08069923,
-0.30662233, 0.26082459, -0.74816126, -0.41383636, -0.56303871,
-0.10834043, -0.10635001, -0.7193433 , 0.29722607, -0.83104628,
1.11914253, -0.34119046, -0.39490014, -0.34709939, -0.00583572,
0.17824887, 0.43295503, 0.11827419, -0.28707108, -0.02838829,
0.02565269, 0.10328653, -0.19100265, -0.24102989, 0.23023468,
0.51493132, 0.34759828, 0.05510307, 0.20583512, -0.17160387,
-0.10351282, 0.19884749, -0.03935663, -0.04055062, 0.38888735,
-0.02003323, -0.16577065, -0.15858875, 0.45083243, -0.09268586,
-0.91098118, 0.16775337, 0.3432925 , 0.2103184 , -0.42439541,
0.26097715, -0.10714807, 0.2415273 , 0.2352251 , -0.21662289,
-0.13343927, 0.11787982, -0.31010333, 0.21146733, -0.11726214,
-0.65574747, 0.04007725, -0.12032496, -0.03468512, 0.11063002,
0.33530036, -0.64098376, 0.34013858, -0.08341357, -0.54826909,
0.0723564 , -0.05169795, -0.19633259, 0.08620321, 0.05993884,
-0.14693044, -0.40531522, -0.07695422, 0.2279872 , -0.12342903,
-0.1919964 , -0.09589464, 0.4433476 , 0.38304719, 1.0319351 ,
0.82628119, 0.3677327 , 0.07600326, 0.08538571, -0.44261214,
-0.10997667, -0.03823839, 0.40593523, 0.32665277, -0.67680383,
0.32504487, 0.4009226 , 0.23463745, -0.21442334, 0.42727917,
0.19593567, -0.10731711, -0.01080817, -0.14738144, 0.15710345,
-0.01099576, 0.35833639, 0.16394758, -0.10431164, -0.28202233,
0.24488974, 0.69327635, -0.29230621], dtype=float32)
raw_input=[item for sublist in corpus for item in sublist]
len(raw_input)
raw_input[12]
#以下为结果
2115170
"ebook"
test_stream=[]
vocab=w2v_model.vocab
for word in raw_input:
if word in vocab:
text_stream.append(word)
len(text_stream)
#以下为结果
2058753
此处的文本预测目的为:根据前面的单词以后,预测下一个单词。
比如:hello from the other,给出side
seq_length=10
x=[]
y=[]
for i in range(0,len(text_stream)-seq_length):
given=text_stream[i:i+seq_length]
predict=text_stream[i+seq_length]
x.append(np.array([w2v_model[word] for word in given]))
y.append(w2v_model[predict])
print(x[10])
print(y[10])
#以下为结果
[[-0.02218935 0.04861801 -0.03001036 ..., 0.07096259 0.16345282
-0.18007144]
[ 0.1663752 0.67981642 0.36581406 ..., 1.03355932 0.94110376
-1.02763569]
[-0.12611888 0.75773817 0.00454156 ..., 0.80544478 2.77890372
-1.00110698]
...,
[ 0.34167829 -0.28152692 -0.12020591 ..., 0.19967555 1.65415502
-1.97690392]
[-0.66742641 0.82389861 -1.22558379 ..., 0.12269551 0.30856156
0.29964617]
[-0.17075984 0.0066567 -0.3894183 ..., 0.23729582 0.41993639
-0.12582727]]
[ 0.18125793 -1.72401989 -0.13503326 -0.42429626 1.40763748 -2.16775346
2.26685596 -2.03301549 0.42729807 -0.84830129 0.56945151 0.87243706
3.01571465 -0.38155749 -0.99618471 1.1960727 1.93537641 0.81187075
-0.83017075 -3.18952608 0.48388934 -0.03766865 -1.68608069 -1.84907544
-0.95259917 0.49039507 -0.40943271 0.12804921 1.35876858 0.72395176
1.43591952 -0.41952157 0.38778016 -0.75301784 -2.5016799 -0.85931653
-1.39363682 0.42932403 1.77297652 0.41443667 -1.30974782 -0.08950856
-0.15183811 -1.59824061 -1.58920395 1.03765178 2.07559252 2.79692245
1.11855054 -0.25542653 -1.04980111 -0.86929852 -1.26279402 -1.14124119
-1.04608357 1.97869778 -2.23650813 -2.18115139 -0.26534671 0.39432198
-0.06398458 -1.02308178 1.43372631 -0.02581184 -0.96472031 -3.08931994
-0.67289352 1.06766248 -1.95796657 1.40857184 0.61604798 -0.50270212
-2.33530831 0.45953822 0.37867084 -0.56957626 -1.90680516 -0.57678169
0.50550407 -0.30320352 0.19682285 1.88185465 -1.40448165 -0.43952951
1.95433044 2.07346153 0.22390689 -0.95107335 -0.24579825 -0.21493609
0.66570002 -0.59126669 -1.4761591 0.86431485 0.36701021 0.12569368
1.65063572 2.048352 1.81440067 -1.36734581 2.41072559 1.30975604
-0.36556485 -0.89859813 1.28804696 -2.75488496 1.5667206 -1.75327337
0.60426879 1.77851915 -0.32698369 0.55594021 2.01069188 -0.52870172
-0.39022744 -1.1704396 1.28902853 -0.89315164 1.41299319 0.43392688
-2.52578211 -1.13480854 -1.05396986 -0.85470092 0.6618616 1.23047733
-0.28597715 -2.35096407]
print(len(x))
print(len(y))
print(len(x[12]))
print(len(x[12][0]))
print(len(y[12])
#输出结果
2058743
2058743
10
128
128
x=np.reshape(x,(-1,seq_lenth,128))#行数未知,用-1表示
y=np.reshape(y,(-1,128))
接下来要做的:
我们已经有了一个input的数字表达(w2v),我们要把它变成LSTM需要的数组格式: [样本数,时间步伐,特征]
第二,对于output,我们直接用128维的输出
model=Sequential()
model.add(LSTM(256,dropout_W=0.2,dropout_u=0.2,input_shape(seq_length,128)))
model.add(Dropout(0.2))
model.add(Dense(128,activation="sigmoid"))
model,compile(loss="mse",optimizer="adm")
#跑模型
model.fit(x,y,nb_epoch=50,batch_size=4096)
#预测
def predict_next(input_array):
x=np.reshape(input_array,(-1,seq_length,128))
y=model.predict(x)
return y
def string_to_index(raw_input):
raw_input=raw_input.lower()
input_stream=nltk.word_tokenize(raw_input)
res=[]
for word in input_stream[(len(input_stream)-seq_length):]:
res.append(w2v_model[word])
return res
def y_to_word(y):
word=w2v_model.most_similar(positive=y,topn=1)
return word
def generate_article(init,rounds=30):
in_string=init.lower()
for i in range(rounds):
n=y_to_word(predict_next(string_to_index(in_string)))
in_string+=" "+n[0][0]
return in_string
init='Language Models allow us to measure how likely a sentence is, which is an important for Machine'
article = generate_article(init)
print(article)
#以下为结果
language models allow us to measure how likely a sentence is, which is an important for machine engagement . to-day good-for-nothing fit job job job job job . i feel thing job job job ; thing really done certainly job job ; but i need not say