from keras.preprocessing import text
#facts, accu_label, article_label, imprison_label=load_data()
somestr = ['ha ha gua angry','howa ha gua excited naive']
tok=text.Tokenizer() #初始化标注器
tok.fit_on_texts(somestr) #学习出文本的字典
word_index = tok.word_index#查看对应的单词和数字的映射关系dict
print(word_index)
sequences = tok.texts_to_sequences(somestr) #通过texts_to_sequences 这个dict可以将每个string的每个词转成数字
print(sequences)
{‘naive’: 6, ‘ha’: 1, ‘excited’: 5, ‘angry’: 3, ‘gua’: 2, ‘howa’: 4}
[[1, 1, 2, 3], [4, 1, 2, 5, 6]]
maxlen = 10
from keras.preprocessing import sequence
x = sequence.pad_sequences(sequences, maxlen,dtype='int16') # 将每条文本的长度设置一个固定值。
print(x)
[[0 0 0 0 0 0 1 1 2 3]
[0 0 0 0 0 4 1 2 5 6]]
import numpy as np
lenofdata = len(x)
x_train = x[np.arange(len(x))][:int(lenofdata * 0.8)]
print(x_train)
[[0 0 0 0 0 0 1 1 2 3]]
np.vstack((x, x_train))
array([[0, 0, 0, 0, 0, 0, 1, 1, 2, 3],
[0, 0, 0, 0, 0, 4, 1, 2, 5, 6],
[0, 0, 0, 0, 0, 0, 1, 1, 2, 3]], dtype=int16)
embedding_weights = {key: embedding_model[word] if word in embedding_model
tok.word_index #查看对应的单词和数字的映射关系dict
tok.texts_to_sequences(somestr) #通过texts_to_sequences 这个dict可以将每个string的每个词转成数字
from gensim.test.utils import common_texts, get_tmpfile
from gensim.models import Word2Vec
path = get_tmpfile("word2vec.model")
model = Word2Vec(common_texts, size=100, window=5, min_count=1, workers=4)
model.save("word2vec.model")
sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
model = Word2Vec(sentences,size=2,min_count=1)
#size:词向量的维度
#min_count:忽略总频率低于此值的所有单词。
词向量的含义 本质上就是在统计词共现信息
其训练过程就是在训练当前的词和预测词之间的相似度(两个词向量的点积就是没有归一化的余弦相似度)
f(x)->y 在 NLP 中,把 x 看做一个句子里的一个词语,y 是这个词语的上下文词语,那么这里的 f,便是 NLP 中经常出现的『语言模型』(language model),这个模型的目的,就是判断 (x,y) 这个样本,是否符合自然语言的法则,更通俗点说就是:词语x和词语y放在一起,是不是人话。
model.similarity("cat", "dog")
0.78672653
#[(k, model.wv[k]) for k, v in model.wv.vocab.items()]
dict((k, model.wv[k]) for k, v in model.wv.vocab.items())
{‘cat’: array([-0.11496235, -0.07849815], dtype=float32),
‘dog’: array([-0.21529776, 0.0142754 ], dtype=float32),
‘meow’: array([0.15002252, 0.07607987], dtype=float32),
‘say’: array([-0.24198197, 0.16584021], dtype=float32),
‘woof’: array([-0.06102178, 0.1575166 ], dtype=float32)}
model.save( 'word2vec')
# 第二种训练方式
new_model = gensim.models.Word2Vec(min_count=1) # 先启动一个空模型 an empty model
new_model.build_vocab(sentences) # can be a non-repeatable, 1-pass generator
new_model.train(sentences, total_examples=new_model.corpus_count, epochs=new_model.iter)
# can be a non-repeatable, 1-pass generator
import numpy as np
embedding_weights=dict((k, model.wv[k]) for k, v in model.wv.vocab.items())
x_train=np.stack([np.stack([embedding_weights[word] for word in sentence]) for sentence in x_train])
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.embeddings import Embedding
# define documents
docs = ['Well done!',
'Good work',
'Great effort',
'nice work',
'Excellent!',
'Weak',
'Poor effort!',
'not good',
'poor work',
'Could have done better.']
# define class labels
labels = [1,1,1,1,1,0,0,0,0,0]
# integer encode the documents
vocab_size = 50
encoded_docs = [one_hot(d, vocab_size) for d in docs]
print(encoded_docs)
# pad documents to a max length of 4 words
max_length = 4
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)
# define the model
model = Sequential()
model.add(Embedding(vocab_size, 8, input_length=max_length))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['acc'])
# summarize the model
print(model.summary())
# fit the model
model.fit(padded_docs, labels, epochs=50, verbose=0)
# evaluate the model
loss, accuracy = model.evaluate(padded_docs, labels, verbose=0)
print('Accuracy: %f' % (accuracy*100))
[[29, 21], [16, 22], [27, 47], [46, 22], [9], [24], [25, 47], [11, 16], [25, 22], [48, 22, 21, 35]]
[[29 21 0 0]
[16 22 0 0]
[27 47 0 0]
[46 22 0 0]
[ 9 0 0 0]
[24 0 0 0]
[25 47 0 0]
[11 16 0 0]
[25 22 0 0]
[48 22 21 35]]
embedding_1 (Embedding) (None, 4, 8) 400
flatten_1 (Flatten) (None, 32) 0
Total params: 433
Trainable params: 433
Non-trainable params: 0
None
Accuracy: 100.000000