import os
from gensim.models import word2vec, Word2Vec
sentences = [
['word1','word2','word3','word1','word2','word3','word1','word2','word3'], # 句子1分词后的列表
['word4','word5','word6','word7','word4','word5','word6','word7','word4','word5','word6','word7'], # 句子2分词后的列表
['word8','word9','word8','word9','word8','word9','word8','word9','word8','word9'], #句子3分词后的列表
]
sentences2 = [
['word1','word5','word3','word1','word2','word3','word1','word2','word3'],
['word4','word1','word6','word7','word4','word1','word6','word7','word4','word5','word6','word7'],
['word8','word9','word8','word9','word8','word9','word8','word9','word8','word9'],
]
model_path = os.path.join(path,'modeldata.model')
def model2vec_train():
# size:每个词的维度,min_count:对整个sentences中词语个数小于min_count的忽略不计,iter:训练的迭代次数
model = Word2Vec(sentences,size=10,window=3,min_count=1,workers=4,iter=1000)
# 继续训练,更新权重
model.train(sentences2,total_examples=model.corpus_count,epochs=100)
#模型保存
model.save(model_path)
def model2vec_predict():
# 模型加载
model = Word2Vec.load(model_path)
words = ['word{}'.format(item) for item in range(1,10)]
for word in words:
# 寻找指定词语最相似的3个词语
for word_similar in model.most_similar(word,topn=3):
print('{}:{}'.format(word,word_similar))
print('='*100)
if __name__ == '__main__':
model2vec_train()
model2vec_predict()
打印结果
word1:('word3', 0.995266854763031)
word1:('word2', 0.9892698526382446)
word1:('word5', 0.8728257417678833)
====================================================================================================
word2:('word1', 0.9892698526382446)
word2:('word3', 0.9870314002037048)
word2:('word5', 0.8871303200721741)
====================================================================================================
word3:('word1', 0.995266854763031)
word3:('word2', 0.9870314002037048)
word3:('word5', 0.8389171957969666)
====================================================================================================
word4:('word6', 0.9964052438735962)
word4:('word7', 0.9962074756622314)
word4:('word5', 0.9893450140953064)
====================================================================================================
word5:('word7', 0.9946494698524475)
word5:('word6', 0.9897040128707886)
word5:('word4', 0.9893450140953064)
====================================================================================================
word6:('word7', 0.9971071481704712)
word6:('word4', 0.9964052438735962)
word6:('word5', 0.9897040128707886)
====================================================================================================
word7:('word6', 0.9971071481704712)
word7:('word4', 0.996207594871521)
word7:('word5', 0.9946494698524475)
====================================================================================================
word8:('word9', 0.9920145273208618)
word8:('word4', 0.44114065170288086)
word8:('word7', 0.421232134103775)
====================================================================================================
word9:('word8', 0.9920144081115723)
word9:('word4', 0.4353828728199005)
word9:('word7', 0.4197976887226105)
====================================================================================================
# 若要查看某一个word对应的word2vec向量,可以将这个word作为索引传递给训练好的模型对象
model['word2']
# 或者
model.wv['word2']
# [-0.20273142 -0.08315199 0.63419747 0.08442459 -0.03156763 -0.0239058 0.74490339 0.13211431 -0.45531601 -0.6855768]
model.similarity('word1','word2')
# 0.989269875838
model.similarity('word1','word8')
# 0.329440097187
# 与word1,word2正相关,与word8负相关的3个最近单词
model.most_similar(positive=['word1', 'word2'], negative=['word8'], topn=3)
# [('word3', 0.8694241046905518), ('word5', 0.7095154523849487), ('word7', 0.6584762334823608)]
# 根据词向量反推词,词向量的type为numpy.ndarray
import numpy as np
vec = np.array([-0.20273142, -0.08315199, 0.63419747, 0.08442459, -0.03156763, -0.0239058, 0.74490339, 0.13211431, -0.45531601, -0.6855768])
model.similar_by_vector(vector=vec, topn=3)
# [('word2', 1.0), ('word1', 0.9892698526382446), ('word3', 0.9870314002037048)]