Gensim调用Word2Vec

使用python自然语言处理包Gensim 调用Word2Vec进行词向量转换

import gensim
import logging
import os

#数据加载
#小样本用list
logging.basicConfig(format="%(asctime)s:%(levelname)s:%(message)s",level=logging.INFO)
sentences = [["first","sentence"],["second","sentence"]]
model = gensim.models.Word2Vec(sentences,min_count=1)

#实际中使用迭代器来读取数据
class MySenctence(object):
    def __init__(self,dirname):
        self.dirname = dirname
    def __iter__(self):
        for fname in os.listdir(self.dirname):
            for line in open(os.path.join(self.dirname,fname),"r"):
                yield line.split() #分词 英文默认按照空格分割


#训练
model = gensim.models.Word2Vec(sentences,size=100,window=5,min_count=1,workers=4)
#size 词向量维度
#window 视窗大小 前后几个词
#min_count 过滤阈值 单词最低出现次数
#workers 并行化

#存储模型
model.save("file_path/to/model")


#使用模型
#最相似性
model.most_similar(positive=["women","king"],negative=["man"],topn=1)
#集合中最不相同的词语
model.doesnt_match("breakfast cereal dinner lunch".split())
#两个单词的相似度
model.similarity("woman","man")


#获取词向量
print (model["man"])

#两个集合的相似度
list1 = ['我','走','我','学校']
list2 = ['我','去','家']
list_sim = model.n_similarity(list1,list2)
print (list_sim)

你可能感兴趣的:(Gensim调用Word2Vec)