gensim和word2vec两种方式导入词向量

# 法一:
import word2vec # 不是gensim
dataPatn = "vector.txt"
'''
第一行为词表大小和维度
如:
20000 128
我 0.001 -0.002 0.004
你 0.125 -0.215 0.112
...
共20000个词,维度为128
'''
word = "他"
model = word2vec.load(dataPatn)
indices = model.similar(word,n=10)[0]
words = [model.vocab[i] for i  in indices]
print(words) # 按相似度,从大到小排列

参考文档 https://radimrehurek.com/gensim/models/keyedvectors.html

# 法二:
import gensim
from gensim.models import KeyedVectors
dataPath = "vector.txt"	# 文件带首行,如20000 128
model = KeyedVectors.load_word2vec_format(datapath, binary=False, unicode_errors="jgnore")# 注意,这里的binary非常关键
# print(model)
word = "他"
res = model.most_similar(word, topn=10)
print(res)

小例子:利用腾讯词向量,返回相似词语

from tqdm import tqdm
import word2vec

if __name__ == '__main__': 
    dataPath = 'tencent_pre_processed_with_200.txt'
    # dataPath = 'tencent_unigram.txt'
    model = word2vec.load(dataPath)
    while True:
        print('请输入查询词:')
        word = input()
        if word == 'q!':
            break
        indices, metrics = model.similar(word, n=20)
        words = model.vocab[indices]
        print(words)
    

你可能感兴趣的:(python编程)