参考:https://blog.csdn.net/qq_35273499/article/details/79098689
1、生成模型
a、先进行文本处理,主要就是分词处理
import jieba.analyse
import jieba
jieba.suggest_freq('沙瑞金', True)
jieba.suggest_freq('田国富', True)
jieba.suggest_freq('高育良', True)
jieba.suggest_freq('侯亮平', True)
jieba.suggest_freq('钟小艾', True)
jieba.suggest_freq('陈岩石', True)
jieba.suggest_freq('欧阳菁', True)
jieba.suggest_freq('易学习', True)
jieba.suggest_freq('王大路', True)
jieba.suggest_freq('蔡成功', True)
jieba.suggest_freq('孙连城', True)
jieba.suggest_freq('季昌明', True)
jieba.suggest_freq('丁义珍', True)
jieba.suggest_freq('郑西坡', True)
jieba.suggest_freq('赵东来', True)
jieba.suggest_freq('高小琴', True)
jieba.suggest_freq('赵瑞龙', True)
jieba.suggest_freq('林华华', True)
jieba.suggest_freq('陆亦可', True)
jieba.suggest_freq('刘新建', True)
jieba.suggest_freq('刘庆祝', True)
jieba.suggest_freq('京州市', True)
jieba.suggest_freq('副市长', True)
with open(r'C:\Users\Lavector\Desktop\nlp词云\in_the_name_of_people.txt',
'rb') as f:
document = f.read()
document_cut = jieba.cut(document, cut_all=False)
# print('/'.join(document_cut))
result = ' '.join(document_cut)
result = result.encode('utf-8')
with open(r'C:\Users\Lavector\Desktop\nlp词云\in_the_name_of_people_segment.txt', 'wb+') as f1:
f1.write(result) # 读取的方式和写入的方式要一致
f.close()
f1.close()
b、生成模型
rom gensim.models import word2vec
##训练word2vec模型
# 加载分此后的文本,使用的是Ttext2Corpus类
sentences = word2vec.Text8Corpus(r'C:\Users\Lavector\Desktop\nlp词云\in_the_name_of_people_segment.txt')
# 训练模型,部分参数如下
model = word2vec.Word2Vec(sentences, size=100, hs=1, min_count=1, window=3)
#保留模型,方便重用
model.save(u'人民的名义.model')
req_count = 5
for key in model.similar_by_word(u'李达康',topn =100):
if len(key[0])==3:
req_count -=1
print(key[0],key[1])
if req_count ==0:
break
2.导入使用模型
import gensim
# 导入模型
model = gensim.models.KeyedVectors.load_word2vec_format(r'C:\Users\Lavector\Desktop\nlp词云\vectors.bin',binary=True,unicode_errors="ignore")
# 返回一个词 的向量:
print(model['股市'])
# 返回和一个词语最相关的多个词语以及对应的相关度
items = model.most_similar('滋润',topn=30)
for item in items:
# 词的内容,词的相关度
print(item[0], item[1])
import gensim
# 导入模型
model = gensim.models.KeyedVectors.load_word2vec_format(r'C:\Users\Lavector\Desktop\nlp词云\vectors.bin',binary=True,unicode_errors="ignore")
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings(action='ignore',category=UserWarning,module='gensim')
warnings.filterwarnings(action='ignore',category=FutureWarning,module='gensim')
rawwordvec=[]
word2ind={}
for i,w in enumerate(model.vocab):
rawwordvec.append(model[w])
word2ind[w]=i
rawwordvec=np.array(rawwordvec)
x_reduced=PCA(n_components=2).fit_transform(rawwordvec)
fig=plt.figure(figsize=(30,15))
ax=fig.gca()
ax.set_facecolor('black')
ax.plot(x_reduced[:,0],x_reduced[:,1],'.',markersize=1,alpha=0.1,color='white')
ax.set_xlim([-12,12])
ax.set_ylim([-10,20])
words={"张艺兴","物理"}
all_words=[]
for l in words:
lst=model.most_similar(l)
wds=[ll[0] for ll in lst ]
metrics=[mm[1] for mm in lst]
wds=np.append(wds,l)
all_words.append(wds)
font1=matplotlib.font_manager.FontProperties(fname=r"c:\windows\fonts\simsun.ttc",size=15)
colors=['red','green']
for num,wds in enumerate(all_words):
for s in wds:
if s in word2ind:
ind=word2ind[s]
xy=x_reduced[ind]
plt.plot(xy[0],xy[1],'-',alpha=1,color=colors[num])
plt.text(xy[0],xy[1],s,fontproperties=font1,alpha=1,color=colors[num])
plt.show()