中文nlp word2vec模型训练生成和导入使用和降维画图

参考:https://blog.csdn.net/qq_35273499/article/details/79098689
1、生成模型
a、先进行文本处理,主要就是分词处理

import jieba.analyse
import jieba

jieba.suggest_freq('沙瑞金', True)
jieba.suggest_freq('田国富', True)
jieba.suggest_freq('高育良', True)
jieba.suggest_freq('侯亮平', True)
jieba.suggest_freq('钟小艾', True)
jieba.suggest_freq('陈岩石', True)
jieba.suggest_freq('欧阳菁', True)
jieba.suggest_freq('易学习', True)
jieba.suggest_freq('王大路', True)
jieba.suggest_freq('蔡成功', True)
jieba.suggest_freq('孙连城', True)
jieba.suggest_freq('季昌明', True)
jieba.suggest_freq('丁义珍', True)
jieba.suggest_freq('郑西坡', True)
jieba.suggest_freq('赵东来', True)
jieba.suggest_freq('高小琴', True)
jieba.suggest_freq('赵瑞龙', True)
jieba.suggest_freq('林华华', True)
jieba.suggest_freq('陆亦可', True)
jieba.suggest_freq('刘新建', True)
jieba.suggest_freq('刘庆祝', True)
jieba.suggest_freq('京州市', True)
jieba.suggest_freq('副市长', True)

with open(r'C:\Users\Lavector\Desktop\nlp词云\in_the_name_of_people.txt',
          'rb') as f:
    document = f.read()
    document_cut = jieba.cut(document, cut_all=False)
    # print('/'.join(document_cut))
    result = ' '.join(document_cut)
    result = result.encode('utf-8')

    with open(r'C:\Users\Lavector\Desktop\nlp词云\in_the_name_of_people_segment.txt', 'wb+') as f1:
        f1.write(result)  # 读取的方式和写入的方式要一致
f.close()
f1.close()

b、生成模型

rom gensim.models import word2vec


##训练word2vec模型

# 加载分此后的文本,使用的是Ttext2Corpus类
sentences = word2vec.Text8Corpus(r'C:\Users\Lavector\Desktop\nlp词云\in_the_name_of_people_segment.txt')

# 训练模型,部分参数如下
model = word2vec.Word2Vec(sentences, size=100, hs=1, min_count=1, window=3)


#保留模型,方便重用
model.save(u'人民的名义.model')


req_count = 5
for key in model.similar_by_word(u'李达康',topn =100):
    if len(key[0])==3:
        req_count -=1
        print(key[0],key[1])
        if req_count ==0:
            break

2.导入使用模型

import gensim
# 导入模型
model = gensim.models.KeyedVectors.load_word2vec_format(r'C:\Users\Lavector\Desktop\nlp词云\vectors.bin',binary=True,unicode_errors="ignore")

# 返回一个词 的向量:
print(model['股市'])

# 返回和一个词语最相关的多个词语以及对应的相关度
items = model.most_similar('滋润',topn=30)
for item in items:
    # 词的内容,词的相关度
    print(item[0], item[1])

中文nlp word2vec模型训练生成和导入使用和降维画图_第1张图片
3、降维画图

import gensim
# 导入模型
model = gensim.models.KeyedVectors.load_word2vec_format(r'C:\Users\Lavector\Desktop\nlp词云\vectors.bin',binary=True,unicode_errors="ignore")

import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings(action='ignore',category=UserWarning,module='gensim')
warnings.filterwarnings(action='ignore',category=FutureWarning,module='gensim')


rawwordvec=[]
word2ind={}
for i,w in enumerate(model.vocab):
    rawwordvec.append(model[w])
    word2ind[w]=i
rawwordvec=np.array(rawwordvec)
x_reduced=PCA(n_components=2).fit_transform(rawwordvec)

fig=plt.figure(figsize=(30,15))
ax=fig.gca()
ax.set_facecolor('black')
ax.plot(x_reduced[:,0],x_reduced[:,1],'.',markersize=1,alpha=0.1,color='white')
ax.set_xlim([-12,12])
ax.set_ylim([-10,20])

words={"张艺兴","物理"}
all_words=[]
for l in words:
    lst=model.most_similar(l)
    wds=[ll[0] for ll in lst ]
    metrics=[mm[1] for mm in lst]
    wds=np.append(wds,l)
    all_words.append(wds)


font1=matplotlib.font_manager.FontProperties(fname=r"c:\windows\fonts\simsun.ttc",size=15)
colors=['red','green']
for num,wds in enumerate(all_words):
    for s in wds:
        if s in word2ind:
            ind=word2ind[s]
            xy=x_reduced[ind]
            plt.plot(xy[0],xy[1],'-',alpha=1,color=colors[num])
            plt.text(xy[0],xy[1],s,fontproperties=font1,alpha=1,color=colors[num])
plt.show()

你可能感兴趣的:(深度学习)