基于word2vec的k-means聚类

1.将txt文本用word2vec将每个词转换成词向量
2.将300维度的词向量用pca转换为2维
3.将2维的数据作为k-means聚类的输入

text.txt:作为训练的文本(最好是英文,如果是中文的话可以再用jieba库把中文解析)
word_model.txt:创建一个空的文本
data.csv:创建一个空的csv文件

#1。将文本的标点替换成空格
import re
import os
list=[',','?','.','?','!','*','(',')','“','”',':','"','`','\'']  ##要替换的标点符号做成一个列表
with open('text.txt','r') as f:                                  ##text.txt是用来训练的文本(英文小说)
    result = f.read()
    for i in range(len(list)):
        result=result.replace(list[i],' ')
    with open('text.txt','w') as w:
        w.write(str(result))



##2。wordvec2获取词向量
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
def wordsCluster(text, vectorSize):                              ##text:输入文本的本地路径 vectorSize:词向量大小(多少维度)
    name = []
    data = open(text, 'r', encoding='utf-8')
    for line in data.readlines():
        line = line.replace('\n', '')
        if line not in name:
            name.append(line)
    # word2vec向量化
    model = Word2Vec(LineSentence(text), size=vectorSize, window=5, min_count=1, workers=4)
    model.wv.save_word2vec_format('word_model.txt', binary=False)      ##将词向量保存在word_model.txt文本中

    # 获取model里面的所有关键词
    keys = model.wv.vocab.keys()
wordsCluster('text.txt',300)



##3。将含词向量的txt文本转换为csv文本
f = open("word_model.txt","r")
new=[]
for line in f:
    new.append(line)
new[0]='\n'
f.close()

f = open("word_model.txt","w")
for n in new:
    f.write(n)
f.close()

import csv
with open('data.csv', 'w', newline='') as csvfile:      ##data.csv是用来存放词向量的csv文件
    writer = csv.writer(csvfile)
    data = open('word_model.txt')
    for each_line in data:
        a = each_line.split()
        writer.writerow(a)

##4。用pca将300维的数据降低到2维
# coding=utf-8
import numpy as np
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
l = []
words=[]
with open('data.csv', 'r') as fd:
    line = fd.readline()
    line=fd.readline()
    while line:
        if line == "":
            continue
        line = line.strip()
        word = line.split(",")
        words.append(word[0])
        l.append(word[1:])
        line = fd.readline()

X = np.array(l)  #导入数据,维度为300
pca = PCA(n_components=2)   #降到2维
pca.fit(X)                  #训练
newX=pca.fit_transform(X)   #降维后的数据存放在newX列表中


##5。构建词向量字典并用kmeans训练,得出分类情况
dict={}
for i in range(len(words)):
    word_=words[i]
    dict[word_]=newX[i]
for j in range(len(words)):
    print(words[j]+':',end='')
    print(dict[words[j]])

from sklearn.cluster import KMeans
import numpy as np

X = np.array(newX)
kmeans = KMeans(n_clusters=5, random_state=0).fit(X)

print("五个中心词的坐标:")
print(kmeans.cluster_centers_)

list1=[]
list2=[]
list3=[]
list4=[]
list5=[]
for j in range(len(words)):
    if kmeans.labels_[j]==0:
        list1.append(words[j])
    elif kmeans.labels_[j]==1:
        list2.append(words[j])
    elif kmeans.labels_[j]==2:
        list3.append(words[j])
    elif kmeans.labels_[j]==3:
        list4.append(words[j])
    elif kmeans.labels_[j]==4:
        list5.append(words[j])
print("与关键词"+list1[0]+"相关的词有:",end='')
print(list1)
print("与关键词"+list2[0]+"相关的词有:",end='')
print(list2)
print("与关键词"+list3[0]+"相关的词有:",end='')
print(list3)
print("与关键词"+list4[0]+"相关的词有:",end='')
print(list4)
print("与关键词"+list5[0]+"相关的词有:",end='')
print(list5)

##将数据用散点图可视化
f1=[]
f2=[]
for i in range(len(newX)):
    f1.append(newX[i][0])
    f2.append(newX[i][1])
plt.scatter(f1, f2, c='blue', s=6)
plt.show()



测试效果在这里插入图片描述基于word2vec的k-means聚类_第1张图片

你可能感兴趣的:(机器学习实战,聚类,word2vec,kmeans)