import logging
import multiprocessing
import os
import sys
from collections import Counter
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
from pandas import np
from sklearn.metrics import silhouette_score
def wordsCluster():
program = os.path.basename(sys.argv[0])
logger = logging.getLogger(program)
logging.basicConfig(format='%(asctime)s: %(levelname)s: %(message)s', level=logging.INFO)
logger.info("running %s" % ' '.join(sys.argv))
inp = r"C:\Users\代码\去除停用词并分词\去除停用词并分词结果\zong_fengci_tingyongci2.txt"
out_model = r'C:\Users\代码\word2vec\数据\corpusSegDone_zong2.model'
out_vector = r'C:\User\代码\word2vec\数据\corpusSegDone_zong2.txt'
model = Word2Vec(LineSentence(inp), size=100, window=3, min_count=5,
workers=multiprocessing.cpu_count(), iter=10, sg=1)
model.save(out_model)
model.wv.save_word2vec_format(out_vector, binary=False)
print("word2vec成功!!")
def changeTxtToCsv():
out_vector = r'C:\Users\代码\word2vec\数据\corpusSegDone_zong2.txt'
f = open(out_vector, "r", encoding='utf-8')
new = []
for line in f:
new.append(line)
new[0] = '\n'
f.close()
f = open(out_vector, "w", encoding='utf-8')
for n in new:
f.write(n)
f.close()
import csv
with open(r'C:\Users\代码\word2vec\数据\corpusSegDone_zong2.csv',
'w', newline='') as csvfile:
writer = csv.writer(csvfile)
data = open(out_vector,encoding='utf-8')
next(data)
for each_line in data:
a = each_line.split()
writer.writerow(a)
print("转换为csv文件成功!")
def jiangwei():
import numpy as np
from sklearn.decomposition import PCA
l = []
words = []
with open(r'C:\Users\代码\word2vec\数据\corpusSegDone_zong2.csv',
'r') as fd:
line = fd.readline()
while line:
if line == "":
continue
line = line.strip()
word = line.split(",")
words.append(word[0])
l.append(word[1:])
line = fd.readline()
X = np.array(l)
pca = PCA(n_components=2)
pca.fit(X)
newX = pca.fit_transform(X)
dict={}
for i in range(len(words)):
word_=words[i]
dict[word_]=newX[i]
mingci_list = []
with open(r'C:\Users\代码\词性标注\名词提取\名词提取数据\高频名词(过滤后).txt',
'r', encoding='ANSI') as mf:
for i in mf.readlines():
mingci_list.append(i.strip('\n'))
mingci_jiangwei_list = []
for i in mingci_list:
mingci_jiangwei_list.append(dict[i.strip('\n')])
result = []
result.append(mingci_jiangwei_list)
result.append(mingci_list)
print("降维成功!!")
return result
def k_means(mingci_jiangwei_list,mingci_list):
from sklearn.cluster import KMeans
import numpy as np
from matplotlib import pyplot as plt
num = 4
X = np.array(mingci_jiangwei_list)
kmeans = KMeans(n_clusters=num, random_state=0).fit(X)
print(str(num) + "个中心词的坐标:")
print(kmeans.cluster_centers_)
list1=[]
list2=[]
list3=[]
list4=[]
list5=[]
list6=[]
list7=[]
for j in range(len(mingci_list)):
if kmeans.labels_[j]==0:
list1.append(mingci_list[j])
elif kmeans.labels_[j]==1:
list2.append(mingci_list[j])
elif kmeans.labels_[j]==2:
list3.append(mingci_list[j])
elif kmeans.labels_[j]==3:
list4.append(mingci_list[j])
elif kmeans.labels_[j]==4:
list5.append(mingci_list[j])
elif kmeans.labels_[j]==5:
list6.append(mingci_list[j])
elif kmeans.labels_[j]==6:
list7.append(mingci_list[j])
print("与关键词"+list1[0]+"相关的词有:",end='')
print(list1)
print("与关键词"+list2[0]+"相关的词有:",end='')
print(list2)
print("与关键词"+list3[0]+"相关的词有:",end='')
print(list3)
print("与关键词"+list4[0]+"相关的词有:",end='')
print(list4)
f1=[]
f2=[]
for i in range(len(mingci_jiangwei_list)):
f1.append(mingci_jiangwei_list[i][0])
f2.append(mingci_jiangwei_list[i][1])
plt.scatter(f1, f2, c='blue', s=6)
plt.show()
def sse(mingci_jiangwei_list):
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
import numpy as np
import matplotlib.pyplot as plt
X = np.array(mingci_jiangwei_list)
x1 = mingci_jiangwei_list[0]
x2 = mingci_jiangwei_list[1]
plt.plot()
plt.xlim([0, 10])
plt.ylim([0, 10])
plt.title('Dataset')
plt.scatter(x1, x2)
plt.show()
plt.plot()
colors = ['b', 'g', 'r']
markers = ['o', 'v', 's']
distortions = []
K = range(1, 10)
for k in K:
kmeanModel = KMeans(n_clusters=k).fit(X)
kmeanModel.fit(X)
distortions.append(sum(np.min(cdist(X, kmeanModel.cluster_centers_, 'euclidean'), axis=1)) / X.shape[0])
plt.plot(K, distortions, 'bx-')
plt.xlabel('k')
plt.ylabel('Distortion')
plt.title('The Elbow Method showing the optimal k')
plt.show()
def cs(mingci_jiangwei_list):
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import metrics
import numpy as np
X = np.array(mingci_jiangwei_list)
x1 = np.array(mingci_jiangwei_list[0])
x2 = np.array(mingci_jiangwei_list[1])
plt.xlim([0,10])
plt.ylim([0,10])
plt.title('Instances')
plt.scatter(x1,x2)
colors = ['b','g','r','c','m','y','k','b']
markers = ['o','s','D','v','^','p','*','+']
clusters=[2,3,4,5,8]
subplot_counter = 1
sc_scores = []
for t in clusters:
subplot_counter += 1
plt.subplot(3,2,subplot_counter)
kmeans_model = KMeans(n_clusters=t).fit(X)
for i,l in enumerate(kmeans_model.labels_):
plt.plot(x1[i], x2[i], color = colors[l], marker = markers[l], ls = 'None')
plt.xlim([0,10])
plt.ylim([0,10])
sc_score = silhouette_score(X,kmeans_model.labels_,markers='euclidean')
sc_scores.append(sc_score)
plt.title('K=%s,silhouette coefficient=%0.03f'%(t,sc_score))
plt.figure()
plt.plot(clusters, sc_scores,'*-')
plt.xlabel('Numbers of clusters')
plt.ylabel('Silhouette Coefficient score')
plt.show()
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn import datasets, metrics
def km_sse_cs(mingci_jiangwei_list):
"""
KMeans算法效果评价
1、簇内误方差(SSE, sum of the squared errors),手肘法,肘部法,其大小表明函数拟合的好坏。
使用图形工具肘部法,根据簇的数量来可视化簇内误方差。下降率突然变缓时即认为是最佳的k值(拐点)。
当KMeans算法训练完成后,可以通过使用内置inertia属性来获取簇内的误方差。
2、轮廓系数法(Silhouette Coefficient)结合了聚类的凝聚度(Cohesion)和分离度(Separation)
平均轮廓系数的取值范围为[-1,1],系数越大,聚类效果越好。当值为负时,暗含该点可能被误分了。
:return:
"""
data = np.array(mingci_jiangwei_list)
sse_list = []
silhouettes = []
for i in range(2, 15):
model = KMeans(n_clusters=i)
model.fit(data)
sse_list.append(model.inertia_)
silhouette = metrics.silhouette_score(data, model.labels_, metric='euclidean')
silhouettes.append(silhouette)
plt.subplot(211)
plt.title('KMeans 簇内误方差')
plt.plot(range(2, 15), sse_list, marker='*')
plt.xlabel('簇数量')
plt.ylabel('簇内误方差(SSE)')
plt.subplot(212)
plt.title('KMeans 轮廓系数')
plt.plot(range(2, 15), silhouettes, marker='o')
plt.xlabel('簇数量')
plt.ylabel('轮廓系数')
plt.tight_layout()
plt.show()
if __name__ == '__main__':
import matplotlib as mpl
import numpy as np
import pandas as pd
wordsCluster()
changeTxtToCsv()
result = jiangwei()
mingci_jiangwei_list = result[0]
mingci_list = result[1]