from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
#创造数据集
x,y = make_blobs(n_samples= 500
,n_features= 2
,centers= 4
,random_state=1)
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples ,silhouette_score
import numpy as np
import pandas as pd
import matplotlib.cm as cm
n_clusters = 4
fig,(ax1 , ax2) = plt.subplots(1,2)
fig.set_size_inches(18,7)
ax1.set_xlim([-0.1,1])
ax1.set_ylim([0,x.shape[0]+ ( n_clusters +1 )*10])
clusterer = KMeans(n_clusters= n_clusters ,random_state = 0).fit(x)
cluster_labers = clusterer.labels_
silhouette_avg = silhouette_score(x,cluster_labers)
print('For n_clusters = ',n_clusters
,'The average silhouette_score is :',silhouette_avg)
sample_silhouette_values = silhouette_samples(x , cluster_labers)
y_lower = 10
for i in range(n_clusters):
ith_cluster_silhouette_values = sample_silhouette_values[cluster_labers == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = cm.nipy_spectral(float(i)/n_clusters)
ax1.fill_betweenx(np.arange(y_lower , y_upper)
,ith_cluster_silhouette_values
,facecolor = color
,alpha = 0.7
)
ax1.text(-0.05
,y_lower + 0.5*size_cluster_i
,str(i))
y_lower = y_upper + 10
ax1.set_title('The silhouette plot for the various clusters.')
ax1.set_xlabel('The silhouette coefficient values')
ax1.set_ylabel('Cluster label')
ax1.axvline(x = silhouette_avg , color = 'red' , linestyle = '--')
ax1.set_yticks([])
ax1.set_xticks([-0.1 , 0 , 0.2 ,0.4 , 0.6, 0.8 ,1])
colors = cm.nipy_spectral(cluster_labers.astype(float)/n_clusters)
ax2.scatter(x[:,0],x[:,1]
,marker = 'o'
,s = 8
,c = colors)
centers = clusterer.cluster_centers_
ax2.scatter(centers[:,0],centers[:,1]
,marker= 'x'
,c = 'red'
,alpha= 1
, s = 200)
ax2.set_title('The visualization of the clustered data')
ax2.set_xlabel('Feature space for the 1st feature')
ax2.set_ylabel('Feature space for the 2nd feature')
plt.suptitle(('silhouette analysis for KMeans clustering on sample data'
'with n_clusters = %d' % n_clusters)
,fontsize = 14
,fontweight = 'bold')
plt.show()
For n_clusters = 4 The average silhouette_score is : 0.6505186632729437
A、重要参数
KMeans(n_clusters = 10 , init = 'random',random_state = 420 , max_iter = 10)
1、n_clusters
表示着我们告诉模型我们要分几类。
2、init
可输入'k-means++','random'或者一个n纬数组,默认'k-means++',一种为K均值聚类选择初始聚类中心的聪明的方法。(建议保留)
3、random_state
对应一个质心随机初始化的随机数种子
4、n_init
默认10,最终结果会基于Inertia来计算的n_init次连续运行后的最佳输出
5、max_iter
默认300,单次运行k-means算法最大迭代次数
6、tol
默认1e-4,两次迭代间Inertia下降的量小于tol所设定的值,迭代就会自动停下
B、重要属性
1、labels_ 每个样本点对应的标签
2、n_iter_ 实际迭代次数
3、inertia_簇内平方和
4、cluster_centers_ 收敛到的质心
C、函数cluster.k_means
from sklearn.cluster import k_means
k_means(x,4,return_n_iter = True)
返回质心,每个样本对应的簇,inertia 以及最佳迭代次数
D、模型评估指标
1、轮廓系数
from sklearn.metrics import silhouette_score
from sklearn.metrics import silhouette_samples
sihouette_score(x,y_pred) #为所有样本轮廓系数的均值
sihouette_samples(x,y_pred) #为所有样本轮廓系数
2、卡林斯基-哈拉巴斯Calinski-Harabaz Index)
from sklearn.metrics import calinski_harabaz_score
calinski_harabaz_score(x,y_pred)
#Calinski_harabaz指数比轮廓系数计算速度快
参考:CDA课堂,直播课后的个人笔记总结,仅供参考,有不一样的想法的大佬们,请辩证地观看,如果有问题可以在评论区指出我再订正。