本文转载自:http://www.javaxxz.com/thread-359535-1-1.html
Kmeans聚类算法的大概流程是:①从样本中随机找出K个样本作为中心点;
②求所有样本到这些样本的距离,按照最短的进行归类;
③求每个聚类中的样本的元素的平均值,作为新的中心点;
④继续②,③,知道所有样本再也无法找到新的聚类,就算完成。
### 一、接下来使用Numpy实现python代码,测试有效并且带注释:Kmeans.py:
```python
# encoding: utf-8
"""
Created on 2017年12月11日
"""
import time
from numpy import *
from scipy.cluster.hierarchy import centroid
import matplotlib.pyplot as plt
# 计算欧氏距离
def euclDistance(vector1,vector2):
return sqrt(sum(power(vector2-vector1,2)))
# 初始化K个中心点
def initCentroids(dataSet,k):
# 拿到数据集的格式 例如[[2,3,4][3,4,5]].shape = (2,3) [1,2,3] = (3,)
numSamples,dim = dataSet.shape
# 按照给定的shape,初始化一个数据类型和排列方式的填满0的数组
centroids = zeros((k,dim))
for i in range(k):
index = int(random.uniform(0,numSamples)) #样本集随机挑一个,作为初始质心
centroids[i,:] = dataSet[index,:]
return centroids
# k-means cluster
def kmeans(dataSet,k):
numSamples = dataSet.shape[0]
#mat 对数组转换用于线性操作,类型变为:numpy.matrixlib.defmatrix.matrix
# 初始化一个二维数据,第一列存储样本属于哪个聚类 第二列存储样本和中心的距离 [[0,0],[0,0]... ...]
clusterAssment = mat(zeros((numSamples,2)))
clusterChanged = True
## 步骤1:初始化中心点
centroids = initCentroids(dataSet, k)
while clusterChanged:
clusterChanged = False
# 遍历每个样本
for i in range(numSamples):
minDist = 1000000000.0 #与样本点最近族群距离
minIndex = 0 #所属族
#步骤2 找到一个最近的中心点
for j in range(k):
distance = euclDistance(centroids[j,:], dataSet[i,:]) #计算每个点到样本点的距离,找出最近的那一个样本
if distance len(mark):
print("Sorry! Your k is too large!please contact Zouxy")
return 1
for i in range(numSamples):
markIndex = int(clusterAssment[i,0]) #每个样本所属族群
plt.plot(dataSet[i,0],dataSet[i,1],mark[markIndex])
mark = ["Dr","Db","Dg","Dk","^b","+b","sb","db","
for i in range(k):
plt.plot(centroids[i,0],centroids[i,1],mark,markersize = 6)
plt.show()
```
```python
# encoding: utf-8
"""
Created on 2017年12月11日
"""
from numpy import *
import time
import matplotlib.pyplot as plt
import kmeans.Kmeans as kean
#步骤1
print("step 1:load data...")
dataSet = []
fileIn = open("D:\Users\zhangjie116\Downloads\Wholesale customers data.csv")
i = 0;
for line in fileIn.readlines():
i = i + 1
lineArr = line.strip().split(",")
if i != 1 :
dataSet.append([float(lineArr[4]),float(lineArr[5])])
#步骤2
print("step 2:clustering")
dataSet = mat(dataSet)
k = 4
centroids,clusterAssment = kean.kmeans(dataSet, k)
#步骤3
print("step 3:show the result...")
kean.showCluster(dataSet, k, centroids, clusterAssment)
```
运行结果:
![](http://aodi.paic.com.cn/forum.php?mod=attachment&aid=NDc2M3xjZjAxMDdiMnwxNTEzMTU3NzEyfDEzMTF8NTA0MA%3D%3D&noupdate=yes)
### 二、使用scikit-learn库并且使用轮廓系数找出最佳K值计算
```python
# encoding: utf-8
"""
Created on 2017年12月13日
"""
import numpy as np
from sklearn import cluster
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn import metrics
#从数据集中加载数据
dataSet = []
fileIn = open("D:\Users\zhangjie116\Downloads\Wholesale customers data.csv")
i = 0;
for line in fileIn.readlines():
i = i + 1
lineArr = line.strip().split(",")
if i != 1 :
dataSet.append([float(lineArr[4]),float(lineArr[5])])
#代码生成数据集
cluster1=np.random.uniform(0.5,1.5,(2,10))
cluster2=np.random.uniform(3.5,4.5,(2,10))
cluster3=np.random.uniform(7.5,8.5,(2,10))
dataSet=np.hstack((cluster1,cluster2,cluster3)).T
max_silhouette_coefficient = 0
max_k = 0
max_centroids = []
max_labels_ = []
numSamples = 0
for k in range(2,10):
#设定K
clf = KMeans(n_clusters=k)
#加载数据集合
s = clf.fit(dataSet)
#样本数量
numSamples = len(dataSet)
#中心点
centroids = clf.cluster_centers_
labels_ = clf.labels_
#获取轮廓系数
silhouette_coefficient = metrics.silhouette_score(dataSet, clf.labels_,metric="euclidean",sample_size=numSamples)
print "k:%d ==== silhouette_coefficient:%f"%(k,silhouette_coefficient)
#找到轮廓系数最大的K值,为效果最好的
if max_silhouette_coefficient < silhouette_coefficient :
max_silhouette_coefficient = silhouette_coefficient
max_k = k
max_centroids = centroids
max_labels_ = labels_
#获取聚类效果值
print "k:%d ==== inertia_:%f"%(k,clf.inertia_)
print "max_k:%d ==== max_silhouette_coefficient:%f"%(max_k,max_silhouette_coefficient)
#画出所有样例点 属于同一分类的绘制同样的颜色
mark1 = ["or", "ob", "og", "ok", "^r", "+r", "sr", "dr", "
plt.plot(dataSet[0], dataSet[1], mark1[max_labels_]) #mark[markIndex])
mark2 = ["Dr", "Db", "Dg", "Dk", "^b", "+b", "sb", "db", "# 画出质点,用特殊图型
for i in range(max_k):
plt.plot(max_centroids[i,0], max_centroids[i,1], mark2,markersize = 12)
plt.show()
```
运行结果:
```
k:2 ==== silhouette_coefficient:0.749433
k:2 ==== inertia_:93.451706
k:3 ==== silhouette_coefficient:0.887454
k:3 ==== inertia_:4.294235
k:4 ==== silhouette_coefficient:0.746558
k:4 ==== inertia_:3.366372
k:5 ==== silhouette_coefficient:0.579589
k:5 ==== inertia_:2.555257
k:6 ==== silhouette_coefficient:0.622791
k:6 ==== inertia_:2.088957
k:7 ==== silhouette_coefficient:0.452267
k:7 ==== inertia_:1.602857
k:8 ==== silhouette_coefficient:0.447269
k:8 ==== inertia_:1.265230
k:9 ==== silhouette_coefficient:0.454158
k:9 ==== inertia_:0.976325
max_k:3 ==== max_silhouette_coefficient:0.887454
```
![](http://aodi.paic.com.cn/forum.php?mod=attachment&aid=NDgwMnw0YWRjNjY0ZHwxNTEzMTU3NzEyfDEzMTF8NTA0MA%3D%3D&noupdate=yes)