import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
dataset=pd.read_csv('F:\\python\\dataset\\watermelon_4.csv', delimiter=",")
data=dataset.values
data
import random
#距离
def distance(x1,x2):
return sum((x1-x2)**2)
#Kmeans实现
def Kmeans(D,K,maxIter):#return K points and the belongings of every points
m,n=np.shape(D)
if K>=m:return D
initSet=set()
curK = K #聚类簇数
while(curK>0): #随机选择K个样本作为均值向量
randomInt = random.randint(0,m-1) #随机一个[0,29]的整数
if randomInt not in initSet:
curK-=1
initSet.add(randomInt) #k个
U=D[list(initSet),:]
print('随机选择的{}个样本作为均值向量为:\n{}\n'.format(K,U))
C=np.zeros(m)
curIter = maxIter
while curIter>0: #迭代
curIter -= 1
"""
4: forj = 1, 2, . . . ,m do
5: 计算样本 xj 与各均值向量 μi (1<=i<=k) 的距离: dji = ||xj-ui||2;
6: 根据距离最近的均值向量确定xj的簇标记:λj = argmi∈{1,2..., k} dji ;
7: 将样本xj划入相应的簇:Cλj =Cλj ∪{Xj};
8: end for
"""
for i in range(m): #4
p=0
minDistance = distance(D[i],U[0])
for j in range(1,K): #5
if distance(D[i],U[j]) < minDistance: #6
minDistance = distance(D[i],U[j])
p=j
C[i] = p # 7
newU = np.zeros((K,n))
cnt = np.zeros(K)
"""
9: for i = 1, 2,… ,k do
10: 计算新均值向量:μi'= 1/|Ci|∑x∈Ci X;
11: if ui' != μi then
12: 将当前均值向量 μi更新为 μi';
13: else
14: 保持当前均值向量不变
15: end if
16: end for
17: until 当前均值向量均未更新
"""
for i in range(m):
newU[int(C[i])]+=D[i]
cnt[int(C[i])]+=1
# print('newU:{}'.format(newU))
# print('cnt:{}'.format(cnt))
changed=0 #标志符
for i in range(K):
newU[i]/=cnt[i] #10
for j in range(n): #11-12
if U[i,j]!=newU[i,j]:
U[i,j]=newU[i,j]
changed=1
if changed==0:
return U,C,maxIter-curIter
return U,C,maxIter-curIter
U,C,iter=Kmeans(data,2,100)
print('最终的类中心向量:\n{}\n'.format(U))
print('样本最终的分类:\n{}\n'.format(C))
print('完成此任务的迭代次数:{}'.format(iter))
f1 = plt.figure(1)
plt.title('watermelon_4')
plt.xlabel('density')
plt.ylabel('ratio')
plt.scatter(data[:,0], data[:,1], marker='o', color='g', s=50)
plt.scatter(U[:,0], U[:,1], marker='o', color='r', s=100)
# plt.xlim(0,1)
# plt.ylim(0,1)
m,n=np.shape(data)
for i in range(m):
plt.plot([data[i,0],U[int(C[i]),0]],[data[i,1],U[int(C[i]),1]],"c--",linewidth=0.3)
plt.show()
k-means算法选的初始点离得越远越容易收敛,聚类效果也越好。
因此k-means算法的好坏与初始样本的选取有很大关系。
代码参考链接