from math import sqrt
from dis import dis
from random import random
import numpy as np
import matplotlib.pyplot as plt
import math
'''-----函数定义区-----'''
def loadDataSet(fileName):
dataMat = []
fr = open(fileName)
for line in fr.readlines():
curLine = line.strip().split('\t')
fltLine = list(map(float, curLine)) #集体转换数据类型为浮点数
dataMat.append(fltLine)
return dataMat
def distEclud(vecA, vecB): #距离计算
return math.sqrt(np.sum(np.power(vecA - vecB, 2)))
def randCent(dataSet, k): #输入数据集和k值,输出数组(数组存储了质心数据)
m, n = np.shape(dataSet) #n代表维度
centroids = np.mat(np.zeros((k,n))) #矩阵化k行n列的0
for j in range(n):
minJ = min(dataSet[:,j]) #获取第j个维度的最小值
rangeJ = float(max(dataSet[:,j])) - float(minJ) #获取第j个维度的跨度(极值差)
centroids[:,j] = minJ + rangeJ*np.random.rand(k,1) #random.rand()主要用于返回一组(k行1列)0到1之间的随机数或随机数组。
#这里centroids列表在第j个维度下存储一个数组(k行1列),每个数都是随机的且在第j个维度定义域范围内
return centroids #注意centroids是一个np数组
def kMeans(dataSet, k, distMeas=distEclud, createCent=randCent): #distEclud和randCent直接被当作函数传进来了,不过套了层马甲
m = np.shape(dataSet)[0] #数据集行数
clusterAssment = np.mat(np.zeros((m,2))) #m行2列的0矩阵,这个矩阵呢用来记录每个样本对应的k索引及对应的欧式距离(的平方)
centroids = createCent(dataSet, k) #创建质心数组
clusterChanged = True #记录聚类结果是否变化
while clusterChanged: #每一次While循环都完整更新一次质心和聚类,直到出现一种情况: 即K的索引列表不再发生变化
clusterChanged = False
for i in range(m): #对数据集逐行遍历,这个for循环目的是更新出clusterAssment
minDist = float('inf'); minIndex = -1 #先把无穷大看作最小距离
for j in range(k): #遍历k次,就是对当前样本与当前每个质心的距离进行计算和聚类,找出最小距离以及对应的k索引
distJI = distMeas(centroids[j,:],dataSet[i,:]) #计算样本与质心的欧式距离
if distJI < minDist:
minDist = distJI; minIndex = j
if clusterAssment[i,0] != minIndex: clusterChanged = True #如果这个k索引与当前聚类结果不一致,则clusterChanged = True(即还需要重复While循环)
clusterAssment[i,:] = minIndex,minDist**2 #更新clusterAssment
print(centroids) #显示当前质心数组
for cent in range(k): #这个for循环目的是重新计算出质心数组
ptsInClust = dataSet[np.nonzero(clusterAssment[:,0].A==cent)[0]]#获取每个质心对应簇的所有样本,代码逻辑详见https://blog.csdn.net/xinjieyuan/article/details/81477120
centroids[cent,:] = np.mean(ptsInClust, axis=0) #当前质心的坐标用簇中样本均值更新
return centroids, clusterAssment #返回最后的质心坐标数组和clusterAssment(存储每个样本对应的k索引值以及对应的欧氏距离平方)
def visualCluster(dataSet, centroids, clusterAssment): #聚类过程可视化函数(二维数据),咱需要的数据是上面的dataSet以及centroids, clusterAssment
fig = plt.figure(num='聚类收敛结果可视化')
ax = fig.add_subplot(111)
m = np.shape(dataSet)[0] #数据集行数
Xcord = []; Ycord = []
for i in range(m): #Xcord\Ycord两个列表用于存储样本坐标数据
Xcord.append(dataSet[i,0])
Ycord.append(dataSet[i,1])
ax.scatter(Xcord, Ycord) #绘制样本坐标
ax.scatter(list(centroids[:,0]),list(centroids[:,1]),c='red',marker = '+', s = 100) #绘制质心坐标
#接下来得想办法找到每个质心对应的最大半径
for p in range(len(list(centroids))): #p = 0,1,...,k-1
p_index = np.nonzero(clusterAssment[:,0].A==p)[0] #找到clusterAssment中以p开头的项的索引数组
dist_square = 0
for index in list(p_index): #一个for循环获取最大的那个dist_square
if list(clusterAssment[:,1])[index]>dist_square:
dist_square = list(clusterAssment[:,1])[index]
maxR = math.sqrt(dist_square)
circle_p = plt.Circle((centroids[p,0],centroids[p,1]), maxR,fill=False,color = 'k')
ax.add_patch(circle_p)
plt.show()
'''-----函数定义区-----'''
fileName = 'F:/python_works/about Machine Learning/K-means/testSet.txt'
dataMat = np.mat(loadDataSet(fileName))
centroids, clusterAssment = kMeans(dataMat, 3, distMeas=distEclud, createCent=randCent)
print(clusterAssment)
visualCluster(dataMat, centroids, clusterAssment)
根据书本内容改的代码,作为新手,错误百出,但是能跑。代码运行结果如下