K-Means关于K的选择,也就是肘部法则,对此我们自己
#肘部法则
#-*- coding:utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from scipy.spatial.distance import cdist
x = np.array([1, 2, 3, 1, 5, 6, 5, 5, 6, 7, 8, 9, 7, 9])
y = np.array([1, 3, 2, 2, 8, 6, 7, 6, 7, 1, 2, 1, 1, 3])
data = np.array(list(zip(x, y))) # 加压
# data = np.array(np.random.rand(100,2))
# data = np.array(datmat) # 加压
#data = ([1,1],[2,3],[3,2],[1,2],[5,8])
# 肘部法则 求解最佳分类数
# K-Means参数的最优解也是以成本函数最小化为目标
# 成本函数是各个类畸变程度(distortions)之和,每个类的畸变程度等于该类重心与其内部成员位置距离的平方和
def K_choose():
num = 0
aa = []
K = range(1, 10)
for k in range(1,10): # 肘部法则测试
kmeans=KMeans(n_clusters=k)
kmeans.fit(data)
aa.append(sum(np.min(cdist(data, kmeans.cluster_centers_, 'euclidean'),axis=1))/data.shape[0])
num += 1
# print(num)
plt.figure()
plt.plot(np.array(K), aa, 'bx-') #横,纵坐标
plt.show()
m = []
for i in range(1,8):
print(i-1,i,i+1)
t1 = aa[i-1]-aa[i]
t2 = aa[i]-aa[i+1]
# dic.setdefault(str(i),[]).append([t1,t2])
m.append(abs(t2-t1))
t = max(m)
print(t)
for i in range(len(m)):
if t == m[i]:
return i+2 #这里+2是因为数据从0开始遍历的,但是真正的分类结果是+2之后的
#绘制散点图及聚类结果中心点
def Kmeans():
k = K_choose()
print(k)
plt.figure()
plt.axis([0, 10, 0, 10])
plt.grid(True)
plt.plot(x,y,'k.')
kmeans=KMeans(n_clusters=3) # 为什么选择3
kmeans.fit(data)
plt.plot(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1],'r.')
plt.show()
# if __name__ is '__main__':
Kmeans()
代码注释的请忽略,中文注释算是比较详细。修改了书上的版本,因为Python已经3.6了
1)map 的规则变了
2)源代码的矩阵,list定义有点混乱
'''
Created on Feb 16, 2011
Modify on Mar 27, 2018
k Means Clustering for Ch10 of Machine Learning in Action
@author: Peter Harrington ---++++---YM
'''
from numpy import *
import numpy as np
def loadDataSet(fileName): #general function to parse tab -delimited floats
dataMat = [] #assume last column is target value
fr = open(fileName)
for line in fr.readlines():
fltLine = []
curLine = line.strip().split('\t') # 读取的数据是坐标形式
fltLine = [float(curLine[0]),float(curLine[1])]
dataMat.append(fltLine)
# fltLine = map(float,curLine) # map all elements to float()
# 为什么会显示map类型呢??????????map不是函数应用吗
# print(curLine[0])
# print(type(fltLine))
return mat(dataMat)
# calculate Euclidean distance
def euclDistance(vector1, vector2):
return sqrt(sum(power(vector2 - vector1, 2))) #求这两个矩阵的距离,vector1、2均为矩阵
# init centroids with random samples
#在样本集中随机选取k个样本点作为初始质心
def initCentroids(dataSet, k):
numSamples, dim = dataSet.shape #矩阵的行数、列数
centroids = zeros((k, dim)) #感觉要不要你都可以
for i in range(k):
index = int(random.uniform(0, numSamples)) #随机产生一个浮点数,然后将其转化为int型
centroids[i,:] = dataSet[index, :]
return centroids
# k-means cluster
#dataSet为一个矩阵
#k为将dataSet矩阵中的样本分成k个类
def kMeans(dataSet, k):
# def kMeans(dataSet, k, distMeas=euclDistance, createCent=initCentroids):
numSamples = dataSet.shape[0] #读取矩阵dataSet的第一维度的长度,即获得有多少个样本数据
# first column stores which cluster this sample belongs to,
# second column stores the error between this sample and its centroid
clusterAssment = mat(zeros((numSamples, 2))) #得到一个N*2的零矩阵
clusterChanged = True
## step 1: init centroids
centroids = initCentroids(dataSet, k) #在样本集中随机选取k个样本点作为初始质心
while clusterChanged:
clusterChanged = False
## for each sample
for i in range(numSamples): #range
minDist = 100000.0
minIndex = 0
## for each centroid
## step 2: find the centroid who is closest
#计算每个样本点与质点之间的距离,将其归内到距离最小的那一簇
for j in range(k):
distance = euclDistance(centroids[j, :], dataSet[i, :])
if distance < minDist:
minDist = distance
minIndex = j
## step 3: update its cluster
#k个簇里面与第i个样本距离最小的的标号和距离保存在clusterAssment中
#若所有的样本不在变化,则退出while循环
if clusterAssment[i, 0] != minIndex:
clusterChanged = True
clusterAssment[i, :] = minIndex, minDist**2 #两个**表示的是minDist的平方
## step 4: update centroids
for j in range(k):
#clusterAssment[:,0].A==j是找出矩阵clusterAssment中第一列元素中等于j的行的下标,返回的是一个以array的列表,第一个array为等于j的下标
pointsInCluster = dataSet[nonzero(clusterAssment[:, 0].A == j)[0]] #将dataSet矩阵中相对应的样本提取出来
centroids[j, :] = mean(pointsInCluster, axis = 0) #计算标注为j的所有样本的平均值
print ('Congratulations, cluster complete!')
print(type(centroids),type(clusterAssment))
return mat(centroids), clusterAssment
def biKmeans(dataSet, k):
m = shape(dataSet)[0]
clusterAssment = mat(zeros((m,2)))
centroid0 = mean(dataSet, axis=0).tolist()[0]
centList =[centroid0] #create a list with one centroid
for j in range(m):#calc initial Error
clusterAssment[j,1] = euclDistance(mat(centroid0), dataSet[j])**2
while (len(centList) < k):
lowestSSE = inf # 尽量使得SSE最小
for i in range(len(centList)):
ptsInCurrCluster = dataSet[nonzero(clusterAssment[:,0].A==i)[0],:]# get the data points currently in cluster i
centroidMat, splitClustAss = kMeans(ptsInCurrCluster, 2)
sseSplit = sum(splitClustAss[:,1])#compare the SSE to the currrent minimum
sseNotSplit = sum(clusterAssment[nonzero(clusterAssment[:,0].A!=i)[0],1])
print("sseSplit, and notSplit: ",sseSplit,sseNotSplit)
if (sseSplit + sseNotSplit) < lowestSSE:
bestCentToSplit = i
bestNewCents = centroidMat
bestClustAss = splitClustAss.copy()
lowestSSE = sseSplit + sseNotSplit
bestClustAss[nonzero(bestClustAss[:,0].A == 1)[0],0] = len(centList) #change 1 to 3,4, or whatever
bestClustAss[nonzero(bestClustAss[:,0].A == 0)[0],0] = bestCentToSplit
print('the bestCentToSplit is: ',bestCentToSplit)
print('the len of bestClustAss is: ', len(bestClustAss))
centList[bestCentToSplit] = bestNewCents[0,:].tolist()[0]#replace a centroid with two best centroids
centList.append(bestNewCents[1,:].tolist()[0])
clusterAssment[nonzero(clusterAssment[:,0].A == bestCentToSplit)[0],:]= bestClustAss#reassign new clusters, and SSE
return mat(centList), clusterAssment
import matplotlib
import matplotlib.pyplot as plt
def clusterClubs(datMat,numClust):
# datList = []
# for line in open('places.txt').readlines():
# lineArr = line.split('\t')
# datList.append([float(lineArr[4]), float(lineArr[3])])
# datMat = mat(datList)
myCentroids, clustAssing = biKmeans(datMat, numClust) # 用的是二分kMeans聚类
fig = plt.figure()
rect=[0.1,0.1,0.8,0.8]
scatterMarkers=['s', 'o', '^', '8', 'p', \
'd', 'v', 'h', '>', '<']
axprops = dict(xticks=[], yticks=[])
ax0=fig.add_axes(rect, label='ax0', **axprops)
imgP = plt.imread('Portland.png')
ax0.imshow(imgP)
ax1=fig.add_axes(rect, label='ax1', frameon=False)
for i in range(numClust):
ptsInCurrCluster = datMat[nonzero(clustAssing[:,0].A==i)[0],:]
markerStyle = scatterMarkers[i % len(scatterMarkers)]
ax1.scatter(ptsInCurrCluster[:,0].flatten().A[0], ptsInCurrCluster[:,1].flatten().A[0], marker=markerStyle, s=90)
ax1.scatter(myCentroids[:,0].flatten().A[0], myCentroids[:,1].flatten().A[0], marker='+', s=300)
plt.show()
if __name__ == '__main__':
# print(type(datmat))
datmat = loadDataSet('testSet.txt')
# print(datmat[:][0])# 第一列的最小值
# print(distEclud(datmat[0],datmat[1]))
#---------------------------
# datmat = mat(np.random.rand(100,2))
# print(type(datmat))
clusterClubs(datmat,4)
# myCentroids,clustAssing = kMeans(datmat,4)