根据类别未知(没有被标记)的训练样本解决模式识别中的各种问题,称之为无监督学习。
我的理解:给出样本,我们忽略所有样本的类别标签,而直接让程序自己去分类
用户给出一个正整数k,程序将样本分为k类。
该思路的代码实现对应下文代码的函数kMeans
缺点:k个质心是随机给出的,对结果影响很大。
上述普通的k均值聚类,初始随机质心对结果影响太大。
定义SSE(Sum of Squared Error)为误差平方和(每个样本点到质心的距离的平方,之和),显然:SSE越小,聚类结果越好。
初始所有样本点属于1个簇。
若当前已有x个簇,则选择一个簇i,将簇i使用kMeans函数一分为二,满足划分后的SSE最小。至此我们的簇数变成x+1,重复此过程,知道簇数达到k
实际测试中,我发现二分优化后的K均值聚类并没有进步太多。查阅资料,说是数据集越接近于超球体分布,即不同类别均匀分布与一个超球体上,且间距尽可能大,这时候的k均值聚类才能体现出优越性。
【代码实现python】
import math
import numpy
def loadDataSet(filename):
dataSet = []
fr = open(filename)
for line in fr.readlines():
curLine = line.strip().split('\t')
dataSet.append([float(i) for i in curLine])
return dataSet
# 计算两向量之间的欧氏距离
# 注意,math.sqrt(参数必须仅一个元素)返回数值,numpy.sqrt返回同参数类型(但若元素仅1个,则返回数值)
def distEclud(vecA, vecB):
return math.sqrt(numpy.sum(numpy.power(vecA - vecB, 2)))
# 随机产生k个质心
def randCenter(dataMat, k):
n = numpy.shape(dataMat)[1]
centroids = numpy.mat(numpy.zeros((k, n))) # 构造k*n的0矩阵
for j in range(n): # 遍历列
minJ = numpy.min(dataMat[:, j])
rangeJ = numpy.max(dataMat[:, j]) - minJ
centroids[:, j] = minJ + rangeJ * numpy.random.rand(k, 1)
return centroids
def kMeans(dataSet, k, distMeas=distEclud, createCent=randCenter):
dataMat = numpy.mat(dataSet)
m = numpy.shape(dataSet)[0]
clusterAssment = numpy.mat(numpy.zeros((m, 2))) # 记下每个点所属的簇的质心
centroids = createCent(dataMat, k) # 随机产生k个质心
clusterChanged = True
while clusterChanged:
clusterChanged = False
for i in range(m): # 遍历点
minDist = numpy.inf;
minIndex = -1 # 寻找一个最近质心
for j in range(k):
distJI = distMeas(centroids[j], dataMat[i])
if minDist > distJI:
minDist = distJI;
minIndex = j
if clusterAssment[i, 0] != minIndex:
clusterChanged = True
clusterAssment[i, :] = minIndex, minDist ** 2
for cent in range(k): # 遍历质心,更新质心
curIndex = numpy.nonzero(clusterAssment[:, 0] == cent)[0] # 选出簇为cent的样本点的下标,np.array
ptsInCluster = dataMat[curIndex] # 截取出本簇的点
centroids[cent, :] = numpy.mean(ptsInCluster, axis=0) # 按列求平均值,得新质心
# clusterAssment[:,0] == cent 得到真假矩阵(1列),簇为cent的项是True,否则False
# numpy.nonzero(x) 得到矩阵x中不为0的项的坐标,返回一个元组,内涵两个np.array,分别行列坐标
# dataMat[c] c是一个np.array,dataMat矩阵截取出第c[i]行
return centroids, clusterAssment # 返回质心、每个点所属的簇的质心下标and距离^2
def biKMeans(dataSet, k, distMeas=distEclud):
dataMat = numpy.mat(dataSet)
m = numpy.shape(dataMat)[0]
clusterAssment = numpy.mat(numpy.zeros((m, 2))) # 每个样本点 0所属簇编号 1距离平方
centroid0 = numpy.mean(dataMat, axis=0) # 全部归于一个簇,即质心为平均值
centList = centroid0.tolist() # 列表 保存质心
for j in range(m): # 为距离平方赋值
clusterAssment[j, 1] = distMeas(centroid0, dataMat[j]) ** 2
while len(centList) < k:
lowestSSE = numpy.inf # 正无穷
for i in range(len(centList)): # 遍历已知质心,即遍历已知簇
ptsInCurrCluster = dataMat[numpy.nonzero(clusterAssment[:, 0].A == i)[0], :]
# 截取出当前簇i的点
centroidMat, splitClutserAss = kMeans(ptsInCurrCluster, k=2)
# 试图将簇i一分为二
sseSplit = numpy.sum(splitClutserAss[:, 1]) # 当前簇一分为二的SSE
sseNotSplit = numpy.sum(clusterAssment[numpy.nonzero(clusterAssment[:, 0] != i)[0], 1]) # 不包含当前簇的SSE
if lowestSSE > (sseSplit + sseNotSplit): # SSE发现新的小值
bestCentToSplit = i # 最好的切分簇下标
bestNewCents = centroidMat # 最好的将i分开后的质心,一定是两个点
bestClusterAss = splitClutserAss
lowestSSE = sseSplit + sseNotSplit
# 下面将切分的那个簇进行处理,首先修改全局中该簇中 每个样本点的质心
bestClusterAss[numpy.nonzero(bestClusterAss[:, 0].A == 1)[0], 0] = len(centList) # 新增,必须放在下一行的前面
bestClusterAss[numpy.nonzero(bestClusterAss[:, 0].A == 0)[0], 0] = bestCentToSplit # 覆盖 若此步与上步交换,则有可能影响值判断条件
clusterAssment[numpy.nonzero(clusterAssment[:, 0].A == bestCentToSplit)[0], :] = bestClusterAss
centList[bestCentToSplit] = bestNewCents.tolist()[0] # 更新0质心 覆盖在切分簇位置
centList.append(bestNewCents.tolist()[1]) # 增加1质心 添加到尾部
# 上面两步赋值需要注意,将numpy.mat转为list
return numpy.mat(centList), clusterAssment # 质心矩阵,样本所属簇、距离平方
# 下面使用绘图,会出聚类结果
def plotter(dataMat, centroids, k):
import matplotlib.pyplot as plt
ax = plt.figure().add_subplot(111) # 创建子图
col = ['red', 'blue', 'yellow', 'green', 'pink', 'black']
for i in range(k):
arr = numpy.nonzero(clusterAssment[:, 0] == i)[0]
ax.scatter(dataMat[arr, 0].tolist(), dataMat[arr, 1].tolist(), color=col[i])
ax.scatter(centroids[:, 0].flatten().A[0], centroids[:, 1].flatten().A[0], s=200)
plt.show()
if __name__ == "__main__":
dataSet = loadDataSet('testSet2.txt')
centroids, clusterAssment = kMeans(dataSet, 3)
plotter(numpy.mat(dataSet), centroids, 3)
数据集testSet2.txt
3.275154 2.957587
-3.344465 2.603513
0.355083 -3.376585
1.852435 3.547351
-2.078973 2.552013
-0.993756 -0.884433
2.682252 4.007573
-3.087776 2.878713
-1.565978 -1.256985
2.441611 0.444826
-0.659487 3.111284
-0.459601 -2.618005
2.177680 2.387793
-2.920969 2.917485
-0.028814 -4.168078
3.625746 2.119041
-3.912363 1.325108
-0.551694 -2.814223
2.855808 3.483301
-3.594448 2.856651
0.421993 -2.372646
1.650821 3.407572
-2.082902 3.384412
-0.718809 -2.492514
4.513623 3.841029
-4.822011 4.607049
-0.656297 -1.449872
1.919901 4.439368
-3.287749 3.918836
-1.576936 -2.977622
3.598143 1.975970
-3.977329 4.900932
-1.791080 -2.184517
3.914654 3.559303
-1.910108 4.166946
-1.226597 -3.317889
1.148946 3.345138
-2.113864 3.548172
0.845762 -3.589788
2.629062 3.535831
-1.640717 2.990517
-1.881012 -2.485405
4.606999 3.510312
-4.366462 4.023316
0.765015 -3.001270
3.121904 2.173988
-4.025139 4.652310
-0.559558 -3.840539
4.376754 4.863579
-1.874308 4.032237
-0.089337 -3.026809
3.997787 2.518662
-3.082978 2.884822
0.845235 -3.454465
1.327224 3.358778
-2.889949 3.596178
-0.966018 -2.839827
2.960769 3.079555
-3.275518 1.577068
0.639276 -3.412840