【聚类算法】K-means个人实现

关于K的选择

K-Means关于K的选择,也就是肘部法则,对此我们自己

#肘部法则
#-*- coding:utf-8 -*-  
import numpy as np  
import matplotlib.pyplot as plt  
from sklearn.cluster import KMeans  
from scipy.spatial.distance import cdist  

x = np.array([1, 2, 3, 1, 5, 6, 5, 5, 6, 7, 8, 9, 7, 9])  
y = np.array([1, 3, 2, 2, 8, 6, 7, 6, 7, 1, 2, 1, 1, 3])  
data = np.array(list(zip(x, y)))   # 加压

# data = np.array(np.random.rand(100,2))
# data = np.array(datmat)   # 加压

#data = ([1,1],[2,3],[3,2],[1,2],[5,8])
# 肘部法则 求解最佳分类数  
# K-Means参数的最优解也是以成本函数最小化为目标  
# 成本函数是各个类畸变程度(distortions)之和,每个类的畸变程度等于该类重心与其内部成员位置距离的平方和  
def K_choose():
    num = 0
    aa = []
    K = range(1, 10) 
    for k in range(1,10):  # 肘部法则测试
        kmeans=KMeans(n_clusters=k) 
        kmeans.fit(data)
        aa.append(sum(np.min(cdist(data, kmeans.cluster_centers_, 'euclidean'),axis=1))/data.shape[0])     
        num += 1
    # print(num)

    plt.figure()
    plt.plot(np.array(K), aa, 'bx-')  #横,纵坐标
    plt.show()
    m = []
    for i in range(1,8):                
        print(i-1,i,i+1)
        t1 = aa[i-1]-aa[i]
        t2 = aa[i]-aa[i+1]
        # dic.setdefault(str(i),[]).append([t1,t2])
        m.append(abs(t2-t1))        
    t = max(m)
    print(t)
    for i in range(len(m)):
        if t == m[i]:
            return i+2 #这里+2是因为数据从0开始遍历的,但是真正的分类结果是+2之后的

#绘制散点图及聚类结果中心点  
def Kmeans():
    k = K_choose()
    print(k)
    plt.figure()
    plt.axis([0, 10, 0, 10])
    plt.grid(True)
    plt.plot(x,y,'k.')  
    kmeans=KMeans(n_clusters=3)  # 为什么选择3
    kmeans.fit(data)
    plt.plot(kmeans.cluster_centers_[:,0],kmeans.cluster_centers_[:,1],'r.')  
    plt.show()
# if __name__ is '__main__':
Kmeans()

代码注释的请忽略,中文注释算是比较详细。修改了书上的版本,因为Python已经3.6了
1)map 的规则变了
2)源代码的矩阵,list定义有点混乱

'''
Created on Feb 16, 2011
Modify on Mar 27, 2018
k Means Clustering for Ch10 of Machine Learning in Action
@author: Peter Harrington ---++++---YM 
'''
from numpy import *
import numpy as np


def loadDataSet(fileName):      #general function to parse tab -delimited floats
    dataMat = []                #assume last column is target value
    fr = open(fileName)
    for line in fr.readlines():
        fltLine = []
        curLine = line.strip().split('\t')  # 读取的数据是坐标形式          
        fltLine = [float(curLine[0]),float(curLine[1])]
        dataMat.append(fltLine)
        # fltLine = map(float,curLine) # map all elements to float()
        # 为什么会显示map类型呢??????????map不是函数应用吗
        # print(curLine[0])
        # print(type(fltLine))
    return mat(dataMat)
  
  
# calculate Euclidean distance  
def euclDistance(vector1, vector2):  
    return sqrt(sum(power(vector2 - vector1, 2)))  #求这两个矩阵的距离,vector1、2均为矩阵
  
# init centroids with random samples  
#在样本集中随机选取k个样本点作为初始质心
def initCentroids(dataSet, k):  
    numSamples, dim = dataSet.shape   #矩阵的行数、列数 
    centroids = zeros((k, dim))         #感觉要不要你都可以
    for i in range(k):  
        index = int(random.uniform(0, numSamples))  #随机产生一个浮点数,然后将其转化为int型
        centroids[i,:] = dataSet[index, :]  
    return centroids
  
# k-means cluster 
#dataSet为一个矩阵
#k为将dataSet矩阵中的样本分成k个类 
def kMeans(dataSet, k):  
# def kMeans(dataSet, k, distMeas=euclDistance, createCent=initCentroids):
    numSamples = dataSet.shape[0]  #读取矩阵dataSet的第一维度的长度,即获得有多少个样本数据
    # first column stores which cluster this sample belongs to,  
    # second column stores the error between this sample and its centroid  
    clusterAssment = mat(zeros((numSamples, 2)))  #得到一个N*2的零矩阵
    clusterChanged = True  
  
    ## step 1: init centroids  
    centroids = initCentroids(dataSet, k)  #在样本集中随机选取k个样本点作为初始质心
  
    while clusterChanged:  
        clusterChanged = False  
        ## for each sample  
        for i in range(numSamples):  #range
            minDist  = 100000.0  
            minIndex = 0  
            ## for each centroid  
            ## step 2: find the centroid who is closest  
            #计算每个样本点与质点之间的距离,将其归内到距离最小的那一簇
            for j in range(k):  
                distance = euclDistance(centroids[j, :], dataSet[i, :])  
                if distance < minDist:  
                    minDist  = distance  
                    minIndex = j                
            ## step 3: update its cluster 
            #k个簇里面与第i个样本距离最小的的标号和距离保存在clusterAssment中
            #若所有的样本不在变化,则退出while循环
            if clusterAssment[i, 0] != minIndex:  
                clusterChanged = True  
                clusterAssment[i, :] = minIndex, minDist**2  #两个**表示的是minDist的平方
  
        ## step 4: update centroids  
        for j in range(k):  
            #clusterAssment[:,0].A==j是找出矩阵clusterAssment中第一列元素中等于j的行的下标,返回的是一个以array的列表,第一个array为等于j的下标
            pointsInCluster = dataSet[nonzero(clusterAssment[:, 0].A == j)[0]] #将dataSet矩阵中相对应的样本提取出来 
            centroids[j, :] = mean(pointsInCluster, axis = 0)  #计算标注为j的所有样本的平均值
  
    print ('Congratulations, cluster complete!')  
    print(type(centroids),type(clusterAssment))
    return mat(centroids), clusterAssment  
  


def biKmeans(dataSet, k):
    m = shape(dataSet)[0]
    clusterAssment = mat(zeros((m,2)))
    centroid0 = mean(dataSet, axis=0).tolist()[0]
    centList =[centroid0] #create a list with one centroid
    for j in range(m):#calc initial Error
        clusterAssment[j,1] = euclDistance(mat(centroid0), dataSet[j])**2
    while (len(centList) < k):
        lowestSSE = inf    # 尽量使得SSE最小
        for i in range(len(centList)):
            ptsInCurrCluster = dataSet[nonzero(clusterAssment[:,0].A==i)[0],:]# get the data points currently in cluster i
            centroidMat, splitClustAss = kMeans(ptsInCurrCluster, 2)
            sseSplit = sum(splitClustAss[:,1])#compare the SSE to the currrent minimum
            sseNotSplit = sum(clusterAssment[nonzero(clusterAssment[:,0].A!=i)[0],1])
            print("sseSplit, and notSplit: ",sseSplit,sseNotSplit)
            if (sseSplit + sseNotSplit) < lowestSSE:
                bestCentToSplit = i
                bestNewCents = centroidMat
                bestClustAss = splitClustAss.copy()
                lowestSSE = sseSplit + sseNotSplit
        bestClustAss[nonzero(bestClustAss[:,0].A == 1)[0],0] = len(centList) #change 1 to 3,4, or whatever
        bestClustAss[nonzero(bestClustAss[:,0].A == 0)[0],0] = bestCentToSplit
        print('the bestCentToSplit is: ',bestCentToSplit)
        print('the len of bestClustAss is: ', len(bestClustAss))
        centList[bestCentToSplit] = bestNewCents[0,:].tolist()[0]#replace a centroid with two best centroids 
        centList.append(bestNewCents[1,:].tolist()[0])
        clusterAssment[nonzero(clusterAssment[:,0].A == bestCentToSplit)[0],:]= bestClustAss#reassign new clusters, and SSE
    return mat(centList), clusterAssment


import matplotlib
import matplotlib.pyplot as plt
def clusterClubs(datMat,numClust):
    # datList = []
    # for line in open('places.txt').readlines():
    #     lineArr = line.split('\t')
    #     datList.append([float(lineArr[4]), float(lineArr[3])])
    # datMat = mat(datList)
    myCentroids, clustAssing = biKmeans(datMat, numClust)  # 用的是二分kMeans聚类
    fig = plt.figure()
    rect=[0.1,0.1,0.8,0.8]
    scatterMarkers=['s', 'o', '^', '8', 'p', \
                    'd', 'v', 'h', '>', '<']
    axprops = dict(xticks=[], yticks=[])
    ax0=fig.add_axes(rect, label='ax0', **axprops)
    imgP = plt.imread('Portland.png')
    ax0.imshow(imgP)
    ax1=fig.add_axes(rect, label='ax1', frameon=False)
    for i in range(numClust):
        ptsInCurrCluster = datMat[nonzero(clustAssing[:,0].A==i)[0],:]
        markerStyle = scatterMarkers[i % len(scatterMarkers)]
        ax1.scatter(ptsInCurrCluster[:,0].flatten().A[0], ptsInCurrCluster[:,1].flatten().A[0], marker=markerStyle, s=90)
    ax1.scatter(myCentroids[:,0].flatten().A[0], myCentroids[:,1].flatten().A[0], marker='+', s=300)
    plt.show()


if __name__ == '__main__':    
    # print(type(datmat))
    datmat = loadDataSet('testSet.txt')
    # print(datmat[:][0])# 第一列的最小值    
    # print(distEclud(datmat[0],datmat[1]))  
    #---------------------------  
    # datmat = mat(np.random.rand(100,2))
    # print(type(datmat))
    clusterClubs(datmat,4) 
    # myCentroids,clustAssing = kMeans(datmat,4)

你可能感兴趣的:(机器学习实战)