二分K-means的python实现

前两天无意间看到”zouxy09“大牛的机器学习算法与Python实践之(六)二分k均值聚类讲解,我感觉很受启发啦,但是又看到下面的评论里说出了一些不足,然后就抱着试一试的心态去做了一下,所以数据还是用的zouxy09“的,链接在下面贴出来了。下面放代码啦~

#coding:utf-8
#二分K-means算法
import numpy as np
import random
import math
import matplotlib.pyplot as plt

def readText(inputfile):
    lines = open(inputfile,"r").readlines()
    dataSet = []
    for line in lines:
        dataList = line.strip().split()
        dataSet.append([float(dataList[0]),float(dataList[1])])
    return dataSet

def transDataSet(dataSet): 
    hang,lie = dataSet.shape
    new_data = np.zeros((hang,lie))
    for i in range(hang):
        new_data[i,:] = dataSet[i,:]
    return new_data

def initClusterpoints(dataSet,k):
    hang,lie = dataSet.shape
    clusterPoints = np.zeros((k,lie))
    for i in range(k):
        index = int(random.uniform(0,hang))
        clusterPoints[i,:] = dataSet[index,:]
    return clusterPoints

def calDis(v1,v2):
    s = (sum(pow(v1-v2,2)))
    return s

def calSSE(dataSubset,cleaterPoint):
    dataSubset = transDataSet(dataSubset)
    cleaterPoint = transDataSet(cleaterPoint)
    dataSubset_hang = dataSubset.shape[0]
    sum_dis = 0
    for i in range(dataSubset_hang):
        sum_dis = sum_dis +calDis(dataSubset[i,:],cleaterPoint[0,:])
    return sum_dis

def bisKmeans(dataSet,k):
#  	将初始的一个cluster一分为二
    hang,lie = dataSet.shape
    # clusterCenter = np.zeros((k,lie))
    clusterAssment = np.zeros((hang,lie))
    for i in range(hang):
        clusterAssment[i,0] = 0
    currentClusterPoints = np.mean(dataSet,axis=0).tolist()[0]
    cenList = {0:[currentClusterPoints]}
    clusterNum = 1

    while clusterNum < k:
        maxSSE = 0.00001
        maxSSEIndex = 0.0
        for j in range(clusterNum):#得到最大的SSE
            currentDataSet = dataSet[np.nonzero(clusterAssment[:,0]==j)[0]]
            currentSSE = calSSE(currentDataSet,np.mat(cenList[j]))

            if currentSSE >= maxSSE:
                maxSSE = currentSSE
                maxSSEIndex = j

        currentDataSet = dataSet[np.nonzero(clusterAssment[:,0]==maxSSEIndex)[0]]
        currentClusterPoints,currentClusterAssment = Kmeans(currentDataSet,2)

        if clusterNum ==1:
            clusterAssment = currentClusterAssment

        else:
            #把新分出来的两部分分别打上标签。
            currentClusterAssment[np.nonzero(currentClusterAssment[:,0]==1)[0],0]=clusterNum
            currentClusterAssment[np.nonzero(currentClusterAssment[:,0]==0)[0],0]=maxSSEIndex

            clusterAssment[np.nonzero(clusterAssment[:,0]==maxSSEIndex)[0],:]=currentClusterAssment

        #更新聚类中心
        cenList[clusterNum] = currentClusterPoints[1,:]
        cenList[maxSSEIndex] = currentClusterPoints[0,:]

        clusterNum += 1

    return cenList,clusterAssment

def Kmeans(dataSet,k):
    clusterPoints = initClusterpoints(dataSet,k)
    new_dataSet = transDataSet(dataSet)
    hang,lie= dataSet.shape
    clusterAssment = np.zeros((hang,lie))
    clusterChanged = True

    while clusterChanged:
        clusterChanged = False
        #计算每个点与各个中心点的距离
        for i in range(hang):
            min_dis = 10000.0
            min_index = 0
            for j in range(k):
                distance = calDis(new_dataSet[i,:],clusterPoints[j,:])
                if distance < min_dis:
                    min_dis = distance
                    min_index = j
            #将每个点归类
            if clusterAssment[i,0] != min_index:
                clusterChanged = True
                clusterAssment[i,0] = min_index
        #更新聚类中心
        for j in range(k):
            points = dataSet[np.nonzero(clusterAssment[:,0]==j)[0]]
            clusterPoints[j,:] = np.mean(points,axis=0)
    # print(u"聚类成功!")
    return clusterPoints,clusterAssment

def showCluster(dataSet, k, centroids, clusterAssment):
    plt.rcParams["font.sans-serif"]=["SimHei"]
    plt.rcParams["axes.unicode_minus"]=False
    numSamples, dim = dataSet.shape
    plt.title(u"K值为%d的聚类结果"%k)
    if dim != 2:
        print ("Sorry! I can not draw because the dimension of your data is not 2!")
        return 1

    mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', ' len(mark):
        print ("Sorry! Your k is too large! please contact Zouxy")
        return 1

    # draw all samples
    for i in range(numSamples):
        markIndex = int(clusterAssment[i, 0])
        plt.plot(dataSet[i, 0], dataSet[i, 1], mark[markIndex])#把所有点先画出来,根据聚类结果来标记不同分类的点。

    mark = ['Dr', 'Db', 'Dg', 'Dk', '^b', '+b', 'sb', 'db', '


你可能感兴趣的:(数据挖掘)