聚类分析算法Python3.6实践K均值聚类(K-means)

在http://blog.csdn.net/zouxy09/article/details/17589329 上看到聚类分析算法,但是是基于python2.7.5版本,直接移植到Python3.6会有问题,更改代码后为增加对比性,绘制原始数据不部分;代码如下:

#################################################
#pyhon3.6运行
#time:20170501
#version
#liouwuzhou
#Email  : [email protected]
#################################################


def euclDistance(vector1, vector2):
    return sqrt(sum(power(vector2 - vector1, 2)))
def initCentroids(dataSet, k):
    numSamples, dim = dataSet.shape
    centroids = zeros((k, dim))
    for i in range(k):
        index = int(random.uniform(0, numSamples))
        centroids[i, :] = dataSet[index, :]
    return centroids

def kmeans(dataSet, k):
    numSamples = dataSet.shape[0]
    clusterAssment = mat(zeros((numSamples, 2)))
    clusterChanged = True
    centroids = initCentroids(dataSet, k)
    while clusterChanged:
        clusterChanged = False
        for i in range(numSamples):
            minDist = 100000.0
            minIndex = 0
            for j in range(k):
                distance = euclDistance(centroids[j, :], dataSet[i, :])
                if distance < minDist:
                    minDist = distance
                    minIndex = j
            if clusterAssment[i, 0] != minIndex:
                clusterChanged = True
                clusterAssment[i, :] = minIndex, minDist ** 2
        for j in range(k):
            pointsInCluster = dataSet[nonzero(clusterAssment[:, 0].A == j)[0]]
            centroids[j, :] = mean(pointsInCluster, axis=0)
    print('Congratulations, cluster complete!')
    return centroids, clusterAssment
def showCluster(dataSet, k, centroids, clusterAssment):
    plt.figure()  #
    numSamples, dim = dataSet.shape
    if dim != 2:
        print("Sorry! I can not draw because the dimension of your data is not 2!")
        return 1
    mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', ', 'pr']
    if k > len(mark):
        print("Sorry! Your k is too large! please contact Zouxy")
        return 1
    for i in range(numSamples):
        markIndex = int(clusterAssment[i, 0])
        plt.plot(dataSet[i, 0], dataSet[i, 1], mark[markIndex])
    mark = ['Dr', 'Db', 'Dg', 'Dk', '^b', '+b', 'sb', 'db', ', 'pb']
    for i in range(k):
        plt.plot(centroids[i, 0], centroids[i, 1], mark[i], markersize=12)

def 画原图(dataSet):
    plt.figure()#
    numSamples = len(dataSet)
    for i in range(numSamples):
        plt.scatter(dataSet[i][0], dataSet[i][1])

#测试代码
from numpy import *
import matplotlib.pyplot as plt
print("step 1: load data...")
dataSet = []
fileIn = open('C:/Python/pycharmwork/MachineLearning/一聚类算法/例子2/testSet.txt')
for line in fileIn.readlines():
    lineArr = line.strip().split()#此处不兼容python3.6,需要按此更改
    dataSet.append([float(lineArr[0]), float(lineArr[1])])#
画原图(dataSet)
print("step 2: clustering...")
dataSet = mat(dataSet)
k = 3
centroids, clusterAssment = kmeans(dataSet, k)
print("step 3: show the result...")
showCluster(dataSet, k, centroids, clusterAssment)
plt.show()


你可能感兴趣的:(聚类分析算法Python3.6实践K均值聚类(K-means))