在http://blog.csdn.net/zouxy09/article/details/17589329 上看到聚类分析算法,但是是基于python2.7.5版本,直接移植到Python3.6会有问题,更改代码后为增加对比性,绘制原始数据不部分;代码如下:
################################################# #pyhon3.6运行 #time:20170501 #version #liouwuzhou #Email : [email protected] ################################################# def euclDistance(vector1, vector2): return sqrt(sum(power(vector2 - vector1, 2))) def initCentroids(dataSet, k): numSamples, dim = dataSet.shape centroids = zeros((k, dim)) for i in range(k): index = int(random.uniform(0, numSamples)) centroids[i, :] = dataSet[index, :] return centroids def kmeans(dataSet, k): numSamples = dataSet.shape[0] clusterAssment = mat(zeros((numSamples, 2))) clusterChanged = True centroids = initCentroids(dataSet, k) while clusterChanged: clusterChanged = False for i in range(numSamples): minDist = 100000.0 minIndex = 0 for j in range(k): distance = euclDistance(centroids[j, :], dataSet[i, :]) if distance < minDist: minDist = distance minIndex = j if clusterAssment[i, 0] != minIndex: clusterChanged = True clusterAssment[i, :] = minIndex, minDist ** 2 for j in range(k): pointsInCluster = dataSet[nonzero(clusterAssment[:, 0].A == j)[0]] centroids[j, :] = mean(pointsInCluster, axis=0) print('Congratulations, cluster complete!') return centroids, clusterAssment def showCluster(dataSet, k, centroids, clusterAssment): plt.figure() # numSamples, dim = dataSet.shape if dim != 2: print("Sorry! I can not draw because the dimension of your data is not 2!") return 1 mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', ', 'pr'] if k > len(mark): print("Sorry! Your k is too large! please contact Zouxy") return 1 for i in range(numSamples): markIndex = int(clusterAssment[i, 0]) plt.plot(dataSet[i, 0], dataSet[i, 1], mark[markIndex]) mark = ['Dr', 'Db', 'Dg', 'Dk', '^b', '+b', 'sb', 'db', ', 'pb'] for i in range(k): plt.plot(centroids[i, 0], centroids[i, 1], mark[i], markersize=12) def 画原图(dataSet): plt.figure()# numSamples = len(dataSet) for i in range(numSamples): plt.scatter(dataSet[i][0], dataSet[i][1]) #测试代码 from numpy import * import matplotlib.pyplot as plt print("step 1: load data...") dataSet = [] fileIn = open('C:/Python/pycharmwork/MachineLearning/一聚类算法/例子2/testSet.txt') for line in fileIn.readlines(): lineArr = line.strip().split()#此处不兼容python3.6,需要按此更改 dataSet.append([float(lineArr[0]), float(lineArr[1])])# 画原图(dataSet) print("step 2: clustering...") dataSet = mat(dataSet) k = 3 centroids, clusterAssment = kmeans(dataSet, k) print("step 3: show the result...") showCluster(dataSet, k, centroids, clusterAssment) plt.show()