问题描述:一堆二维数据,用kmeans算法对其进行聚类,下面例子以分k=3为例。
原数据:
1.5,3.1
2.2,2.9
3,4
2,1
15,25
43,13
32,42
0,0
8,9
12,5
9,12
11,8
22,33
24,25
实现代码:
#coding:utf-8
from numpy import *
import string
import math
def loadDataSet(filename):
dataMat = []
fr = open(filename)
for line in fr.readlines():
element = line.strip('\n').split(',')
number = []
for i in range(len(element)):
number.append(string.atof(element[i]))
dataMat.append(number)
return dataMat
def distEclud(vecA, vecB):
count = len(vecA)
s = 0.0
for i in range(0, count):
s = s + power(vecA[i]-vecB[i], 2)
return sqrt(s)
def clusterOfElement(means, element):
min_dist = distEclud(means[0], element)
lable = 0
for index in range(1, len(means)):
dist = distEclud(means[index], element)
if(dist < min_dist):
min_dist = dist
lable = index
return lable
def getMean(cluster):#cluster=[[[1,2],[1,2],[1,2]....],[[2,1],[2,1],[2,1],[2,1]...]]
num = len(cluster)#1个簇的num,如上为3个
res = []
temp = 0
dim = len(cluster[0])
for i in range(0, dim):
for j in range(0, num):
temp = temp + cluster[j][i]
temp = temp / num
res.append(temp)
return res
def kMeans():
k = 3
data = loadDataSet('data.txt')
print "data is ", data
inite_mean = [[1.1, 1], [1, 1],[1,2]]
count = 0
while(count < 1000):
count = count + 1
clusters = []
means = []
for i in range(k):
clusters.append([])
means.append([])
for index in range(len(data)):
lable = clusterOfElement(inite_mean, data[index])
clusters[lable].append(data[index])
for cluster_index in range(k):
mea = getMean(clusters[cluster_index])
for mean_dim in range(len(mea)):
means[cluster_index].append(mea[mean_dim])
for mm in range(len(means)):
for mmm in range(len(means[mm])):
inite_mean[mm][mmm] = means[mm][mmm]
print "result cluster is ", clusters
print "result means is ", inite_mean
kMeans()