kmeans python版

好久没更新了,这一阵在学python和数据挖掘,晚上下班无聊用python实现了以下kmeans,发上来占个坑吧。

这个版本支持对多个不同维度的向量聚类,但是其坐标必须是数字。由于本人刚接触python,对其特性的理解还很欠缺,代码风格也可能还有很重java和c的痕迹,还望大伙多多指正。

下面这个是对向量类的定义:

import math

class Point:
	MAX_DIST = 1<<60

	def __init__(self,v):
		self.point = v

	def dist(self,other):
		pa = self.point
		pb = other.point
		if(len(pa) != len(pb)):
			return Point.MAX_DIST*max(len(pa),len(pb))
		res = 0
		for i in range(0,len(pa)):
			res += (pa[i]-pb[i])*(pa[i]-pb[i])
		return math.sqrt(res);

	def __add__(self,other):
		pa = self.point
		pb = other.point
		res = []
		for i in range(min(len(pa),len(pb))):
			res.append(pa[i]+pb[i])
		for i in range(min(len(pa),len(pb)), max(len(pa),len(pb))):
			res.append(Point.MAX_DIST)
		return Point(tuple(res))

	def __div__(self,num):
		p = [1.0*n/num for n in self.point]
		return Point(tuple(p))

if __name__ == '__main__':
	a = Point((1,2,))
	b = Point(tuple((3,4)))
	print a.point,b.point
	print (a+b).point
	print a.dist(b)
	print (a/2).point
kmeans算法的实现:

from point import Point

class KmeansAlgo:

	def __init__(self,K,plist):
		self.k = K
		self.plist = plist
		self.kinds = []
		if len(self.plist) < K:
			print 'plist member too few!'
			return
		for i in range(0,K):
			self.kinds.append([self.plist[i],])
		for i in range(K,len(plist)):
			self.kinds[0].append(self.plist[i])

	def adjust(self):
		readjust = 0
		midpoints = []
		for sublist in self.kinds:
			m = Point((0,0,))
			for x in sublist:
				m = m+x
			m = m/len(sublist)
			midpoints.append(m)
		for ilist in range(self.k):
			sublist = self.kinds[ilist]
			for x in sublist:
				minimal = midpoints[ilist].dist(x)
				idx = ilist
				for imid in range(self.k):
					tmpdist = x.dist(midpoints[imid])
					if tmpdist < minimal:
						minimal = tmpdist
						idx = imid
				if idx != ilist:
					sublist.remove(x)
					self.kinds[idx].append(x)
					readjust += 1
		return readjust

	def repeat(self):
		times = 0
		while True:
			readjust = self.adjust()
			if not readjust:
				break
			times += 1
			print times,readjust


if __name__ == '__main__':
	file = open('points.in','r')
	plist = []
	while True:
		s = file.readline()
		if len(s)==0:
			break
		points = s.split()
		plist.append(Point(tuple([int(i) for i in points])))
	km = KmeansAlgo(4,plist)
	km.repeat()
	for i in range(len(km.kinds)):
		print 'kind ',i
		for p in km.kinds[i]:
			print p.point
下面是个输入文件的样例:

1 1

0 1

1 0

-1 -1

-1 0

0 -1

100 101

101 100

99 100

100 99

1000 999

1001 999

1000 1000

999 999

1 2 3

2 3 4

2 3 100

1

2

1 2 3 4 5

**********************************over**************************************

你可能感兴趣的:(python,数据挖掘,Data,聚类,kmeans,Mining)