层次聚类与KMeans聚类实现

import numpy as np
from sklearn.datasets import load_iris
import pandas as pd
import matplotlib.pyplot as plt
import collections
import copy

"""
第14章聚类算法,分别实现层次聚类和kmeans聚类
使用例题debuge,层次聚类用iris数据集作图验证,kmeans只看了iris数据集聚类后每一类的点数量
"""


class hierarchicalClustering():
	"""
	算法14.1
	"""

	def __init__(self, k, data, Dpq='min', p=2):
		self.k = k
		self.data = np.asarray(data)
		self.n = data.shape[1]
		# self.n=5
		self.p = p
		self.Dpq = 'min'
		self.distanceMatrix = self.getDistanceMatrix()
		"""
		self.distanceMatrix = np.array([
			[0, 7, 2, 9, 3],
			[7, 0, 5, 4, 6],
			[2, 5, 0, 8, 1],
			[9, 0, 8, 0, 5],
			[3, 6, 1, 5, 0]
		])
		"""
		self.hierarchicalClustering()

	def getDistanceMatrix(self):
		distanceMatrix = np.zeros((self.n, self.n))
		for i in range(self.n):
			for j in range(i + 1, self.n):
				distance = self.getPointDistance(self.p, i, j)
				distanceMatrix[i][j] = distanceMatrix[j][i] = distance
		# print(distanceMatrix)
		return distanceMatrix

	def getPointDistance(self, p, x1Index, x2Index):
		return np.power(np.sum(np.power(np.abs(self.data[:, x1Index] - self.data[:, x2Index]), p)), 1 / p)

	def getClusterDistance(self, Gp, Gq):
		return min([self.distanceMatrix[xiIndex, xjIndex] for xiIndex in Gp for xjIndex in Gq])

	def hierarchicalClustering(self):
		self.GList = [[i] for i in range(self.n)]
		while len(self.GList) > self.k:
			# print("目前GList={}".format(self.GList))
			minGDistance = float('inf')
			GiIndex, GjIndex = -1, -1
			for i in range(len(self.GList)):
				for j in range(i + 1, len(self.GList)):
					distance = self.getClusterDistance(self.GList[i], self.GList[j])
					# print("类{}和类{}的距离是{}".format(self.GList[i],self.GList[j],distance))
					if distance < minGDistance:
						minGDistance = distance
						GiIndex = i
						GjIndex = j
			mergeG = self.GList[GiIndex] + self.GList[GjIndex]
			self.GList.pop(GiIndex)
			self.GList.pop(GjIndex - 1)  # 弹出Gi后索引后退1
			# print("融合第{}个类和第{}个类".format(GiIndex+1,GjIndex+1))
			self.GList.append(mergeG)


class kMeansClustering():
	"""
	算法14.2
	"""

	def __init__(self, k, data, p=2):
		self.k = k
		self.n = data.shape[1]
		self.data = np.asarray(data)
		# self.n = 5
		self.p = p
		"""
		self.data = np.array([
			[0, 0, 1, 5, 5],
			[2, 0, 0, 0, 2]
		])
		"""
		self.oldCenters = collections.defaultdict(list)  # 用前k个点初始化中心,字典意义是{类中心:类中的点}
		for i in range(self.k):
			self.oldCenters[tuple(self.data[:, i])].append(self.data[:, i])
		# 计算后k个样本到类中心的距离并分配,初始化状态
		for pointIndex in range(self.k, self.n):
			minDistance = float('inf')
			finalCenter = -1
			for center in self.oldCenters.keys():
				distance = self.getPointDistance(self.p, self.data[:, pointIndex], center)
				# print(distance)
				if distance < minDistance:
					minDistance = distance
					finalCenter = center

			self.oldCenters[finalCenter].append(self.data[:, pointIndex])
		self.newCenters = copy.deepcopy(self.oldCenters)
		self.kMeansClustering()

	def getPointDistance(self, p, x1, x2):
		return np.power(np.sum(np.power(np.abs(x1 - x2), p)), 1 / p)

	def kMeansClustering(self):
		notSame = True
		while (notSame):
			self.oldCenters = self.newCenters
			self.newCenters = collections.defaultdict(list)
			# 计算类中心
			centers = []
			for cluster in self.oldCenters.items():
				sum = np.zeros(len(cluster[1][0]))
				for point in cluster[1]:
					sum += point
				centers.append(sum / len(cluster[1]))
			# 重新分配点
			for pointIndex in range(self.n):
				minDistance = float('inf')
				finalCenter = -1
				for center in centers:
					distance = self.getPointDistance(self.p, self.data[:, pointIndex], center)
					# print(distance)
					if distance < minDistance:
						minDistance = distance
						finalCenter = center
				self.newCenters[tuple(finalCenter)].append(self.data[:, pointIndex])
			# print(self.newCenters)
			# print(self.oldCenters)
			thisLoopisSame = True

			oldClusters, newClusters = list(self.oldCenters.values()), list(self.newCenters.values())
			for clusterIndex in range(self.k):
				oldCluster = oldClusters[clusterIndex]
				newCluster = newClusters[clusterIndex]
				if len(oldCluster) != len(newCluster):
					thisLoopisSame = False
					break
				for pointIndex in range(len(oldCluster)):
					# print("oldCluster[pointIndex]={}".format(oldCluster[pointIndex]))
					# print("newCluster[pointIndex]={}".format(newCluster[pointIndex]))
					if (oldCluster[pointIndex] != newCluster[pointIndex]).any():
						thisLoopisSame = False
						break
			# print("--------------------------")
			if thisLoopisSame: notSame = False
		self.GList = []
		for clusterIndex in self.newCenters:
			self.GList.append(self.newCenters[clusterIndex])


iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['label'] = iris.target
df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']  # 原数据集标签带'(cm)'


def plotRawData():
	plt.scatter(df.iloc[0:50]['sepal length'], df.iloc[0:50]['sepal width'], label='0')  # 前50行的数据是0
	plt.scatter(df.iloc[50:100]['sepal length'], df.iloc[50:100]['sepal width'], label='1')  # 前50-100行的数据是1
	# plt.scatter(df[100:150]['sepal length'],df[100:150]['sepal width'],label='2') #前100-150行的数据是2
	plt.xlabel('sepal length')
	plt.ylabel('sepal width')
	plt.legend()
	plt.show()


# plotRawData()
data = np.asarray(df.iloc[:100, :4]).T


#最终结果接口不统一,层次聚类GList的点是索引表达的(),kmeans的点是向量表达的,不想改了...
#mycluster = hierarchicalClustering(2, data)
mycluster = kMeansClustering(2, data, 2)
cluster1, cluster2 = mycluster.GList[0], mycluster.GList[1]
print(len(cluster1),len(cluster2))  #50,50,应该正确聚类了(因为每个类点应该是50),没有实际作图验证


def plotClustering(featureIndex1, featureIndex2):
	plt.scatter(data[featureIndex1][cluster1], data[featureIndex2][cluster1], label='cluster1')
	plt.scatter(data[featureIndex1][cluster2], data[featureIndex2][cluster2], label='cluster2')
	plt.xlabel('featureIndex1')
	plt.ylabel('featureIndex2')
	plt.legend()


def plothierarchicalClustering():
	fig = plt.figure(figsize=(10, 10))
	graph = []
	cnt = 1
	for i in range(4):
		for j in range(i + 1, 4):
			graph.append(fig.add_subplot(2, 4, cnt))
			cnt += 1
			plotClustering(i, j)
			plt.tight_layout()

	plt.show()

你可能感兴趣的:(统计学习方法代码实现,python,机器学习,人工智能)