import numpy as np
from sklearn.datasets import load_iris
import pandas as pd
import matplotlib.pyplot as plt
import collections
import copy
"""
第14章聚类算法,分别实现层次聚类和kmeans聚类
使用例题debuge,层次聚类用iris数据集作图验证,kmeans只看了iris数据集聚类后每一类的点数量
"""
class hierarchicalClustering():
"""
算法14.1
"""
def __init__(self, k, data, Dpq='min', p=2):
self.k = k
self.data = np.asarray(data)
self.n = data.shape[1]
self.p = p
self.Dpq = 'min'
self.distanceMatrix = self.getDistanceMatrix()
"""
self.distanceMatrix = np.array([
[0, 7, 2, 9, 3],
[7, 0, 5, 4, 6],
[2, 5, 0, 8, 1],
[9, 0, 8, 0, 5],
[3, 6, 1, 5, 0]
])
"""
self.hierarchicalClustering()
def getDistanceMatrix(self):
distanceMatrix = np.zeros((self.n, self.n))
for i in range(self.n):
for j in range(i + 1, self.n):
distance = self.getPointDistance(self.p, i, j)
distanceMatrix[i][j] = distanceMatrix[j][i] = distance
return distanceMatrix
def getPointDistance(self, p, x1Index, x2Index):
return np.power(np.sum(np.power(np.abs(self.data[:, x1Index] - self.data[:, x2Index]), p)), 1 / p)
def getClusterDistance(self, Gp, Gq):
return min([self.distanceMatrix[xiIndex, xjIndex] for xiIndex in Gp for xjIndex in Gq])
def hierarchicalClustering(self):
self.GList = [[i] for i in range(self.n)]
while len(self.GList) > self.k:
minGDistance = float('inf')
GiIndex, GjIndex = -1, -1
for i in range(len(self.GList)):
for j in range(i + 1, len(self.GList)):
distance = self.getClusterDistance(self.GList[i], self.GList[j])
if distance < minGDistance:
minGDistance = distance
GiIndex = i
GjIndex = j
mergeG = self.GList[GiIndex] + self.GList[GjIndex]
self.GList.pop(GiIndex)
self.GList.pop(GjIndex - 1)
self.GList.append(mergeG)
class kMeansClustering():
"""
算法14.2
"""
def __init__(self, k, data, p=2):
self.k = k
self.n = data.shape[1]
self.data = np.asarray(data)
self.p = p
"""
self.data = np.array([
[0, 0, 1, 5, 5],
[2, 0, 0, 0, 2]
])
"""
self.oldCenters = collections.defaultdict(list)
for i in range(self.k):
self.oldCenters[tuple(self.data[:, i])].append(self.data[:, i])
for pointIndex in range(self.k, self.n):
minDistance = float('inf')
finalCenter = -1
for center in self.oldCenters.keys():
distance = self.getPointDistance(self.p, self.data[:, pointIndex], center)
if distance < minDistance:
minDistance = distance
finalCenter = center
self.oldCenters[finalCenter].append(self.data[:, pointIndex])
self.newCenters = copy.deepcopy(self.oldCenters)
self.kMeansClustering()
def getPointDistance(self, p, x1, x2):
return np.power(np.sum(np.power(np.abs(x1 - x2), p)), 1 / p)
def kMeansClustering(self):
notSame = True
while (notSame):
self.oldCenters = self.newCenters
self.newCenters = collections.defaultdict(list)
centers = []
for cluster in self.oldCenters.items():
sum = np.zeros(len(cluster[1][0]))
for point in cluster[1]:
sum += point
centers.append(sum / len(cluster[1]))
for pointIndex in range(self.n):
minDistance = float('inf')
finalCenter = -1
for center in centers:
distance = self.getPointDistance(self.p, self.data[:, pointIndex], center)
if distance < minDistance:
minDistance = distance
finalCenter = center
self.newCenters[tuple(finalCenter)].append(self.data[:, pointIndex])
thisLoopisSame = True
oldClusters, newClusters = list(self.oldCenters.values()), list(self.newCenters.values())
for clusterIndex in range(self.k):
oldCluster = oldClusters[clusterIndex]
newCluster = newClusters[clusterIndex]
if len(oldCluster) != len(newCluster):
thisLoopisSame = False
break
for pointIndex in range(len(oldCluster)):
if (oldCluster[pointIndex] != newCluster[pointIndex]).any():
thisLoopisSame = False
break
if thisLoopisSame: notSame = False
self.GList = []
for clusterIndex in self.newCenters:
self.GList.append(self.newCenters[clusterIndex])
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['label'] = iris.target
df.columns = ['sepal length', 'sepal width', 'petal length', 'petal width', 'label']
def plotRawData():
plt.scatter(df.iloc[0:50]['sepal length'], df.iloc[0:50]['sepal width'], label='0')
plt.scatter(df.iloc[50:100]['sepal length'], df.iloc[50:100]['sepal width'], label='1')
plt.xlabel('sepal length')
plt.ylabel('sepal width')
plt.legend()
plt.show()
data = np.asarray(df.iloc[:100, :4]).T
mycluster = kMeansClustering(2, data, 2)
cluster1, cluster2 = mycluster.GList[0], mycluster.GList[1]
print(len(cluster1),len(cluster2))
def plotClustering(featureIndex1, featureIndex2):
plt.scatter(data[featureIndex1][cluster1], data[featureIndex2][cluster1], label='cluster1')
plt.scatter(data[featureIndex1][cluster2], data[featureIndex2][cluster2], label='cluster2')
plt.xlabel('featureIndex1')
plt.ylabel('featureIndex2')
plt.legend()
def plothierarchicalClustering():
fig = plt.figure(figsize=(10, 10))
graph = []
cnt = 1
for i in range(4):
for j in range(i + 1, 4):
graph.append(fig.add_subplot(2, 4, cnt))
cnt += 1
plotClustering(i, j)
plt.tight_layout()
plt.show()