先选取数据集,本实验采用鸢尾花数据集,利用PCA对数据进行降维,得到二维矩阵如图1。把数据分成4类(k=4)。
先初始化k个簇中心,利用欧氏距离计算每个点到簇中心的距离,然后判断离那个中心的距离最近就把它分到那个类,在对每一类计算平均值,更新簇中心,知道簇中心不再发生变化。
初始的聚类中心不同,结果不同。可以用二分k均值算法改进。
import numpy as np
from matplotlib import pyplot as plt
import pandas as pd
from sklearn.datasets import load_iris
from sklearn import decomposition
#降维后的数据
iris_data = load_iris()
iris = pd.DataFrame(iris_data[‘data’], columns=iris_data[‘feature_names’])
iris = pd.merge(iris, pd.DataFrame(iris_data[‘target’], columns=[‘species’]), left_index=True, right_index=True)
labels = dict(zip([0,1,2], iris_data[‘target_names’]))
iris[‘species’] = iris[‘species’].apply(lambda x: labels[x])
#pd.plotting.andrews_curves(iris, ‘species’)
from sklearn.preprocessing import StandardScaler
StandardScaler().fit_transform(iris_data.data)
from sklearn.preprocessing import MinMaxScaler
MinMaxScaler().fit_transform(iris_data.data)
pca = decomposition.PCA(n_components=2)
X = pca.fit_transform(iris.iloc[:,:-1].values)
print(“降维后的鸢尾花数据集”)
print(X)
def distance(d1, d2):
‘’’
距离计算
‘’’
dis = np.sqrt((d1[0]-d2[0])**2 + (d1[1]-d2[1])**2)
return dis
def init_center(data, k):
‘’’
初始化聚类中心
‘’’
clusterCenter=[]
for i in range(k):
clusterCenter.append(data[i])
return clusterCenter
def means(arr):
‘’’
计算簇的中心
‘’’
#axis = 0:压缩行,对各列求均值,返回 1* n 矩阵
#axis =1 :压缩列,对各行求均值,返回 m *1 矩阵
#mean 求均值
array = np.array(arr)
center = np.array([np.mean(array[:,0]), np.mean(array[:,1])])
return center
def k_means(data, k):
clusterCenter = init_center(data, k);
clusterChanged=True# 用来判断聚类是否已经收敛
#shape()读取矩阵的长度
m = data.shape[0] #数据集的行数
clusterAssment = np.mat(np.zeros((m, 2))); #分配结果矩阵,第一列存放所属中心点,第二列为距离
##迭代聚类
while(clusterChanged):
clusterChanged=False
#计算每一个点与每个簇中心的距离
for i in range(m): #每一个数据点
minDist = np.inf #距离设为无穷远
minIndex = -1
for j in range(k): #每一个簇
dis = distance(data[i],clusterCenter[j])
if dis < minDist:
minDist = dis
minIndex = j
if clusterAssment[i,0] != minIndex:
clusterChanged = True
clusterAssment[i, :] = minIndex, minDist
#重新计算簇的中心
for i in range(k):
array=[]
for j in range(m):
if clusterAssment[j,0] == i:
array.append(data[j])
center = means(array)
clusterCenter[i] = center;
return clusterAssment,clusterCenter
def print_result(data, clusterAssment,clusterCenter, k):
arrays = []
for i in range(k):
array=[]
for j in range(data.shape[0]):
if clusterAssment[j,0] == i:
array.append(data[j])
arrays.append(array)
i = 1
for array in arrays:
print("\n\n输出第", i, “个簇的对象”)
for e in array:
print(e, end="")
print("\n簇的平均值为",clusterCenter[i-1])
i+=1
clore=[‘g’,‘r’, ‘y’, ‘b’];
i=0
plt.figure(1)
for array in arrays:
array=np.array(array)
plt.scatter(array[:,0], array[:,1], c=clore[i])
i+=1
plt.show()
plt.figure(2)
plt.scatter(data[:,0], data[:,1])
plt.show()
if name == “main”:
data = X
k=4
clusterAssment, clusterCenter = k_means(data, k);
print("聚类后的矩阵,第一列所属分类,第二类是到每个中心点的距离")
print(clusterAssment)
print_result(data, clusterAssment, clusterCenter,k);