实现K-means聚类,并使用它来压缩图像。 我们将从一个简单的2D数据集开始,以了解K-means是如何工作的,然后我们将其应用于图像压缩。 我们还将对主成分分析进行实验,并了解如何使用它来找到面部图像的低维表示。
实施和应用K-means到一个简单的二维数据集,以获得一些直观的工作原理。 K-means是一个迭代的,无监督的聚类算法,将类似的实例组合成簇。 该算法通过猜测每个簇的初始聚类中心开始,然后重复将实例分配给最近的簇,并重新计算该簇的聚类中心。 我们要实现的第一部分是找到数据中每个实例最接近的聚类中心的函数。
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.io as sio
data = sio.loadmat("data/ex7data2.mat")
X = data['X']
fig, ax = plt.subplots(figsize=(12,8))
ax.scatter(X[:,0], X[:,1],c="b")
plt.show()
def find_closest_centroids(X, centroids):
m = X.shape[0]
k = centroids.shape[0]
idx = np.zeros(m)
for i in range(m):
min_dist = 1000000
for j in range(k):
dist = np.sum((X[i,:] - centroids[j,:]) ** 2)
if dist < min_dist:
min_dist = dist
idx[i] = j
return idx
#测试上述函数
initial_centroids = np.array([[3, 3], [6, 2], [8, 5]]) #手动初始化三个聚类中心点
idx = find_closest_centroids(X,initial_centroids)
print(idx[0:3])
[0. 2. 1.]
def compute_centroids(X, idx, k):
m, n = X.shape
centroids = np.zeros((k, n))
for i in range(k):
indices = X[np.where(idx == i)[0]] # np.where找出在idx中与i值相等的值的下标,返回值类型是元组,[0]是取内部数据
centroids[i] = np.mean(indices, 0) # 按列求均值
return centroids
compute_centroids(X, idx, 3)
array([[2.42830111, 3.15792418],
[5.81350331, 2.63365645],
[7.11938687, 3.6166844 ]])
def run_k_means(X, initial_centroids, max_iters):
m,n = X.shape
k = initial_centroids.shape[0]
idx = np.zeros(m)
centroids = initial_centroids
for i in range(max_iters):
idx = find_closest_centroids(X,centroids) # 找到最近质心
centroids = compute_centroids(X,idx,k) # 重新计算质心
return idx,centroids
idx, centroids = run_k_means(X, initial_centroids, 10)
# 可视化
cluster_1 = X[np.where(idx==0)[0],:]
cluster_2 = X[np.where(idx==1)[0],:]
cluster_3 = X[np.where(idx==2)[0],:]
fig,ax = plt.subplots(figsize=(12,8))
ax.scatter(cluster_1[:,0],cluster_1[:,1],c='r', label='cluster_1')
ax.scatter(cluster_2[:,0],cluster_2[:,1],c='g', label='cluster_2')
ax.scatter(cluster_3[:,0],cluster_3[:,1],c='b', label='cluster_3')
ax.legend()
plt.show()
def random_init_centroids(X, k):
m, n = X.shape
centroids = np.zeros((k, n))
idx = np.random.randint(0, m, k) # 随机生成k大小的int,范围(0,m)
for i in range(k):
centroids[i,:] = X[idx[i],:]
return centroids
下一个任务是将K-means应用于图像压缩。 从下面的演示可以看到,我们可以使用聚类来找到最具代表性的少数颜色,并使用聚类分配将原始的24位颜色映射到较低维的颜色空间。
下面是我们要压缩的图像。
from IPython.display import Image
Image(filename='data/bird_small.png')
image_data = sio.loadmat('data/bird_small.mat')
image_data
{'__header__': b'MATLAB 5.0 MAT-file, Platform: GLNXA64, Created on: Tue Jun 5 04:06:24 2012',
'__version__': '1.0',
'__globals__': [],
'A': array([[[219, 180, 103],
[230, 185, 116],
[226, 186, 110],
...,
[ 14, 15, 13],
[ 13, 15, 12],
[ 12, 14, 12]],
[[230, 193, 119],
[224, 192, 120],
[226, 192, 124],
...,
[ 16, 16, 13],
[ 14, 15, 10],
[ 11, 14, 9]],
[[228, 191, 123],
[228, 191, 121],
[220, 185, 118],
...,
[ 14, 16, 13],
[ 13, 13, 11],
[ 11, 15, 10]],
...,
[[ 15, 18, 16],
[ 18, 21, 18],
[ 18, 19, 16],
...,
[ 81, 45, 45],
[ 70, 43, 35],
[ 72, 51, 43]],
[[ 16, 17, 17],
[ 17, 18, 19],
[ 20, 19, 20],
...,
[ 80, 38, 40],
[ 68, 39, 40],
[ 59, 43, 42]],
[[ 15, 19, 19],
[ 20, 20, 18],
[ 18, 19, 17],
...,
[ 65, 43, 39],
[ 58, 37, 38],
[ 52, 39, 34]]], dtype=uint8)}
data = image_data['A']
data.shape
(128, 128, 3)
预处理
#数据归一化 因为每个数据都是0-255之间
# normalize value ranges
data = data / 255.
# reshape the array
X = np.reshape(data, (data.shape[0] * data.shape[1], data.shape[2]))
k = 16
max_iters = 10
X.shape
(16384, 3)
# 随机初始化聚类中心
init_centroids = random_init_centroids(X, k)
# 获取聚类中心
idx, centroids = run_k_means(X, init_centroids, max_iters)
# 将所有数据点,设置归属到对应的聚类中心去
idx = find_closest_centroids(X, centroids)
# 将每一个像素值与聚类结果进行匹配
X_recovered = centroids[idx.astype(int), :] # 将属于一个聚类的像素,设置为聚类中心的值(统一)
# X_recovered.shape (16384, 3)
X_recovered = np.reshape(X_recovered, (data.shape[0], data.shape[1], data.shape[2])) # 再展开为三维数据
X_recovered.shape
(128, 128, 3)
plt.imshow(X_recovered)
plt.show()
from sklearn.cluster import KMeans#导入kmeans库
model = KMeans(n_clusters=16,n_init=100,n_jobs=-1) #n_init设置获取初始簇中心的更迭次数,防止局部最优 n_jobs设置并行(使用CPU数,-1则使用所有CPU)
model.fit(X) #开始聚类
centroids = model.cluster_centers_ #获取聚簇中心
C = model.predict(X) #获取每个数据点的对应聚簇中心的索引
X_recovered = centroids[C].reshape((data.shape[0],data.shape[1],data.shape[2])) #获取新的图像