白化操作的目的是让我们的减少冗余信息,准确来说通过白化操作我们有两个目的:
Principal Components Analysis (PCA) 是一个用来减少特征纬度的算法,它通过减少特征的纬度来减少冗余信息。
比如说,下图所示的一个特征纬度为2的点分类问题,我们可以看到数据的主要分布在y=x这条线周围。
# -*- coding=utf-8 -*-
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
def draw_points(data, label):
color = ['r', 'g', 'b']
for idx in np.unique(label):
# plt.scatter(x=data[np.argwhere(label == idx), 0], y=data[np.argwhere(label == idx), 1], c=color[idx])
plt.scatter(x=data[np.argwhere(label == idx), 0], y=data[np.argwhere(label == idx), 1])
# plt.yticks([-0.5, 0.5])
def load_data():
# digits = datasets.load_digits()
# xs = digits.data
# ys = digits.target
# print(np.shape(xs), np.shape(ys))
xs_1 = np.random.randint(0, 30, [30, 1])
xs_2 = np.random.randint(40, 70, [30, 1])
xs_3 = np.random.randint(75, 100, [40, 1])
xs = np.concatenate([xs_1, xs_2, xs_3], axis=0)
ys = []
for x in xs:
flag = np.random.random()
if flag > 0.5:
ys.append(x + np.random.random() * np.random.randint(30))
else:
ys.append(x - np.random.random() * np.random.randint(30))
# ys = xs + np.random.random([100, 1]) * 5
ys = np.asarray(ys, np.float32)
index_1 = np.argwhere(xs <= 20)
index_2 = np.argwhere(np.logical_and(xs > 20, xs < 80))
index_3 = np.argwhere(xs >= 80)
label = np.asarray(range(100), np.int32)
label[index_1] = 0
label[index_2] = 1
label[index_3] = 2
data = np.asarray(np.concatenate([xs, ys], axis=-1), np.float32)
data[:, 0] /= 10
data[:, 1] /= 100
data -= np.mean(data, axis=0)
print('the shape of data is ', np.shape(data))
return data, label
def PCA(data):
# 首先将中心点的位置调整至原点。即就是均值为0
avg = np.mean(data, axis=0)
data = data - avg
# 计算协方差矩阵
data = np.mat(data)
sigma = data.T * data / len(data)
print('The shape of Sigma is ', np.shape(sigma))
print('The Sigma is ', sigma)
eigenvectors, eigenvalues, _ = np.linalg.svd(sigma)
print('the eigenvalues is ', eigenvalues)
print('the eigenvectors is ', eigenvectors)
data_rot = data * eigenvectors
# draw_points(np.asarray(data_rot, np.float32), label)
return data_rot, eigenvalues, eigenvectors
def PCA_whitening(data, select_k):
data_rot, eigenvalues, eigenvectors = PCA(data)
data_whitening = np.copy(data_rot[:, select_k])
data_whitening[:, 0] /= np.sqrt(eigenvalues[0])
data_whitening[:, 1] /= np.sqrt(eigenvalues[1])
# draw_points(data_whitening, label)
return data_whitening, eigenvalues[:select_k], eigenvectors[:, :select_k]
def ZCA_whitening(data, select_k):
data_whitening_pca, _, eigenvectors = PCA_whitening(data, select_k)
print(np.shape(data_whitening_pca), np.shape(eigenvectors))
data_whitening = data_whitening_pca * eigenvectors.T
print(np.shape(data_whitening))
# draw_points(np.asarray(data_whitening, np.float32), label)
return data_whitening
if __name__ == '__main__':
data, label = load_data()
# draw_points(data, label)
# data_rot, eigenvalues = PCA(data, label)
# draw_points(np.asarray(data_rot, np.float32), label)
# PCA_whitening(data, label)
data_whitening_zca = ZCA_whitening(data, select_k=1)
draw_points(np.asarray(data_whitening_zca, np.float32), label)
plt.show()