# Singular Value Decomposition (SVD)求解矩阵的主成分向量
X_centered = X - X.mean(axis=0) #主成分要求数据以原点为中心
U, s, V = np.linalg.svd(X_centered)
c1 = V.T[:, 0]
c2 = V.T[:, 1]
W2 = V.T[:, :2]
X2D = X_centered.dot(W2)
V T = ( ∣ ∣ ∣ c 1 c 2 ⋯ c n ∣ ∣ ∣ ) V^T=\begin{pmatrix} | & | & & | \\ c_1 & c_2 & \cdots & c_n\\ | & | & & | \end{pmatrix} VT=⎝⎛∣c1∣∣c2∣⋯∣cn∣⎠⎞
X d − p r o j = X ⋅ W d X_{d-proj}=X \cdot W_d Xd−proj=X⋅Wd
from sklearn.decomposition import PCA
pca = PCA(n_components = 2)
X2D = pca.fit_transform(X)
# automatically takes care of centering the data
pca.components_.T[:, 0])
pca = PCA()
pca.fit(X)
cumsum = np.cumsum(pca.explained_variance_ratio_)
d = np.argmax(cumsum >= 0.95) + 1
print(pca.explained_variance_ratio_)
array([ 0.84248607, 0.14631839])
或者直接设置我们要保存的数据方差SUM
pca = PCA(n_components=0.95)
X_reduced = pca.fit_transform(X)
或者绘制维度-方差和曲线来选择拐点
# 将压缩的数据反转回去
pca = PCA(n_components = 154)
X_mnist_reduced = pca.fit_transform(X_mnist)
X_mnist_recovered = pca.inverse_transform(X_mnist_reduced)
the reconstruction error
The mean squared distance between the original data and the reconstructed data
X r e c o v e r e d = X d − p r o j ⋅ W d T X_{recovered} = X_{d-proj} \cdot W_d^T Xrecovered=Xd−proj⋅WdT
from sklearn.decomposition import IncrementalPCA
n_batches = 100
inc_pca = IncrementalPCA(n_components=154)
for X_batch in np.array_split(X_mnist, n_batches):
inc_pca.partial_fit(X_batch)
X_mnist_reduced = inc_pca.transform(X_mnist)
或者用memmap, 支持将数据以二进制的形式存储在磁盘上, 按需加载进内存使用
X_mm = np.memmap(filename, dtype="float32", mode="readonly", shape=(m, n))
batch_size = m // n_batches
inc_pca = IncrementalPCA(n_components=154, batch_size=batch_size)
inc_pca.fit(X_mm)
rnd_pca = PCA(n_components=154, svd_solver="randomized")
X_reduced = rnd_pca.fit_transform(X_mnist)
from sklearn.decomposition import KernelPCA
rbf_pca = KernelPCA(n_components = 2, kernel="rbf", gamma=0.04)
X_reduced = rbf_pca.fit_transform(X)
Selecting a Kernel and Tuning Hyperparameters
有label,训练一个分类器,对比准确率来选择KPCA参数
clf = Pipeline([
("kpca", KernelPCA(n_components=2)),
("log_reg", LogisticRegression())
])
param_grid = [{
"kpca__gamma": np.linspace(0.03, 0.05, 10),
"kpca__kernel": ["rbf", "sigmoid"]
}]
grid_search = GridSearchCV(clf, param_grid, cv=3)
grid_search.fit(X, y)
print(grid_search.best_params_)
无label,将原始数据作为label,训练一个回归模型
rbf_pca = KernelPCA(n_components = 2, kernel="rbf", gamma=0.0433,
fit_inverse_transform=True)
X_reduced = rbf_pca.fit_transform(X)
X_preimage = rbf_pca.inverse_transform(X_reduced)
mean_squared_error(X, X_preimage)
first measuring how each training instance linearly relates to its closest neighbors(c.n)
W ^ = a r g m i n w ∑ i = 1 m ∥ x ( i ) − ∑ j = 1 m w ^ i , j x ( j ) ∥ 2 s u b j e c t t o { w i , j = 0 i f x ( j ) i s n o t o n e o f t h e k c . n o f x ( i ) ∑ j = 1 m w ^ i , j = 1 f o r i = 1 , 2 , ⋯ , m \begin{aligned} &{\hat{W} = \underset{w}{argmin} \sum_{i=1}^{m} \left \| x^{(i)}-\sum_{j=1}^{m}\hat{w}_{i,j}x^{(j)} \right \|^2} \\ & subject \; to \; \left\{\begin{matrix} w_{i,j}=0 & if \; x^{(j)} \; is \; not \; one \; of \; the \; k \; c.n \; of \; x^{(i)}\\ \\ \sum_{j=1}^{m}\hat{w}_{i,j}=1 & for \; i=1,2,\cdots,m \end{matrix}\right. \end{aligned} W^=wargmini=1∑m∥∥∥∥∥x(i)−j=1∑mw^i,jx(j)∥∥∥∥∥2subjectto⎩⎨⎧wi,j=0∑j=1mw^i,j=1ifx(j)isnotoneofthekc.nofx(i)fori=1,2,⋯,m
then looking for a low-dimensional representation of the training set
尤其擅长展开卷状数据
Z ^ = a r g m i n z ∑ i = 1 m ∥ z ( i ) − ∑ j = 1 m w ^ i , j z ( j ) ∥ 2 \hat{Z} = \underset{z}{argmin} \sum_{i=1}^{m} \left \| z^{(i)}-\sum_{j=1}^{m}\hat{w}_{i,j}z^{(j)} \right \|^2 Z^=zargmini=1∑m∥∥∥∥∥z(i)−j=1∑mw^i,jz(j)∥∥∥∥∥2
from sklearn.manifold import LocallyLinearEmbedding
lle = LocallyLinearEmbedding(n_components=2, n_neighbors=10)
X_reduced = lle.fit_transform(X)
@ 学必求其心得,业必贵其专精