【基于python实现pca降维算法】

基于python实现pca降维算法

说明:本文仅仅用代码实现自己对pca降维算法理解。不涉及该算法的原理及优化方法,然后利用生成的测试数据对聚类算法进行验证。可自行补充pca的理论知识:

# -*- coding:utf-8 -*- 
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from scipy import linalg
from scipy.linalg import svd, eig


def pca_v1(X, k=None):
    """
    Parameters
    ----------
    X : array-like of shape (n_samples, n_features)
        Training data, where `n_samples` is the number of samples
        and `n_features` is the number of features.
    """
    n_samples, n_features = X.shape
    X_norm = X - X.mean(axis=0)

    # 计算协方差矩阵 XX'
    scatter_matrix = np.dot(X_norm.T, X_norm)   # (n_features, n_features)
    # 对协方差矩阵做特征值分解
    eig_val, eig_vec = linalg.eig(scatter_matrix)
    eig_val_index = np.abs(eig_val).argsort()[::-1]  # 倒序排列
    # eig_pairs = [(np.abs(eig_val[i]), eig_vec[:, i]) for i in range(n_features)]
    # eig_pairs.sort(reverse=True)
    # features_topk = np.array([ele[1] for ele in eig_pairs[:k]])
    
    if k is None:
        k = 1
        for i in range(n_features):
            top_k_val = eig_val[eig_val_index[:i]].sum().real
            if 1 - top_k_val / eig_val.sum().real < 1e-5:
                k = i
                break
    # 选择top k 个特征向量 eigenvectors
    features_topk = eig_vec[eig_val_index[:k]].reshape(k, -1)
    X_pca = np.dot(X_norm, features_topk.T)
    return X_pca


def pca_v2(X, k=2):
    """
    Parameters
    ----------
    X : array-like of shape (n_samples, n_features)
        Training data, where `n_samples` is the number of samples
        and `n_features` is the number of features.
    """
    n_samples, n_features = X.shape
    X_norm = X - X.mean(axis=0)

    # svd分解法
    U, S, Vt = linalg.svd(X_norm, full_matrices=False)    # 等价于 svd(X_norm, full_matrices=False)
    components_ = Vt
    X_pca = np.dot(X_norm, components_[:k, :].T)
    X_bak = np.dot(U[:, :k] * S[:k], Vt[:k, :]).round()
    return X_pca


if __name__ == '__main__':
    X = np.array([[-1, -1, 0, 2, 1], [2, 0, 0, -1, -1], [2, 0, 1, 1, 0]])
    # X = np.array([[-1, -1, 3], [-2, -1, 4], [-3, -2, 1], [1, 1, 5], [2, 1, 0], [3, 2, 8]])

    k = 2
    X_pca_v1 = pca_v1(X, k=k)

    # svd分解法
    X_pca_v2 = pca_v2(X, k=k)

    # scklearn-pca
    pca = PCA(n_components=k)
    pca.fit(X)
    X_pca_skl = pca.transform(X)

    print("*." * 20 + " done " + "*." * 20)

你可能感兴趣的:(python,python,算法,机器学习)