from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn import datasets
digits = dataset.load_digits()
features = StandardScaler().fit_transform(digits.data)
pca = PCA(n_components=0.99,whiten=True)
features_pca = pca.fit_transform(features)
>>> features.shape[1]
64
>>> features_pca.shape[1]
54
n_components = 0.99 保存99%的信息
参数whiten = True;表示主成分转换时保证平均值为0;方差为1
svd_solver = "randomized" 使用随机方法找到找到第一个主成分
features,_ = make_circles(n_samples=1000,random_state=1,noise=0.1,factor=0.1)
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA,KernelPCA
from sklearn.datasets import make_circles
from sklearn import datasets
features,_ = make_circles(n_samples=1000,random_state=1,noise=0.1,factor=0.1)
kpca = KernelPCA(kernel="rbf",gamma=15,n_components=1)
features_kpca = kpca.fit_transform(features)
>>> features.shape[1]
2
>>> features_kpca.shape[1]
1
from sklearn import datasets
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
iris = datasets.load_iris()
features = iris.data
target = iris.target
lda = LinearDiscriminantAnalysis(n_components=1)
features_lda = lda.fit(features,target).transform(features)
>>> features.shape[1]
4
>>> features_lda.shape[1]
1
>>> lda.explained_variance_ratio_
array([0.9912126])
from sklearn.decomposition import NMF
from sklearn import datasets
digits = datasets.load_digits()
features = digits.data
nmf = NMF(n_components=10,random_state=1)
features_nmf = nmf.fit_transform(features)
Warning (from warnings module):
File "C:\Users\LX\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\decomposition\_nmf.py", line 1692
warnings.warn(
ConvergenceWarning: Maximum number of iterations 200 reached. Increase it to improve convergence.
>>> features.shape[1]
64
>>> features_nmf.shape[1]
10
使用截断奇异值分解(Trumcated Singular Value Decomposition)法
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from scipy.sparse import csr_matrix
from sklearn import datasets
import numpy as np
digits = datasets.load_digits()
features = StandardScaler().fit_transform(digits.data)
features_sparse = csr_matrix(features)
tsvd = TruncatedSVD(n_components=10)
features_sparse_tsvd = tsvd.fit(features_sparse).transform(features_sparse)
>>> features.shape[1]
64
>>> features_sparse_tsvd.shape[1]
10
from sklearn.feature_selection import VarianceThreshold
from sklearn import datasets
iris = datasets.load_iris()
features = iris.data
target = iris.target
thresholder = VarianceThreshold(threshold=0.5)
features_high_variance = thresholder.fit_transform(features)
print(features_high_variance[0:3])
[[5.1 1.4 0.2]
[4.9 1.4 0.2]
[4.7 1.3 0.2]]
特征0 | 80%分类0 |
特征1 | 20%分类0 |
特征2 | 60%分类0 |
from sklearn.feature_selection import VarianceThreshold
from sklearn import datasets
features = [[0,1,0],
[0,1,1],
[0,1,0],
[0,1,1],
[1,0,0]]
thresholder = VarianceThreshold(threshold=(0.75*(1-0.75)))
thresholder.fit_transform(features)
>>> a = thresholder.fit_transform(features)
>>> a
array([[0],
[1],
[0],
[1],
[0]])