降维算法高级(二)21

1  相关性

import numpy as np
import pandas as pd 
df = pd.DataFrame(np.random.randint(0,150,size = (50,3)),columns=['python','En','Chinese'])
#相关性系数
#相关系数范围(-1,1)
#-1表示负相关
#1表示正相关
df.corr()

降维算法高级(二)21_第1张图片

df['Physic']=np.random.randint(-150,0,size=50)
df.corr()

降维算法高级(二)21_第2张图片

df['Math']=df['python'].map(lambda x:x+np.random.randint(-10,10,size=1)[0])
df.corr()
#python与数学关联性非常大

降维算法高级(二)21_第3张图片

2  PCA降维

from sklearn.decomposition import PCA
from sklearn import datasets
#物理意义特征
X,y=datasets.load_iris(return_X_y=True)
X[:5]
输出:
array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])
pca=PCA(n_components=2,whiten=True)
X_pca=pca.fit_transform(X)
X_pca[:5]
#n_components=2(int类型),表示降维保留2个特征
#n_components=0.95(#n_components=2(浮点类型),表示降维保留95%重要性的特征
#whiten=True,进行归一化
array([[-1.30533786,  0.64836932],
       [-1.31993521, -0.35930856],
       [-1.40496732, -0.29424412],
       [-1.33510889, -0.64613986],
       [-1.32702321,  0.6633044 ]])

3  PCA原理(特征值和特征向量)

pca.explained_variance_#pca方法直接计算特征值和特征向量
输出:
array([4.22824171, 0.24267075])
#1.去中心化(每一列都减去平均值)
B=X-X.mean(axis=0)

#2.协方差
V=np.cov(B,rowvar=False,bias=False)
#rowvar=False表示计算列的协方差,默认计算行
#bias=True,默认截距维1 

#3.计算特征值和特征向量eigen,ev(线性代数)
eigen,ev=np.linalg.eig(V)
display(eigen,ev)

#特征值和特征向量计算
cond=(eigen/eigen.sum()).cumsum()>=0.95
print((eigen/eigen.sum()).cumsum())
print(cond)
#计算不同数据合起来对总数据的占比,显示重要性
index=cond.argmax()#chi
index
ev=ev[:,:index+1]#保留特征值对应的特征向量

#5.进行矩阵乘法
pca_result=B.dot(ev)
pca_result[:5]

#进行标准化(归一化)
pca_result=(pca_result-pca_result.mean(axis=0))/pca_result.std(axis=0,ddof=1)
#ddof=1表示的是样本方差,分母为(n-1),而不是原来的n
pca_result[:5]

输出:
array([4.22824171, 0.24267075, 0.0782095 , 0.02383509])
array([[ 0.36138659, -0.65658877, -0.58202985,  0.31548719],
       [-0.08452251, -0.73016143,  0.59791083, -0.3197231 ],
       [ 0.85667061,  0.17337266,  0.07623608, -0.47983899],
       [ 0.3582892 ,  0.07548102,  0.54583143,  0.75365743]])
[0.92461872 0.97768521 0.99478782 1.        ]
[False  True  True  True]
array([[-1.30533786, -0.64836932],
       [-1.31993521,  0.35930856],
       [-1.40496732,  0.29424412],
       [-1.33510889,  0.64613986],
       [-1.32702321, -0.6633044 ]])

4  PCA代码实现(奇异值分解)

#pip install scipy
#scipy 高级数据计算库
#Numpy基本数据计算库
from scipy import linalg
n_components_ = 3
X,y = datasets.load_iris(return_X_y = True)

# 1、去中心化
mean_ = np.mean(X, axis=0)
X -= mean_

# 2、奇异值分解
U, S, Vt = linalg.svd(X, full_matrices=False)

# 3、符号翻转(如果为负数,那么变成正直)
# max_abs_cols = np.argmax(np.abs(U), axis=0)
# signs = np.sign(U[max_abs_cols, range(U.shape[1])])
# U *= signs

# 4、降维特征筛选
U = U[:, :n_components_]

# 5、归一化
U = (U - U.mean(axis = 0))/U.std(axis = 0,ddof = 1)
# U *= np.sqrt(X.shape[0] - 1)
U[:5]

输出:
array([[-1.30533786, -0.64836932,  0.09981716],
       [-1.31993521,  0.35930856,  0.75257299],
       [-1.40496732,  0.29424412, -0.0640073 ],
       [-1.33510889,  0.64613986, -0.11284924],
       [-1.32702321, -0.6633044 , -0.32210314]])
np.sign([1,2,-2,-0.5])
输出:
array([ 1.,  1., -1., -1.])
X_pca[:5]
输出
array([[-1.30533786,  0.64836932],
       [-1.31993521, -0.35930856],
       [-1.40496732, -0.29424412],
       [-1.33510889, -0.64613986],
       [-1.32702321,  0.6633044 ]])
X[:5]
输出:
array([[-0.74333333,  0.44266667, -2.358     , -0.99933333],
       [-0.94333333, -0.05733333, -2.358     , -0.99933333],
       [-1.14333333,  0.14266667, -2.458     , -0.99933333],
       [-1.24333333,  0.04266667, -2.258     , -0.99933333],
       [-0.84333333,  0.54266667, -2.358     , -0.99933333]])

sigma = np.zeros(shape = (4,4))
for i in range(4):
    for j in range(4):
        if i == j:
            sigma[i,i] = S[i]
sigma
输出:
sigma = np.zeros(shape = (4,4))
for i in range(4):
    for j in range(4):
        if i == j:
            sigma[i,i] = S[i]
sigma
sigma = np.zeros(shape = (4,4))
for i in range(4):
    for j in range(4):
        if i == j:
            sigma[i,i] = S[i]
sigma
array([[25.09996044,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  6.01314738,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  3.41368064,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  1.88452351]])
U, S, Vt = linalg.svd(X, full_matrices=False)
display(U.shape,S.shape,Vt.shape)
print(S)
# 反向矩阵乘法 ---> X
U.dot(sigma).dot(Vt)[:5]

输出:
(150, 4)
(4,)
(4, 4)
[25.09996044  6.01314738  3.41368064  1.88452351]
array([[-0.74333333,  0.44266667, -2.358     , -0.99933333],
       [-0.94333333, -0.05733333, -2.358     , -0.99933333],
       [-1.14333333,  0.14266667, -2.458     , -0.99933333],
       [-1.24333333,  0.04266667, -2.258     , -0.99933333],
       [-0.84333333,  0.54266667, -2.358     , -0.99933333]])

你可能感兴趣的:(alot学习,alot物联网工程师,分类,回归,逻辑回归)