理论部分可以看看这个大佬的文章:https://www.cnblogs.com/pinard/p/6244265.html
# -*- coding: utf-8 -*-
# @Date : 2019/12/18
# @File : LDA.py
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
'''
类内SW 类间SB矩阵
根据拉格朗日:
S_b*u = lambda * S_w * u ==> S_w^(-1) * S_b * u = lambda*u
https://www.cnblogs.com/pinard/p/6244265.html
'''
class LDA(object):
def __init__(self, num_class=2, out_dim=2):
self.num_class = num_class
self.out_dim = out_dim
self.W = None # 用来降维的特征向量
self.eig_pairs = None # 特征值从大到小的,特征值和特征向量的组合
self.reduced_data = None # 降维后的数据(reduced_X, y) X.shape=(n, outdim)
def fit(self, X, y):
'''
:param X:
:param y: 默认y的值从0开始
:return:
'''
m = X.shape[1]
class_mean = self.__calc_class_mean(X, y)
S_b = self.__calc_Sb(X, y, class_mean)
S_w = self.__calc_Sw(X, y, class_mean)
# 得到特征值[..., wi], 特征向量[:, i]
eig_vals, eig_vecs = np.linalg.eig(np.linalg.inv(S_w).dot(S_b))
eig_pairs = [(eig_vals[i], eig_vecs[:,i]) for i in range(len(eig_vals))]
eig_pairs = sorted(eig_pairs, key=lambda x:x[0], reverse=True)
# 默认取前2个特征值对应的特征向量
out_vecs = []
for dim in range(self.out_dim):
out_vecs.append(eig_pairs[dim][1].reshape(m, 1))
self.W = np.hstack(out_vecs)
self.eig_pairs = eig_pairs
self.reduced_X = X.dot(self.W)
self.reduced_data = (self.reduced_X, y)
return self
def predict(self, X):
pass
def __calc_class_mean(self, X, y):
class_mean = []
for label in range(self.num_class):
idx = (y == label)
vec = np.mean(X[idx], axis=0)
class_mean.append(vec)
return np.array(class_mean)
def __calc_Sb(self, X, y, class_mean):
'''可以通过总体散度矩阵进行优化 \sum_k N_k * (mean_k - mean) * (mean_k - mean)^T}'''
m = X.shape[1]
S_b = np.zeros((m, m))
all_mean = np.mean(X, axis=0)
for k in range(self.num_class):
class_k_mean = class_mean[k]
n_k = sum(y == k)
all_mean, class_k_mean = all_mean.reshape(m, 1), class_k_mean.reshape(m, 1)
S_b += n_k * (class_k_mean - all_mean).dot((class_k_mean - all_mean).T)
return S_b
def __calc_Sw(self, X, y, class_mean):
'''类内矩阵'''
m = X.shape[1]
S_w = np.zeros((m, m))
for k in range(self.num_class):
class_k_mat = np.zeros((m, m))
class_k_mean = class_mean[k]
for x_i in X[y==k]:
x_i, class_k_mean = x_i.reshape(m, 1), class_k_mean.reshape(m, 1)
# class_k_mat += (x_i.dot(x_i.T) - class_k_mean.dot(class_k_mean.T))
class_k_mat += (x_i - class_k_mean).dot((x_i - class_k_mean).T)
S_w += class_k_mat
return S_w
def plot(self, y):
# 默认降维到二维数据进行可视化
assert self.num_class == 2
for label, marker, color in zip([0, 1], ('^', 's'), ('blue', 'yellow')):
plt.scatter(x=self.reduced_X[:, 0][y == label],
y=self.reduced_X[:, 1][y == label],
marker=marker,
color=color,
alpha=0.5,
label=str(label)
)
plt.xlabel('x1')
plt.ylabel('x2')
plt.title('LDA')
plt.legend()
plt.show()
if __name__ == '__main__':
data = pd.read_csv('F:/dataStructurePrectice/data/breast-cancer-wisconsin.data.csv',
header=None,
names=['ID', 'ct', 'size', 'shape', 'ma', 'secs', 'bn', 'bc', 'nn', 'mit', 'y'])
data['y'] = data['y'].replace({2:0, 4:1})
# 将样例数据进行处理 用 众数 对问号 进行填充
for f in data.select_dtypes(include=['object']):
mode = data[f].mode().iloc[0]
data[f] = data[f].replace('?', mode).astype(int)
X = data.drop(columns=['ID', 'y']).values
y = data['y'].values
lda = LDA(num_class=2, out_dim=2)
lda.fit(X, y)
lda.plot(y)
将数据降维到二维的可视化:
最后:
LDA vs PCA
LDA用于降维,和PCA有很多相同,也有很多不同的地方,因此值得好好的比较一下两者的降维异同点。
相同点:
1)两者均可以对数据进行降维。
2)两者在降维时均使用了矩阵特征分解的思想。
3)两者都假设数据符合高斯分布。
不同点:
1)LDA是有监督的降维方法,而PCA是无监督的降维方法
2)LDA降维最多降到类别数k-1的维数,而PCA没有这个限制。 主要由于Rank(AB) <= Min(Rank(A), Rank(B))
3)LDA除了可以用于降维,还可以用于分类。
4)LDA选择分类性能最好的投影方向,而PCA选择样本点投影具有最大方差的方向。
在某些数据分布下LDA比PCA降维较优。