【机器学习】LDA线性判别分析python实现

理论部分可以看看这个大佬的文章:https://www.cnblogs.com/pinard/p/6244265.html


 

# -*- coding: utf-8 -*-
# @Date   : 2019/12/18
# @File   : LDA.py

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

'''
类内SW 类间SB矩阵
根据拉格朗日:
S_b*u = lambda * S_w * u  ==> S_w^(-1) * S_b * u = lambda*u
https://www.cnblogs.com/pinard/p/6244265.html
'''

class LDA(object):

    def __init__(self, num_class=2, out_dim=2):
        self.num_class = num_class
        self.out_dim = out_dim
        self.W = None # 用来降维的特征向量
        self.eig_pairs = None # 特征值从大到小的,特征值和特征向量的组合
        self.reduced_data = None # 降维后的数据(reduced_X, y)   X.shape=(n, outdim)

    def fit(self, X, y):
        '''

        :param X:
        :param y: 默认y的值从0开始
        :return:
        '''
        m = X.shape[1]
        class_mean = self.__calc_class_mean(X, y)
        S_b = self.__calc_Sb(X, y, class_mean)
        S_w = self.__calc_Sw(X, y, class_mean)
        # 得到特征值[..., wi], 特征向量[:, i]
        eig_vals, eig_vecs = np.linalg.eig(np.linalg.inv(S_w).dot(S_b))
        eig_pairs = [(eig_vals[i], eig_vecs[:,i]) for i in range(len(eig_vals))]
        eig_pairs = sorted(eig_pairs, key=lambda x:x[0], reverse=True)
        # 默认取前2个特征值对应的特征向量
        out_vecs = []
        for dim in range(self.out_dim):
            out_vecs.append(eig_pairs[dim][1].reshape(m, 1))
        self.W = np.hstack(out_vecs)
        self.eig_pairs = eig_pairs
        self.reduced_X = X.dot(self.W)
        self.reduced_data = (self.reduced_X, y)
        return self

    def predict(self, X):
        pass

    def __calc_class_mean(self, X, y):
        class_mean = []
        for label in range(self.num_class):
            idx = (y == label)
            vec = np.mean(X[idx], axis=0)
            class_mean.append(vec)
        return np.array(class_mean)

    def __calc_Sb(self, X, y, class_mean):
        '''可以通过总体散度矩阵进行优化 \sum_k N_k * (mean_k - mean) * (mean_k - mean)^T}'''
        m = X.shape[1]
        S_b = np.zeros((m, m))
        all_mean = np.mean(X, axis=0)
        for k in range(self.num_class):
            class_k_mean = class_mean[k]
            n_k = sum(y == k)
            all_mean, class_k_mean = all_mean.reshape(m, 1), class_k_mean.reshape(m, 1)
            S_b += n_k * (class_k_mean - all_mean).dot((class_k_mean - all_mean).T)
        return S_b

    def __calc_Sw(self, X, y, class_mean):
        '''类内矩阵'''
        m = X.shape[1]
        S_w = np.zeros((m, m))
        for k in range(self.num_class):
            class_k_mat = np.zeros((m, m))
            class_k_mean = class_mean[k]
            for x_i in X[y==k]:
                x_i, class_k_mean = x_i.reshape(m, 1), class_k_mean.reshape(m, 1)
                # class_k_mat += (x_i.dot(x_i.T) - class_k_mean.dot(class_k_mean.T))
                class_k_mat += (x_i - class_k_mean).dot((x_i - class_k_mean).T)
            S_w += class_k_mat
        return S_w

    def plot(self, y):
        # 默认降维到二维数据进行可视化
        assert self.num_class == 2

        for label, marker, color in zip([0, 1], ('^', 's'), ('blue', 'yellow')):
            plt.scatter(x=self.reduced_X[:, 0][y == label],
                        y=self.reduced_X[:, 1][y == label],
                        marker=marker,
                        color=color,
                        alpha=0.5,
                        label=str(label)
                        )
        plt.xlabel('x1')
        plt.ylabel('x2')
        plt.title('LDA')
        plt.legend()
        plt.show()



if __name__ == '__main__':

    data = pd.read_csv('F:/dataStructurePrectice/data/breast-cancer-wisconsin.data.csv',
                       header=None,
                       names=['ID', 'ct', 'size', 'shape', 'ma', 'secs', 'bn', 'bc', 'nn', 'mit', 'y'])
    data['y'] = data['y'].replace({2:0, 4:1})
    # 将样例数据进行处理 用 众数 对问号 进行填充
    for f in data.select_dtypes(include=['object']):
        mode = data[f].mode().iloc[0]
        data[f] = data[f].replace('?', mode).astype(int)

    X = data.drop(columns=['ID', 'y']).values
    y = data['y'].values

    lda = LDA(num_class=2, out_dim=2)
    lda.fit(X, y)

    lda.plot(y)

将数据降维到二维的可视化:

【机器学习】LDA线性判别分析python实现_第1张图片

 

最后:
 LDA vs PCA
    LDA用于降维,和PCA有很多相同,也有很多不同的地方,因此值得好好的比较一下两者的降维异同点。
    相同点:
    1)两者均可以对数据进行降维。
    2)两者在降维时均使用了矩阵特征分解的思想。
    3)两者都假设数据符合高斯分布。
    不同点:
    1)LDA是有监督的降维方法,而PCA是无监督的降维方法
    2)LDA降维最多降到类别数k-1的维数,而PCA没有这个限制。 主要由于Rank(AB) <= Min(Rank(A), Rank(B))
    3)LDA除了可以用于降维,还可以用于分类。
    4)LDA选择分类性能最好的投影方向,而PCA选择样本点投影具有最大方差的方向。
    在某些数据分布下LDA比PCA降维较优。
 

你可能感兴趣的:(机器学习)