线性判别分析LDA的多个python实现

在这里就不说明LDA的原理了,不懂的同学可以百度找相关资料学。这里直接给出楼主的python实现,以及搜索到的其他实现。

楼主实现

#实现线性判别分析算法
#二类n维降至一维
#传入的data,target都是array,target分别是0和1

def lda(data,target):
    target=target.flatten()  #将target变成意味数组
    print(target.shape)
    df1=data[target==0]  #第0类
    df2=data[target==1]  #第1类
    n=data.shape[1]
    u1=df1.mean(0).reshape((1,n))
    u2=df2.mean(0).reshape((1,n))
    print(u2.shape)
    data_mean_1=df1-u1  #应用numpy的广播机制
    data_mean_2=df2-u2
    print(data_mean_1.T.dot(data_mean_1))
    Sw=data_mean_1.T.dot(data_mean_1)+data_mean_2.T.dot(data_mean_2)
    w=np.mat((u1-u2))*np.mat(Sw).I
    return w
#多类别降至K维的实现
def lda_muliti_class(data,target,K):
    #within_class scatter matrix
    clusters=unique(target)
    if K>len(clusters)-1:
        print("K is too much")
        print("please input again")
        exit(0)
    Sw=np.zeros((data.shape[1],data.shape[1]))
    for i in clusters:
        datai=data[target==i]
        datai=datai-datai.mean(0)
        Swi=np.mat(datai).T*np.mat(datai)
        Sw+=Swi
    #between_class scatter matrix
    SB=np.zeros((data.shape[1],data.shape[1]))
    u=data.mean(0)  #所有样本的平均值
    for i in clusters:
        Ni=data[target==i].shape[0]
        ui=data[target==i].mean(0)  #某个类别的平均值
        SBi=Ni*np.mat(ui-u).T*np.mat(ui-u)
        SB+=SBi
    S=np.linalg.inv(Sw)*SB
    eigVals,eigVects=np.linalg.eig(S)  #求特征值,特征向量
    eigValInd=argsort(eigVals)
    eigValInd=eigValInd[:(-K-1):-1]
    w=eigVects[:,eigValInd]
    return w
w=lda_muliti_class(data_iris,target_iris,2)    
w    

搜索到的其他实现:

#二类n维降至一维
def calulate_w(df):    
    df1=df[df.label==1]    
    df2=df[df.label==0]    
    X1=df1.values[:,1:3]    
    X0=df2.values[:,1:3]    
    mean1=np.array([mean(X1[:,0]),mean(X1[:,1])])    
    mean0=np.array([mean(X0[:,0]),mean(X0[:,1])])    
    m1=shape(X1)[0]    
    sw=zeros(shape=(2,2))    
    for i in range(m1):    
        xsmean=mat(X1[i,:]-mean1)    
        sw+=xsmean.transpose()*xsmean    
    m0=shape(X0)[0]    
    for i in range(m0):    
        xsmean=mat(X0[i,:]-mean0)    
        sw+=xsmean.transpose()*xsmean    
    w=(mean0-mean1)*(mat(sw).I)    
    return w    
w=calulate_w(df)
w
#多类降至二维,注意是二维而已
def read_iris():   #导入iris数据集,作为例子
    from sklearn.datasets import load_iris
    from sklearn import preprocessing
    data_set = load_iris()
    data_x = data_set.data 
    label = data_set.target + 1
    #preprocessing.scale(data_x, axis=0, with_mean=True, with_std=True, copy=False) 
    return data_x,label
    # 特征均值,计算每类的均值,返回一个向量
def class_mean(data,label,clusters):
    mean_vectors = [] 
    for cl in range(1,clusters+1):
        mean_vectors.append(np.mean(data[label==cl,],axis=0))
    #print mean_vectors
    return mean_vectors
    # 计算类内散度
def within_class_SW(data,label,clusters):
    m = data.shape[1]
    S_W = np.zeros((m,m))
    mean_vectors = class_mean(data,label,clusters)
    for cl ,mv in zip(range(1,clusters+1),mean_vectors):
        class_sc_mat = np.zeros((m,m))
        # 对每个样本数据进行矩阵乘法 
        for row  in data[label == cl]:
            row ,mv =row.reshape(4,1),mv.reshape(4,1)
            class_sc_mat += (row-mv).dot((row-mv).T)
        S_W +=class_sc_mat
    #print S_W 
    return S_W

def between_class_SB(data,label,clusters):
    m = data.shape[1]
    all_mean =np.mean(data,axis = 0)
    S_B = np.zeros((m,m))
    mean_vectors = class_mean(data,label,clusters)
    for cl ,mean_vec in enumerate(mean_vectors):
        n = data[label==cl+1,:].shape[0]
        mean_vec = mean_vec.reshape(4,1) # make column vector
        all_mean = all_mean.reshape(4,1)# make column vector
        S_B += n * (mean_vec - all_mean).dot((mean_vec - all_mean).T)
    #print S_B 
    return S_B

def lda():
    data,label=read_iris();
    clusters = 3
    S_W = within_class_SW(data,label,clusters)
    S_B = between_class_SB(data,label,clusters)
    eig_vals, eig_vecs = np.linalg.eig(np.linalg.inv(S_W)*S_B)
    #print S_W 
    #print S_B 
    for i in range(len(eig_vals)):
        eigvec_sc = eig_vecs[:,i].reshape(4,1)
        #print('\nEigenvector {}: \n{}'.format(i+1, eigvec_sc.real))
        #print('Eigenvalue {:}: {:.2e}'.format(i+1, eig_vals[i].real))
    eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]
    eig_pairs = sorted(eig_pairs, key=lambda k: k[0], reverse=True)
    W = np.hstack((eig_pairs[0][1].reshape(4,1), eig_pairs[1][1].reshape(4,1)))
    #print 'Matrix W:\n', W.real
    #print data.dot(W)
    return W 
data,labels = read_iris()
W = lda()
Y = data.dot(W)
print(W)

参考博客
1. http://blog.csdn.net/qunxingvip/article/details/47283293
2. http://blog.csdn.net/wzmsltw/article/details/51037725

你可能感兴趣的:(机器学习,python学习)