在这里就不说明LDA的原理了,不懂的同学可以百度找相关资料学。这里直接给出楼主的python实现,以及搜索到的其他实现。
楼主实现:
#实现线性判别分析算法
#二类n维降至一维
#传入的data,target都是array,target分别是0和1
def lda(data,target):
target=target.flatten() #将target变成意味数组
print(target.shape)
df1=data[target==0] #第0类
df2=data[target==1] #第1类
n=data.shape[1]
u1=df1.mean(0).reshape((1,n))
u2=df2.mean(0).reshape((1,n))
print(u2.shape)
data_mean_1=df1-u1 #应用numpy的广播机制
data_mean_2=df2-u2
print(data_mean_1.T.dot(data_mean_1))
Sw=data_mean_1.T.dot(data_mean_1)+data_mean_2.T.dot(data_mean_2)
w=np.mat((u1-u2))*np.mat(Sw).I
return w
#多类别降至K维的实现
def lda_muliti_class(data,target,K):
#within_class scatter matrix
clusters=unique(target)
if K>len(clusters)-1:
print("K is too much")
print("please input again")
exit(0)
Sw=np.zeros((data.shape[1],data.shape[1]))
for i in clusters:
datai=data[target==i]
datai=datai-datai.mean(0)
Swi=np.mat(datai).T*np.mat(datai)
Sw+=Swi
#between_class scatter matrix
SB=np.zeros((data.shape[1],data.shape[1]))
u=data.mean(0) #所有样本的平均值
for i in clusters:
Ni=data[target==i].shape[0]
ui=data[target==i].mean(0) #某个类别的平均值
SBi=Ni*np.mat(ui-u).T*np.mat(ui-u)
SB+=SBi
S=np.linalg.inv(Sw)*SB
eigVals,eigVects=np.linalg.eig(S) #求特征值,特征向量
eigValInd=argsort(eigVals)
eigValInd=eigValInd[:(-K-1):-1]
w=eigVects[:,eigValInd]
return w
w=lda_muliti_class(data_iris,target_iris,2)
w
搜索到的其他实现:
#二类n维降至一维
def calulate_w(df):
df1=df[df.label==1]
df2=df[df.label==0]
X1=df1.values[:,1:3]
X0=df2.values[:,1:3]
mean1=np.array([mean(X1[:,0]),mean(X1[:,1])])
mean0=np.array([mean(X0[:,0]),mean(X0[:,1])])
m1=shape(X1)[0]
sw=zeros(shape=(2,2))
for i in range(m1):
xsmean=mat(X1[i,:]-mean1)
sw+=xsmean.transpose()*xsmean
m0=shape(X0)[0]
for i in range(m0):
xsmean=mat(X0[i,:]-mean0)
sw+=xsmean.transpose()*xsmean
w=(mean0-mean1)*(mat(sw).I)
return w
w=calulate_w(df)
w
#多类降至二维,注意是二维而已
def read_iris(): #导入iris数据集,作为例子
from sklearn.datasets import load_iris
from sklearn import preprocessing
data_set = load_iris()
data_x = data_set.data
label = data_set.target + 1
#preprocessing.scale(data_x, axis=0, with_mean=True, with_std=True, copy=False)
return data_x,label
# 特征均值,计算每类的均值,返回一个向量
def class_mean(data,label,clusters):
mean_vectors = []
for cl in range(1,clusters+1):
mean_vectors.append(np.mean(data[label==cl,],axis=0))
#print mean_vectors
return mean_vectors
# 计算类内散度
def within_class_SW(data,label,clusters):
m = data.shape[1]
S_W = np.zeros((m,m))
mean_vectors = class_mean(data,label,clusters)
for cl ,mv in zip(range(1,clusters+1),mean_vectors):
class_sc_mat = np.zeros((m,m))
# 对每个样本数据进行矩阵乘法
for row in data[label == cl]:
row ,mv =row.reshape(4,1),mv.reshape(4,1)
class_sc_mat += (row-mv).dot((row-mv).T)
S_W +=class_sc_mat
#print S_W
return S_W
def between_class_SB(data,label,clusters):
m = data.shape[1]
all_mean =np.mean(data,axis = 0)
S_B = np.zeros((m,m))
mean_vectors = class_mean(data,label,clusters)
for cl ,mean_vec in enumerate(mean_vectors):
n = data[label==cl+1,:].shape[0]
mean_vec = mean_vec.reshape(4,1) # make column vector
all_mean = all_mean.reshape(4,1)# make column vector
S_B += n * (mean_vec - all_mean).dot((mean_vec - all_mean).T)
#print S_B
return S_B
def lda():
data,label=read_iris();
clusters = 3
S_W = within_class_SW(data,label,clusters)
S_B = between_class_SB(data,label,clusters)
eig_vals, eig_vecs = np.linalg.eig(np.linalg.inv(S_W)*S_B)
#print S_W
#print S_B
for i in range(len(eig_vals)):
eigvec_sc = eig_vecs[:,i].reshape(4,1)
#print('\nEigenvector {}: \n{}'.format(i+1, eigvec_sc.real))
#print('Eigenvalue {:}: {:.2e}'.format(i+1, eig_vals[i].real))
eig_pairs = [(np.abs(eig_vals[i]), eig_vecs[:,i]) for i in range(len(eig_vals))]
eig_pairs = sorted(eig_pairs, key=lambda k: k[0], reverse=True)
W = np.hstack((eig_pairs[0][1].reshape(4,1), eig_pairs[1][1].reshape(4,1)))
#print 'Matrix W:\n', W.real
#print data.dot(W)
return W
data,labels = read_iris()
W = lda()
Y = data.dot(W)
print(W)
参考博客:
1. http://blog.csdn.net/qunxingvip/article/details/47283293
2. http://blog.csdn.net/wzmsltw/article/details/51037725