测试集具体形状如图所示:(不是训练集)
当然是用matlab生成的
即使用python中的sklearn包LinearDiscriminantAnalysis
算法如下
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix,precision_score,accuracy_score,recall_score,f1_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import matplotlib.pyplot as plt
data=pd.read_csv("train.csv")
y_train=data['type'].tolist()
x_train=np.mat([data['x'].tolist(),data['y'].tolist()]).T
data2=pd.read_csv("new.csv")
y_test=data2['type'].tolist()
x_test=np.mat([data2['x'].tolist(),data2['y'].tolist()]).T
clf=LinearDiscriminantAnalysis()
clf.fit(x_train,y_train)
y_pred=clf.predict(x_test)
con=confusion_matrix(y_test, y_pred)
print(con)
print(accuracy_score(y_test, y_pred),
precision_score(y_test, y_pred),
recall_score(y_test,y_pred),f1_score(y_test, y_pred))
for i in range(0,len(y_test)):
if y_pred[i]!=y_test[i]:
if y_test[i]==1:
plt.scatter(x_test[i,0],x_test[i,1], marker = '+', color = 'green', s = 40)#wrong1
if y_test[i]==0:
plt.scatter(x_test[i,0],x_test[i,1],marker = 'x', color = 'cyan', s = 40)#wrong0
else:
if y_test[i]==1:
plt.scatter(x_test[i,0],x_test[i,1], marker = 'o', color = 'blue', s = 40)#right1
if y_test[i]==0:
plt.scatter(x_test[i,0],x_test[i,1], marker = 'v', color = 'magenta', s = 40)#right0
plt.legend(loc = 'best')
plt.show()
通过这个算法我们可以得到我们的最终判定结果
其中,蓝色点是判断正确的正例,粉色三角是判断正确负例,天蓝色叉是判断错误的负例,绿色加号是判断错误的正例,可以知道,错误是难免的,我们只能找到一些尽可能正确的学习机
即借用QuadraticDiscriminantAnalysis包
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix,precision_score,accuracy_score,recall_score,f1_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
import matplotlib.pyplot as plt
data=pd.read_csv("train.csv")
y_train=data['type'].tolist()
x_train=np.mat([data['x'].tolist(),data['y'].tolist()]).T
data2=pd.read_csv("new.csv")
y_test=data2['type'].tolist()
x_test=np.mat([data2['x'].tolist(),data2['y'].tolist()]).T
clf=QuadraticDiscriminantAnalysis()
clf.fit(x_train,y_train)
y_pred=clf.predict(x_test)
con=confusion_matrix(y_test, y_pred)
print(con)
print(accuracy_score(y_test, y_pred),
precision_score(y_test, y_pred),
recall_score(y_test,y_pred),f1_score(y_test, y_pred))
for i in range(0,len(y_test)):
if y_pred[i]!=y_test[i]:
if y_test[i]==1:
plt.scatter(x_test[i,0],x_test[i,1], marker = '+', color = 'green', s = 40)#wrong1
if y_test[i]==0:
plt.scatter(x_test[i,0],x_test[i,1],marker = 'x', color = 'cyan', s = 40)#wrong0
else:
if y_test[i]==1:
plt.scatter(x_test[i,0],x_test[i,1], marker = 'o', color = 'blue', s = 40)#right1
if y_test[i]==0:
plt.scatter(x_test[i,0],x_test[i,1], marker = 'v', color = 'magenta', s = 40)#right0
plt.legend(loc = 'best')
plt.show()
不得不说,这个包还是蛮好用的
我们会发现最终结果稍微好了一点
此时负例预测的更加正确了
虽然借用了sklearn的一些功能,不过那无关紧要
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix,precision_score,accuracy_score,recall_score,f1_score
import matplotlib.pyplot as plt
def meanvector(x,y,num_of_type):
nrow,ncol=x.shape
np.set_printoptions(precision=4)
result=np.zeros((num_of_type,ncol))
for i in range(0,ncol):
tmp=x[:,i]
for j in range(0,num_of_type):
result[j,i]=np.mean(tmp[y==j],axis=1)
result=result.T
return result
def sw(x,y,num_of_type):
nrow,ncol=x.shape
mean_vec=meanvector(x,y,num_of_type)
part2=mean_vec[:,0]-mean_vec[:,1]
sw=np.zeros((ncol,ncol))
for i in range(0,ncol):
for j in range(0,ncol):
for k in range(0,nrow):
type_this=y[k,0];
sw[i,j]=sw[i,j]+(x[k,i]-mean_vec[type_this,i])*(x[k,j]-mean_vec[type_this,j])
result=np.linalg.inv(sw)
result=np.dot(result,part2)
return result,mean_vec
data=pd.read_csv("train.csv")
y_train=np.mat(data['type'].tolist()).T
x_train=np.mat([data['x'].tolist(),data['y'].tolist()]).T
data2=pd.read_csv("new.csv")
y_test=np.mat(data2['type'].tolist()).T
x_test=np.mat([data2['x'].tolist(),data2['y'].tolist()]).T
w,mean_vec=sw(x_train,y_train,2)
w=w.T
mean0=w.dot(mean_vec[:,0])
mean1=w.dot(mean_vec[:,1])
mid=mean0/2+mean1/2
y_pred=np.zeros((len(y_test),1))
for i in range(0,len(y_test)):
now=x_test[i,:].T
this_mean=w.dot(now)
if this_mean>mid:
y_pred[i,0]=0
else:
y_pred[i,0]=1
con=confusion_matrix(y_test, y_pred)
print(con)
print(accuracy_score(y_test, y_pred),
precision_score(y_test, y_pred),
recall_score(y_test,y_pred),f1_score(y_test, y_pred))
for i in range(0,len(y_test)):
if y_pred[i]!=y_test[i]:
if y_test[i]==1:
plt.scatter(x_test[i,0],x_test[i,1], marker = '+', color = 'green', s = 40)#wrong1
if y_test[i]==0:
plt.scatter(x_test[i,0],x_test[i,1],marker = 'x', color = 'cyan', s = 40)#wrong0
else:
if y_test[i]==1:
plt.scatter(x_test[i,0],x_test[i,1], marker = 'o', color = 'blue', s = 40)#right1
if y_test[i]==0:
plt.scatter(x_test[i,0],x_test[i,1], marker = 'v', color = 'magenta', s = 40)#right0
plt.legend(loc = 'best')
plt.show()
我们最终得出了和LDA一样的答案,说明周志强在西瓜书中讲到的Fisher判别法和sklearn中的LDA算法是几乎相同的,因为我不是非常熟悉包内的内容,只能说,他们的效果是一样的。
可见QDA对于高斯分布数据的判断还是更优一些的,问题在于这种方法并不是很适合降维,所以主流的还是LDA占据一些优势,相关的资料也更多。但是他们的思想还是很近似的。