import numpy as np import matplotlib.pyplot as plt from scipy import io from scipy import stats import os from sklearn.metrics import precision_recall_curve from sklearn.naive_bayes import GaussianNB def prior(yTrain): ''' :param yTrain: 训练数据集样本的类别标签(218, 1) :return: num_classes*1 numpy.ndarray 每个类别的先验概率 ''' num_total=yTrain.shape[0] # num_class_1=np.sum(yTrain==1) # num_class_2 = np.sum(yTrain == 2) # num_class_3 = np.sum(yTrain == 3) # num_class_4 = np.sum(yTrain == 4) # num_class_5 = np.sum(yTrain == 5) p=np.zeros((5,1)) for i in range(p.shape[0]): p[i,0]=np.sum(yTrain==i+1)/num_total return p def likelihood(xTrain, yTrain): ''' :param xTrain: 训练数据集的样本特征矩阵 :param yTrain: 训练数据集样本的标签矩阵 :return: 条件概率:条件均值和条件方差 M numpy.ndarray shape=[num_feat,num_classes] 其中的第c列,第m行表示,在当前类别为c的条件下,特征m的均值 V numpy.ndarray shape=[num_feat,num_classes] 其中的第c列,第m行表示,在当前类别为c的条件下,特征m的方差 ''' num_feat=xTrain.shape[1] num_classes=5 M=np.zeros((num_feat,num_classes)) V=np.ones((num_feat,num_classes)) for c in range(num_classes): # num_class_examples=np.sum(yTrain==c+1) # num_class_examples : scalar 表示训练数据集中属于当前类别的样本总数 temp_class_examples=xTrain[np.where(np.squeeze(yTrain,axis=1)==c+1)] # print('temp_class_examples',temp_class_examples.shape) # temp_class_examples numpy.ndarray shape=[num_class_examples,num_feat] # 表示训练数据集中属于当前类别的样本的所有特征 M[:,c]=np.mean(temp_class_examples,axis=0) # temp_mean numpy.ndarray shape=[num_feat,] 表示在当前类别下,每个维度特征的平均值 V[:,c]=np.var(temp_class_examples,axis=0) return M, V def naiveBayesClassify(xTest, M, V, p): ''' :param xTest: 测试数据样本的特征矩阵 numpy.ndarray shape=[num_test,num_feat] :param M: 根据训练数据集得到的条件均值 numpy.ndarray shape=[num_feat,num_classes] :param V: 根据训练数据集得到的条件方差 numpy.ndarray shape=[num_feat,num_classes] :param p: 根据训练数据集得到的各个类别的先验概率 numpy.ndarray shape=[num_classes,1] :return: nb: 对于测试数据集所预测的类别 numpy.ndarray shape=[num_test,1] ''' # 对于测试数据集中的每个样本(xTest矩阵中的每一行),计算出一个矩阵 temp_matrix=[num_feat,num_classes] # 表示当前测试样本属于每个类别下,当前维度特征的条件概率(根据高斯分布的概率密度函数得到) num_test=xTest.shape[0] num_feat=xTest.shape[1] num_classes=5 nb=np.zeros((num_test,1)) prob=np.zeros((num_test,1)) sample_prob=np.ones((num_test,num_classes)) for i in range(num_test): temp_sample=xTest[i,:] for c in range(num_classes): for dim in range(num_feat): feat_mean=M[dim][c] feat_var=V[dim][c] # 根据高斯分布的均值和方差计算概率密度函数 当前的特征取值为xTest[i][dim] sample_prob[i,c]*=((np.exp(-(xTest[i][dim]-feat_mean)**2/(2*feat_var)))/(np.sqrt(2*np.pi*feat_var))) # 概率密度函数在某个离散数值上面的取值可能大于1 # 高斯分布的概率密度函数可能大于1,之需要保证最终概率密度函数曲线的积分值等于1即可 # 比如区间为[0,0.2]上的均匀分布,其概率密度函数为 # pdf(x)=5 if 0<=x<=2 # pdf(x)=0 if x<0 or x>2 # print('--------temp',((np.exp(-(xTest[i][dim]-feat_mean)**2/(2*feat_var)))/(np.sqrt(2*np.pi*feat_var)))) # print(stats.norm(feat_mean, np.sqrt(feat_var)).pdf(xTest[i][dim])) sample_prob[i,c]*=p[c,0] # print('in naiveBayesClassify', sample_prob[i,c]) nb[i,0]=np.argmax(sample_prob[i,:])+1 prob[i,0]=np.max(sample_prob[i,:]) return nb,prob def PR_curve(): return if __name__=='__main__': mat=io.loadmat(os.path.join('F:\\machine_learning\\yaling\\hw_3\\naiveBayes\\naiveBayes','ecoli.mat')) print(type(mat)) xTrain=mat['xTrain'] yTrain=mat['yTrain'] xTest=mat['xTest'] yTest=mat['yTest'] # print(type(xTrain),xTrain.shape,xTrain) # xTrain 训练数据集样本特征 (218, 5) # yTrain 训练样本集类别标签 (218, 1) [1, 2, 3, 4, 5] 类别标签中的类别 # print(type(yTrain), yTrain.shape,list(set(list(np.squeeze(yTrain,1))))) print(type(xTest),xTest.shape) print(type(yTest),yTest.shape,list(set(list(np.squeeze(yTest,1))))) # xTest 测试数据集的样本特征 (109, 5) # yTest 测试数据集的类别标签 (109, 1) [1, 2, 3, 4, 5] # 朴素贝叶斯算法的步骤: # 1.计算出每个类别的先验概率 p=prior(yTrain) # print('p',type(p),p.shape,p) # p (5, 1) # [[0.47247706] # [0.23394495] # [0.14220183] # [0.10091743] # [0.05045872]] # 2.求出矩阵M和V,矩阵中的第m行,第c列的元素表示类别c的条件下,特征m的条件概率 # 将条件概率建模成为高斯分布,则矩阵M和矩阵V分别表示相应条件概率的均值和方差 M,V=likelihood(xTrain,yTrain) clf = GaussianNB() clf.fit(xTrain,yTrain) # # package_mean=clf.theta_ # package_var=clf.sigma_ # 调用scipy函数 package所计算出来的每个类别中每个特征的均值和每个类别中每个特征的方差 # print('mean and variance check',np.sum(M-package_mean),np.sum(V-package_var)) # mean and variance check 0.0 -1.2204103179425108e-09 # print(M,V) # 3.根据先验概率和每个维度特征的条件概率,求出argmax问题,求出使得联合概率密度取得最大值的类别作为预测类别 nb,prob=naiveBayesClassify(xTest, M, V, p) # print(nb.shape,list(set(list(np.squeeze(nb,axis=1))))) # print('prob', prob, np.max(prob), np.min(prob)) # (109, 1) [0.0, 1.0, 2.0, 3.0, 4.0] package_pred=clf.predict(xTest) # print('package_pred',package_pred.shape,package_pred) # print('prediction check',np.sum(package_pred-np.squeeze(nb,axis=1))) # prediction check 0.0 # 4.分析在测试数据集上的性能,并将测试性能写入evaluation.txt文件 # (1)accuracy: 正确预测的样本数/总测试样本数 accuracy=np.sum(nb==yTest)/yTest.shape[0] print(accuracy) # 0.8715596330275229 训练数据集上的准确率 # 0.8348623853211009 测试数据集上的准确率 # precision的定义:P=TP/(TP+FP)=TP/all_pred # 即预测的所有positive examples正样本点中,有多少个正样本点是预测正确的 # recall的定义:R=TP/(TP+FN)=TP/all_gt # 即预测正确的正样本点数与ground truth label正样本点数的比例 prec_1=np.sum(nb[np.where(np.squeeze(yTest,axis=1)==1)]==1)/np.sum(nb==1) rec_1=np.sum(nb[np.where(np.squeeze(yTest,axis=1)==1)]==1)/np.sum(yTest==1) prec_5 = np.sum(nb[np.where(np.squeeze(yTest,axis=1)==5)]==5) / np.sum(nb == 5) rec_5 = np.sum(nb[np.where(np.squeeze(yTest,axis=1)==5)]==5) / np.sum(yTest == 5) with open(os.path.join('F:\\machine_learning\\yaling\\hw_3\\naiveBayes\\naiveBayes','evaluation.txt'),'w') as eval: eval.write('accuracy:%.4f'%(accuracy)) eval.write('\n') eval.write('precision_1:%.4f' % (prec_1)) eval.write('\n') eval.write('recall_1:%.4f' % (rec_1)) eval.write('\n') eval.write('precision_5:%.4f' % (prec_5)) eval.write('\n') eval.write('recall_5:%.4f' % (rec_5)) eval.write('\n') # 分别画出5个前景类别的PR曲线 # print('nb',np.max(nb),np.min(nb)) num_classes=5 num_test=xTest.shape[0] for i in range(num_classes): # plt.figure(1) # 创建图表1 plt.title('num_classes='+str(i+1)+' Precision/Recall Curve') # give plot a title plt.xlabel('Recall') # make axis labels plt.ylabel('Precision') temp_gt=np.squeeze(yTest,axis=1) temp_gt=np.where(temp_gt==i+1,1,0) temp_pred=np.where(np.squeeze(nb,axis=1)==i+1,np.squeeze(prob,axis=1),0) # print('temp_gt', temp_gt, np.max(temp_gt), np.min(temp_gt)) # print('temp_pred',temp_pred,np.max(temp_pred),np.min(temp_pred)) index=np.argsort(-temp_pred) # 对预测的概率值按照降序排列 temp_gt=temp_gt[index] temp_pred=temp_pred[index] # print('temp_pred',temp_pred,np.max(temp_pred),np.min(temp_pred)) # y_true和y_scores分别是gt label和predict score precision, recall, thresholds = precision_recall_curve(temp_gt, temp_pred) plt.figure(1) plt.plot(recall, precision) # 分别表示x轴和y轴,PR曲线中,precision是Y轴,recall是x轴 plt.show() # 实现对于当前类别所预测的概率值,绘制PR曲线 # PR曲线的绘制并不需要像绘制ROC曲线一样,设定不同的threshold进行描点 # 而是将预测为前景类别的概率值从高到低进行排序,然后对于每个预测的点,给出PR值 # 即PR曲线上的散点个数与测试集中的样本点数相等 # tp=np.zeros(num_test) # fp=np.zeros(num_test) # # for j in range(num_test): # if temp_pred[j]>0.5: # if temp_gt[j]==1: # tp[j]=1 # else: # fp[j]=1 # # precision=np.cumsum(tp)/np.cumsum(tp+fp) # recall=np.cumsum(tp)/np.sum(temp_gt) plt.figure(1) plt.plot(recall, precision)# 分别表示x轴和y轴,PR曲线中,precision是Y轴,recall是x轴 plt.show()