BreastCancer和wine数据集分别对应单分类和多分类问题。本文主要根据完整的数据分析步骤,使 用 Python 对 数 据 集 进 行 集 合 划 分 、 模 型 训 练 和 预 测 以 及 模 型 评 估 。
目的:熟 悉 理 论 知 识 、pandas/sklearn/matplotlib 等常用库、评估指标计算函数和 ROC 曲线的绘制。本文主要采用逻辑回归模型,对比二分类和多分类问题的异同,以及该模型代码在分类问题中实现的异同。
二分类模型的最终结果可能出现4种情况(一个矩阵)【T/F是对于预测值而言】
预测值 | 预测值 | 合计 | ||
---|---|---|---|---|
1 | 0 | |||
实际值 | 1 | True Positive | False Negative | Actual Positive (TP+FN) |
实际值 | 0 | False Positive | True Negative | Actual Negative (FP+TN) |
合计 | Predicted Positive (TP+FP) | Predicted Negative (FN+TN) | P+N |
注:TP真正例/FT假正例/FN假负例/TN正负例
评估指标:
分类问题
模式识别系列(六)多分类策略
逻辑回归多分类的Python实现
机器学习系列(七)——分类问题(classification)
sklearn中绘制 ROC 曲线的函数 roc_curve() 解释
python实现二分类和多分类的ROC曲线
wine数据集逻辑回归分析
'''wine:二分类问题示例'''
from sklearn.datasets import load_breast_cancer
import pandas as pd
BreastCancer = load_breast_cancer()
# 放入DataFrame中便于查看
#划分分类标签
x=pd.DataFrame(BreastCancer.data,columns=BreastCancer.feature_names)
y= pd.DataFrame(BreastCancer.target,columns=['Class'])
print(x)
print(y)
df = pd.concat([x,y],axis=1)
df.head()
#切片划分训练集和测试集
from sklearn.model_selection import train_test_split
x=df.loc[:,:'worst fractal dimension']
y=df.loc[:,'Class']
x_train, x_test,y_train,y_test= train_test_split(x,y, shuffle=True)
#模型
from sklearn.linear_model import LogisticRegression
model=LogisticRegression(solver='liblinear')
model.fit(x_train,y_train)
model.score(x_test,y_test)'''此函数计算模型评估指标,当y为连续变量时返回R2值,分类变量时返回accuracy'''
#预测结果
pre_ex1=model.predict_proba(x_test)
print('so the probabilities are:\n',pre_ex1)
pre_ex2=model.predict(x_test)
print('so the class is :\n',pre_ex2)
#模型评估
##ACC
from sklearn.metrics import accuracy_score
acc=accuracy_score(y_test,pre_ex2)
print('The Accuracy score is {:.4f} %'.format(acc*100))
##recall rate
from sklearn.metrics import recall_score
r1=recall_score(y_test,pre_ex2,average='micro')
r2=recall_score(y_test,pre_ex2,average='macro')
r3=recall_score(y_test,pre_ex2,average='weighted')
print('The recall scores are {:.4f} % for micro, {:.4f} % for macro and {:.4f} % for weighted'.format(r1*100,r2*100,r3*100))
##Precision
from sklearn.metrics import precision_score
p1=precision_score(y_test,pre_ex2,average='micro')
p2=precision_score(y_test,pre_ex2,average='macro')
p3=precision_score(y_test,pre_ex2,average='weighted')
print('The precision scores are {:.4f} % for micro, {:.4f} % for macro and {:.4f} % for weighted'.format(p1*100,p2*100,p3*100))
##F1
from sklearn.metrics import f1_score
f1_mi=f1_score(y_test,pre_ex2,average='micro')
f1_ma=f1_score(y_test,pre_ex2,average='macro')
f1_we=f1_score(y_test,pre_ex2,average='weighted')
print('The precision scores are {:.4f} % for micro, {:.4f} % for macro and {:.4f} % for weighted'.format(f1_mi*100,f1_ma*100,f1_we*100))
##ROC & AUC
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve,auc
'''roc_curve()第三个参数为y_score,可以是正类的概率估计值,也可以是可信度值等。
普通二分类问题中一般使用正类概率【TPR和FPR本质是是看预测值中的正类】;
SVM一般使用model.decision_function(x_test)得到,其他的可以使用model.predict_prob(x_test)[:,1]得到'''
fpr,tpr,threshold=roc_curve(y_test,pre_ex1[:,1])
#pre_ex1为0/1类别分别对应的概率得到的二维数据,需要转为只有类别1的一维数据
print('The fpr and tpr are:\n and \n',fpr,tpr)
roc_auc=auc(fpr,tpr)
print('The roc_auc is {:.2f}'.format(roc_auc))
#测试得出,如果y_score使用负例概率,得到的结果刚好相反,auc值互补
plt.plot(fpr,tpr,label='ROC')#绘制出的实在不同threshold阈值下的不同TPR和FPR的曲线
plt.xlabel('FPR')
plt.ylabel('TPR')
BreastCancer数据集逻辑回归分析
'''BreastCancer:多分类问题示例'''
#获取数据集
from sklearn.datasets import load_wine
import pandas as pd
wine = load_wine()
# 放入DataFrame中便于查看
x= pd.DataFrame(wine.data,columns=wine.feature_names)
y= pd.DataFrame(wine.target,columns=['Class'])
print(x)
print(y)
df = pd.concat([x,y],axis=1)
df.head()
#处理数据集
from sklearn.model_selection import train_test_split
x=df.loc[:,:'proline']
y=df.loc[:,'Class']
x_train, x_test,y_train,y_test= train_test_split(x,y, shuffle=True)
#逻辑回归
from sklearn.linear_model import LogisticRegression
'''多分类一定要写入multinomial,此时solver不能用liblinear'''
model=LogisticRegression(multi_class='multinomial',solver='lbfgs')
model.fit(x_train,y_train)
model.score(x_test,y_test)
pre_ex1=model.predict_proba(x_test)
print('so the probabilities are:\n',pre_ex1)
pre_ex2=model.predict(x_test)
print('so the class is :\n',pre_ex2)
#模型评估
##ACC
from sklearn.metrics import accuracy_score
acc=accuracy_score(y_test,pre_ex2)
print('The Accuracy score is {:.4f} %'.format(acc*100))
##recall rate
from sklearn.metrics import recall_score
r1=recall_score(y_test,pre_ex2,average='micro')
r2=recall_score(y_test,pre_ex2,average='macro')
r3=recall_score(y_test,pre_ex2,average='weighted')
print('The recall scores are {:.4f} % for micro, {:.4f} % for macro and {:.4f} % for weighted'.format(r1*100,r2*100,r3*100))
##Precision
from sklearn.metrics import precision_score
p1=precision_score(y_test,pre_ex2,average='micro')
p2=precision_score(y_test,pre_ex2,average='macro')
p3=precision_score(y_test,pre_ex2,average='weighted')
print('The precision scores are {:.4f} % for micro, {:.4f} % for macro and {:.4f} % for weighted'.format(p1*100,p2*100,p3*100))
##F1
from sklearn.metrics import f1_score
f1_mi=f1_score(y_test,pre_ex2,average='micro')
f1_ma=f1_score(y_test,pre_ex2,average='macro')
f1_we=f1_score(y_test,pre_ex2,average='weighted')
print('The precision scores are {:.4f} % for micro, {:.4f} % for macro and {:.4f} % for weighted'.format(f1_mi*100,f1_ma*100,f1_we*100))
##ROC & AUC
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve,auc
'''重复:roc_curve()的思路是,对于输入的类别1的预测概率,和不断改变的阈值进行对比来判断预测类别为0/1。通过预测类别和输入的真实类别计算tpr和fpr,绘制出ROC曲线'''
'''该部分理解来源于https://zhuanlan.zhihu.com/p/266386193,可能存在错误'''
'''需要注意roc_curve()本身仅应用于二分类问题
对于多分类问题,则需要将数据集处理为二分类类型,首先进行label_binarize()获得二进制标签
两种方法:
1)通过标签直接确定预测数据属于的类别,就可以计算对应的fpr和tpr,最后平均。
2)ravel()函数将真实概率和预测标签展开成1列,(思路是转化为另一个二分类问题【注意不是对应的二分类问题】,此时1的位置表示正,0表示负),能够如二分类问题一样得到ROC
对于roc_auc_score中的average,方法1对应macro,方法2对应micro'''
fpr=dict()
tpr=dict()
roc_auc=dict()
#二进制标签
from sklearn.preprocessing import label_binarize
y_test=label_binarize(y_test,classes=[0,1,2])
n_classes=y_test.shape[1]
###每个class
for i in range(n_classes):
fpr[i],tpr[i],threshold=roc_curve(y_test[:,i],pre_ex1[:,i])
roc_auc[i]=auc(fpr[i],tpr[i])
#方法2
fpr["micro"], tpr["micro"], threshold= roc_curve(y_test.ravel(), pre_ex1.ravel())
roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
# 方法1
# First aggregate all false positive rates
import numpy as np
all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
# Then interpolate all ROC curves at this points
mean_tpr = np.zeros_like(all_fpr)
from scipy import interp
for i in range(n_classes):
mean_tpr += interp(all_fpr, fpr[i], tpr[i])
# Finally average it and compute AUC
mean_tpr /= n_classes
fpr["macro"] = all_fpr
tpr["macro"] = mean_tpr
roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
# Plot all ROC curves
'''示例'''
from itertools import cycle
lw=2
plt.figure()
plt.plot(fpr["micro"], tpr["micro"],
label='micro-average ROC curve (area = {0:0.2f})'
''.format(roc_auc["micro"]),
color='deeppink', linestyle=':', linewidth=4)
plt.plot(fpr["macro"], tpr["macro"],
label='macro-average ROC curve (area = {0:0.2f})'
''.format(roc_auc["macro"]),
color='navy', linestyle=':', linewidth=4)
colors = cycle(['aqua', 'darkorange', 'cornflowerblue'])
for i, color in zip(range(n_classes), colors):
plt.plot(fpr[i], tpr[i], color=color, lw=lw,
label='ROC curve of class {0} (area = {1:0.2f})'
''.format(i, roc_auc[i]))
plt.plot([0, 1], [0, 1], 'k--', lw=lw)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Some extension of Receiver operating characteristic to multi-class')
plt.legend(loc="lower right")
plt.show()
深入来讲,多分类还能够采取1VN、1V1和softmax方法等,也可以应用决策树、SVM等模型算法。