步骤:
1.选择并构建训练模型model
2.将训练模型model投入到GridSearchCV中,得到GridSearchCV模型grid_model
3.用grid_model拟合训练集数据,选择在validation_dataset上效果最好的参数的模型best_estimator
4.1.用best_estimator拟合训练集(得到的结果应该与之前不同,因为之前用交叉验证等方法对训练集进行了分割)
4.2.用best_estimator拟合测试集
5.结果可视化:AUC曲线,AUPR曲线
一.数据
【数据准备】
Size | Size | |
训练集 | (1206, 294) | (1206,) |
测试集 | (64, 294) | (64,) |
二.主模型
【搭建环境】
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.model_selection import GridSearchCV,cross_val_score
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.metrics import auc,roc_auc_score,roc_curve,precision_recall_curve
from sklearn.metrics import confusion_matrix,make_scorer
【模型准备】
seed = 1231
np.random.seed(seed)
x_train,y_train,x_test,y_test = x_train,y_train,x_test,y_test
names = ['Decision Tree', 'Random Forest']
classifiers = [DecisionTreeClassifier(),RandomForestClassifier()]
parameter_dtc = {'max_features':['auto','sqrt','log2',None],'max_depth':range(3,100,2)}
parameter_rfc = {'n_estimators':range(5,200,5),'max_features':['auto','sqrt','log2',None],'max_depth':range(3,100,2)}
parameters = [parameter_dtc,parameter_rfc,parameter_ada_dtc,parameter_mlp]
scoring = {'roc_auc':'roc_auc','accuracy':'accuracy', 'precision':'precision','recall':'recall','f1':'f1'}
【主模型函数】
def gird_search_model(clf,param,name,x_train,y_train,x_test,y_test): #clf-classifier;param-parameter;name-classifier_name
model = GridSearchCV(clf,param,cv=5,verbose=2,scoring=scoring,refit='roc_auc',n_jobs=-1,return_train_score=True) #GridSearchCV模型
fit = model.fit(x_train,y_train) #GridSearchCV模型拟合训练集数据,并返回训练器集合为fit
y_train_pred = fit.best_estimator_.predict(x_train) #用训练器集合中最好的estimator预测y_train_pred
y_test_pred = fit.best_estimator_.predict(x_test) #用训练器集合中最好的estimator预测y_test_pred
cv_results = pd.DataFrame(fit.cv_results_).set_index(['params']) #将训练器集合fit的cv_results保存为df格式
cv_results_mean = cv_results[['mean_train_accuracy', 'mean_train_f1','mean_train_precision', 'mean_train_recall', 'mean_train_roc_auc',
'mean_test_accuracy','mean_test_f1', 'mean_test_precision', 'mean_test_recall','mean_test_roc_auc']] #cv_results中的各个score的mean
cv_results_std = cv_results[['std_train_accuracy', 'std_train_f1', 'std_train_precision','std_train_recall', 'std_train_roc_auc',
'std_test_accuracy', 'std_test_f1','std_test_precision', 'std_test_recall', 'std_test_roc_auc']] #cv_results中的各个score的std
#-------------------模型结果展示------------------------------------------------------
print('MODEL : %r' % name)
print('Best cv_test_roc_auc: %f using %s' % (fit.best_score_,fit.best_params_)) #训练器集合fit中最好的模型得到的:best_score和best_params
print(cv_results_mean)
print(cv_results_std)
train_score_list = []
test_score_list = []
score_list = []
model_metrics_name = [accuracy_score,precision_score,recall_score,f1_score,roc_auc_score,aupr] #模型评价指标,与scoreing相对应
for matrix in model_metrics_name: #计算各个模型评价指标
train_score = matrix(y_train,y_train_pred) #计算训练集的
test_score = matrix(y_test,y_test_pred) #计算测试集的
train_score_list.append(train_score) #把训练集的各个模型指标放在同一行
test_score_list.append(test_score) #把测试集的各个模型指标放在同一行
score_list.append(train_score_list) #合并训练集和测试集的结果(便于展示)
score_list.append(test_score_list) #合并训练集和测试集的结果(便于展示)
score_df = pd.DataFrame(score_list,index = ['train','test'],columns = ['accuracy','precision','recall','f1','roc_auc','aupr']) #将结果显示为df格式,加上行列index
print('EVALUATE_METRICS:')
print(score_df)
return cv_results,score_list,y_train_pred,y_test_pred
【单个模型执行过程】
【多个模型循环执行】
train_score_list = []
test_score_list = []
y_train_pred_list = []
y_test_pred_list = []
for clf,param,name in zip(classifiers,parameters,names):
cv_result,score_list,y_train_pred,y_test_pred = gird_search_model(clf,param,name,x_train,y_train,x_test,y_test) #执行主模型函数
train_score_list.append(score_list[0])
test_score_list.append(score_list[1])
y_train_pred_list.append(y_train_pred)
y_test_pred_list.append(y_test_pred)
print('-------------------------------------------------------------------------------------------------------------------------------')
train_score_df = pd.DataFrame(train_score_list,index=names,columns=['acc','pre','rec','f1','roc_auc','aupr'])
test_score_df = pd.DataFrame(test_score_list,index=names,columns=['acc','pre','rec','f1','roc_auc','aupr'])
print('TRAIN_SCORE:')
print(train_score_df)
print()
print('TEST_SCORE:')
print(test_score_df)
【多个模型执行结果】
三.画AUC和PRC图
【主函数】
for clf_name,y_train_pred,y_test_pred in zip(names,y_train_pred_list,y_test_pred_list):
show_curve(y_train,y_train_pred,clf_name,True)
show_curve(y_test,y_test_pred,clf_name,False)
【结果】
四.子函数(主程序内的,应该写在最前面,本文为便于理解,放在最后)
1.模型评估函数里有一个aupr(precision-recall-curve的曲线下面积):当正负样本不平衡时使用aupr评估比auc好。
def aupr(y_true,y_pred):
precision, recall, thresholds = precision_recall_curve(y_true,y_pred)
roc_aupr = auc(recall,precision)
return roc_aupr
2.如果想使用混淆矩阵作为GridSearchCV模型中的scoring,需要用make_scorer转换一下。
def tn(y_true,y_pred): return confusion_matrix(y_true,y_pred)[0,0]
def fp(y_true,y_pred): return confusion_matrix(y_true,y_pred)[0,1]
def fn(y_true,y_pred): return confusion_matrix(y_true,y_pred)[1,0]
def tp(y_true,y_pred): return confusion_matrix(y_true,y_pred)[1,1]
make_score = {'tp':make_scorer(tp),'tn':make_scorer(tn),'fp':make_scorer(fp),'fn':make_scorer(fn)}
3.画图_步1:AUC和PRC曲线
import matplotlib.pyplot as plt
def show_roc(roc_auc,fpr,tpr):
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--',label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot(fpr, tpr)
plt.xlabel('False positive rate')
plt.ylabel('True positive rate')
plt.title('ROC curve')
plt.legend(loc='best')
plt.show()
def show_roc_pr(roc_aupr,recall,precision):
plt.figure(1)
plt.plot([0, 1], [0, 1], 'k--',label='ROC curve (area = %0.2f)' % roc_aupr)
plt.plot(recall, precision)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('ROC_PR curve')
plt.legend(loc='best')
plt.show()
print()
4.画图_步2:AUC和PRC曲线
def show_curve(y_true,y_pred,clf_name,train=True):
fpr, tpr, thresholds1 = roc_curve(y_true,y_pred)
precision, recall, thresholds2 = precision_recall_curve(y_true,y_pred)
roc_auc = auc(fpr, tpr)
roc_aupr = auc(recall,precision)
if train == True:
print('%s (%s)' %(clf_name,"train"))
else:
print('%s (%s)' %(clf_name,"test"))
show_roc(roc_auc,fpr,tpr)
print()
show_roc_pr(roc_aupr,recall,precision)