数据文件不会上传
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams["font.sans-serif"]=["SimHei"]
plt.rcParams["axes.unicode_minus"]=False
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import f1_score,confusion_matrix
from sklearn.metrics import precision_score,recall_score,roc_auc_score,roc_curve
import warnings
warnings.filterwarnings('ignore')
# (1)读入aviation数据集,设置MEMBER_NO为索引列
data=pd.read_excel("../datas/aviation.xls",index_col="MEMBER_NO")
# (2)剔除重复值、缺失值
data.drop_duplicates(inplace=True)
data.dropna(inplace=True)
# (3)随机抽取500样本,切片特征X和标签Y 因为调参很慢,所以这里的操作是想在小数据集上调参
data=data.sample(500)
X=data.iloc[:,:-1]
Y=data.iloc[:,-1]
# (4)使用交叉验证方法
# 比较逻辑回归、决策树算法性能差异,评估指标用F1分数
Log=LogisticRegression()
DT=DecisionTreeClassifier()
model1=cross_val_score(Log,X,Y,scoring="f1",cv=4) #[0.94949495 0.97959184 0.98 0.97916667]
model2=cross_val_score(DT,X,Y,scoring="f1",cv=4) #[0.89583333 0.9245283 0.92307692 0.95833333]
print(model1)
print(model2)
# (5)使用网格搜索对上题中F1分数较高的算法进行超参数调优。
param1={
"C":[0.1,1,10,100,200],
"max_iter":[1,10,50,100,200]
}
searchmodel1=GridSearchCV(Log,param_grid=param1,cv=2).fit(X,Y)
print(searchmodel1.best_params_)
print(searchmodel1.best_score_*100,"%")
# (6)使用4、5中确定的最优算法和最优参数建立模型。
model=LogisticRegression(C=searchmodel1.best_params_.get("C"),max_iter=100)
# (7)按照6:4划分整个数据集(样本总体)
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.4,random_state=3)
# (8)使用训练集数据进行模型训练,对测试集数据进行预测,打印混淆矩阵
model.fit(X_train,Y_train)
pred=model.predict(X_test)
preds=model.predict_proba(X_test)
print("混淆矩阵:\n",confusion_matrix(Y_test,pred))
# (9)打印精确率、召回率、F1分数和AUC值、画出ROC曲线
print("精确率:%.2f"%(precision_score(Y_test,pred)*100),"%")
print("召回率:%.2f"%(recall_score(Y_test,pred)*100),"%")
print("F1-Score:%.2f"%(f1_score(Y_test,pred)*100),"%")
print("AUC:%.2f"%(roc_auc_score(Y_test,preds[:,-1])*100),"%")
TPR,FPR,TH=roc_curve(Y_test,preds[:,-1],)
plt.title("ROC曲线")
plt.plot(TPR,FPR)
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.show()
结果展示
[0.92473118 0.9375 0.98947368 0.95652174]
[0.92307692 0.90526316 0.92631579 0.94949495]
{'C': 1, 'max_iter': 50}
97.0 %
混淆矩阵:
[[116 1]
[ 2 81]]
精确率: 0.9878048780487805
召回率: 0.9759036144578314
F1-Score: 0.9818181818181818
AUC: 0.9948511996704767