用小数据集测试catboost,并画出ROC曲线
数据集用CSV
首先import需要的库
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier,CatBoostRegressor,Pool
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
pandas读入csv
去掉标题的第一行
obj=pd.read_csv('file.csv',header=1)
划分training data和test_data
这里简单地把前N行(N自己定数字)作为训练集,最后一列去掉,倒数第二列是label
注意这里的index是从0开始,to_index是end的下一个index
#0~N-1行,需要指定为0:N
train_data = obj.iloc[0:N, 1:-2]
train_label = obj.iloc[0:N, -2]
test_data = obj.iloc[N:, 1:-2]
test_label = obj.iloc[N:, -2]
由于数据量很少,很容易过拟合,减小迭代次数
model=CatBoostClassifier(iterations=2,depth=2,learning_rate=0.5,loss_function='Logloss', logging_level='Verbose')
训练
model.fit(train_data,train_label)
根据阈值来确定predict label
threshold=0.5
predict_label=np.zeros(prob.shape)
predict_label[prob>threshold]=1
计算准确率,SE,SP
cm = confusion_matrix(train_label,predict_label)
se = cm[0,0]/(cm[0,0]+cm[0,1])
print('SE : ', se)
sp = cm[1,1]/(cm[1,0]+cm[1,1])
print('SP : ', sp)
acc=(cm[0,0]+cm[1,1])/train_label.size
print ('Accuracy : ', acc)
计算AUC
#training data
fpr,tpr,threshold = roc_curve(train_label, prob,pos_label=1)
roc_auc = auc(fpr,tpr)
#test data
prob_test = model.predict_proba(test_data)
prob_test = prob_test[:,1]
fpr_test,tpr_test,threshold_test = roc_curve(test_label, prob_test,pos_label=1)
roc_auc_test = auc(fpr_test,tpr_test)
training data的ROC
plt.figure()
lw = 2
plt.figure(figsize=(6,6))
plt.plot(fpr, tpr, color='red',
lw=lw, label='ACC = %0.2f, SE = %0.2f, SP = %0.2f, AUC = %0.2f' % (acc,se,sp,roc_auc)) ###假正率为横坐标,真正率为纵坐标做曲线
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve')
plt.legend(loc="lower right")
plt.grid()
plt.show()
test data的ROC
plt.figure()
lw = 2
plt.figure(figsize=(10,10))
plt.plot(fpr_test, tpr_test, color='red',
lw=lw, label='ROC curve (area = %0.2f)' % roc_auc_test) ###假正率为横坐标,真正率为纵坐标做曲线
plt.plot([0, 1], [0, 1], color='navy', lw=lw, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve')
plt.legend(loc="lower right")
plt.show()