step:
code:
from sklearn import cross_validation
from sklearn import datasets
from sklearn import metrics
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import numpy as np
from sklearn.model_selection import KFold
#评估交叉验证的性能
def evaluation(y_test,pred):
acc = metrics.accuracy_score(y_test,pred)
f1 = metrics.f1_score(y_test,pred)
auc = metrics.roc_auc_score(y_test,pred)
print ("Accuracy: ",acc)
print ("F1-score: ",f1)
print ("AUC ROC : ",auc)
#创建一个分类数据集
dataset = datasets.make_classification(n_samples = 1500,n_features = 10)
index = 1
#使用10倍交叉验证分割数据集
kf = cross_validation.KFold(len(dataset[0]),n_folds=10,shuffle=True)
for train_index,test_index in kf:
X_train,y_train = dataset[0][train_index],dataset[1][train_index]
X_test,y_test = dataset[0][test_index],dataset[1][test_index]
print("test ",index) #给每个测试标记
index += 1
#训练算法
#(1) GaussianNB
#Native Bayes
clf = GaussianNB()
clf.fit(X_train,y_train)
pred = clf.predict(X_test)
print ("GaussianNB's evaluation: ")
evaluation(y_test,pred)
print("-----------------")
#(2)SVC
clf = SVC(C=1e-01,kernel='rbf',gamma=0.1)
clf.fit(X_train,y_train)
pred = clf.predict(X_test)
print("SVC's evaluation: ")
evaluation(y_test,pred)
print('-----------------')
#(3) RandomForestClassifier
clf = RandomForestClassifier(n_estimators=10)
clf.fit(X_train,y_train)
pred = clf.predict(X_test)
print("RandomForestClassifier's evaluation: ")
evaluation(y_test,pred)
print('-----------------')
部分output:
test 1
GaussianNB's evaluation:
Accuracy: 0.94
F1-score: 0.9419354838709678
AUC ROC : 0.9406116642958747
-----------------
SVC's evaluation:
Accuracy: 0.9466666666666667
F1-score: 0.948051948051948
AUC ROC : 0.947190611664296
-----------------
RandomForestClassifier's evaluation:
Accuracy: 0.96
F1-score: 0.9594594594594594
AUC ROC : 0.9599928876244666
-----------------
test 6
GaussianNB's evaluation:
Accuracy: 0.92
F1-score: 0.925
AUC ROC : 0.9199999999999999
-----------------
SVC's evaluation:
Accuracy: 0.96
F1-score: 0.9615384615384615
AUC ROC : 0.9600000000000001
-----------------
RandomForestClassifier's evaluation:
Accuracy: 0.9866666666666667
F1-score: 0.9868421052631579
AUC ROC : 0.9866666666666667
-----------------
test 10
GaussianNB's evaluation:
Accuracy: 0.9133333333333333
F1-score: 0.9115646258503401
AUC ROC : 0.9160714285714286
-----------------
SVC's evaluation:
Accuracy: 0.9533333333333334
F1-score: 0.9510489510489512
AUC ROC : 0.9544642857142858
-----------------
RandomForestClassifier's evaluation:
Accuracy: 0.9666666666666667
F1-score: 0.9640287769784173
AUC ROC : 0.9660714285714287
-----------------
通过将10个test的数据绘制成折线图可以更直观评估
import matplotlib.pyplot as plt
x = [1,2,3,4,5,6,7,8,9,10]
# acc = [0.94,0.9333, 0.9333, 0.9466 ,0.9333 ,0.92, 0.96, 0.9467 ,0.9467, 0.9133]
# f1 = [0.9419, 0.9411, 0.9324 ,0.9480,0.9390 ,0.925, 0.9552 ,0.9487, 0.9500 ,0.9115 ]
# auc = [0.9406, 0.9289 ,0.9366, 0.9472, 0.9312, 0.9199 ,0.9610 ,0.9466 ,0.9455, 0.9160]
# acc = [ 0.9466, 0.9533, 0.9533, 0.96, 0.9666, 0.96, 0.9733, 0.9933, 0.9533, 0.9533]
# f1 = [ 0.9480, 0.9590, 0.9517, 0.9605, 0.9696, 0.9615, 0.9701, 0.9935, 0.9565, 0.9510 ]
# auc = [ 0.9471, 0.9485, 0.9553, 0.9603, 0.9642, 0.9600, 0.9745, 0.9935, 0.9519, 0.9544]
acc = [ 0.96, 0.9533, 0.9666, 0.98, 0.9666, 0.9866, 0.9866, 0.98, 0.98, 0.9666]
f1 = [ 0.9594, 0.9585, 0.9650, 0.9798, 0.9685, 0.9868, 0.9846, 0.9806, 0.9808, 0.9640]
auc = [ 0.9599, 0.9497, 0.9678, 0.9800, 0.9669, 0.9866, 0.9848, 0.9802, 0.9797, 0.9660]
plt.title("Evaluation of RandomForestClassifier",fontsize=14)
plt.xlabel("test",fontsize=14)
plt.ylabel("evaluation",fontsize=14)
plt.plot(x,acc,label="$Accuracy$",color = "red")
plt.plot(x,f1,label = "$F1-score$",color = "blue")
plt.plot(x,auc,label = "$AUC ROC$",color = "green")
plt.legend()
plt.show()
GaussionNB:
SVC:
RandomForestClassifier: