In the second ML assignment you have to compare the performance of three di↵erent classification algorithms, namely Naive Bayes, SVM, and Random Forest.
For this assignment you need to generate a random binary classification problem, and then train and test (using 10-fold cross validation) the three algorithms. For some algorithms inner cross validation (5-fold) for choosing the parameters is needed. Then, show the classification performace (per-fold and averaged) in the report, and briefly discussing the results.
from sklearn import datasets
from sklearn import cross_validation
#step 1
X, y = datasets.make_classification(n_samples=1000, n_features=10, n_classes=2)
print('samples:', X)
print('labels for class membership of each sample:', y)
#step 2
kf = cross_validation.KFold(len(X), n_folds=10, shuffle=True)
for train_index, test_index in kf:
X_train, y_train = X[train_index], y[train_index]
X_test, y_test = X[test_index], y[test_index]
print(X_train)
print(y_train)
print(X_test)
print(y_test)
(因为数据集规模较大,所以没有截图代码的输出)
GaussianNB
from sklearn.naive_bayes import GaussianNB
from sklearn import metrics
clf = GaussianNB()
#模型训练
clf.fit(X_train, y_train)
#模型预测
pred = clf.predict(X_test)
print('' , pred)
print('' , y_test)
#Evaluate performance
acc = metrics.accuracy_score(y_test, pred)
print('' , acc)
f1 = metrics.f1_score(y_test, pred)
print('' , f1)
auc = metrics.roc_auc_score(y_test, pred)
print('' , auc)
[Output]
<Pedict> [0 1 0 1 1 1 1 0 1 1 0 1 1 0 1 0 1 1 1 1 1 0 1 0 0 1 0 1 0 0 1 1 1 0 0 1 0
1 0 1 1 1 0 1 1 0 0 1 0 0 0 1 0 1 0 0 1 1 1 1 1 0 1 1 0 1 1 0 0 1
0 1 1 0
0 0 1 1 0 1 0 0 1 1 0 1 1 1 1 0 0 1 0 0 0 0 1 1 1 0]
<Test> [0 0 1 1 0 1 1 0 1 0 0 1 1 0 1 0 1 1 1 1 1 0 1 0 0 1 0 0 1 0 1 0 1 0 0 1 0
1 0 1 0 0 1 0 0 0 0 1 0 0 0 1 1 1 0 0 1 0 1 1 1 0 1 0 0 1 1 0 0 1
0 0 1 0
0 0 1 1 1 1 0 0 1 1 0 1 0 1 1 0 0 1 0 0 0 0 1 1 1 0]
<Accuracy> 0.82
<F1-score> 0.8269230769230769
<AUC ROC:> 0.8229166666666666
SVC
from sklearn.svm import SVC
from sklearn import metrics
for C in [1e-02, 1e-01, 1e00, 1e01, 1e02]:
print('*-- [ C value=', C, '] --*')
clf = SVC(C, kernel='rbf', gamma=0.1)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
print('' , pred)
print('' , y_test)
acc = metrics.accuracy_score(y_test, pred)
print('' , acc)
f1 = metrics.f1_score(y_test, pred)
print('' , f1)
auc = metrics.roc_auc_score(y_test, pred)
print('' , auc)
[Output]
*-- [ C value= 0.01 ] --*
<Pedict> [0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
1 0 0 0
0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0]
<Test> [1 1 1 1 1 1 0 0 0 0 1 0 1 1 1 0 0 1 0 1 1 1 1 0 0 1 1 0 0 1 0 0 0 1 0 0 1
0 0 1 0 1 0 1 1 0 1 0 1 0 0 1 0 1 1 1 0 0 1 1 0 1 1 1 1 0 0 1 0 0
1 0 1 1
0 1 1 1 1 1 0 1 1 1 0 0 1 1 1 1 1 1 0 0 0 1 1 0 0 0]
<Accuracy> 0.5
<F1-score> 0.21875
<AUC ROC:> 0.5511363636363636
*-- [ C value= 0.1 ] --*
<Pedict> [1 1 1 1 1 1 0 1 0 0 1 0 1 1 0 0 0 1 0 1 1 1 0 1 0 1 1 0 1 1 0 0 0 1 0 0 1
0 0 1 0 1 0 1 1 0 0 0 1 0 0 1 0 1 1 0 0 0 1 1 0 1 1 1 1 1 0 1 0 0
1 0 1 1
0 1 1 1 1 1 0 1 0 1 0 0 1 1 1 1 0 1 1 0 0 1 1 0 0 0]
<Test> [1 1 1 1 1 1 0 0 0 0 1 0 1 1 1 0 0 1 0 1 1 1 1 0 0 1 1 0 0 1 0 0 0 1 0 0 1
0 0 1 0 1 0 1 1 0 1 0 1 0 0 1 0 1 1 1 0 0 1 1 0 1 1 1 1 0 0 1 0 0
1 0 1 1
0 1 1 1 1 1 0 1 1 1 0 0 1 1 1 1 1 1 0 0 0 1 1 0 0 0]
<Accuracy> 0.89
<F1-score> 0.9009009009009009
<AUC ROC:> 0.8896103896103895
*-- [ C value= 1.0 ] --*
<Pedict> [1 1 1 1 1 1 0 1 0 0 1 0 1 1 0 0 0 1 0 1 1 1 0 0 0 1 1 0 1 1 0 0 0 1 0 0 1
0 0 1 0 1 0 1 1 0 1 0 1 0 0 1 0 1 1 0 0 0 1 1 0 1 1 1 1 1 0 1 0 0
1 0 1 1
0 1 1 1 1 1 0 1 1 1 0 0 1 1 1 1 0 1 1 0 0 1 1 0 0 0]
<Test> [1 1 1 1 1 1 0 0 0 0 1 0 1 1 1 0 0 1 0 1 1 1 1 0 0 1 1 0 0 1 0 0 0 1 0 0 1
0 0 1 0 1 0 1 1 0 1 0 1 0 0 1 0 1 1 1 0 0 1 1 0 1 1 1 1 0 0 1 0 0
1 0 1 1
0 1 1 1 1 1 0 1 1 1 0 0 1 1 1 1 1 1 0 0 0 1 1 0 0 0]
<Accuracy> 0.92
<F1-score> 0.9285714285714286
<AUC ROC:> 0.9188311688311689
*-- [ C value= 10.0 ] --*
<Pedict> [1 1 1 1 1 1 1 1 0 0 1 0 1 0 0 0 0 1 0 1 0 1 0 0 0 1 1 0 1 1 0 0 0 1 0 0 1
0 0 1 0 1 0 1 1 0 1 0 1 0 0 1 0 1 1 0 0 0 1 1 0 1 1 1 1 1 0 1 0 0
1 0 1 1
0 1 1 1 1 1 0 1 0 1 0 0 1 1 1 1 0 1 1 0 0 1 1 0 0 0]
<Test> [1 1 1 1 1 1 0 0 0 0 1 0 1 1 1 0 0 1 0 1 1 1 1 0 0 1 1 0 0 1 0 0 0 1 0 0 1
0 0 1 0 1 0 1 1 0 1 0 1 0 0 1 0 1 1 1 0 0 1 1 0 1 1 1 1 0 0 1 0 0
1 0 1 1
0 1 1 1 1 1 0 1 1 1 0 0 1 1 1 1 1 1 0 0 0 1 1 0 0 0]
<Accuracy> 0.88
<F1-score> 0.8909090909090909
<AUC ROC:> 0.8806818181818181
*-- [ C value= 100.0 ] --*
<Pedict> [1 1 1 1 1 1 0 0 0 0 1 0 1 0 0 0 0 1 0 1 1 1 0 0 0 0 1 0 1 1 0 0 0 1 0 0 1
0 0 1 0 1 1 1 1 0 1 0 0 0 0 1 0 1 1 0 0 0 1 1 0 1 1 1 1 1 0 1 0 0
1 0 0 1
0 1 1 1 1 1 0 1 0 1 0 0 1 1 1 1 0 1 0 0 0 1 1 0 0 0]
<Test> [1 1 1 1 1 1 0 0 0 0 1 0 1 1 1 0 0 1 0 1 1 1 1 0 0 1 1 0 0 1 0 0 0 1 0 0 1
0 0 1 0 1 0 1 1 0 1 0 1 0 0 1 0 1 1 1 0 0 1 1 0 1 1 1 1 0 0 1 0 0
1 0 1 1
0 1 1 1 1 1 0 1 1 1 0 0 1 1 1 1 1 1 0 0 0 1 1 0 0 0]
<Accuracy> 0.88
<F1-score> 0.8867924528301886
<AUC ROC:> 0.8855519480519481
RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
for n_estimator in [10, 100, 1000]:
print('*-- [ n estimator value=', n_estimator, '] --*')
clf = RandomForestClassifier(n_estimators=n_estimator)
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
print('' , pred)
print('' , y_test)
acc = metrics.accuracy_score(y_test, pred)
print('' , acc)
f1 = metrics.f1_score(y_test, pred)
print('' , f1)
auc = metrics.roc_auc_score(y_test, pred)
print('' , auc)
[Output]
*-- [ n estimator value= 10 ] --*
<Pedict> [0 1 1 0 1 0 0 1 1 0 1 0 0 0 0 1 1 1 0 0 0 0 1 0 1 1 0 0 1 0 1 0 1 0 0 1 1
0 1 0 1 0 1 1 0 0 1 0 0 1 0 1 1 1 0 1 0 1 0 1 1 1 1 1 1 1 0 0 0 1
0 1 0 0
0 0 1 0 1 0 1 1 1 0 1 1 0 1 1 0 0 1 0 1 0 1 1 0 1 1]
<Test> [0 1 1 0 1 0 0 1 1 0 1 0 0 0 0 1 1 1 0 0 0 0 1 0 1 1 0 0 1 0 0 0 1 0 0 1 1
0 1 1 1 0 0 1 0 0 0 0 1 1 0 0 1 1 0 1 0 1 0 1 1 1 1 1 1 1 0 0 1 0
1 1 0 0
0 0 0 0 1 0 1 0 1 0 1 1 0 0 1 0 1 1 1 1 0 1 1 0 1 1]
<Accuracy> 0.86
<F1-score> 0.8627450980392156
<AUC ROC:> 0.86
*-- [ n estimator value= 100 ] --*
<Pedict> [0 1 1 0 1 0 0 1 1 0 1 0 0 0 0 1 1 1 0 0 0 0 1 0 1 1 0 0 1 0 1 0 1 0 0 1 1
0 1 1 1 0 1 1 0 1 0 0 0 1 1 1 0 1 0 1 0 1 0 1 1 1 1 1 1 1 0 0 1 0
1 1 0 0
0 0 1 0 1 0 1 1 1 0 1 1 0 1 1 0 0 1 1 1 0 1 1 0 1 1]
<Test> [0 1 1 0 1 0 0 1 1 0 1 0 0 0 0 1 1 1 0 0 0 0 1 0 1 1 0 0 1 0 0 0 1 0 0 1 1
0 1 1 1 0 0 1 0 0 0 0 1 1 0 0 1 1 0 1 0 1 0 1 1 1 1 1 1 1 0 0 1 0
1 1 0 0
0 0 0 0 1 0 1 0 1 0 1 1 0 0 1 0 1 1 1 1 0 1 1 0 1 1]
<Accuracy> 0.89
<F1-score> 0.8952380952380952
<AUC ROC:> 0.89
*-- [ n estimator value= 1000 ] --*
<Pedict> [0 1 1 0 1 0 0 1 1 0 1 1 0 0 0 1 1 1 0 0 0 0 1 0 1 1 0 0 1 0 1 0 1 0 0 1 1
0 1 1 1 0 1 1 0 1 0 0 0 1 1 1 0 1 0 1 0 1 0 1 1 1 1 1 1 1 0 0 1 0
0 1 0 0
0 0 1 0 1 0 1 1 1 0 1 1 0 1 1 0 1 1 1 1 0 1 1 0 1 1]
<Test> [0 1 1 0 1 0 0 1 1 0 1 0 0 0 0 1 1 1 0 0 0 0 1 0 1 1 0 0 1 0 0 0 1 0 0 1 1
0 1 1 1 0 0 1 0 0 0 0 1 1 0 0 1 1 0 1 0 1 0 1 1 1 1 1 1 1 0 0 1 0
1 1 0 0
0 0 0 0 1 0 1 0 1 0 1 1 0 0 1 0 1 1 1 1 0 1 1 0 1 1]
<Accuracy> 0.88
<F1-score> 0.8867924528301886
<AUC ROC:> 0.88
1.RandomForestClassifier与SVC的预测效果一般来说要好于GaussianNB
2.SVC并不是C值越大预测能力越好,C=1.0时最佳。而RandomForestClassifier同样也不是n estimator越大越好。