from Tools.Plot import plot_confusion_matrix,macro_roc
from sklearn.metrics import classification_report,confusion_matrix,log_loss,auc
from sklearn.preprocessing import (
MinMaxScaler, label_binarize, OneHotEncoder, LabelEncoder)
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV,train_test_split
from imblearn.metrics import classification_report_imbalanced
from itertools import cycle, product
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import cycle
import seaborn as sns
import warnings
from tqdm import tqdm_notebook
warnings.filterwarnings('ignore')
%matplotlib inline
加载数据
##################### 加载数据 ##########################
train = pd.read_csv('data/train_all.csv')
test = pd.read_csv('data/test_all.csv')
# label encoding
label_dict = {'Normal': 0,
'Probe': 1,
'DoS': 2,
'U2R': 3,
'R2L': 4}
X_train = train.drop(['label_num'],axis=1)
X_test = test.drop(['label_num'],axis=1)
y_train = train['label_num']
y_test = test['label_num']
print('Shape of training set:', X_train.shape)
print('Shape of testing set:', X_test.shape)
# print('Columns: \n', list(X_train.columns))
labels = [key for i in sorted(label_dict.values()) for key,val in label_dict.items() if val==i]
labels_number = sorted(label_dict.values()) # [0, 1, 2, 3, 4]
Shape of training set: (125973, 15)
Shape of testing set: (22544, 15)
分类器
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import GaussianNB, ComplementNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import (GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier,
BaggingClassifier, ExtraTreesClassifier, RandomTreesEmbedding)
from Boost import RUSBoost,SMOTEBoost
RANDOM_STATE = 0
clfs = [
("RUSBoost",
RUSBoost.RUSBoost(random_state=RANDOM_STATE)),
("SMOTEBoost",
SMOTEBoost.SMOTEBoost(random_state=RANDOM_STATE)),
("LGBMClassifier",
LGBMClassifier(random_state=RANDOM_STATE)),
("XGBClassifier",
XGBClassifier(random_state=RANDOM_STATE)),
("LinearSVC",
LinearSVC(random_state=RANDOM_STATE)),
("SVC",
SVC(random_state=RANDOM_STATE)),
("GaussianNB",
GaussianNB()),
("ComplementNB",
ComplementNB()),
("MLPClassifier",
MLPClassifier(random_state=RANDOM_STATE)),
("KNeighborsClassifier",
KNeighborsClassifier()),
("LogisticRegression",
LogisticRegression(random_state=RANDOM_STATE)),
("SGDClassifier,loss='log'",
SGDClassifier(loss='log',random_state=RANDOM_STATE)),
("SGDClassifier,loss='modified_huber'",
SGDClassifier(loss='modified_huber',random_state=RANDOM_STATE)),
("DecisionTreeClassifier",
DecisionTreeClassifier(random_state=RANDOM_STATE)),
("ExtraTreeClassifier",
ExtraTreeClassifier(random_state=RANDOM_STATE)),
("GradientBoostingClassifier",
GradientBoostingClassifier(random_state=RANDOM_STATE)),
("RandomForestClassifier",
RandomForestClassifier(random_state=RANDOM_STATE)),
("AdaBoostClassifier",
AdaBoostClassifier(random_state=RANDOM_STATE)),
("BaggingClassifier",
BaggingClassifier(random_state=RANDOM_STATE)),
("ExtraTreesClassifier",
ExtraTreesClassifier(random_state=RANDOM_STATE)),
]
评估
################### train #####################################
plt.figure(figsize=(12, 6))
cm = []
clf_report_list = []
for label, clf in tqdm_notebook(clfs):
start = datetime.datetime.now()
clf.fit(X_train, y_train)
end = datetime.datetime.now()
print('[',label,']--done',(end-start))
######################### 测试集评估 ########################
y_test_pred = clf.predict(X_test)
if label in ['LinearSVC', 'SVC']:
y_test_score = clf.decision_function(X_test)
else:
y_test_score = clf.predict_proba(X_test) # valid score
# 分类报告
clf_report = classification_report_imbalanced(
y_test, y_test_pred, digits=4, target_names=labels)
clf_report_list.append(clf_report)
# 混淆矩阵
cnf_matrix = confusion_matrix(y_test, y_test_pred)
cm.append((label, cnf_matrix))
# ROC
all_fpr, mean_tpr = macro_roc(
y_test, y_test_score, labels_number)
roc_auc = auc(all_fpr, mean_tpr)
sns.set_style('darkgrid')
plt.plot(all_fpr, mean_tpr, lw=1,
label='{0} (auc = {1:0.4f})'.format(label, roc_auc))
plt.plot([0, 1], [0, 1], 'k--', lw=1)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.01])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve')
plt.legend(loc=(1.1, 0))
plt.tight_layout()
plt.show()
[ RUSBoost ]--done 0:00:05.803901
[ SMOTEBoost ]--done 0:00:07.628780
[ LGBMClassifier ]--done 0:00:07.268016
[ XGBClassifier ]--done 0:00:31.144963
[ LinearSVC ]--done 0:00:07.350956
[ SVC ]--done 0:02:07.588954
[ GaussianNB ]--done 0:00:00.052485
[ ComplementNB ]--done 0:00:00.025667
[ MLPClassifier ]--done 0:02:28.444017
[ KNeighborsClassifier ]--done 0:00:11.978950
[ LogisticRegression ]--done 0:00:03.451240
[ SGDClassifier,loss='log' ]--done 0:00:00.949393
[ SGDClassifier,loss='modified_huber' ]--done 0:00:00.639087
[ DecisionTreeClassifier ]--done 0:00:00.610772
[ ExtraTreeClassifier ]--done 0:00:00.048572
[ GradientBoostingClassifier ]--done 0:00:46.638650
[ RandomForestClassifier ]--done 0:00:00.782573
[ AdaBoostClassifier ]--done 0:00:03.450502
[ BaggingClassifier ]--done 0:00:03.894190
[ ExtraTreesClassifier ]--done 0:00:00.506168
sns.set_style('white')
fig,axes = plt.subplots(5,4,figsize=(20,20))
for ax,(name,cnf_matrix) in zip(axes.ravel(),cm):
plot_confusion_matrix(cnf_matrix, labels,ax=ax,
normalize=True,
title=name,
cmap=plt.cm.Blues)
plt.tight_layout()
plt.show()
for clf_report,(name,clf) in zip(clf_report_list,clfs):
print(name,'\n',clf_report)
RUSBoost
pre rec spe f1 geo iba sup
Normal 0.5827 0.7450 0.5962 0.6539 0.6665 0.4508 9711
Probe 0.4619 0.2156 0.9698 0.2940 0.4573 0.1933 2421
DoS 0.6661 0.8036 0.8009 0.7284 0.8022 0.6437 7458
U2R 0.0000 0.0000 1.0000 0.0000 0.0000 0.0000 200
R2L 0.0000 0.0000 1.0000 0.0000 0.0000 0.0000 2754
avg / total 0.5210 0.6099 0.7569 0.5542 0.6016 0.4279 22544
SMOTEBoost
pre rec spe f1 geo iba sup
Normal 0.2493 0.3152 0.2818 0.2784 0.2980 0.0891 9711
Probe 0.8357 0.2710 0.9936 0.4092 0.5189 0.2498 2421
DoS 0.2974 0.3713 0.5664 0.3303 0.4586 0.2062 7458
U2R 0.1375 0.0550 0.9969 0.0786 0.2342 0.0497 200
R2L 0.9121 0.0301 0.9996 0.0583 0.1736 0.0272 2754
avg / total 0.4082 0.2919 0.5464 0.2810 0.3591 0.1372 22544
LGBMClassifier
pre rec spe f1 geo iba sup
Normal 0.6757 0.9673 0.6487 0.7956 0.7921 0.6475 9711
Probe 0.8396 0.6010 0.9862 0.7005 0.7699 0.5699 2421
DoS 0.9549 0.8241 0.9808 0.8847 0.8990 0.7956 7458
U2R 0.0474 0.0650 0.9883 0.0549 0.2535 0.0583 200
R2L 0.8400 0.0610 0.9984 0.1137 0.2468 0.0552 2754
avg / total 0.8002 0.7618 0.8405 0.7250 0.7537 0.6105 22544
XGBClassifier
pre rec spe f1 geo iba sup
Normal 0.6563 0.9718 0.6150 0.7835 0.7731 0.6189 9711
Probe 0.7708 0.6113 0.9781 0.6819 0.7733 0.5760 2421
DoS 0.9625 0.8024 0.9846 0.8752 0.8888 0.7756 7458
U2R 1.0000 0.0100 1.0000 0.0198 0.1000 0.0090 200
R2L 0.9630 0.0094 0.9999 0.0187 0.0972 0.0085 2754
avg / total 0.8104 0.7509 0.8267 0.7027 0.7228 0.5862 22544
LinearSVC
pre rec spe f1 geo iba sup
Normal 0.6251 0.9687 0.5604 0.7599 0.7368 0.5650 9711
Probe 0.8171 0.5609 0.9849 0.6652 0.7433 0.5290 2421
DoS 0.9556 0.7473 0.9828 0.8387 0.8570 0.7171 7458
U2R 0.0000 0.0000 1.0000 0.0000 0.0000 0.0000 200
R2L 0.0000 0.0000 0.9999 0.0000 0.0000 0.0000 2754
avg / total 0.6731 0.7247 0.8033 0.6762 0.6807 0.5374 22544
SVC
pre rec spe f1 geo iba sup
Normal 0.6217 0.9681 0.5542 0.7571 0.7325 0.5587 9711
Probe 0.7946 0.5783 0.9820 0.6694 0.7536 0.5449 2421
DoS 0.9544 0.7243 0.9829 0.8236 0.8438 0.6935 7458
U2R 0.0000 0.0000 1.0000 0.0000 0.0000 0.0000 200
R2L 0.0000 0.0000 1.0000 0.0000 0.0000 0.0000 2754
avg / total 0.6689 0.7187 0.8004 0.6705 0.6756 0.5286 22544
GaussianNB
pre rec spe f1 geo iba sup
Normal 0.8816 0.6899 0.9299 0.7741 0.8010 0.6262 9711
Probe 0.6846 0.4725 0.9738 0.5591 0.6783 0.4371 2421
DoS 0.9387 0.5977 0.9807 0.7304 0.7656 0.5638 7458
U2R 0.0248 0.9100 0.6796 0.0483 0.7864 0.6327 200
R2L 0.3758 0.1616 0.9627 0.2260 0.3944 0.1431 2754
avg / total 0.8100 0.5735 0.9532 0.6632 0.7263 0.5263 22544
ComplementNB
pre rec spe f1 geo iba sup
Normal 0.6519 0.9627 0.6110 0.7774 0.7670 0.6089 9711
Probe 0.7739 0.5217 0.9817 0.6232 0.7156 0.4886 2421
DoS 0.8499 0.7489 0.9346 0.7962 0.8366 0.6869 7458
U2R 0.0000 0.0000 1.0000 0.0000 0.0000 0.0000 200
R2L 0.0000 0.0000 1.0000 0.0000 0.0000 0.0000 2754
avg / total 0.6451 0.7185 0.8088 0.6652 0.6840 0.5420 22544
MLPClassifier
pre rec spe f1 geo iba sup
Normal 0.6337 0.9702 0.5755 0.7666 0.7473 0.5805 9711
Probe 0.8523 0.5766 0.9880 0.6879 0.7548 0.5463 2421
DoS 0.9617 0.7505 0.9852 0.8430 0.8599 0.7220 7458
U2R 0.0000 0.0000 1.0000 0.0000 0.0000 0.0000 200
R2L 0.8894 0.0701 0.9988 0.1299 0.2646 0.0635 2754
avg / total 0.7913 0.7367 0.8108 0.6989 0.7197 0.5553 22544
KNeighborsClassifier
pre rec spe f1 geo iba sup
Normal 0.6205 0.9771 0.5478 0.7590 0.7316 0.5583 9711
Probe 0.8583 0.5027 0.9900 0.6340 0.7055 0.4734 2421
DoS 0.9614 0.7473 0.9852 0.8409 0.8580 0.7186 7458
U2R 0.1154 0.0150 0.9990 0.0265 0.1224 0.0135 200
R2L 0.4545 0.0018 0.9997 0.0036 0.0426 0.0016 2754
avg / total 0.7340 0.7225 0.7992 0.6739 0.6810 0.5294 22544
LogisticRegression
pre rec spe f1 geo iba sup
Normal 0.6196 0.9712 0.5488 0.7565 0.7301 0.5555 9711
Probe 0.8604 0.5576 0.9891 0.6767 0.7427 0.5278 2421
DoS 0.9544 0.7356 0.9826 0.8308 0.8502 0.7050 7458
U2R 0.0000 0.0000 1.0000 0.0000 0.0000 0.0000 200
R2L 0.0000 0.0000 0.9997 0.0000 0.0000 0.0000 2754
avg / total 0.6750 0.7216 0.7987 0.6734 0.6755 0.5292 22544
SGDClassifier,loss='log'
pre rec spe f1 geo iba sup
Normal 0.6164 0.9717 0.5424 0.7543 0.7260 0.5497 9711
Probe 0.8677 0.5527 0.9899 0.6752 0.7396 0.5231 2421
DoS 0.9529 0.7274 0.9822 0.8250 0.8453 0.6963 7458
U2R 0.0000 0.0000 1.0000 0.0000 0.0000 0.0000 200
R2L 0.0000 0.0000 0.9999 0.0000 0.0000 0.0000 2754
avg / total 0.6740 0.7186 0.7959 0.6704 0.6718 0.5233 22544
SGDClassifier,loss='modified_huber'
pre rec spe f1 geo iba sup
Normal 0.6207 0.9704 0.5512 0.7571 0.7314 0.5574 9711
Probe 0.8393 0.5523 0.9873 0.6662 0.7384 0.5215 2421
DoS 0.9556 0.7389 0.9830 0.8334 0.8523 0.7087 7458
U2R 0.0000 0.0000 1.0000 0.0000 0.0000 0.0000 200
R2L 0.0000 0.0000 0.9999 0.0000 0.0000 0.0000 2754
avg / total 0.6736 0.7218 0.7997 0.6734 0.6763 0.5305 22544
DecisionTreeClassifier
pre rec spe f1 geo iba sup
Normal 0.6554 0.9690 0.6144 0.7819 0.7716 0.6165 9711
Probe 0.8475 0.6105 0.9868 0.7097 0.7762 0.5798 2421
DoS 0.9617 0.7836 0.9846 0.8635 0.8783 0.7560 7458
U2R 0.3333 0.0200 0.9996 0.0377 0.1414 0.0180 200
R2L 0.7620 0.0977 0.9958 0.1732 0.3119 0.0885 2754
avg / total 0.7875 0.7543 0.8269 0.7202 0.7457 0.5889 22544
ExtraTreeClassifier
pre rec spe f1 geo iba sup
Normal 0.6477 0.9691 0.6011 0.7765 0.7632 0.6040 9711
Probe 0.8605 0.6295 0.9877 0.7271 0.7885 0.5995 2421
DoS 0.9616 0.7884 0.9844 0.8664 0.8810 0.7609 7458
U2R 0.2083 0.0250 0.9991 0.0446 0.1580 0.0225 200
R2L 0.2308 0.0087 0.9960 0.0168 0.0932 0.0078 2754
avg / total 0.7196 0.7472 0.8212 0.7016 0.7177 0.5774 22544
GradientBoostingClassifier
pre rec spe f1 geo iba sup
Normal 0.6432 0.9739 0.5911 0.7747 0.7588 0.5978 9711
Probe 0.8659 0.5708 0.9894 0.6881 0.7515 0.5411 2421
DoS 0.9630 0.7885 0.9850 0.8671 0.8813 0.7615 7458
U2R 0.5000 0.0100 0.9999 0.0196 0.1000 0.0090 200
R2L 0.9318 0.0447 0.9995 0.0852 0.2113 0.0404 2754
avg / total 0.8069 0.7472 0.8177 0.7051 0.7258 0.5725 22544
RandomForestClassifier
pre rec spe f1 geo iba sup
Normal 0.6432 0.9713 0.5922 0.7739 0.7584 0.5970 9711
Probe 0.8607 0.5820 0.9887 0.6944 0.7585 0.5520 2421
DoS 0.9627 0.7778 0.9851 0.8604 0.8753 0.7503 7458
U2R 0.6667 0.0200 0.9999 0.0388 0.1414 0.0180 200
R2L 0.9000 0.0686 0.9989 0.1275 0.2618 0.0622 2754
avg / total 0.8038 0.7468 0.8181 0.7085 0.7310 0.5724 22544
AdaBoostClassifier
pre rec spe f1 geo iba sup
Normal 0.4581 0.7677 0.3129 0.5738 0.4901 0.2511 9711
Probe 0.8131 0.3395 0.9906 0.4790 0.5799 0.3144 2421
DoS 0.5636 0.3628 0.8611 0.4415 0.5590 0.2969 7458
U2R 0.0000 0.0000 0.9997 0.0000 0.0000 0.0000 200
R2L 0.9425 0.1547 0.9987 0.2658 0.3930 0.1414 2754
avg / total 0.5862 0.5061 0.6569 0.4771 0.5063 0.2574 22544
BaggingClassifier
pre rec spe f1 geo iba sup
Normal 0.6558 0.9707 0.6145 0.7828 0.7723 0.6177 9711
Probe 0.8632 0.5993 0.9886 0.7075 0.7697 0.5694 2421
DoS 0.9640 0.8013 0.9852 0.8752 0.8885 0.7749 7458
U2R 0.5000 0.0200 0.9998 0.0385 0.1414 0.0180 200
R2L 0.9046 0.0930 0.9986 0.1686 0.3047 0.0844 2754
avg / total 0.8091 0.7591 0.8277 0.7236 0.7478 0.5941 22544
ExtraTreesClassifier
pre rec spe f1 geo iba sup
Normal 0.6303 0.9727 0.5682 0.7649 0.7434 0.5751 9711
Probe 0.8689 0.5857 0.9894 0.6997 0.7612 0.5561 2421
DoS 0.9598 0.7589 0.9843 0.8476 0.8643 0.7302 7458
U2R 0.1905 0.0200 0.9992 0.0362 0.1414 0.0180 200
R2L 0.5714 0.0015 0.9998 0.0029 0.0381 0.0013 2754
avg / total 0.7538 0.7333 0.8076 0.6857 0.6938 0.5493 22544