Various classifier comparisons on NSL-KDD

from Tools.Plot import plot_confusion_matrix,macro_roc
from sklearn.metrics import classification_report,confusion_matrix,log_loss,auc
from sklearn.preprocessing import (
    MinMaxScaler, label_binarize, OneHotEncoder, LabelEncoder)
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import StratifiedKFold, GridSearchCV,train_test_split

from imblearn.metrics import classification_report_imbalanced
from itertools import cycle, product
import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from itertools import cycle
import seaborn as sns
import warnings
from tqdm import tqdm_notebook
warnings.filterwarnings('ignore')
%matplotlib inline

加载数据

##################### 加载数据 ##########################
train = pd.read_csv('data/train_all.csv')
test = pd.read_csv('data/test_all.csv')
# label encoding
label_dict = {'Normal': 0,
              'Probe': 1,
              'DoS': 2,
              'U2R': 3,
              'R2L': 4}
X_train = train.drop(['label_num'],axis=1)
X_test = test.drop(['label_num'],axis=1)
y_train = train['label_num']
y_test = test['label_num']
print('Shape of training set:', X_train.shape)
print('Shape of testing set:', X_test.shape)
# print('Columns: \n', list(X_train.columns))

labels = [key for i in sorted(label_dict.values()) for key,val in label_dict.items() if val==i]
labels_number = sorted(label_dict.values()) # [0, 1, 2, 3, 4]

Shape of training set: (125973, 15)
Shape of testing set: (22544, 15)

分类器

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from sklearn.svm import  LinearSVC, SVC
from sklearn.naive_bayes import GaussianNB, ComplementNB
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import (GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier,
BaggingClassifier, ExtraTreesClassifier, RandomTreesEmbedding)
from Boost import RUSBoost,SMOTEBoost

RANDOM_STATE = 0
clfs = [
    ("RUSBoost",
        RUSBoost.RUSBoost(random_state=RANDOM_STATE)),
    ("SMOTEBoost",
        SMOTEBoost.SMOTEBoost(random_state=RANDOM_STATE)),
    ("LGBMClassifier",
        LGBMClassifier(random_state=RANDOM_STATE)),
    ("XGBClassifier",
        XGBClassifier(random_state=RANDOM_STATE)),
    ("LinearSVC",
        LinearSVC(random_state=RANDOM_STATE)),
    ("SVC",
        SVC(random_state=RANDOM_STATE)),
    ("GaussianNB",
     GaussianNB()),
    ("ComplementNB",
     ComplementNB()),
    ("MLPClassifier",
     MLPClassifier(random_state=RANDOM_STATE)),
    ("KNeighborsClassifier",
     KNeighborsClassifier()),
    ("LogisticRegression",
        LogisticRegression(random_state=RANDOM_STATE)),
    ("SGDClassifier,loss='log'",
        SGDClassifier(loss='log',random_state=RANDOM_STATE)),
    ("SGDClassifier,loss='modified_huber'",
        SGDClassifier(loss='modified_huber',random_state=RANDOM_STATE)),
    ("DecisionTreeClassifier",
     DecisionTreeClassifier(random_state=RANDOM_STATE)),
    ("ExtraTreeClassifier",
     ExtraTreeClassifier(random_state=RANDOM_STATE)),
    ("GradientBoostingClassifier",
     GradientBoostingClassifier(random_state=RANDOM_STATE)),
    ("RandomForestClassifier",
     RandomForestClassifier(random_state=RANDOM_STATE)),
    ("AdaBoostClassifier",
     AdaBoostClassifier(random_state=RANDOM_STATE)),
    ("BaggingClassifier",
     BaggingClassifier(random_state=RANDOM_STATE)),
    ("ExtraTreesClassifier",
     ExtraTreesClassifier(random_state=RANDOM_STATE)),
]

评估

################### train #####################################
plt.figure(figsize=(12, 6))
cm = []
clf_report_list = []
for label, clf in tqdm_notebook(clfs):
    
    start = datetime.datetime.now()
    clf.fit(X_train, y_train)
    end = datetime.datetime.now()
    print('[',label,']--done',(end-start))
######################### 测试集评估 ########################
    y_test_pred = clf.predict(X_test)
    if label in ['LinearSVC', 'SVC']:
        y_test_score = clf.decision_function(X_test)
    else:
        y_test_score = clf.predict_proba(X_test)  # valid score
    # 分类报告
    clf_report = classification_report_imbalanced(
        y_test, y_test_pred, digits=4, target_names=labels)
    clf_report_list.append(clf_report)
    # 混淆矩阵
    cnf_matrix = confusion_matrix(y_test, y_test_pred)
    cm.append((label, cnf_matrix))
    # ROC
    all_fpr, mean_tpr = macro_roc(
        y_test, y_test_score, labels_number)
    roc_auc = auc(all_fpr, mean_tpr)

    sns.set_style('darkgrid')
    plt.plot(all_fpr, mean_tpr, lw=1,
             label='{0} (auc = {1:0.4f})'.format(label, roc_auc))
plt.plot([0, 1], [0, 1], 'k--', lw=1)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.01])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC curve')
plt.legend(loc=(1.1, 0))
plt.tight_layout()
plt.show()
[ RUSBoost ]--done 0:00:05.803901
[ SMOTEBoost ]--done 0:00:07.628780
[ LGBMClassifier ]--done 0:00:07.268016
[ XGBClassifier ]--done 0:00:31.144963
[ LinearSVC ]--done 0:00:07.350956
[ SVC ]--done 0:02:07.588954
[ GaussianNB ]--done 0:00:00.052485
[ ComplementNB ]--done 0:00:00.025667
[ MLPClassifier ]--done 0:02:28.444017
[ KNeighborsClassifier ]--done 0:00:11.978950
[ LogisticRegression ]--done 0:00:03.451240
[ SGDClassifier,loss='log' ]--done 0:00:00.949393
[ SGDClassifier,loss='modified_huber' ]--done 0:00:00.639087
[ DecisionTreeClassifier ]--done 0:00:00.610772
[ ExtraTreeClassifier ]--done 0:00:00.048572
[ GradientBoostingClassifier ]--done 0:00:46.638650
[ RandomForestClassifier ]--done 0:00:00.782573
[ AdaBoostClassifier ]--done 0:00:03.450502
[ BaggingClassifier ]--done 0:00:03.894190
[ ExtraTreesClassifier ]--done 0:00:00.506168
output_6_2.png
sns.set_style('white')
fig,axes = plt.subplots(5,4,figsize=(20,20))
for ax,(name,cnf_matrix) in zip(axes.ravel(),cm): 
    plot_confusion_matrix(cnf_matrix, labels,ax=ax,
                          normalize=True,
                          title=name,
                          cmap=plt.cm.Blues)
plt.tight_layout()
plt.show()
output_7_0.png
for clf_report,(name,clf) in zip(clf_report_list,clfs):
    print(name,'\n',clf_report)
RUSBoost 
                    pre       rec       spe        f1       geo       iba       sup

     Normal     0.5827    0.7450    0.5962    0.6539    0.6665    0.4508      9711
      Probe     0.4619    0.2156    0.9698    0.2940    0.4573    0.1933      2421
        DoS     0.6661    0.8036    0.8009    0.7284    0.8022    0.6437      7458
        U2R     0.0000    0.0000    1.0000    0.0000    0.0000    0.0000       200
        R2L     0.0000    0.0000    1.0000    0.0000    0.0000    0.0000      2754

avg / total     0.5210    0.6099    0.7569    0.5542    0.6016    0.4279     22544

SMOTEBoost 
                    pre       rec       spe        f1       geo       iba       sup

     Normal     0.2493    0.3152    0.2818    0.2784    0.2980    0.0891      9711
      Probe     0.8357    0.2710    0.9936    0.4092    0.5189    0.2498      2421
        DoS     0.2974    0.3713    0.5664    0.3303    0.4586    0.2062      7458
        U2R     0.1375    0.0550    0.9969    0.0786    0.2342    0.0497       200
        R2L     0.9121    0.0301    0.9996    0.0583    0.1736    0.0272      2754

avg / total     0.4082    0.2919    0.5464    0.2810    0.3591    0.1372     22544

LGBMClassifier 
                    pre       rec       spe        f1       geo       iba       sup

     Normal     0.6757    0.9673    0.6487    0.7956    0.7921    0.6475      9711
      Probe     0.8396    0.6010    0.9862    0.7005    0.7699    0.5699      2421
        DoS     0.9549    0.8241    0.9808    0.8847    0.8990    0.7956      7458
        U2R     0.0474    0.0650    0.9883    0.0549    0.2535    0.0583       200
        R2L     0.8400    0.0610    0.9984    0.1137    0.2468    0.0552      2754

avg / total     0.8002    0.7618    0.8405    0.7250    0.7537    0.6105     22544

XGBClassifier 
                    pre       rec       spe        f1       geo       iba       sup

     Normal     0.6563    0.9718    0.6150    0.7835    0.7731    0.6189      9711
      Probe     0.7708    0.6113    0.9781    0.6819    0.7733    0.5760      2421
        DoS     0.9625    0.8024    0.9846    0.8752    0.8888    0.7756      7458
        U2R     1.0000    0.0100    1.0000    0.0198    0.1000    0.0090       200
        R2L     0.9630    0.0094    0.9999    0.0187    0.0972    0.0085      2754

avg / total     0.8104    0.7509    0.8267    0.7027    0.7228    0.5862     22544

LinearSVC 
                    pre       rec       spe        f1       geo       iba       sup

     Normal     0.6251    0.9687    0.5604    0.7599    0.7368    0.5650      9711
      Probe     0.8171    0.5609    0.9849    0.6652    0.7433    0.5290      2421
        DoS     0.9556    0.7473    0.9828    0.8387    0.8570    0.7171      7458
        U2R     0.0000    0.0000    1.0000    0.0000    0.0000    0.0000       200
        R2L     0.0000    0.0000    0.9999    0.0000    0.0000    0.0000      2754

avg / total     0.6731    0.7247    0.8033    0.6762    0.6807    0.5374     22544

SVC 
                    pre       rec       spe        f1       geo       iba       sup

     Normal     0.6217    0.9681    0.5542    0.7571    0.7325    0.5587      9711
      Probe     0.7946    0.5783    0.9820    0.6694    0.7536    0.5449      2421
        DoS     0.9544    0.7243    0.9829    0.8236    0.8438    0.6935      7458
        U2R     0.0000    0.0000    1.0000    0.0000    0.0000    0.0000       200
        R2L     0.0000    0.0000    1.0000    0.0000    0.0000    0.0000      2754

avg / total     0.6689    0.7187    0.8004    0.6705    0.6756    0.5286     22544

GaussianNB 
                    pre       rec       spe        f1       geo       iba       sup

     Normal     0.8816    0.6899    0.9299    0.7741    0.8010    0.6262      9711
      Probe     0.6846    0.4725    0.9738    0.5591    0.6783    0.4371      2421
        DoS     0.9387    0.5977    0.9807    0.7304    0.7656    0.5638      7458
        U2R     0.0248    0.9100    0.6796    0.0483    0.7864    0.6327       200
        R2L     0.3758    0.1616    0.9627    0.2260    0.3944    0.1431      2754

avg / total     0.8100    0.5735    0.9532    0.6632    0.7263    0.5263     22544

ComplementNB 
                    pre       rec       spe        f1       geo       iba       sup

     Normal     0.6519    0.9627    0.6110    0.7774    0.7670    0.6089      9711
      Probe     0.7739    0.5217    0.9817    0.6232    0.7156    0.4886      2421
        DoS     0.8499    0.7489    0.9346    0.7962    0.8366    0.6869      7458
        U2R     0.0000    0.0000    1.0000    0.0000    0.0000    0.0000       200
        R2L     0.0000    0.0000    1.0000    0.0000    0.0000    0.0000      2754

avg / total     0.6451    0.7185    0.8088    0.6652    0.6840    0.5420     22544

MLPClassifier 
                    pre       rec       spe        f1       geo       iba       sup

     Normal     0.6337    0.9702    0.5755    0.7666    0.7473    0.5805      9711
      Probe     0.8523    0.5766    0.9880    0.6879    0.7548    0.5463      2421
        DoS     0.9617    0.7505    0.9852    0.8430    0.8599    0.7220      7458
        U2R     0.0000    0.0000    1.0000    0.0000    0.0000    0.0000       200
        R2L     0.8894    0.0701    0.9988    0.1299    0.2646    0.0635      2754

avg / total     0.7913    0.7367    0.8108    0.6989    0.7197    0.5553     22544

KNeighborsClassifier 
                    pre       rec       spe        f1       geo       iba       sup

     Normal     0.6205    0.9771    0.5478    0.7590    0.7316    0.5583      9711
      Probe     0.8583    0.5027    0.9900    0.6340    0.7055    0.4734      2421
        DoS     0.9614    0.7473    0.9852    0.8409    0.8580    0.7186      7458
        U2R     0.1154    0.0150    0.9990    0.0265    0.1224    0.0135       200
        R2L     0.4545    0.0018    0.9997    0.0036    0.0426    0.0016      2754

avg / total     0.7340    0.7225    0.7992    0.6739    0.6810    0.5294     22544

LogisticRegression 
                    pre       rec       spe        f1       geo       iba       sup

     Normal     0.6196    0.9712    0.5488    0.7565    0.7301    0.5555      9711
      Probe     0.8604    0.5576    0.9891    0.6767    0.7427    0.5278      2421
        DoS     0.9544    0.7356    0.9826    0.8308    0.8502    0.7050      7458
        U2R     0.0000    0.0000    1.0000    0.0000    0.0000    0.0000       200
        R2L     0.0000    0.0000    0.9997    0.0000    0.0000    0.0000      2754

avg / total     0.6750    0.7216    0.7987    0.6734    0.6755    0.5292     22544

SGDClassifier,loss='log' 
                    pre       rec       spe        f1       geo       iba       sup

     Normal     0.6164    0.9717    0.5424    0.7543    0.7260    0.5497      9711
      Probe     0.8677    0.5527    0.9899    0.6752    0.7396    0.5231      2421
        DoS     0.9529    0.7274    0.9822    0.8250    0.8453    0.6963      7458
        U2R     0.0000    0.0000    1.0000    0.0000    0.0000    0.0000       200
        R2L     0.0000    0.0000    0.9999    0.0000    0.0000    0.0000      2754

avg / total     0.6740    0.7186    0.7959    0.6704    0.6718    0.5233     22544

SGDClassifier,loss='modified_huber' 
                    pre       rec       spe        f1       geo       iba       sup

     Normal     0.6207    0.9704    0.5512    0.7571    0.7314    0.5574      9711
      Probe     0.8393    0.5523    0.9873    0.6662    0.7384    0.5215      2421
        DoS     0.9556    0.7389    0.9830    0.8334    0.8523    0.7087      7458
        U2R     0.0000    0.0000    1.0000    0.0000    0.0000    0.0000       200
        R2L     0.0000    0.0000    0.9999    0.0000    0.0000    0.0000      2754

avg / total     0.6736    0.7218    0.7997    0.6734    0.6763    0.5305     22544

DecisionTreeClassifier 
                    pre       rec       spe        f1       geo       iba       sup

     Normal     0.6554    0.9690    0.6144    0.7819    0.7716    0.6165      9711
      Probe     0.8475    0.6105    0.9868    0.7097    0.7762    0.5798      2421
        DoS     0.9617    0.7836    0.9846    0.8635    0.8783    0.7560      7458
        U2R     0.3333    0.0200    0.9996    0.0377    0.1414    0.0180       200
        R2L     0.7620    0.0977    0.9958    0.1732    0.3119    0.0885      2754

avg / total     0.7875    0.7543    0.8269    0.7202    0.7457    0.5889     22544

ExtraTreeClassifier 
                    pre       rec       spe        f1       geo       iba       sup

     Normal     0.6477    0.9691    0.6011    0.7765    0.7632    0.6040      9711
      Probe     0.8605    0.6295    0.9877    0.7271    0.7885    0.5995      2421
        DoS     0.9616    0.7884    0.9844    0.8664    0.8810    0.7609      7458
        U2R     0.2083    0.0250    0.9991    0.0446    0.1580    0.0225       200
        R2L     0.2308    0.0087    0.9960    0.0168    0.0932    0.0078      2754

avg / total     0.7196    0.7472    0.8212    0.7016    0.7177    0.5774     22544

GradientBoostingClassifier 
                    pre       rec       spe        f1       geo       iba       sup

     Normal     0.6432    0.9739    0.5911    0.7747    0.7588    0.5978      9711
      Probe     0.8659    0.5708    0.9894    0.6881    0.7515    0.5411      2421
        DoS     0.9630    0.7885    0.9850    0.8671    0.8813    0.7615      7458
        U2R     0.5000    0.0100    0.9999    0.0196    0.1000    0.0090       200
        R2L     0.9318    0.0447    0.9995    0.0852    0.2113    0.0404      2754

avg / total     0.8069    0.7472    0.8177    0.7051    0.7258    0.5725     22544

RandomForestClassifier 
                    pre       rec       spe        f1       geo       iba       sup

     Normal     0.6432    0.9713    0.5922    0.7739    0.7584    0.5970      9711
      Probe     0.8607    0.5820    0.9887    0.6944    0.7585    0.5520      2421
        DoS     0.9627    0.7778    0.9851    0.8604    0.8753    0.7503      7458
        U2R     0.6667    0.0200    0.9999    0.0388    0.1414    0.0180       200
        R2L     0.9000    0.0686    0.9989    0.1275    0.2618    0.0622      2754

avg / total     0.8038    0.7468    0.8181    0.7085    0.7310    0.5724     22544

AdaBoostClassifier 
                    pre       rec       spe        f1       geo       iba       sup

     Normal     0.4581    0.7677    0.3129    0.5738    0.4901    0.2511      9711
      Probe     0.8131    0.3395    0.9906    0.4790    0.5799    0.3144      2421
        DoS     0.5636    0.3628    0.8611    0.4415    0.5590    0.2969      7458
        U2R     0.0000    0.0000    0.9997    0.0000    0.0000    0.0000       200
        R2L     0.9425    0.1547    0.9987    0.2658    0.3930    0.1414      2754

avg / total     0.5862    0.5061    0.6569    0.4771    0.5063    0.2574     22544

BaggingClassifier 
                    pre       rec       spe        f1       geo       iba       sup

     Normal     0.6558    0.9707    0.6145    0.7828    0.7723    0.6177      9711
      Probe     0.8632    0.5993    0.9886    0.7075    0.7697    0.5694      2421
        DoS     0.9640    0.8013    0.9852    0.8752    0.8885    0.7749      7458
        U2R     0.5000    0.0200    0.9998    0.0385    0.1414    0.0180       200
        R2L     0.9046    0.0930    0.9986    0.1686    0.3047    0.0844      2754

avg / total     0.8091    0.7591    0.8277    0.7236    0.7478    0.5941     22544

ExtraTreesClassifier 
                    pre       rec       spe        f1       geo       iba       sup

     Normal     0.6303    0.9727    0.5682    0.7649    0.7434    0.5751      9711
      Probe     0.8689    0.5857    0.9894    0.6997    0.7612    0.5561      2421
        DoS     0.9598    0.7589    0.9843    0.8476    0.8643    0.7302      7458
        U2R     0.1905    0.0200    0.9992    0.0362    0.1414    0.0180       200
        R2L     0.5714    0.0015    0.9998    0.0029    0.0381    0.0013      2754

avg / total     0.7538    0.7333    0.8076    0.6857    0.6938    0.5493     22544

你可能感兴趣的:(Various classifier comparisons on NSL-KDD)