阿里云天池大赛赛题(机器学习)——阿里云安全恶意程序检测(完整代码)

目录

  • 赛题背景
  • 全代码(ML 和 DL)
    • 特征工程进阶与方案优化 代码
      • 特征工程进阶部分
      • 基于LightGBM 的模型验证
      • 模型测试
    • 深度学习解决方案:TextCNN建模 代码
      • 数据读取
      • 数据预处理
      • TextCNN网络结构
      • TextCNN训练和预测
      • 结果提交

赛题背景

阿里云作为国内最大的云服务提供商,每天都面临着网络上海量的恶意攻击。
本题目提供的一堆恶意文件数据,包括感染性病毒、木马程序、挖矿程序、DDoS木马、勒索病毒等等,总计6亿条数据,每个文件数据会有对API调用顺序及线程等相关信息,我们需要训练模型,将测试文件正确归类(预测出是哪种病毒),因此是典型的多分类问题
常见的分类算法:朴素贝叶斯决策树支持向量机KNN逻辑回归等等;
集成学习:随机森林GBDT(梯度提升决策树),AdabootXGBoostLightGBMCatBoost等等;
神经网络:MLP(多层神经网络),DL(深度学习)等。

全代码(ML 和 DL)

一个典型的机器学习实战算法基本包括 1) 数据处理,2) 特征选取、优化,和 3) 模型选取、验证、优化。 因为 “数据和特征决定了机器学习的上限,而模型和算法知识逼近这个上限而已。” 所以在解决一个机器学习问题时大部分时间都会花在数据处理和特征优化上。
大家最好在jupyter notebook上一段一段地跑下面的代码,加深理解。
机器学习的基本知识可以康康我的其他文章哦 好康的。

特征工程进阶与方案优化 代码

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from tqdm import tqdm_notebook

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
# 内存管理
import numpy as np
import pandas as pd
from tqdm import tqdm  

class _Data_Preprocess:
    def __init__(self):
        self.int8_max = np.iinfo(np.int8).max
        self.int8_min = np.iinfo(np.int8).min

        self.int16_max = np.iinfo(np.int16).max
        self.int16_min = np.iinfo(np.int16).min

        self.int32_max = np.iinfo(np.int32).max
        self.int32_min = np.iinfo(np.int32).min

        self.int64_max = np.iinfo(np.int64).max
        self.int64_min = np.iinfo(np.int64).min

        self.float16_max = np.finfo(np.float16).max
        self.float16_min = np.finfo(np.float16).min

        self.float32_max = np.finfo(np.float32).max
        self.float32_min = np.finfo(np.float32).min

        self.float64_max = np.finfo(np.float64).max
        self.float64_min = np.finfo(np.float64).min

    def _get_type(self, min_val, max_val, types):
        if types == 'int':
            if max_val <= self.int8_max and min_val >= self.int8_min:
                return np.int8
            elif max_val <= self.int16_max <= max_val and min_val >= self.int16_min:
                return np.int16
            elif max_val <= self.int32_max and min_val >= self.int32_min:
                return np.int32
            return None

        elif types == 'float':
            if max_val <= self.float16_max and min_val >= self.float16_min:
                return np.float16
            if max_val <= self.float32_max and min_val >= self.float32_min:
                return np.float32
            if max_val <= self.float64_max and min_val >= self.float64_min:
                return np.float64
            return None

    def _memory_process(self, df):
        init_memory = df.memory_usage().sum() / 1024 ** 2 / 1024
        print('Original data occupies {} GB memory.'.format(init_memory))
        df_cols = df.columns

          
        for col in tqdm_notebook(df_cols):
            try:
                if 'float' in str(df[col].dtypes):
                    max_val = df[col].max()
                    min_val = df[col].min()
                    trans_types = self._get_type(min_val, max_val, 'float')
                    if trans_types is not None:
                        df[col] = df[col].astype(trans_types)
                elif 'int' in str(df[col].dtypes):
                    max_val = df[col].max()
                    min_val = df[col].min()
                    trans_types = self._get_type(min_val, max_val, 'int')
                    if trans_types is not None:
                        df[col] = df[col].astype(trans_types)
            except:
                print(' Can not do any process for column, {}.'.format(col)) 
        afterprocess_memory = df.memory_usage().sum() / 1024 ** 2 / 1024
        print('After processing, the data occupies {} GB memory.'.format(afterprocess_memory))
        return df
memory_process = _Data_Preprocess()
### 数据读取
path  = '../security_data/'
train = pd.read_csv(path + 'security_train.csv')
test  = pd.read_csv(path + 'security_test.csv')
train.head()
def simple_sts_features(df):
    simple_fea             = pd.DataFrame()
    simple_fea['file_id']  = df['file_id'].unique()
    simple_fea             = simple_fea.sort_values('file_id')
     
    df_grp = df.groupby('file_id')
    simple_fea['file_id_api_count']   = df_grp['api'].count().values
    simple_fea['file_id_api_nunique'] = df_grp['api'].nunique().values
    
    simple_fea['file_id_tid_count']   = df_grp['tid'].count().values
    simple_fea['file_id_tid_nunique'] = df_grp['tid'].nunique().values
    
    simple_fea['file_id_index_count']   = df_grp['index'].count().values
    simple_fea['file_id_index_nunique'] = df_grp['index'].nunique().values
    
    return simple_fea
%%time
simple_train_fea1 = simple_sts_features(train)
%%time
simple_test_fea1 = simple_sts_features(test)
def simple_numerical_sts_features(df):
    simple_numerical_fea             = pd.DataFrame()
    simple_numerical_fea['file_id']  = df['file_id'].unique()
    simple_numerical_fea             = simple_numerical_fea.sort_values('file_id')
     
    df_grp = df.groupby('file_id')
    
    simple_numerical_fea['file_id_tid_mean']  = df_grp['tid'].mean().values
    simple_numerical_fea['file_id_tid_min']   = df_grp['tid'].min().values
    simple_numerical_fea['file_id_tid_std']   = df_grp['tid'].std().values
    simple_numerical_fea['file_id_tid_max']   = df_grp['tid'].max().values
    
    
    simple_numerical_fea['file_id_index_mean']= df_grp['index'].mean().values
    simple_numerical_fea['file_id_index_min'] = df_grp['index'].min().values
    simple_numerical_fea['file_id_index_std'] = df_grp['index'].std().values
    simple_numerical_fea['file_id_index_max'] = df_grp['index'].max().values
    
    return simple_numerical_fea
%%time
simple_train_fea2 = simple_numerical_sts_features(train)
%%time
simple_test_fea2 = simple_numerical_sts_features(test)

特征工程进阶部分

def api_pivot_count_features(df):
    tmp = df.groupby(['file_id','api'])['tid'].count().to_frame('api_tid_count').reset_index()
    tmp_pivot = pd.pivot_table(data=tmp,index = 'file_id',columns='api',values='api_tid_count',fill_value=0)
    tmp_pivot.columns = [tmp_pivot.columns.names[0] + '_pivot_'+ str(col) for col in tmp_pivot.columns]
    tmp_pivot.reset_index(inplace = True)
    tmp_pivot = memory_process._memory_process(tmp_pivot)
    return tmp_pivot 
%%time
simple_train_fea3 = api_pivot_count_features(train)
%%time
simple_test_fea3 = api_pivot_count_features(test)
def api_pivot_nunique_features(df):
    tmp = df.groupby(['file_id','api'])['tid'].nunique().to_frame('api_tid_nunique').reset_index()
    tmp_pivot = pd.pivot_table(data=tmp,index = 'file_id',columns='api',values='api_tid_nunique',fill_value=0)
    tmp_pivot.columns = [tmp_pivot.columns.names[0] + '_pivot_'+ str(col) for col in tmp_pivot.columns]
    tmp_pivot.reset_index(inplace = True)
    tmp_pivot = memory_process._memory_process(tmp_pivot)
    return tmp_pivot 
%%time
simple_train_fea4 = api_pivot_count_features(train)
%%time
simple_test_fea4 = api_pivot_count_features(test)
train_label = train[['file_id','label']].drop_duplicates(subset = ['file_id','label'], keep = 'first')
test_submit = test[['file_id']].drop_duplicates(subset = ['file_id'], keep = 'first')
train_data = train_label.merge(simple_train_fea1, on ='file_id', how='left')
train_data = train_data.merge(simple_train_fea2, on ='file_id', how='left')
train_data = train_data.merge(simple_train_fea3, on ='file_id', how='left')
train_data = train_data.merge(simple_train_fea4, on ='file_id', how='left')
test_submit = test_submit.merge(simple_test_fea1, on ='file_id', how='left')
test_submit = test_submit.merge(simple_test_fea2, on ='file_id', how='left')
test_submit = test_submit.merge(simple_test_fea3, on ='file_id', how='left')
test_submit = test_submit.merge(simple_test_fea4, on ='file_id', how='left')
### 评估指标构建
def lgb_logloss(preds,data):
    labels_ = data.get_label()             
    classes_ = np.unique(labels_) 
    preds_prob = []
    for i in range(len(classes_)):
        preds_prob.append(preds[i*len(labels_):(i+1) * len(labels_)] )
        
    preds_prob_ = np.vstack(preds_prob) 
    
    loss = []
    for i in range(preds_prob_.shape[1]):     
        sum_ = 0
        for j in range(preds_prob_.shape[0]): 
            pred = preds_prob_[j,i]           
            if  j == labels_[i]:
                sum_ += np.log(pred)
            else:
                sum_ += np.log(1 - pred)
        loss.append(sum_)       
    return 'loss is: ',-1 * (np.sum(loss) / preds_prob_.shape[1]),False

基于LightGBM 的模型验证

train_features = [col for col in train_data.columns if col not in ['label','file_id']]
train_label    = 'label'
%%time
from sklearn.model_selection import StratifiedKFold,KFold
params = {
        'task':'train', 
        'num_leaves': 255,
        'objective': 'multiclass',
        'num_class': 8,
        'min_data_in_leaf': 50,
        'learning_rate': 0.05,
        'feature_fraction': 0.85,
        'bagging_fraction': 0.85,
        'bagging_freq': 5, 
        'max_bin':128,
        'random_state':100
    }   

folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof = np.zeros(len(train))

predict_res = 0
models = []
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_data)):
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(train_data.iloc[trn_idx][train_features], label=train_data.iloc[trn_idx][train_label].values)
    val_data = lgb.Dataset(train_data.iloc[val_idx][train_features], label=train_data.iloc[val_idx][train_label].values) 
    
    clf = lgb.train(params, trn_data, num_boost_round=2000,valid_sets=[trn_data,val_data], verbose_eval=50, early_stopping_rounds=100, feval=lgb_logloss) 
    models.append(clf)
plt.figure(figsize=[10,8])
sns.heatmap(train_data.iloc[:10000, 1:21].corr())
### 特征重要性分析
feature_importance             = pd.DataFrame()
feature_importance['fea_name'] = train_features
feature_importance['fea_imp']  = clf.feature_importance()
feature_importance             = feature_importance.sort_values('fea_imp',ascending = False)
feature_importance.sort_values('fea_imp',ascending = False)
plt.figure(figsize=[20, 10,])
plt.figure(figsize=[20, 10,])
sns.barplot(x = feature_importance.iloc[:10]['fea_name'], y = feature_importance.iloc[:10]['fea_imp'])
plt.figure(figsize=[20, 10,])
sns.barplot(x = feature_importance['fea_name'], y = feature_importance['fea_imp'])

模型测试

pred_res = 0
flod = 5
for model in models:
    pred_res += model.predict(test_submit[train_features]) * 1.0 / flod

test_submit['prob0'] = 0
test_submit['prob1'] = 0
test_submit['prob2'] = 0
test_submit['prob3'] = 0
test_submit['prob4'] = 0
test_submit['prob5'] = 0
test_submit['prob6'] = 0
test_submit['prob7'] = 0

test_submit[['prob0','prob1','prob2','prob3','prob4','prob5','prob6','prob7']] = pred_res
test_submit[['file_id','prob0','prob1','prob2','prob3','prob4','prob5','prob6','prob7']].to_csv('baseline2.csv',index = None)

 

深度学习解决方案:TextCNN建模 代码

数据读取

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

from tqdm import tqdm_notebook
from sklearn.preprocessing import LabelBinarizer,LabelEncoder

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline

path  = '../security_data/'
train = pd.read_csv(path + 'security_train.csv')
test  = pd.read_csv(path + 'security_test.csv')

import numpy as np
import pandas as pd
from tqdm import tqdm  

class _Data_Preprocess:
    def __init__(self):
        self.int8_max = np.iinfo(np.int8).max
        self.int8_min = np.iinfo(np.int8).min

        self.int16_max = np.iinfo(np.int16).max
        self.int16_min = np.iinfo(np.int16).min

        self.int32_max = np.iinfo(np.int32).max
        self.int32_min = np.iinfo(np.int32).min

        self.int64_max = np.iinfo(np.int64).max
        self.int64_min = np.iinfo(np.int64).min

        self.float16_max = np.finfo(np.float16).max
        self.float16_min = np.finfo(np.float16).min

        self.float32_max = np.finfo(np.float32).max
        self.float32_min = np.finfo(np.float32).min

        self.float64_max = np.finfo(np.float64).max
        self.float64_min = np.finfo(np.float64).min

    def _get_type(self, min_val, max_val, types):
        if types == 'int':
            if max_val <= self.int8_max and min_val >= self.int8_min:
                return np.int8
            elif max_val <= self.int16_max <= max_val and min_val >= self.int16_min:
                return np.int16
            elif max_val <= self.int32_max and min_val >= self.int32_min:
                return np.int32
            return None

        elif types == 'float':
            if max_val <= self.float16_max and min_val >= self.float16_min:
                return np.float16
            if max_val <= self.float32_max and min_val >= self.float32_min:
                return np.float32
            if max_val <= self.float64_max and min_val >= self.float64_min:
                return np.float64
            return None

    def _memory_process(self, df):
        init_memory = df.memory_usage().sum() / 1024 ** 2 / 1024
        print('Original data occupies {} GB memory.'.format(init_memory))
        df_cols = df.columns

          
        for col in tqdm_notebook(df_cols):
            try:
                if 'float' in str(df[col].dtypes):
                    max_val = df[col].max()
                    min_val = df[col].min()
                    trans_types = self._get_type(min_val, max_val, 'float')
                    if trans_types is not None:
                        df[col] = df[col].astype(trans_types)
                elif 'int' in str(df[col].dtypes):
                    max_val = df[col].max()
                    min_val = df[col].min()
                    trans_types = self._get_type(min_val, max_val, 'int')
                    if trans_types is not None:
                        df[col] = df[col].astype(trans_types)
            except:
                print(' Can not do any process for column, {}.'.format(col)) 
        afterprocess_memory = df.memory_usage().sum() / 1024 ** 2 / 1024
        print('After processing, the data occupies {} GB memory.'.format(afterprocess_memory))
        return df

memory_process = _Data_Preprocess()

train.head()

数据预处理

# (字符串转化为数字)
unique_api = train['api'].unique()

api2index = {item:(i+1) for i,item in enumerate(unique_api)}
index2api = {(i+1):item for i,item in enumerate(unique_api)}

train['api_idx'] = train['api'].map(api2index)
test['api_idx']  = test['api'].map(api2index)

# 获取每个文件对应的字符串序列
def get_sequence(df,period_idx):
    seq_list = []
    for _id,begin in enumerate(period_idx[:-1]):
        seq_list.append(df.iloc[begin:period_idx[_id+1]]['api_idx'].values)
    seq_list.append(df.iloc[period_idx[-1]:]['api_idx'].values)
    return seq_list

train_period_idx = train.file_id.drop_duplicates(keep='first').index.values
test_period_idx  = test.file_id.drop_duplicates(keep='first').index.values

train_df = train[['file_id','label']].drop_duplicates(keep='first')
test_df  = test[['file_id']].drop_duplicates(keep='first')

train_df['seq'] = get_sequence(train,train_period_idx)
test_df['seq']  = get_sequence(test,test_period_idx)

TextCNN网络结构

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Lambda, Embedding, Dropout, Activation,GRU,Bidirectional
from keras.layers import Conv1D,Conv2D,MaxPooling2D,GlobalAveragePooling1D,GlobalMaxPooling1D, MaxPooling1D, Flatten
from keras.layers import CuDNNGRU, CuDNNLSTM, SpatialDropout1D
from keras.layers.merge import concatenate, Concatenate, Average, Dot, Maximum, Multiply, Subtract, average
from keras.models import Model
from keras.optimizers import RMSprop,Adam
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import SGD
from keras import backend as K
from sklearn.decomposition import TruncatedSVD, NMF, LatentDirichletAllocation
from keras.layers import SpatialDropout1D
from keras.layers.wrappers import Bidirectional
def TextCNN(max_len,max_cnt,embed_size, num_filters,kernel_size,conv_action, mask_zero):
    
    _input = Input(shape=(max_len,), dtype='int32')
    _embed = Embedding(max_cnt, embed_size, input_length=max_len, mask_zero=mask_zero)(_input)
    _embed = SpatialDropout1D(0.15)(_embed)
    warppers = []
    
    for _kernel_size in kernel_size:
        conv1d = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation=conv_action)(_embed)
        warppers.append(GlobalMaxPooling1D()(conv1d))
                        
    fc = concatenate(warppers)
    fc = Dropout(0.5)(fc)
    #fc = BatchNormalization()(fc)
    fc = Dense(256, activation='relu')(fc)
    fc = Dropout(0.25)(fc)
    #fc = BatchNormalization()(fc) 
    preds = Dense(8, activation = 'softmax')(fc)
    
    model = Model(inputs=_input, outputs=preds)
    
    model.compile(loss='categorical_crossentropy',
        optimizer='adam',
        metrics=['accuracy'])
    return model

train_labels = pd.get_dummies(train_df.label).values
train_seq    = pad_sequences(train_df.seq.values, maxlen = 6000)
test_seq     = pad_sequences(test_df.seq.values, maxlen = 6000)

TextCNN训练和预测

from sklearn.model_selection import StratifiedKFold,KFold 
skf = KFold(n_splits=5, shuffle=True)

max_len     = 6000
max_cnt     = 295
embed_size  = 256
num_filters = 64
kernel_size = [2,4,6,8,10,12,14]
conv_action = 'relu'
mask_zero   = False
TRAIN       = True

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
meta_train = np.zeros(shape = (len(train_seq),8))
meta_test = np.zeros(shape = (len(test_seq),8))
FLAG = True
i = 0
for tr_ind,te_ind in skf.split(train_labels):
    i +=1
    print('FOLD: '.format(i))
    print(len(te_ind),len(tr_ind)) 
    model_name = 'benchmark_textcnn_fold_'+str(i)
    X_train,X_train_label = train_seq[tr_ind],train_labels[tr_ind]
    X_val,X_val_label     = train_seq[te_ind],train_labels[te_ind]
    
    model = TextCNN(max_len,max_cnt,embed_size,num_filters,kernel_size,conv_action,mask_zero)
    
    model_save_path = './NN/%s_%s.hdf5'%(model_name,embed_size)
    early_stopping =EarlyStopping(monitor='val_loss', patience=3)
    model_checkpoint = ModelCheckpoint(model_save_path, save_best_only=True, save_weights_only=True)
    if TRAIN and FLAG:
        model.fit(X_train,X_train_label,validation_data=(X_val,X_val_label),epochs=100,batch_size=64,shuffle=True,callbacks=[early_stopping,model_checkpoint] )
    
    model.load_weights(model_save_path)
    pred_val = model.predict(X_val,batch_size=128,verbose=1)
    pred_test = model.predict(test_seq,batch_size=128,verbose=1)
    
    meta_train[te_ind] = pred_val
    meta_test += pred_test
    K.clear_session()
meta_test /= 5.0 

结果提交

test_df['prob0'] = 0
test_df['prob1'] = 0
test_df['prob2'] = 0
test_df['prob3'] = 0
test_df['prob4'] = 0
test_df['prob5'] = 0
test_df['prob6'] = 0
test_df['prob7'] = 0

test_df[['prob0','prob1','prob2','prob3','prob4','prob5','prob6','prob7']] = meta_test
test_df[['file_id','prob0','prob1','prob2','prob3','prob4','prob5','prob6','prob7']].to_csv('nn_baseline_5fold.csv',index = None)

以上内容和代码全部来自于《阿里云天池大赛赛题解析(机器学习篇)》这本好书,十分推荐大家去阅读原书!

你可能感兴趣的:(人工智能,Python,机器学习,阿里云,安全)