阿里云作为国内最大的云服务提供商,每天都面临着网络上海量的恶意攻击。
本题目提供的一堆恶意文件数据,包括感染性病毒、木马程序、挖矿程序、DDoS木马、勒索病毒等等,总计6亿条数据,每个文件数据会有对API调用顺序及线程等相关信息,我们需要训练模型,将测试文件正确归类(预测出是哪种病毒),因此是典型的多分类问题。
常见的分类算法:朴素贝叶斯,决策树,支持向量机,KNN,逻辑回归等等;
集成学习:随机森林,GBDT(梯度提升决策树),Adaboot,XGBoost,LightGBM,CatBoost等等;
神经网络:MLP(多层神经网络),DL(深度学习)等。
一个典型的机器学习实战算法基本包括 1) 数据处理,2) 特征选取、优化,和 3) 模型选取、验证、优化。 因为 “数据和特征决定了机器学习的上限,而模型和算法知识逼近这个上限而已。” 所以在解决一个机器学习问题时大部分时间都会花在数据处理和特征优化上。
大家最好在jupyter notebook上一段一段地跑下面的代码,加深理解。
机器学习的基本知识可以康康我的其他文章哦 好康的。
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from tqdm import tqdm_notebook
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
# 内存管理
import numpy as np
import pandas as pd
from tqdm import tqdm
class _Data_Preprocess:
def __init__(self):
self.int8_max = np.iinfo(np.int8).max
self.int8_min = np.iinfo(np.int8).min
self.int16_max = np.iinfo(np.int16).max
self.int16_min = np.iinfo(np.int16).min
self.int32_max = np.iinfo(np.int32).max
self.int32_min = np.iinfo(np.int32).min
self.int64_max = np.iinfo(np.int64).max
self.int64_min = np.iinfo(np.int64).min
self.float16_max = np.finfo(np.float16).max
self.float16_min = np.finfo(np.float16).min
self.float32_max = np.finfo(np.float32).max
self.float32_min = np.finfo(np.float32).min
self.float64_max = np.finfo(np.float64).max
self.float64_min = np.finfo(np.float64).min
def _get_type(self, min_val, max_val, types):
if types == 'int':
if max_val <= self.int8_max and min_val >= self.int8_min:
return np.int8
elif max_val <= self.int16_max <= max_val and min_val >= self.int16_min:
return np.int16
elif max_val <= self.int32_max and min_val >= self.int32_min:
return np.int32
return None
elif types == 'float':
if max_val <= self.float16_max and min_val >= self.float16_min:
return np.float16
if max_val <= self.float32_max and min_val >= self.float32_min:
return np.float32
if max_val <= self.float64_max and min_val >= self.float64_min:
return np.float64
return None
def _memory_process(self, df):
init_memory = df.memory_usage().sum() / 1024 ** 2 / 1024
print('Original data occupies {} GB memory.'.format(init_memory))
df_cols = df.columns
for col in tqdm_notebook(df_cols):
try:
if 'float' in str(df[col].dtypes):
max_val = df[col].max()
min_val = df[col].min()
trans_types = self._get_type(min_val, max_val, 'float')
if trans_types is not None:
df[col] = df[col].astype(trans_types)
elif 'int' in str(df[col].dtypes):
max_val = df[col].max()
min_val = df[col].min()
trans_types = self._get_type(min_val, max_val, 'int')
if trans_types is not None:
df[col] = df[col].astype(trans_types)
except:
print(' Can not do any process for column, {}.'.format(col))
afterprocess_memory = df.memory_usage().sum() / 1024 ** 2 / 1024
print('After processing, the data occupies {} GB memory.'.format(afterprocess_memory))
return df
memory_process = _Data_Preprocess()
### 数据读取
path = '../security_data/'
train = pd.read_csv(path + 'security_train.csv')
test = pd.read_csv(path + 'security_test.csv')
train.head()
def simple_sts_features(df):
simple_fea = pd.DataFrame()
simple_fea['file_id'] = df['file_id'].unique()
simple_fea = simple_fea.sort_values('file_id')
df_grp = df.groupby('file_id')
simple_fea['file_id_api_count'] = df_grp['api'].count().values
simple_fea['file_id_api_nunique'] = df_grp['api'].nunique().values
simple_fea['file_id_tid_count'] = df_grp['tid'].count().values
simple_fea['file_id_tid_nunique'] = df_grp['tid'].nunique().values
simple_fea['file_id_index_count'] = df_grp['index'].count().values
simple_fea['file_id_index_nunique'] = df_grp['index'].nunique().values
return simple_fea
%%time
simple_train_fea1 = simple_sts_features(train)
%%time
simple_test_fea1 = simple_sts_features(test)
def simple_numerical_sts_features(df):
simple_numerical_fea = pd.DataFrame()
simple_numerical_fea['file_id'] = df['file_id'].unique()
simple_numerical_fea = simple_numerical_fea.sort_values('file_id')
df_grp = df.groupby('file_id')
simple_numerical_fea['file_id_tid_mean'] = df_grp['tid'].mean().values
simple_numerical_fea['file_id_tid_min'] = df_grp['tid'].min().values
simple_numerical_fea['file_id_tid_std'] = df_grp['tid'].std().values
simple_numerical_fea['file_id_tid_max'] = df_grp['tid'].max().values
simple_numerical_fea['file_id_index_mean']= df_grp['index'].mean().values
simple_numerical_fea['file_id_index_min'] = df_grp['index'].min().values
simple_numerical_fea['file_id_index_std'] = df_grp['index'].std().values
simple_numerical_fea['file_id_index_max'] = df_grp['index'].max().values
return simple_numerical_fea
%%time
simple_train_fea2 = simple_numerical_sts_features(train)
%%time
simple_test_fea2 = simple_numerical_sts_features(test)
def api_pivot_count_features(df):
tmp = df.groupby(['file_id','api'])['tid'].count().to_frame('api_tid_count').reset_index()
tmp_pivot = pd.pivot_table(data=tmp,index = 'file_id',columns='api',values='api_tid_count',fill_value=0)
tmp_pivot.columns = [tmp_pivot.columns.names[0] + '_pivot_'+ str(col) for col in tmp_pivot.columns]
tmp_pivot.reset_index(inplace = True)
tmp_pivot = memory_process._memory_process(tmp_pivot)
return tmp_pivot
%%time
simple_train_fea3 = api_pivot_count_features(train)
%%time
simple_test_fea3 = api_pivot_count_features(test)
def api_pivot_nunique_features(df):
tmp = df.groupby(['file_id','api'])['tid'].nunique().to_frame('api_tid_nunique').reset_index()
tmp_pivot = pd.pivot_table(data=tmp,index = 'file_id',columns='api',values='api_tid_nunique',fill_value=0)
tmp_pivot.columns = [tmp_pivot.columns.names[0] + '_pivot_'+ str(col) for col in tmp_pivot.columns]
tmp_pivot.reset_index(inplace = True)
tmp_pivot = memory_process._memory_process(tmp_pivot)
return tmp_pivot
%%time
simple_train_fea4 = api_pivot_count_features(train)
%%time
simple_test_fea4 = api_pivot_count_features(test)
train_label = train[['file_id','label']].drop_duplicates(subset = ['file_id','label'], keep = 'first')
test_submit = test[['file_id']].drop_duplicates(subset = ['file_id'], keep = 'first')
train_data = train_label.merge(simple_train_fea1, on ='file_id', how='left')
train_data = train_data.merge(simple_train_fea2, on ='file_id', how='left')
train_data = train_data.merge(simple_train_fea3, on ='file_id', how='left')
train_data = train_data.merge(simple_train_fea4, on ='file_id', how='left')
test_submit = test_submit.merge(simple_test_fea1, on ='file_id', how='left')
test_submit = test_submit.merge(simple_test_fea2, on ='file_id', how='left')
test_submit = test_submit.merge(simple_test_fea3, on ='file_id', how='left')
test_submit = test_submit.merge(simple_test_fea4, on ='file_id', how='left')
### 评估指标构建
def lgb_logloss(preds,data):
labels_ = data.get_label()
classes_ = np.unique(labels_)
preds_prob = []
for i in range(len(classes_)):
preds_prob.append(preds[i*len(labels_):(i+1) * len(labels_)] )
preds_prob_ = np.vstack(preds_prob)
loss = []
for i in range(preds_prob_.shape[1]):
sum_ = 0
for j in range(preds_prob_.shape[0]):
pred = preds_prob_[j,i]
if j == labels_[i]:
sum_ += np.log(pred)
else:
sum_ += np.log(1 - pred)
loss.append(sum_)
return 'loss is: ',-1 * (np.sum(loss) / preds_prob_.shape[1]),False
train_features = [col for col in train_data.columns if col not in ['label','file_id']]
train_label = 'label'
%%time
from sklearn.model_selection import StratifiedKFold,KFold
params = {
'task':'train',
'num_leaves': 255,
'objective': 'multiclass',
'num_class': 8,
'min_data_in_leaf': 50,
'learning_rate': 0.05,
'feature_fraction': 0.85,
'bagging_fraction': 0.85,
'bagging_freq': 5,
'max_bin':128,
'random_state':100
}
folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof = np.zeros(len(train))
predict_res = 0
models = []
for fold_, (trn_idx, val_idx) in enumerate(folds.split(train_data)):
print("fold n°{}".format(fold_))
trn_data = lgb.Dataset(train_data.iloc[trn_idx][train_features], label=train_data.iloc[trn_idx][train_label].values)
val_data = lgb.Dataset(train_data.iloc[val_idx][train_features], label=train_data.iloc[val_idx][train_label].values)
clf = lgb.train(params, trn_data, num_boost_round=2000,valid_sets=[trn_data,val_data], verbose_eval=50, early_stopping_rounds=100, feval=lgb_logloss)
models.append(clf)
plt.figure(figsize=[10,8])
sns.heatmap(train_data.iloc[:10000, 1:21].corr())
### 特征重要性分析
feature_importance = pd.DataFrame()
feature_importance['fea_name'] = train_features
feature_importance['fea_imp'] = clf.feature_importance()
feature_importance = feature_importance.sort_values('fea_imp',ascending = False)
feature_importance.sort_values('fea_imp',ascending = False)
plt.figure(figsize=[20, 10,])
plt.figure(figsize=[20, 10,])
sns.barplot(x = feature_importance.iloc[:10]['fea_name'], y = feature_importance.iloc[:10]['fea_imp'])
plt.figure(figsize=[20, 10,])
sns.barplot(x = feature_importance['fea_name'], y = feature_importance['fea_imp'])
pred_res = 0
flod = 5
for model in models:
pred_res += model.predict(test_submit[train_features]) * 1.0 / flod
test_submit['prob0'] = 0
test_submit['prob1'] = 0
test_submit['prob2'] = 0
test_submit['prob3'] = 0
test_submit['prob4'] = 0
test_submit['prob5'] = 0
test_submit['prob6'] = 0
test_submit['prob7'] = 0
test_submit[['prob0','prob1','prob2','prob3','prob4','prob5','prob6','prob7']] = pred_res
test_submit[['file_id','prob0','prob1','prob2','prob3','prob4','prob5','prob6','prob7']].to_csv('baseline2.csv',index = None)
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from tqdm import tqdm_notebook
from sklearn.preprocessing import LabelBinarizer,LabelEncoder
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
path = '../security_data/'
train = pd.read_csv(path + 'security_train.csv')
test = pd.read_csv(path + 'security_test.csv')
import numpy as np
import pandas as pd
from tqdm import tqdm
class _Data_Preprocess:
def __init__(self):
self.int8_max = np.iinfo(np.int8).max
self.int8_min = np.iinfo(np.int8).min
self.int16_max = np.iinfo(np.int16).max
self.int16_min = np.iinfo(np.int16).min
self.int32_max = np.iinfo(np.int32).max
self.int32_min = np.iinfo(np.int32).min
self.int64_max = np.iinfo(np.int64).max
self.int64_min = np.iinfo(np.int64).min
self.float16_max = np.finfo(np.float16).max
self.float16_min = np.finfo(np.float16).min
self.float32_max = np.finfo(np.float32).max
self.float32_min = np.finfo(np.float32).min
self.float64_max = np.finfo(np.float64).max
self.float64_min = np.finfo(np.float64).min
def _get_type(self, min_val, max_val, types):
if types == 'int':
if max_val <= self.int8_max and min_val >= self.int8_min:
return np.int8
elif max_val <= self.int16_max <= max_val and min_val >= self.int16_min:
return np.int16
elif max_val <= self.int32_max and min_val >= self.int32_min:
return np.int32
return None
elif types == 'float':
if max_val <= self.float16_max and min_val >= self.float16_min:
return np.float16
if max_val <= self.float32_max and min_val >= self.float32_min:
return np.float32
if max_val <= self.float64_max and min_val >= self.float64_min:
return np.float64
return None
def _memory_process(self, df):
init_memory = df.memory_usage().sum() / 1024 ** 2 / 1024
print('Original data occupies {} GB memory.'.format(init_memory))
df_cols = df.columns
for col in tqdm_notebook(df_cols):
try:
if 'float' in str(df[col].dtypes):
max_val = df[col].max()
min_val = df[col].min()
trans_types = self._get_type(min_val, max_val, 'float')
if trans_types is not None:
df[col] = df[col].astype(trans_types)
elif 'int' in str(df[col].dtypes):
max_val = df[col].max()
min_val = df[col].min()
trans_types = self._get_type(min_val, max_val, 'int')
if trans_types is not None:
df[col] = df[col].astype(trans_types)
except:
print(' Can not do any process for column, {}.'.format(col))
afterprocess_memory = df.memory_usage().sum() / 1024 ** 2 / 1024
print('After processing, the data occupies {} GB memory.'.format(afterprocess_memory))
return df
memory_process = _Data_Preprocess()
train.head()
# (字符串转化为数字)
unique_api = train['api'].unique()
api2index = {item:(i+1) for i,item in enumerate(unique_api)}
index2api = {(i+1):item for i,item in enumerate(unique_api)}
train['api_idx'] = train['api'].map(api2index)
test['api_idx'] = test['api'].map(api2index)
# 获取每个文件对应的字符串序列
def get_sequence(df,period_idx):
seq_list = []
for _id,begin in enumerate(period_idx[:-1]):
seq_list.append(df.iloc[begin:period_idx[_id+1]]['api_idx'].values)
seq_list.append(df.iloc[period_idx[-1]:]['api_idx'].values)
return seq_list
train_period_idx = train.file_id.drop_duplicates(keep='first').index.values
test_period_idx = test.file_id.drop_duplicates(keep='first').index.values
train_df = train[['file_id','label']].drop_duplicates(keep='first')
test_df = test[['file_id']].drop_duplicates(keep='first')
train_df['seq'] = get_sequence(train,train_period_idx)
test_df['seq'] = get_sequence(test,test_period_idx)
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Dense, Input, LSTM, Lambda, Embedding, Dropout, Activation,GRU,Bidirectional
from keras.layers import Conv1D,Conv2D,MaxPooling2D,GlobalAveragePooling1D,GlobalMaxPooling1D, MaxPooling1D, Flatten
from keras.layers import CuDNNGRU, CuDNNLSTM, SpatialDropout1D
from keras.layers.merge import concatenate, Concatenate, Average, Dot, Maximum, Multiply, Subtract, average
from keras.models import Model
from keras.optimizers import RMSprop,Adam
from keras.layers.normalization import BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.optimizers import SGD
from keras import backend as K
from sklearn.decomposition import TruncatedSVD, NMF, LatentDirichletAllocation
from keras.layers import SpatialDropout1D
from keras.layers.wrappers import Bidirectional
def TextCNN(max_len,max_cnt,embed_size, num_filters,kernel_size,conv_action, mask_zero):
_input = Input(shape=(max_len,), dtype='int32')
_embed = Embedding(max_cnt, embed_size, input_length=max_len, mask_zero=mask_zero)(_input)
_embed = SpatialDropout1D(0.15)(_embed)
warppers = []
for _kernel_size in kernel_size:
conv1d = Conv1D(filters=num_filters, kernel_size=_kernel_size, activation=conv_action)(_embed)
warppers.append(GlobalMaxPooling1D()(conv1d))
fc = concatenate(warppers)
fc = Dropout(0.5)(fc)
#fc = BatchNormalization()(fc)
fc = Dense(256, activation='relu')(fc)
fc = Dropout(0.25)(fc)
#fc = BatchNormalization()(fc)
preds = Dense(8, activation = 'softmax')(fc)
model = Model(inputs=_input, outputs=preds)
model.compile(loss='categorical_crossentropy',
optimizer='adam',
metrics=['accuracy'])
return model
train_labels = pd.get_dummies(train_df.label).values
train_seq = pad_sequences(train_df.seq.values, maxlen = 6000)
test_seq = pad_sequences(test_df.seq.values, maxlen = 6000)
from sklearn.model_selection import StratifiedKFold,KFold
skf = KFold(n_splits=5, shuffle=True)
max_len = 6000
max_cnt = 295
embed_size = 256
num_filters = 64
kernel_size = [2,4,6,8,10,12,14]
conv_action = 'relu'
mask_zero = False
TRAIN = True
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
meta_train = np.zeros(shape = (len(train_seq),8))
meta_test = np.zeros(shape = (len(test_seq),8))
FLAG = True
i = 0
for tr_ind,te_ind in skf.split(train_labels):
i +=1
print('FOLD: '.format(i))
print(len(te_ind),len(tr_ind))
model_name = 'benchmark_textcnn_fold_'+str(i)
X_train,X_train_label = train_seq[tr_ind],train_labels[tr_ind]
X_val,X_val_label = train_seq[te_ind],train_labels[te_ind]
model = TextCNN(max_len,max_cnt,embed_size,num_filters,kernel_size,conv_action,mask_zero)
model_save_path = './NN/%s_%s.hdf5'%(model_name,embed_size)
early_stopping =EarlyStopping(monitor='val_loss', patience=3)
model_checkpoint = ModelCheckpoint(model_save_path, save_best_only=True, save_weights_only=True)
if TRAIN and FLAG:
model.fit(X_train,X_train_label,validation_data=(X_val,X_val_label),epochs=100,batch_size=64,shuffle=True,callbacks=[early_stopping,model_checkpoint] )
model.load_weights(model_save_path)
pred_val = model.predict(X_val,batch_size=128,verbose=1)
pred_test = model.predict(test_seq,batch_size=128,verbose=1)
meta_train[te_ind] = pred_val
meta_test += pred_test
K.clear_session()
meta_test /= 5.0
test_df['prob0'] = 0
test_df['prob1'] = 0
test_df['prob2'] = 0
test_df['prob3'] = 0
test_df['prob4'] = 0
test_df['prob5'] = 0
test_df['prob6'] = 0
test_df['prob7'] = 0
test_df[['prob0','prob1','prob2','prob3','prob4','prob5','prob6','prob7']] = meta_test
test_df[['file_id','prob0','prob1','prob2','prob3','prob4','prob5','prob6','prob7']].to_csv('nn_baseline_5fold.csv',index = None)
以上内容和代码全部来自于《阿里云天池大赛赛题解析(机器学习篇)》这本好书,十分推荐大家去阅读原书!