基于TensorFlow2实现MalConv恶意软件检测

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Embedding, Conv1D, multiply, GlobalMaxPool1D, Input, Activation
import tensorflow as tf
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import datetime
%matplotlib inline
import csv
import os
import struct
from sklearn.model_selection import train_test_split

定义一维卷积神经网络

def Malconv(max_len=2000000, win_size=500, vocab_size=256):
    inp = Input((max_len,))
    emb = Embedding(vocab_size, 8)(inp)

    conv1 = Conv1D(kernel_size=(win_size), filters=128, strides=(win_size), padding='same')(emb)
    conv2 = Conv1D(kernel_size=(win_size), filters=128, strides=(win_size), padding='same')(emb)
    a = Activation('sigmoid', name='sigmoid')(conv2)

    mul = multiply([conv1, a])
    a = Activation('relu', name='relu')(mul)
    p = GlobalMaxPool1D()(a)
    d = Dense(64)(p)
    out = Dense(1, activation='sigmoid')(d)

    return Model(inp, out)

定义变量

mal_file_dir = 'E:\malware1500'
ben_file_dir = 'E:\PeBenFileForEXE'
ben_max_sum = 1000
mal_max_sum = 1000

遍历PE文件夹中的恶意与良性软件,做好标签存入CSV

def gen_csv():
    rows = []
    mallist = os.listdir(mal_file_dir)
    mal_num = 0
    for a in mallist:
        abspath = mal_file_dir+os.sep+a
        if(os.path.getsize(abspath)<2000000):
    #         print(abspath,os.path.getsize(abspath))
            rows.append((a,'1'))
            mal_num = mal_num + 1
        if mal_num>=mal_max_sum:
            break;
    benlist = os.listdir(ben_file_dir)
    ben_num = 0
    for b in benlist:
        abspath = ben_file_dir+os.sep+b
        if(os.path.getsize(abspath)<2000000):
    #         print(abspath,os.path.getsize(abspath))
            rows.append((b,'0'))
            ben_num = ben_num + 1
        if ben_num>=ben_max_sum:
            break;
    print(ben_num,mal_num)
    headers = ['filename','label']
    with open('MalAndBenFile.csv','w',encoding='utf8',newline='') as f :
        writer = csv.writer(f)
        writer.writerow(headers)
        writer.writerows(rows)

执行获取CSV

gen_csv()

字节填充

def fill_list(my_list: list, length, fill=100): # 使用 fill字符/数字 填充,使得最后的长度为 length
#     print(len(my_list))
    if len(my_list) >= length:
        return my_list
    else:
        return my_list + (length - len(my_list)) * [fill]

获取数据

data = pd.read_csv('./MalAndBenFile.csv')

基于TensorFlow2实现MalConv恶意软件检测_第1张图片
PE文件转字节列表

def read_bin(file_name):
    """
    function: read a bin file, return the list of the content in file
    """
    with open(file_name, "rb") as f:
        f_content = f.read()
        content = struct.unpack("B" * len(f_content), f_content)
        f.close()
    return list(content)

构造训练与测试数据集

data = data.to_numpy()
new_data = []
new_label= []

for i in data:
    if i[1] == 0:
        p = ben_file_dir+os.sep+i[0]
        new_data.append(fill_list(read_bin(p),2000000))
        new_label.append(0)
    elif i[1] == 1:
        p = mal_file_dir+os.sep+i[0]
        new_data.append(fill_list(read_bin(p),2000000))
        new_label.append(1)
new_data = np.array(new_data,dtype='int32')
new_label = np.array(new_label,dtype='int32')
x_train, x_test, y_train, y_test = train_test_split(new_data,new_label,test_size=0.25)

x_train = tf.cast(x_train, tf.int32)
x_test = tf.cast(x_test, tf.int32)
y_train = tf.cast(y_train, tf.int32)
y_test = tf.cast(y_test, tf.int32)

# 标签one-hot处理(可用可不用)
# train_labels = tf.keras.utils.to_categorical(train_labels,num_classes=10) 
# test_labels = tf.keras.utils.to_categorical(test_labels,num_classes=10)

### 创建Dataset
dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(2000).batch(32)
test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(32)

模型训练

tf.keras.backend.clear_session()
model = Malconv()
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model.fit(dataset,validation_data=test_dataset, epochs=10)
Epoch 1/10
47/47 [==============================] - 321s 7s/step - loss: 0.4718 - acc: 0.7947 - val_loss: 0.2674 - val_acc: 0.8700
Epoch 2/10
47/47 [==============================] - 320s 7s/step - loss: 0.1739 - acc: 0.9353 - val_loss: 0.1817 - val_acc: 0.9260
Epoch 3/10
47/47 [==============================] - 321s 7s/step - loss: 0.0577 - acc: 0.9833 - val_loss: 0.1847 - val_acc: 0.9220
Epoch 4/10
47/47 [==============================] - 322s 7s/step - loss: 0.0191 - acc: 0.9973 - val_loss: 0.2146 - val_acc: 0.9340
Epoch 5/10
47/47 [==============================] - 321s 7s/step - loss: 0.0035 - acc: 1.0000 - val_loss: 0.2066 - val_acc: 0.9280
Epoch 6/10
47/47 [==============================] - 322s 7s/step - loss: 0.0015 - acc: 1.0000 - val_loss: 0.2175 - val_acc: 0.9320
Epoch 7/10
47/47 [==============================] - 320s 7s/step - loss: 9.3740e-04 - acc: 1.0000 - val_loss: 0.2352 - val_acc: 0.9360
Epoch 8/10
47/47 [==============================] - 320s 7s/step - loss: 6.8522e-04 - acc: 1.0000 - val_loss: 0.2359 - val_acc: 0.9360
Epoch 9/10
47/47 [==============================] - 321s 7s/step - loss: 5.1924e-04 - acc: 1.0000 - val_loss: 0.2400 - val_acc: 0.9360
Epoch 10/10
47/47 [==============================] - 321s 7s/step - loss: 4.1028e-04 - acc: 1.0000 - val_loss: 0.2508 - val_acc: 0.9360

模型保存

model.save("./MalConvTF2.h5")

你可能感兴趣的:(机器学习与深度学习,TensorFlow2,网络安全,tensorflow,python,深度学习)