from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Embedding, Conv1D, multiply, GlobalMaxPool1D, Input, Activation
import tensorflow as tf
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
%matplotlib inline
import csv
import os
import struct
from sklearn.model_selection import train_test_split
定义一维卷积神经网络
def Malconv(max_len=2000000, win_size=500, vocab_size=256):
inp = Input((max_len,))
emb = Embedding(vocab_size, 8)(inp)
conv1 = Conv1D(kernel_size=(win_size), filters=128, strides=(win_size), padding='same')(emb)
conv2 = Conv1D(kernel_size=(win_size), filters=128, strides=(win_size), padding='same')(emb)
a = Activation('sigmoid', name='sigmoid')(conv2)
mul = multiply([conv1, a])
a = Activation('relu', name='relu')(mul)
p = GlobalMaxPool1D()(a)
d = Dense(64)(p)
out = Dense(1, activation='sigmoid')(d)
return Model(inp, out)
定义变量
mal_file_dir = 'E:\malware1500'
ben_file_dir = 'E:\PeBenFileForEXE'
ben_max_sum = 1000
mal_max_sum = 1000
遍历PE文件夹中的恶意与良性软件,做好标签存入CSV
def gen_csv():
rows = []
mallist = os.listdir(mal_file_dir)
mal_num = 0
for a in mallist:
abspath = mal_file_dir+os.sep+a
if(os.path.getsize(abspath)<2000000):
# print(abspath,os.path.getsize(abspath))
rows.append((a,'1'))
mal_num = mal_num + 1
if mal_num>=mal_max_sum:
break;
benlist = os.listdir(ben_file_dir)
ben_num = 0
for b in benlist:
abspath = ben_file_dir+os.sep+b
if(os.path.getsize(abspath)<2000000):
# print(abspath,os.path.getsize(abspath))
rows.append((b,'0'))
ben_num = ben_num + 1
if ben_num>=ben_max_sum:
break;
print(ben_num,mal_num)
headers = ['filename','label']
with open('MalAndBenFile.csv','w',encoding='utf8',newline='') as f :
writer = csv.writer(f)
writer.writerow(headers)
writer.writerows(rows)
执行获取CSV
gen_csv()
字节填充
def fill_list(my_list: list, length, fill=100): # 使用 fill字符/数字 填充,使得最后的长度为 length
# print(len(my_list))
if len(my_list) >= length:
return my_list
else:
return my_list + (length - len(my_list)) * [fill]
获取数据
data = pd.read_csv('./MalAndBenFile.csv')
def read_bin(file_name):
"""
function: read a bin file, return the list of the content in file
"""
with open(file_name, "rb") as f:
f_content = f.read()
content = struct.unpack("B" * len(f_content), f_content)
f.close()
return list(content)
构造训练与测试数据集
data = data.to_numpy()
new_data = []
new_label= []
for i in data:
if i[1] == 0:
p = ben_file_dir+os.sep+i[0]
new_data.append(fill_list(read_bin(p),2000000))
new_label.append(0)
elif i[1] == 1:
p = mal_file_dir+os.sep+i[0]
new_data.append(fill_list(read_bin(p),2000000))
new_label.append(1)
new_data = np.array(new_data,dtype='int32')
new_label = np.array(new_label,dtype='int32')
x_train, x_test, y_train, y_test = train_test_split(new_data,new_label,test_size=0.25)
x_train = tf.cast(x_train, tf.int32)
x_test = tf.cast(x_test, tf.int32)
y_train = tf.cast(y_train, tf.int32)
y_test = tf.cast(y_test, tf.int32)
# 标签one-hot处理(可用可不用)
# train_labels = tf.keras.utils.to_categorical(train_labels,num_classes=10)
# test_labels = tf.keras.utils.to_categorical(test_labels,num_classes=10)
### 创建Dataset
dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(2000).batch(32)
test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(32)
模型训练
tf.keras.backend.clear_session()
model = Malconv()
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model.fit(dataset,validation_data=test_dataset, epochs=10)
Epoch 1/10
47/47 [==============================] - 321s 7s/step - loss: 0.4718 - acc: 0.7947 - val_loss: 0.2674 - val_acc: 0.8700
Epoch 2/10
47/47 [==============================] - 320s 7s/step - loss: 0.1739 - acc: 0.9353 - val_loss: 0.1817 - val_acc: 0.9260
Epoch 3/10
47/47 [==============================] - 321s 7s/step - loss: 0.0577 - acc: 0.9833 - val_loss: 0.1847 - val_acc: 0.9220
Epoch 4/10
47/47 [==============================] - 322s 7s/step - loss: 0.0191 - acc: 0.9973 - val_loss: 0.2146 - val_acc: 0.9340
Epoch 5/10
47/47 [==============================] - 321s 7s/step - loss: 0.0035 - acc: 1.0000 - val_loss: 0.2066 - val_acc: 0.9280
Epoch 6/10
47/47 [==============================] - 322s 7s/step - loss: 0.0015 - acc: 1.0000 - val_loss: 0.2175 - val_acc: 0.9320
Epoch 7/10
47/47 [==============================] - 320s 7s/step - loss: 9.3740e-04 - acc: 1.0000 - val_loss: 0.2352 - val_acc: 0.9360
Epoch 8/10
47/47 [==============================] - 320s 7s/step - loss: 6.8522e-04 - acc: 1.0000 - val_loss: 0.2359 - val_acc: 0.9360
Epoch 9/10
47/47 [==============================] - 321s 7s/step - loss: 5.1924e-04 - acc: 1.0000 - val_loss: 0.2400 - val_acc: 0.9360
Epoch 10/10
47/47 [==============================] - 321s 7s/step - loss: 4.1028e-04 - acc: 1.0000 - val_loss: 0.2508 - val_acc: 0.9360
模型保存
model.save("./MalConvTF2.h5")