这里记录下自己用pytorch复现论文方法的代码:
论文名称:Malware Detection by Eating a Whole EXE
神经网络:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Embedding, Conv1D, multiply, GlobalMaxPool1D, Input, Activation
def Malconv(max_len=200000, win_size=500, vocab_size=256):
inp = Input((max_len,))
emb = Embedding(vocab_size, 8)(inp)
conv1 = Conv1D(kernel_size=(win_size), filters=128, strides=(win_size), padding='same')(emb)
conv2 = Conv1D(kernel_size=(win_size), filters=128, strides=(win_size), padding='same')(emb)
a = Activation('sigmoid', name='sigmoid')(conv2)
mul = multiply([conv1, a])
a = Activation('relu', name='relu')(mul)
p = GlobalMaxPool1D()(a)
d = Dense(64)(p)
out = Dense(1, activation='sigmoid')(d)
return Model(inp, out)
自定义Dataset:
# 准备pytorch的数据
from torch.utils.data import Dataset, DataLoader
from OSutils import get_data_path, load_jsondata
from ByteSequencesFeature import byte_sequences_feature
from torch.utils.data import DataLoader
import numpy as np
import torch
def data_loader_multilabel(file_path='', label_dict={}):
# 用于读取多标签的情况
file_md5 = file_path.split('/')[-1]
return byte_sequences_feature(file_path), label_dict.get(file_md5)
def data_loader(file_path='', label_dict={}):
# 用于读取单标签的情况
file_md5 = file_path.split('/')[-1]
if file_md5 in label_dict:
return byte_sequences_feature(file_path), 1
else:
return byte_sequences_feature(file_path), 0
def pred_data_loader(file_path='', *args):
# 用于读取预测文件的情况
file_md5 = file_path.split('/')[-1]
return byte_sequences_feature(file_path), file_md5
class MalconvDataSet(Dataset):
def __init__(self, black_samples_dir="black_samples/", white_samples_dir='white_samples/',
label_dict_path='label_dict.json', label_type="single", valid=False, valid_size=0.2, seed=207):
self.file_list = get_data_path(black_samples_dir)
self.loader = data_loader_multilabel
if label_type == "single":
self.loader = data_loader
self.file_list += get_data_path(white_samples_dir)
if label_type == "predict":
self.label_dict = {}
self.loader = pred_data_loader
else:
self.label_dict = load_jsondata(label_dict_path)
np.random.seed(seed)
np.random.shuffle(self.file_list)
# 如果是需要测试集,就在原来的基础上分割
# 因为设定了随机种子,所以分割的结果是一样的
valid_cut = int((1 - valid_size) * len(self.file_list))
if valid:
self.file_list = self.file_list[valid_cut:]
else:
self.file_list = self.file_list[:valid_cut]
def __getitem__(self, index):
file_path = self.file_list[index]
feature, label = self.loader(file_path, self.label_dict)
return np.array(feature), label
def __len__(self):
return len(self.file_list)
训练神经网络:
from MalconvData import MalconvDataSet
from MalconvPytorch import MalConv
import os
import time
import sys
import numpy as np
from torch.utils.data import DataLoader
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from OSutils import get_input_args_dict
# 读取输入的字典
input_args_dict = get_input_args_dict(sys.argv)
# 二分类任务
task_type = "single"
num_classes = 2
# 多分类任务
# task_type="multilabel"
# num_classes=103
total_step = 1
max_step = 300
display_step = 1
test_step = 1000
learning_rate = 0.0001
log_file_path = 'train_log_' + task_type + '.txt'
use_gpu = True
model_path = 'Malconv_' + task_type + '.model'
black_samples_dir = "black_samples/"
white_samples_dir = 'white_samples/'
label_dict_path = 'label_dict.json'
valid_size = 0.2
for each_args in input_args_dict:
if input_args_dict.get(each_args) is not None:
globals()[each_args] = input_args_dict.get(each_args)
train_data_loader = DataLoader(
MalconvDataSet(black_samples_dir=black_samples_dir, white_samples_dir=white_samples_dir,
label_dict_path=label_dict_path, label_type=task_type, valid=False,
valid_size=valid_size, seed=207), batch_size=8, shuffle=True, )
test_data_loader = DataLoader(
MalconvDataSet(black_samples_dir=black_samples_dir, white_samples_dir=white_samples_dir,
label_dict_path=label_dict_path, label_type=task_type, valid=True,
valid_size=valid_size, seed=207), batch_size=8, shuffle=True, )
malconv = MalConv(num_classes=num_classes)
model_loss = nn.CrossEntropyLoss()
adam_optim = optim.Adam([{'params': malconv.parameters()}], lr=learning_rate)
step_msg = 'step-{}-loss-{:.6f}-acc-{:.4f}'
valid_msg = 'step-{}-tr_loss-{:.6f}-tr_acc-{:.4f}-val_loss-{:.6f}-val_acc-{:.4f}-time-{:.4f}'
log_msg = '{}, {:.6f}, {:.4f}, {:.6f}, {:.4f},{:.4f}'
if os.path.exists(log_file_path) and os.path.exists(model_path):
log = open(log_file_path, 'a+')
log.write('读取上一次的模型开始训练\n')
log.write('step,tr_loss, tr_acc, val_loss, val_acc,cost_time\n')
log.close()
malconv = torch.load(model_path)
else:
log = open(log_file_path, 'w+')
log.write('step,tr_loss, tr_acc, val_loss, val_acc,cost_time\n')
log.close()
history = {'tr_loss': [], 'tr_acc': []}
valid_best_acc = 0
if use_gpu:
malconv = malconv.cuda()
model_loss = model_loss.cuda()
while total_step < max_step:
malconv.train()
total_step += 1
start_time = time.time()
for step, batch_data in enumerate(train_data_loader):
exe_input = batch_data[0].cuda() if use_gpu else batch_data[0]
exe_input = exe_input.long()
label = batch_data[1].cuda() if use_gpu else batch_data[1]
label = label.long()
pred = malconv(exe_input)
loss = model_loss(pred, label)
adam_optim.zero_grad()
loss.backward()
adam_optim.step()
with torch.no_grad():
pred = F.softmax(pred, dim=-1)
pred = pred.argmax(1)
current_acc = (pred == label).float().mean().item()
current_loss = loss.item()
history['tr_loss'].append(current_loss)
history['tr_acc'].append(current_acc)
if step % display_step == 0:
print(step_msg.format(step, np.mean(history['tr_loss']), np.mean(history['tr_acc'])), flush=True)
print("current_step_acc:{:.4f}".format(current_acc), flush=True)
if step % test_step == 0:
# 每过一段时间就进行测试和保存记录
break
# Testing
history['val_loss'] = []
history['val_acc'] = []
for step, batch_data in enumerate(test_data_loader):
exe_input = batch_data[0].cuda() if use_gpu else batch_data[0]
exe_input = exe_input.long()
label = batch_data[1].cuda() if use_gpu else batch_data[1]
label = label.long()
pred = malconv(exe_input)
loss = model_loss(pred, label)
pred = F.softmax(pred, dim=-1)
pred = pred.argmax(1)
current_acc = (pred == label).float().mean().item()
current_loss = loss.item()
if step % display_step:
print("step:", step, "current_valid_acc:", current_acc)
history['val_loss'].append(current_loss)
history['val_acc'].append(current_acc)
step_cost_time = time.time() - start_time
log = open(log_file_path, 'a+')
print(log_msg.format(total_step, np.mean(history['tr_loss']), np.mean(history['tr_acc']),
np.mean(history['val_loss']), np.mean(history['val_acc']),
step_cost_time),
file=log, flush=True)
log.close()
print(valid_msg.format(total_step, np.mean(history['tr_loss']), np.mean(history['tr_acc']),
np.mean(history['val_loss']), np.mean(history['val_acc']),
step_cost_time))
if valid_best_acc < np.mean(history['val_acc']):
valid_best_acc = np.mean(history['val_acc'])
torch.save(malconv, model_path)
print('model saved at', model_path)
history['tr_loss'] = []
history['tr_acc'] = []
预测结果:
from MalconvData import MalconvDataSet
import numpy as np
from torch.utils.data import DataLoader
import torch
import torch.nn.functional as F
from OSutils import get_input_args_dict,write_csv
import sys
def get_test_data_loader(black_samples_dir="../data/black_samples/"):
# 返回读取测试数据的data_loader
test_data_loader = DataLoader(
MalconvDataSet(black_samples_dir=black_samples_dir, white_samples_dir='not needed',
label_dict_path='not needed', label_type='predict', valid=False,
valid_size=0, seed=207), batch_size=8, shuffle=False, )
return test_data_loader
def pred_samples(samples_dir='black_samples/', model_path='Malconv_single.model',save_path="pred.csv",use_gpu=True):
# 提供需要检测的样本路径和模型地址,返回并保存检测结果
malconv = torch.load(model_path)
test_data_loader = get_test_data_loader(samples_dir)
malconv.eval()
pred_result = []
csv_data=[]
for step, batch_data in enumerate(test_data_loader):
exe_input = batch_data[0].cuda() if use_gpu else batch_data[0]
exe_input = exe_input.long()
label = batch_data[1]
pred = malconv(exe_input)
pred = F.softmax(pred, dim=-1)
pred = pred.argmax(1)
pred_result += pred.cpu().numpy().tolist()
for pointer in range(0,len(pred_result)):
csv_data.append((label[pointer],pred_result[pointer]))
write_csv(answer_data=csv_data, data_head=[("file_id", "label")], filename=save_path)
return pred_result
if __name__ == '__main__':
samples_dir = "black_samples/"
model_path='Malconv_single.model'
result_path='pred.csv'
input_args_dict=get_input_args_dict(sys.argv)
for each_args in input_args_dict:
if input_args_dict.get(each_args) is not None:
globals()[each_args] = input_args_dict.get(each_args)
pred_samples(samples_dir,model_path,result_path)