对kaggle2015 1st代码的简单理解

这里对训练模型的py文件不作解释

unique_gram.py
是对不同文件中的操作码进行计数,并且找出最多的前多少个操作码以及他的次数。

from csv import DictReader
from datetime import datetime
import pickle
import heapq
import sys

#load data
def load_label(path, label):                    #取出path中label对应的值
    result = []
    for row in DictReader(open(path)):
        if int(row['Class']) == label:
            result.append((row['Id']))
    return result


#generate grams dictionary for one file 给一个文件生成一个字典数据结构
def grams_dict(f_name, N=4): 
    path = "train/%s.bytes"%f_name
    one_list = []
    with open(path, 'rb') as f:
        for line in f:
            one_list += line.rstrip().split(" ")[1:]    #rstrip是去除开头和结尾的空格的意思
    grams_string = [''.join(one_list[i:i+N]) for i in xrange(len(one_list)-N+1)]   #四个四个连成一个string 四个应该是选择几gram语义关系
    tree = dict()
    for gram in grams_string:       #将生成的string保存到tree中,有的置1,不累加
        if gram not in tree:
            tree[gram] = 1   
    return tree


#add up ngram dictionaries
def reduce_dict(f_labels):          #将f_labels中的所有文件中的二进制操作码进行获取
    result = dict()
    for f_name in f_labels:
        d = grams_dict(f_name)
        for k,v in d.iteritems():   #记录下相同操作码的个数
            if k in result:
                result[k] += v 
            else:
                result[k] = v
        del d
    #print "this class has %i keys"%len(result)
    #pickle.dump(result, open('gram/ngram_%i'%label,'wb'))
    return result

#heap to get the top 100,000 features.
def Heap_top(dictionary, label, num = 100000):   #找出dic中出现次数最多的100000个操作码
    heap = [(0,'tmp')]* num # initialize the heap
    root = heap[0]
    for ngram,count in dictionary.iteritems():
            if count > root[0]:
                root = heapq.heapreplace(heap, (count, ngram))
    pickle.dump(heap, open('gram/ngram_%i_top%i'%(label,num),'wb'))        
         
if __name__ == '__main__':
    start = datetime.now()
    #for label in range(1,10): # take too much memory
    label = int(sys.argv[1]) #这里的sys.argv[1]是取接收的的在终端传入的第一个参数,这里是1到9,生成对应label的值
    print "Gathering 4 grams, Class %i out of 9..."%label
    f_labels = load_label('trainLabels.csv', label)
    Heap_top(reduce_dict(f_labels),label)
    #print datetime.now() - start

writeasm.py
对data文件中的code进行汇编成asm文件

import pickle
import sys
xid=pickle.load(open(sys.argv[1])) #sys.argv[1] 是测试集xid测试集的文件名
data_path=sys.argv[2]
asm_code_path=sys.argv[3]

for cc,i in enumerate(xid):
    f=open(data_path+'/'+i+'.asm')
    fo=open(asm_code_path+'/'+i+'.asm','w')
    start=True
    for line in f:
        xx=line.split()
        for c,x in enumerate(xx):
            if x=='Pure':
                if xx[c+1]=='data':
                    start=False
                if xx[c+1]=='code':        #如果是代码类型的代码就进行保存
                    start=True
        if True:				#这里我怀疑他打错了,应该是 if start:
            xx[0]=xx[0].split(':')[0]            
            fo.write(''.join(xx)+'\n')
    f.close()
    fo.close()          
    print cc*1.0/len(xid)  #输出操作的进度

join_grams.py
用来生成样本特征值对应的二进制的代码格式文件(1有0无)

import heapq
import pickle
import math
from csv import DictReader
import glob
import os
import csv

def join_ngrams(num = 100000): #找出相应操作码在哪个ngram文件中存在,存在即在相应位置记上他所对应的count
    dict_all = dict()
    for c in range(1,10):
        #print "merging %i out of 9"%c
        heap = pickle.load(open('gram/ngram_%i_top%i'%(c,num),'rb'))
        while heap:
            count, gram = heapq.heappop(heap)
            if gram not in dict_all:
                dict_all[gram] = [0]*9
            dict_all[gram][c-1] = count
    return dict_all
    #pickle.dump(dict_all, open('ready_for_selection.pkl','wb'))


# load data
def num_instances(path, label): #记录下path所对应的文件中类别存在有label对应类别的个数
    p = 0
    n = 0
    for row in DictReader(open(path)):
        if int(row['Class']) == label:
            p += 1
        else:
            n += 1
    return p,n


def entropy(p,n):               #对应上面的数值计算出一种用来参考的比率数值
    p_ratio = float(p)/(p+n)
    n_ratio = float(n)/(p+n)
    return -p_ratio*math.log(p_ratio) - n_ratio * math.log(n_ratio)

def info_gain(p0,n0,p1,n1,p,n): #计算一种用来参考的比率数值
    return entropy(p,n) - float(p0+n0)/(p+n)*entropy(p0,n0) - float(p1+n1)/(p+n)*entropy(p1,n1)

def Heap_gain(p, n, class_label, dict_all, num_features = 750, gain_minimum_bar = -100000): #找出比率得分高低的对应heap
    heap = [(gain_minimum_bar, 'gain_bar')] * num_features
    root = heap[0]
    for gram, count_list in dict_all.iteritems():
        p1 = count_list[class_label-1]
        n1 = sum(count_list[:(class_label-1)] + count_list[class_label:])
        p0,n0 = p - p1, n - n1         
        if p1*p0*n1*n0 != 0:
            gain = info_gain(p0,n0,p1,n1,p,n)
            if gain > root[0]:
                root = heapq.heapreplace(heap, (gain, gram))
    #return heap
    return [i[1] for i in heap]

def gen_df(features_all, train = True, verbose = False, N = 4):     #用二进制生成id对应的样本中的特征的特征值,1有0无
    yield ['Id'] + features_all # yield header      #yield是一种迭代器,要用next生成器来对其进行操作,每一次用一次next时他会运行到
                                                    #一个yield的地方并放回他的值,下一次next时他会承接该次的操作,并且运行到下一个yield的地方并
                                                    #放回其值
    if train == True:
        ds = 'train'
    else:
        ds = 'test'
    directory_names = list(set(glob.glob(os.path.join(ds, "*.bytes")))) #这里取得所有的train或者test的bytes文件的名字
    for f in directory_names:
        f_id = f.split('/')[-1].split('.')[0]                           #取得他的bytes前面的具体文件名
        if verbose == True:
            print 'doing %s'%f_id
        one_list = []
        with open("%s/%s.bytes"%(ds,f_id),'rb') as read_file:
            for line in read_file:
                one_list += line.rstrip().split(" ")[1:]
        grams_string = [''.join(one_list[i:i+N]) for i in xrange(len(one_list)-N)]
        # build a dict for looking up
        
        grams_dict = dict()
        for gram in grams_string:
            if gram not in grams_dict:
                grams_dict[gram] = 1
        
        binary_features = []
        for feature in features_all:
            if feature in grams_dict:
                binary_features.append(1)
            else:
                binary_features.append(0)
        del grams_string
        '''
        ## instead of binary features, do count
        grams_dict = dict()
        for gram in grams_string:
            if gram not in grams_dict:
                grams_dict[gram] = 1
            else:
                grams_dict[gram] += 1 
        
        binary_features = []
        for feature in features_all:
            if feature in grams_dict:
                binary_features.append(grams_dict[feature])
            else:
                binary_features.append(0)
        del grams_string        
        '''
        yield [f_id] + binary_features

if __name__ == '__main__':              
    dict_all = join_ngrams()
    features_all = []
    for i in range(1,10):
        p, n = num_instances('trainLabels.csv', i)
        features_all  += Heap_gain(p,n,i,dict_all) # 750 * 9
    train_data = gen_df(features_all, train = True, verbose = False)
    with open('train_data_750.csv','wb') as outfile:
        wr = csv.writer(outfile, delimiter=',', quoting=csv.QUOTE_ALL)
        for row in train_data:
            wr.writerow(row)
    test_data = gen_df(features_all, train = False,verbose = False)
    with open('test_data_750.csv','wb') as outfile:
        wr = csv.writer(outfile, delimiter=',', quoting=csv.QUOTE_ALL)
        for row in test_data:
            wr.writerow(row)   
    print "DONE 4 gram features!"

instr_freq.py
用于记录指令码出现的次数

# -*- coding: utf-8 -*-
## instructions frequency

from multiprocessing import Pool
import os
import csv

paths = ['train','test']

instr_set = set(['mov','xchg','stc','clc','cmc','std','cld','sti','cli','push',  #所有指令码的集合
	'pushf','pusha','pop','popf','popa','cbw','cwd','cwde','in','out',
	'add','adc','sub','sbb','div','idiv','mul','imul','inc','dec',
	'cmp','sal','sar','rcl','rcr','rol','ror','neg','not','and'
	'or','xor','shl','shr','nop','lea','int','call','jmp',
	'je','jz','jcxz','jp','jpe','ja','jae','jb','jbe','jna',
	'jnae','jnb','jnbe','jc','jnc','ret','jne','jnz','jecxz',
	'jnp','jpo','jg','jge','jl','jle','jng','jnge','jnl','jnle',
	'jo','jno','js','jns'])

def consolidate(path,instr_set = instr_set):
	Files = os.listdir(path)
	asmFiles = [i for i in Files if '.asm' in i]                    #记住asm文件的文件名
	consolidatedFile = path + '_instr_frequency.csv'                #打开之前记住的频率操作数的文件
	with open(consolidatedFile, 'wb') as f:
		fieldnames = ['Id'] + list(instr_set)
		writer = csv.DictWriter(f, fieldnames = fieldnames)
		writer.writeheader()
		for t, fname in enumerate(asmFiles):
			consolidation = dict(zip(instr_set,[0]*len(instr_set)))
			consolidation['Id'] = fname[:fname.find('.asm')]
			with open(path+'/'+fname, 'rb') as f:
				for line in f:
					if 'text' in line and ',' in line and ';' not in line:
						row = line.lower().strip().split('  ')[1:]     #strip()用于去除字符串 首尾 的空格
						if row:
							tmp_list = [x.strip() for x in row if x != '']
							if len(tmp_list) == 2 and tmp_list[0] in consolidation:
								consolidation[tmp_list[0]] += 1        #consolidation用于记住asm文件中各种操作码的出现次数
			writer.writerow(consolidation)
			#if (t+1)%100 == 0:
			#	print str(t+1) + 'files loaded for ' + path

if __name__ == '__main__':
	p = Pool(2)
	p.map(consolidate, paths)
	print "DONE instruction count!"

image_fea.py
对图片文件进行处理,用二进制访问它然后保存成列表形式

import numpy,scipy.misc, os, array
def get_feature(data_set = 'train', data_type = 'bytes'):
    files=os.listdir(data_set)  #用于返回指定的文件夹包含的文件或文件夹的名字的列表,用files记住
    with open('%s_%s_image.csv'%(data_set, data_type),'wb') as f:
        f.write('Id,%s\n'%','.join(['%s_%i'%(data_type,x)for x in xrange(1000)]))
        for cc,x in enumerate(files):           
            if data_type != x.split('.')[-1]:           #只处理文件集中与函数中的data_type相同的文件
                continue
            file_id = x.split('.')[0]
            tmp = read_image(data_set + '/' +x)
            f.write('%s,%s\n'%(file_id, ','.join(str(v) for v in tmp)))
            #print "finish..." + file_id
def read_image(filename):                       #阅读图片类型的数据,将其转化为列表形式
    f = open(filename,'rb')                     #访问其二进制的形式
    ln = os.path.getsize(filename) # length of file in bytes
    width = 256
    rem = ln%width
    a = array.array("B") # uint8 array
    a.fromfile(f,ln-rem)                #将多余的去掉,只保留符合我们要求形状部分    
    f.close()
    g = numpy.reshape(a,(len(a)/width,width))
    g = numpy.uint8(g)
    g.resize((1000,))
    return list(g)                      #放回成列表格式

if __name__ == '__main__':
    #get_feature(data_set = 'train', data_type = 'bytes')
    get_feature(data_set = 'train', data_type = 'asm')
    #get_feature(data_set = 'test', data_type = 'bytes')
    get_feature(data_set = 'test', data_type = 'asm')
    print 'DONE asm image features!'

get_jump_map.py

获取跳转的地址

import pickle
import sys

##########################################################
# usage
# pypy get_jump_map.py xid_train.p ../../data/train ./jump_train ./jump_map_train

# xid_train.p is a list like ['loIP1tiwELF9YNZQjSUO',''....] to specify
# the order of samples in traing data
# ../../data/train is the path of original train data
# ./instrain is where the local folder of ins.p, {address:ins}
# ./jumptrain is where the local folder of jmp.p, {address:jump ins}
##########################################################



xid=pickle.load(open(sys.argv[1])) # xid_train.p or xid_test.p
data_path=sys.argv[2]              # ../../data/train
jump_path=sys.argv[3]              # ./jump_train
jump_map_path=sys.argv[4]          # ./jump_map_train
def isvalid_address(s):                                   #判断是不是合法的地址
    # a legal address should contain only these letters  一个合法的地址只能含有这些字母
    letters='0123456789ABCDEF'
    if True:
        for i in s:
            if i not in letters :#or s[1] in words:
                return False
        return True
    return False
cou=0

for cc,fx in enumerate(xid):                
    f=open(data_path+'/'+fx+'.asm')
    loc={}  # address jumping dic: start address -> stop address 
    jumpadd=pickle.load(open(jump_path+'/'+fx+'.jmp.p'))
    if len(jumpadd)==0:
        del jumpadd,loc
        continue
    ll=len(jumpadd)
    for line in f:
        if '.text' != line[:5] and '.code' != line[:5]:
            continue
        xx=line.split()
        if len(xx)>2:
            add=xx[0].split(':')[1]  # get address
            if add in jumpadd:  # this is a jump instruction 跳转指令
                for cx,x in enumerate(xx):
                     if x=='jmp' or x=='ja':
                         tid=cx+2  # two patterns: jmp xxx addr or jmp addr
                         if cx+2>=len(xx):
                             tid=cx+1
                         tmpx=xx[tid].split('_') 
                         if len(tmpx)!=2:  # not a valid address
                             break
                         if isvalid_address(tmpx[1]):
                             if len(tmpx[1])<8: # make the address 8 bit
                                 tmpx[1]='0'*(8-len(tmpx[1]))+tmpx[1]
                             loc[add]=tmpx[1]
                             ll=ll-1
                         else:
                             print fx,line#xx[-1].split('_')[1]
                         break
            if ll==0:
                break                
                #print xx[-1][-8:]
    if len(loc)>0:
        pickle.dump(loc,open(jump_map_path+'/'+fx+'.p','w'))    #记录跳转的地址
    del loc,jumpadd   
    print cc*1.0/len(xid)               #输出进度
    f.close()

get_jump.py
记录有跳转的地址

import pickle
import sys

xid=pickle.load(open(sys.argv[1]))
#unconditional_jump=['jmp','j','ja']
ins_path=sys.argv[2]
jump_path=sys.argv[3]

for cc,i in enumerate(xid):
    jmp={}
    tmp=pickle.load(open(ins_path+'/'+i+'.ins.p'))
    for add in tmp:
        if tmp[add] == 'jmp' or tmp[add]=='ja':             #地址有跳转的令1
            jmp[add]=1
    del tmp
    pickle.dump(jmp,open(jump_path+'/'+i+'.jmp.p','w'))     
    del jmp

    print cc*1.0/len(xid)   #输出进度

get_ins.py
获取地址相应的指令码

import pickle
import sys
##########################################################
# usage
# pypy getins.py xid_train.p ../../data/train ./ins_train ./jump_train

# xid_train.p is a list like ['loIP1tiwELF9YNZQjSUO',''....] to specify
# the order of samples in traing data
# ../../data/train is the path of original train data
# ./ins_train is where the local folder of ins.p, {address:ins}
# ./jump_train is where the local folder of jmp.p, {address:jump ins}
##########################################################

xid=pickle.load(open(sys.argv[1])) #xid_train.p or xid_test.p
data_path=sys.argv[2]  
ins_path=sys.argv[3]
def isvalid(s):                 #判断是不是有效的值
    Bytes='0123456789ABCDEF'
    if len(s)==2:
        if s[0] in Bytes :
            return False # ins cannot have these  指令没有上面的数字和字母
    return True
for cc,fx in enumerate(xid):
    f=open(data_path+'/'+fx+'.asm')
    loc={} # address -> instruction
    for line in f:
        if '.text' != line[:5] and '.code' != line[:5]:
            # most of ins are in those two parts
            continue
        xx=line.split()
        if len(xx)>2:
            add=xx[0].split(':')[1] # address获得他的位置
            for i in xx[1:]:
                if isvalid(i): # get the first token that is not a byte
                    loc[add]=i 
                    break      # one instruction per line (address)  一行只有一个指令码
    pickle.dump(loc,open(ins_path+'/'+fx+'.ins.p','w'))
    if cc%50==0:    
        print 'progress',cc*1.0/len(xid),len(loc)
    del loc
    f.close() 

get_id.py
找出训练测试集文件名中的id值,并保存其中有的label值

import os
xid=[i.split('.')[0] for i in os.listdir('train') if '.asm' in i]       #获得训练集或者测试集汇编文件名中的id值
Xt_id=[i.split('.')[0] for i in os.listdir('test') if '.asm' in i]
f=open('trainLabels.csv')
f.readline()
label={}
for line in f:
    xx=line.split(',')
    idx=xx[0][1:-1]
    label[idx]=int(xx[-1])					 #-1 就是label所在的特征列
f.close()
y=[label[i] for i in xid]                       #找出train中所有的label
import pickle
pickle.dump(xid,open('xid_train.p','w'))                #保存
pickle.dump(Xt_id,open('xid_test.p','w'))
pickle.dump(xid,open('xid.p','w'))
pickle.dump(Xt_id,open('Xt_id.p','w'))
pickle.dump(y,open('y.p','w'))

gen_opcount_seg.py
查看打开各个文件时候的状态(应该是检查文件的可用性)

import subprocess   
data_path='.'
opcode_path='op_train'
jump_path='jump_train'
jump_map_path='jump_map_train'                              #查看打开文件时候的状态

cmd='mkdir '+' '.join([opcode_path,jump_path,jump_map_path])	#cmd就是所要检查的命令
subprocess.call(cmd,shell=True)          #subprocess.call()执行指定的命令,返回命令执行状态。
															#shell=True执行命令的所有,false是执行只第一条命令
cmd='pypy get_ins.py xid_train.p '+' '.join([data_path+'/train',opcode_path])
subprocess.call(cmd,shell=True)

cmd='pypy get_jump.py xid_train.p '+' '.join([opcode_path,jump_path])
subprocess.call(cmd,shell=True)

cmd='pypy get_jump_map.py xid_train.p '+' '.join([data_path+'/train',jump_path,jump_map_path])
subprocess.call(cmd,shell=True)

cmd='pypy find_new_ins.py xid_train.p '+' '.join([opcode_path,jump_map_path])
subprocess.call(cmd,shell=True)

cmd='pypy filtcmd.py'
subprocess.call(cmd,shell=True)

cmd='pypy find_2g.py xid_train.p '+data_path+'/train'
subprocess.call(cmd,shell=True)

cmd='pypy find_3g.py xid_train.p '+data_path+'/train'
subprocess.call(cmd,shell=True)

cmd='pypy cut3g.py'
subprocess.call(cmd,shell=True)

cmd='pypy cut3g_for_4g.py'
subprocess.call(cmd,shell=True)

cmd='pypy find_4g.py xid_train.p '+data_path+'/train'
subprocess.call(cmd,shell=True)

cmd='pypy cut4g.py'
subprocess.call(cmd,shell=True)

cmd='pypy findhead.py xid_train.p '+data_path+'/train'
subprocess.call(cmd,shell=True)

cmd='pypy rebuild_2g3g4ghead.py xid_train.p '+data_path+'/train'
subprocess.call(cmd,shell=True)

cmd='pypy rebuild_2g3g4ghead.py xid_test.p '+data_path+'/test'
subprocess.call(cmd,shell=True)

cmd='python getfea.py'
subprocess.call(cmd,shell=True)

gen_data.py
生成可用的数据集

import os
import subprocess
train={}
test={}
path='..'
f=open(path+'/trainLabels.csv')
fo=open('trainLabels.csv','w')
fo.write(f.readline())
f1=open(path+'/sampleSubmission.csv')
fo1=open('sampleSubmission.csv','w')
fo1.write(f1.readline())

for line in f:
    xx=line.split(',')
    label=int(xx[-1])                            #取得label         
    if label not in train:                      #如果train中或者test中没有该label的话,就在相应的数据集中加上该label对应的数据
        size=os.path.getsize(path+'/train/'+xx[0][1:-1]+'.asm')
        #if size>1000000:
        #    continue
        train[label]=1
        idx=xx[0][1:-1]
        cmd='cp '+path+'/train/'+idx+'.* train/'
        subprocess.call(cmd,shell=True)
        fo.write(line)
    elif label not in test:                     #将该label对应的样本也写入测试集中
        test[label]=1
        idx=xx[0][1:-1]
        cmd='cp '+path+'/train/'+idx+'.* test/'
        subprocess.call(cmd,shell=True)
        fo1.write(idx+','+','.join(['0' if i!=label-1 else '1' for i in range(9)])+'\n')      #在该特征对应的标签列上标上 ‘1’
    
f.close()
fo.close()
f1.close()
fo1.close()

freq_count.py
记录操作码的出现次数并保存在一个文件中

# -*- coding: utf-8 -*-
"""
part of the code borrowed from the benchmark in the forum.
create Frequency Features for 1 byte. So 16*16 features will add to train and test.
"""
from multiprocessing import Pool
import os
from csv import writer


paths = ['train','test']

 
def consolidate(path):              #consolidate是合并的意思

    s_path = path
    Files = os.listdir(s_path)          #files记录path中所有的文件名
    byteFiles = [i for i in Files if '.bytes' in i]     #记录上面文件名中所有二进制文件的名字
    consolidatedFile = path + '_frequency.csv'
    
    with open(consolidatedFile, 'wb') as f:
        # Preparing header part
        fw = writer(f)
        colnames = ['Id']
        colnames += ['FR_'+hex(i)[2:] for i in range(16**2)]   #这里的[2:]只是为了去掉它十六进制前面的0x
        fw.writerow(colnames)
        
        for t, fname in enumerate(byteFiles):
            consolidation = []
            f = open(s_path+'/'+fname, 'rb')
            twoByte = [0]*16**2
            for row in f:
                codes = row[:-2].split()[1:]
                
                # Conversion of code to to two byte
                twoByteCode = [int(i,16) for i in codes if i != '??']                                     
                # Frequency calculation of two byte codes
                for i in twoByteCode:
                    twoByte[i] += 1                           #记录某一个操作码的出现次数
                
            # Row added
            consolidation += [fname[:fname.find('.bytes')]]   #文件名 + 操作码出现次数
            consolidation += twoByte                          
            
            fw.writerow(consolidation)
            # Writing rows after every 100 files processed
            #if (t+1)%100==0:
            #    printw(t+1, 'files loaded for ', path)

if __name__ == '__main__':
    p = Pool(2)
    p.map(consolidate, paths)
    print "DONE bytes count!"

findhead.py
对asm文件每行代码的第一个操作码进行计数保存

import sys
import pickle

##########################################################
# usage
# pypy findhead.py xid_train.p ../../data/train 

# xid_train.p is a list like ['loIP1tiwELF9YNZQjSUO',''....] to specify
# the order of samples in traing data
# ../../data/train is the path of original train data
##########################################################
xid_name=sys.argv[1]
data_path=sys.argv[2]


xid=pickle.load(open(xid_name)) #xid_train.p or xid_test.p

head={}

for c,f in enumerate(xid):                      #第一个操作码出现的次数统计
    fo=open(data_path+'/'+f+'.asm')
    tot=0
    for line in fo:
        xx=line.split()
        h=xx[0].split(':')
        if h[0] not in head:                    #如果本来在head中还没保存当前的head,则保存
            head[h[0]]=0
        head[h[0]]+=1                           #对当前的head计数
    fo.close()
    if True:                                    #c%10000==0:
        print c*1.0/len(xid),len(head)
print len(head)
pickle.dump(head,open('head.p','w'))

find_new_ins.py
找出符合条件的指令,并有其出现的次数

import pickle
import os
import sys

##########################################################
# usage
# pypy find_new_ins.py xid_train.p ./ins_train  ./jump_map_train

# xid_train.p is a list like ['loIP1tiwELF9YNZQjSUO',''....] to specify
# the order of samples in traing data
# ./ins_train is where the local folder of ins.p, {address:ins}
# ./jump_map_train is where the local folder of jump map, {address of this ins: address of next ins}
##########################################################

xid_name=sys.argv[1]
ins_path=sys.argv[2]
jump_map_path=sys.argv[3]

xid=pickle.load(open(xid_name))  #xid_train.p or xid_test.p

cmd={}                           # new ins found

files=os.listdir(jump_map_path)  #存有跳转指令文件的地址 
mware_that_has_jump={}
for i in files:
    if '.p' in i:                                       #.p文件是已经发现了的恶意文件?
        mware_that_has_jump[i.split('.')[0]]=1          #.p文件中有跳转的直接放入字典中

for cc,fx in enumerate(xid):
    tmpcount={}
    ins=pickle.load(open(ins_path+'/'+fx+'.ins.p'))
    insx=[]
    if fx not in mware_that_has_jump: # there is no jump in that malware     恶意软件没有跳转操作码
        for i in ins:
            if i not in tmpcount:
                tmpcount[i]=0
            tmpcount[i]+=1
        count={}
        for i in tmpcount:
            count[tmpcount[i]]=i
        for j in sorted(count.keys(),reverse=True):
            if j <200:
                break
            if count[j] not in cmd:
                cmd[count[j]]=1 # get the top 200 frequent ins in that mware 获取该malware中前200个常见的INS
        del ins,insx,tmpcount,count
        continue
    jump=pickle.load(open(jump_map_path+'/'+fx+'.p'))
    keys= sorted(ins.keys())
    #print keys[:20]
    nextins={}
    for c,j in enumerate(keys[:-1]):
        if j in jump and jump[j] in ins:
            nextins[j]=jump[j]
            #print j,jump[j]
        else:
            nextins[j]=keys[c+1]
    current=keys[0]
    
    while True:
        if ins[current] not in tmpcount:
            tmpcount[ins[current]]=0
        tmpcount[ins[current]]+=1
        if current not in nextins:
            print 'not in'
            break
        if  sum(tmpcount.values())>len(ins)*5:
            print 'loop runs more than 5x'
            break
        current=nextins[current]

    count={}
    for i in tmpcount:
        count[tmpcount[i]]=i
    for j in sorted(count.keys(),reverse=True):
        if j <200:
            break
        if count[j] not in cmd:
            cmd[count[j]]=0
        cmd[count[j]]+=j
        

    
    del current,ins,insx,jump,keys,nextins,tmpcount,count
              
    print 'find',cc*1.0/len(xid),len(cmd)
print cmd
pickle.dump(cmd,open('newcmd.p','w'))

find_2g.py
find_3g.py
find_4g.py

都是找出相邻的几个操作码,找出他们之间的关系,其中3g在2g的基础上增加,4g在3g的基础上增加

import sys
import pickle

##########################################################
# usage
# pypy find_2g.py xid_train.p ../../data/train 

# xid_train.p is a list like ['loIP1tiwELF9YNZQjSUO',''....] to specify
# the order of samples in traing data
# ../../data/train is the path of original train data
##########################################################
xid_name=sys.argv[1]
data_path=sys.argv[2]


xid=pickle.load(open(xid_name)) #xid_train.p or xid_test.p

newc=pickle.load(open('newc.p'))
cmd2g={}                                                #2g refer to 2-gram
for i in newc:
    for j in newc:
        cmd2g[(i,j)]=0
print newc

for c,f in enumerate(xid):#(files[len(files)/10*a1:len(files)/10*a2]):
    count={}
    for i in cmd2g:
        count[i]=0
    fo=open(data_path+'/'+f+'.asm')
    tot=0
    a=-1
    b=-1
    for line in fo:
        xx=line.split()
        for x in xx:
            if x in newc:
                
                a=b
                b=x
                if (a,b) in cmd2g:
                    count[(a,b)]+=1
                    tot+=1
#                     print (b,a)
    fo.close()
    if c%10==0:
        print c*1.0/len(xid),tot
    for i in cmd2g:
        cmd2g[i]=count[i]+cmd2g[i]
    del count

import pickle
cmd2gx={}
for i in cmd2g:
    if cmd2g[i]>10:
        cmd2gx[i]=cmd2g[i]
print len(cmd2gx)
pickle.dump(cmd2gx,open('cmd2g.p','w'))

ensemble.py
貌似是按照某种规律汇编相应文件的内容?

import pandas as pd
s1=pd.read_csv('model1.csv',index_col=0)
s2=pd.read_csv('model2.csv',index_col=0)
s3=pd.read_csv('model3.csv',index_col=0)
for i in s1.columns.values:
    s1[i]=s1[i]**0.1*s2[i]**0.4*s3[i]*0.5
s1.to_csv('ensemble.csv')

filtcmd.py
对之前找出来的指令码进行筛选,筛掉不合法的指令码

import pickle
cmd=pickle.load(open('newcmd.p'))
newc={}
for c in cmd:
    if '_' in c or c[0] in '?1234567890ABCDEF':  #检查指令的合法性
        continue
    else:
        #print c,cmd[c]
        newc[c]=cmd[c]                           #合法的指令进行保存         
print newc
pickle.dump(newc,open('newc.p','w'))

dll.py
找出训练集和测试集中dll特征并保存

import heapq
import pickle
import math
from csv import DictReader
import glob
import os
import csv
from datetime import datetime

# dll call features.

# load file names 
def load_label(path, label):           #取出label对应的值
    result = []
    for row in DictReader(open(path)):
        if int(row['Class']) == label:
            result.append((row['Id']))
    return result

def dll_single_file(f, N = 4):
    pattern_dict = dict()
    f_lines = list()
    with open(f, 'rb') as outfile:
        for line in outfile:
            if 'idata' in line and 'extrn' in line:      #把在文件中有idata和extrn的行用小写记录下来
                f_lines.append(line.lower())
    for line in f_lines:                                 
        line = line.strip().split()                      #line现在是一个列表,里面是这一行按空格分割的每一个word
        p = line[line.index('extrn')+1].split(':')[0]    #取出extrn后面的词
        if p and p not in pattern_dict:                  #extrn是用来说明:在当前模块所使用的标识符中已在其它模块中被定义为指定类型的标识符,既是一种dll关系。
            pattern_dict[p] = 1                          #记录上面的词
    return pattern_dict

def reduce_dict():
    dict_all = dict()
    for c in range(1,10):
        f_labels = load_label('trainLabels.csv', c)
        for f in f_labels:
            f_name = 'train/'+f+'.asm'
            dll = dll_single_file(f_name)
            for feature in dll:
                if feature not in dict_all:
                    dict_all[feature] = [0]*9
                dict_all[feature][c-1] +=1              #这里记录被应用模块的对应位置,如某个模块为 0,1,0,0,1,1,0,0,0,0
        #print "finishing features in class %i"%c
    return dict_all

# load data     
def num_instances(path, label):                         #记录path文件中值为label的值有多少
    p = 0
    n = 0
    for row in DictReader(open(path)):
        if int(row['Class']) == label:      
            p += 1
        else:
            n += 1
    return p,n


def entropy(p,n):                                       #计算用于观察的某些比率
    p_ratio = float(p)/(p+n)
    n_ratio = float(n)/(p+n)
    return -p_ratio*math.log(p_ratio) - n_ratio * math.log(n_ratio)

def info_gain(p0,n0,p1,n1,p,n):                         #计算用于观察的某些比率
    return entropy(p,n) - float(p0+n0)/(p+n)*entropy(p0,n0) - float(p1+n1)/(p+n)*entropy(p1,n1)



def Heap_gain(p, n, class_label, dict_all, num_features = 1000, gain_minimum_bar = -100):
    heap = [(gain_minimum_bar, 'gain_bar')] * num_features
    root = heap[0]
    for gram, count_list in dict_all.iteritems():
        p1 = count_list[class_label-1]
        n1 = sum(count_list[:(class_label-1)] + count_list[class_label:])
        p0,n0 = p - p1, n - n1
        if p1*p0*n1*n0 != 0:
            gain = info_gain(p0,n0,p1,n1,p,n)
            if gain > root[0]:
                root = heapq.heapreplace(heap, (gain, gram))
    #return heap
    result = [i[1] for i in heap if i[1] != 'gain_bar']
    #print "the length of dll for class %i is %i"%(class_label, len(result))
    return result

def gen_df(features_all, train = True, verbose = False, N = 4):   #生成训练数据或者测试数据
    yield ['Id'] + features_all # yield header
    if train == True:
        ds = 'train'
    else:
        ds = 'test'
    directory_names = list(set(glob.glob(os.path.join(ds, "*.asm"))))
    for f in directory_names:
        f_id = f.split('/')[-1].split('.')[0]
        if verbose == True:
            print 'doing %s'%f_id

        binary_features = list()
        tmp_pattern = dict()
        f_lines = list()
        with open(f, 'rb') as outfile:
            for line in outfile:
                if 'idata' in line and 'extrn' in line:
                    f_lines.append(line.lower())
        for line in f_lines:
            line = line.strip().split()
            p = line[line.index('extrn')+1].split(':')[0]
            if p and p not in tmp_pattern:
                tmp_pattern[p] = 1

        for fea in features_all:
            if fea in tmp_pattern:
                binary_features.append(1)
            else:
                binary_features.append(0)

        yield [f_id] + binary_features

if __name__ == '__main__':
    start = datetime.now()
    dict_all = reduce_dict()
    features_all = []
    for i in range(1,10):
        p, n = num_instances('trainLabels.csv', i)
        features_all  += Heap_gain(p,n,i,dict_all)
    train_data = gen_df(features_all, train = True, verbose = False)
    with open('train_dll.csv','wb') as outfile:
        wr = csv.writer(outfile, delimiter=',', quoting=csv.QUOTE_ALL)
        for row in train_data:
            wr.writerow(row)
    test_data = gen_df(features_all, train = False,verbose = False)
    with open('test_dll.csv','wb') as outfile:
        wr = csv.writer(outfile, delimiter=',', quoting=csv.QUOTE_ALL)
        for row in test_data:
            wr.writerow(row)
    print "DONE DLL functions!"
    #print datetime.now() - start

rebuild_code.py
将asm所有文件的数据按所需的格式(这里是int类型)重新编译一次并保存在一个新文件中

import os,array
import pickle
import numpy as np
import sys
xid=pickle.load(open(sys.argv[1]))
asm_code_path=sys.argv[2]
train_or_test=asm_code_path.split('_')[-1]

X = np.zeros((len(xid),2000))
for cc,i in enumerate(xid):
    f=open(asm_code_path+'/'+i+'.asm')
    ln = os.path.getsize(asm_code_path+'/'+i+'.asm') # length of file in bytes
    width = int(ln**0.5)
    rem = ln%width
    a = array.array("B")        # uint8 array 这里的“B”表示是int类型的数据
    a.fromfile(f,ln-rem)        #读取文件中的内容到a中
    f.close()
    a=np.array(a)
    #im = Image.open('asmimage/'+i+'.png')
    a.resize((2000,))
    #im1 = im.resize((64,64),Image.ANTIALIAS); # for faster computation
    #des = leargist.color_gist(im1)
    X[cc] = a#[0,:1000] #des[0:320]
    print cc*1.0/len(xid)
pickle.dump(X,open('Xcode_'+train_or_test+'.p','w'))

cut3g.py
cut4g.py
精炼原本的找出来的3g,4g指令码

import sys
import pickle
newc=pickle.load(open('cmd3g.p'))
nx={}
c=0
for i in newc:
    if newc[i]>100:                      #记住大于100的
        c+=1
        nx[i]=newc[i]
print c,len(nx)                          #输出3g指令中出现次数大于100的有多少,并保存操作码和出现次数
pickle.dump(nx,open('cutcmd3g.p','w'))   

cut3g_for_4g.py
在3g中找出符合4g的操作码与次数

import sys
import pickle
newc=pickle.load(open('cmd3g.p'))
nx={}
c=0
for i in newc:
    if newc[i]>10000:               #只有出现次数大于10000的操作码才能进入4g
        c+=1
        nx[i]=newc[i]
print c,len(newc)
pickle.dump(nx,open('cutcmd3g_for_4g.p','w'))

rebuild_2g3g4ghead.py
在之前找出来的指令码中,找出符合2g3g4g的指令码

import sys
import pickle

##########################################################
# usage
# pypy rebuild_2g.py xid_train.p ../../data/train 

# xid_train.p is a list like ['loIP1tiwELF9YNZQjSUO',''....] to specify
# the order of samples in traing data
# ../../data/train is the path of original train data
##########################################################
                          # sys.argv 是获取运行python文件的时候 命令行参数 ,且以list形式存储参数
xid_name=sys.argv[1]      # sys.argv[1] 是获取文件名后的第一个 命令行参数           注意看上面运行文件时候的命令行  
data_path=sys.argv[2]     # sys.argv[2] 是获取文件名后的第二个 命令行参数  
xid=pickle.load(open(xid_name)) #xid_train.p or xid_test.p

newc=pickle.load(open('newc.p'))

train_or_test=data_path.split('/')[-1]       #由路径的最后的一个单词判断文件是训练文件还是测试文件
if train_or_test=='train':
    f=open(data_path[:-5]+'trainLabels.csv') #打开这个labels数据文件
    f.readline()
    train={}
    for line in f:
        xx=line.split(',')
        train[xx[0][1:-1]]=int(xx[1])        # labels are from 1 -> 9 !               ???
    f.close()
    y=[]

cmd2g={}
for i in newc:
    for j in newc:                           #形成一个类似矩阵的结构
        cmd2g[(i,j)]=1
print newc


cmd3g=pickle.load(open('cutcmd3g.p'))        #取出本来保存中3g4g的指令码
cmd4g=pickle.load(open('cutcmd4g.p'))
head=pickle.load(open('head.p'))

print newc
X2g=[]
X3g=[]
X4g=[]
Xhead=[]
for c,f in enumerate(xid):#(files[len(files)/10*a1:len(files)/10*a2]):
    fo=open(data_path+'/'+f+'.asm')

    count2g={}
    count3g={}
    count4g={}
    for i in cmd2g:
        count2g[i]=0

    for i in cmd3g:
        count3g[i]=0

    for i in cmd4g:
        count4g[i]=0

    counthead={}
    for i in head:
        counthead[i]=0

    tot=0
    a=-1
    b=-1
    d=-1
    e=-1
    for line in fo:

        xx=line.split()

        if xx[0].split(':')[0] in counthead:
            counthead[xx[0].split(':')[0]]+=1  #记录head头操作码出现的次数

        for x in xx:
            if x in newc:
                a=b
                b=d
                d=e
                e=x
                if (a,b,d,e) in cmd4g:
                    count4g[(a,b,d,e)]+=1
                    tot+=1

                if (b,d,e) in cmd3g:
                    count3g[(b,d,e)]+=1

                if (d,e) in cmd2g:
                    count2g[(d,e)]+=1

    fo.close()
    name=f.split('.')[0]
    if train_or_test=='train': 
        y.append(train[name])
    if True:#c%10000==0:
        print c*1.0/len(xid),tot
    X4g.append([count4g[i] for i in cmd4g])
    X3g.append([count3g[i] for i in cmd3g])
    X2g.append([count2g[i] for i in cmd2g])
    Xhead.append([counthead[i] for i in head])

    del count4g,count2g,count3g,counthead
train_or_test=data_path.split('/')[-1]
pickle.dump(X4g,open('X4g_'+train_or_test+'.p','w'))
pickle.dump(X3g,open('X3g_'+train_or_test+'.p','w'))
pickle.dump(X2g,open('X2g_'+train_or_test+'.p','w'))
pickle.dump(Xhead,open('Xhead_'+train_or_test+'.p','w'))

if train_or_test=='train':
    pickle.dump(y,open('y.p','w'))

你可能感兴趣的:(检测恶意软件)