-Ausen

对kaggle2015 1st代码的简单理解

这里对训练模型的py文件不作解释

unique_gram.py
是对不同文件中的操作码进行计数，并且找出最多的前多少个操作码以及他的次数。

from csv import DictReader
from datetime import datetime
import pickle
import heapq
import sys

#load data
def load_label(path, label):                    #取出path中label对应的值
    result = []
    for row in DictReader(open(path)):
        if int(row['Class']) == label:
            result.append((row['Id']))
    return result


#generate grams dictionary for one file 给一个文件生成一个字典数据结构
def grams_dict(f_name, N=4): 
    path = "train/%s.bytes"%f_name
    one_list = []
    with open(path, 'rb') as f:
        for line in f:
            one_list += line.rstrip().split(" ")[1:]    #rstrip是去除开头和结尾的空格的意思
    grams_string = [''.join(one_list[i:i+N]) for i in xrange(len(one_list)-N+1)]   #四个四个连成一个string 四个应该是选择几gram语义关系
    tree = dict()
    for gram in grams_string:       #将生成的string保存到tree中，有的置1，不累加
        if gram not in tree:
            tree[gram] = 1   
    return tree


#add up ngram dictionaries
def reduce_dict(f_labels):          #将f_labels中的所有文件中的二进制操作码进行获取
    result = dict()
    for f_name in f_labels:
        d = grams_dict(f_name)
        for k,v in d.iteritems():   #记录下相同操作码的个数
            if k in result:
                result[k] += v 
            else:
                result[k] = v
        del d
    #print "this class has %i keys"%len(result)
    #pickle.dump(result, open('gram/ngram_%i'%label,'wb'))
    return result

#heap to get the top 100,000 features.
def Heap_top(dictionary, label, num = 100000):   #找出dic中出现次数最多的100000个操作码
    heap = [(0,'tmp')]* num # initialize the heap
    root = heap[0]
    for ngram,count in dictionary.iteritems():
            if count > root[0]:
                root = heapq.heapreplace(heap, (count, ngram))
    pickle.dump(heap, open('gram/ngram_%i_top%i'%(label,num),'wb'))        
         
if __name__ == '__main__':
    start = datetime.now()
    #for label in range(1,10): # take too much memory
    label = int(sys.argv[1]) #这里的sys.argv[1]是取接收的的在终端传入的第一个参数，这里是1到9，生成对应label的值
    print "Gathering 4 grams, Class %i out of 9..."%label
    f_labels = load_label('trainLabels.csv', label)
    Heap_top(reduce_dict(f_labels),label)
    #print datetime.now() - start

writeasm.py
对data文件中的code进行汇编成asm文件

import pickle
import sys
xid=pickle.load(open(sys.argv[1])) #sys.argv[1] 是测试集xid测试集的文件名
data_path=sys.argv[2]
asm_code_path=sys.argv[3]

for cc,i in enumerate(xid):
    f=open(data_path+'/'+i+'.asm')
    fo=open(asm_code_path+'/'+i+'.asm','w')
    start=True
    for line in f:
        xx=line.split()
        for c,x in enumerate(xx):
            if x=='Pure':
                if xx[c+1]=='data':
                    start=False
                if xx[c+1]=='code':        #如果是代码类型的代码就进行保存
                    start=True
        if True:				#这里我怀疑他打错了，应该是 if start：
            xx[0]=xx[0].split(':')[0]            
            fo.write(''.join(xx)+'\n')
    f.close()
    fo.close()          
    print cc*1.0/len(xid)  #输出操作的进度

join_grams.py
用来生成样本特征值对应的二进制的代码格式文件（1有0无）

import heapq
import pickle
import math
from csv import DictReader
import glob
import os
import csv

def join_ngrams(num = 100000): #找出相应操作码在哪个ngram文件中存在，存在即在相应位置记上他所对应的count
    dict_all = dict()
    for c in range(1,10):
        #print "merging %i out of 9"%c
        heap = pickle.load(open('gram/ngram_%i_top%i'%(c,num),'rb'))
        while heap:
            count, gram = heapq.heappop(heap)
            if gram not in dict_all:
                dict_all[gram] = [0]*9
            dict_all[gram][c-1] = count
    return dict_all
    #pickle.dump(dict_all, open('ready_for_selection.pkl','wb'))


# load data
def num_instances(path, label): #记录下path所对应的文件中类别存在有label对应类别的个数
    p = 0
    n = 0
    for row in DictReader(open(path)):
        if int(row['Class']) == label:
            p += 1
        else:
            n += 1
    return p,n


def entropy(p,n):               #对应上面的数值计算出一种用来参考的比率数值
    p_ratio = float(p)/(p+n)
    n_ratio = float(n)/(p+n)
    return -p_ratio*math.log(p_ratio) - n_ratio * math.log(n_ratio)

def info_gain(p0,n0,p1,n1,p,n): #计算一种用来参考的比率数值
    return entropy(p,n) - float(p0+n0)/(p+n)*entropy(p0,n0) - float(p1+n1)/(p+n)*entropy(p1,n1)

def Heap_gain(p, n, class_label, dict_all, num_features = 750, gain_minimum_bar = -100000): #找出比率得分高低的对应heap
    heap = [(gain_minimum_bar, 'gain_bar')] * num_features
    root = heap[0]
    for gram, count_list in dict_all.iteritems():
        p1 = count_list[class_label-1]
        n1 = sum(count_list[:(class_label-1)] + count_list[class_label:])
        p0,n0 = p - p1, n - n1         
        if p1*p0*n1*n0 != 0:
            gain = info_gain(p0,n0,p1,n1,p,n)
            if gain > root[0]:
                root = heapq.heapreplace(heap, (gain, gram))
    #return heap
    return [i[1] for i in heap]

def gen_df(features_all, train = True, verbose = False, N = 4):     #用二进制生成id对应的样本中的特征的特征值，1有0无
    yield ['Id'] + features_all # yield header      #yield是一种迭代器，要用next生成器来对其进行操作，每一次用一次next时他会运行到
                                                    #一个yield的地方并放回他的值，下一次next时他会承接该次的操作，并且运行到下一个yield的地方并
                                                    #放回其值
    if train == True:
        ds = 'train'
    else:
        ds = 'test'
    directory_names = list(set(glob.glob(os.path.join(ds, "*.bytes")))) #这里取得所有的train或者test的bytes文件的名字
    for f in directory_names:
        f_id = f.split('/')[-1].split('.')[0]                           #取得他的bytes前面的具体文件名
        if verbose == True:
            print 'doing %s'%f_id
        one_list = []
        with open("%s/%s.bytes"%(ds,f_id),'rb') as read_file:
            for line in read_file:
                one_list += line.rstrip().split(" ")[1:]
        grams_string = [''.join(one_list[i:i+N]) for i in xrange(len(one_list)-N)]
        # build a dict for looking up
        
        grams_dict = dict()
        for gram in grams_string:
            if gram not in grams_dict:
                grams_dict[gram] = 1
        
        binary_features = []
        for feature in features_all:
            if feature in grams_dict:
                binary_features.append(1)
            else:
                binary_features.append(0)
        del grams_string
        '''
        ## instead of binary features, do count
        grams_dict = dict()
        for gram in grams_string:
            if gram not in grams_dict:
                grams_dict[gram] = 1
            else:
                grams_dict[gram] += 1 
        
        binary_features = []
        for feature in features_all:
            if feature in grams_dict:
                binary_features.append(grams_dict[feature])
            else:
                binary_features.append(0)
        del grams_string        
        '''
        yield [f_id] + binary_features

if __name__ == '__main__':              
    dict_all = join_ngrams()
    features_all = []
    for i in range(1,10):
        p, n = num_instances('trainLabels.csv', i)
        features_all  += Heap_gain(p,n,i,dict_all) # 750 * 9
    train_data = gen_df(features_all, train = True, verbose = False)
    with open('train_data_750.csv','wb') as outfile:
        wr = csv.writer(outfile, delimiter=',', quoting=csv.QUOTE_ALL)
        for row in train_data:
            wr.writerow(row)
    test_data = gen_df(features_all, train = False,verbose = False)
    with open('test_data_750.csv','wb') as outfile:
        wr = csv.writer(outfile, delimiter=',', quoting=csv.QUOTE_ALL)
        for row in test_data:
            wr.writerow(row)   
    print "DONE 4 gram features!"

instr_freq.py
用于记录指令码出现的次数

# -*- coding: utf-8 -*-
## instructions frequency

from multiprocessing import Pool
import os
import csv

paths = ['train','test']

instr_set = set(['mov','xchg','stc','clc','cmc','std','cld','sti','cli','push',  #所有指令码的集合
	'pushf','pusha','pop','popf','popa','cbw','cwd','cwde','in','out',
	'add','adc','sub','sbb','div','idiv','mul','imul','inc','dec',
	'cmp','sal','sar','rcl','rcr','rol','ror','neg','not','and'
	'or','xor','shl','shr','nop','lea','int','call','jmp',
	'je','jz','jcxz','jp','jpe','ja','jae','jb','jbe','jna',
	'jnae','jnb','jnbe','jc','jnc','ret','jne','jnz','jecxz',
	'jnp','jpo','jg','jge','jl','jle','jng','jnge','jnl','jnle',
	'jo','jno','js','jns'])

def consolidate(path,instr_set = instr_set):
	Files = os.listdir(path)
	asmFiles = [i for i in Files if '.asm' in i]                    #记住asm文件的文件名
	consolidatedFile = path + '_instr_frequency.csv'                #打开之前记住的频率操作数的文件
	with open(consolidatedFile, 'wb') as f:
		fieldnames = ['Id'] + list(instr_set)
		writer = csv.DictWriter(f, fieldnames = fieldnames)
		writer.writeheader()
		for t, fname in enumerate(asmFiles):
			consolidation = dict(zip(instr_set,[0]*len(instr_set)))
			consolidation['Id'] = fname[:fname.find('.asm')]
			with open(path+'/'+fname, 'rb') as f:
				for line in f:
					if 'text' in line and ',' in line and ';' not in line:
						row = line.lower().strip().split('  ')[1:]     #strip()用于去除字符串 首尾 的空格
						if row:
							tmp_list = [x.strip() for x in row if x != '']
							if len(tmp_list) == 2 and tmp_list[0] in consolidation:
								consolidation[tmp_list[0]] += 1        #consolidation用于记住asm文件中各种操作码的出现次数
			writer.writerow(consolidation)
			#if (t+1)%100 == 0:
			#	print str(t+1) + 'files loaded for ' + path

if __name__ == '__main__':
	p = Pool(2)
	p.map(consolidate, paths)
	print "DONE instruction count!"

image_fea.py
对图片文件进行处理，用二进制访问它然后保存成列表形式

import numpy,scipy.misc, os, array
def get_feature(data_set = 'train', data_type = 'bytes'):
    files=os.listdir(data_set)  #用于返回指定的文件夹包含的文件或文件夹的名字的列表，用files记住
    with open('%s_%s_image.csv'%(data_set, data_type),'wb') as f:
        f.write('Id,%s\n'%','.join(['%s_%i'%(data_type,x)for x in xrange(1000)]))
        for cc,x in enumerate(files):           
            if data_type != x.split('.')[-1]:           #只处理文件集中与函数中的data_type相同的文件
                continue
            file_id = x.split('.')[0]
            tmp = read_image(data_set + '/' +x)
            f.write('%s,%s\n'%(file_id, ','.join(str(v) for v in tmp)))
            #print "finish..." + file_id
def read_image(filename):                       #阅读图片类型的数据，将其转化为列表形式
    f = open(filename,'rb')                     #访问其二进制的形式
    ln = os.path.getsize(filename) # length of file in bytes
    width = 256
    rem = ln%width
    a = array.array("B") # uint8 array
    a.fromfile(f,ln-rem)                #将多余的去掉，只保留符合我们要求形状部分    
    f.close()
    g = numpy.reshape(a,(len(a)/width,width))
    g = numpy.uint8(g)
    g.resize((1000,))
    return list(g)                      #放回成列表格式

if __name__ == '__main__':
    #get_feature(data_set = 'train', data_type = 'bytes')
    get_feature(data_set = 'train', data_type = 'asm')
    #get_feature(data_set = 'test', data_type = 'bytes')
    get_feature(data_set = 'test', data_type = 'asm')
    print 'DONE asm image features!'

get_jump_map.py

获取跳转的地址

import pickle
import sys

##########################################################
# usage
# pypy get_jump_map.py xid_train.p ../../data/train ./jump_train ./jump_map_train

# xid_train.p is a list like ['loIP1tiwELF9YNZQjSUO',''....] to specify
# the order of samples in traing data
# ../../data/train is the path of original train data
# ./instrain is where the local folder of ins.p, {address:ins}
# ./jumptrain is where the local folder of jmp.p, {address:jump ins}
##########################################################



xid=pickle.load(open(sys.argv[1])) # xid_train.p or xid_test.p
data_path=sys.argv[2]              # ../../data/train
jump_path=sys.argv[3]              # ./jump_train
jump_map_path=sys.argv[4]          # ./jump_map_train
def isvalid_address(s):                                   #判断是不是合法的地址
    # a legal address should contain only these letters  一个合法的地址只能含有这些字母
    letters='0123456789ABCDEF'
    if True:
        for i in s:
            if i not in letters :#or s[1] in words:
                return False
        return True
    return False
cou=0

for cc,fx in enumerate(xid):                
    f=open(data_path+'/'+fx+'.asm')
    loc={}  # address jumping dic: start address -> stop address 
    jumpadd=pickle.load(open(jump_path+'/'+fx+'.jmp.p'))
    if len(jumpadd)==0:
        del jumpadd,loc
        continue
    ll=len(jumpadd)
    for line in f:
        if '.text' != line[:5] and '.code' != line[:5]:
            continue
        xx=line.split()
        if len(xx)>2:
            add=xx[0].split(':')[1]  # get address
            if add in jumpadd:  # this is a jump instruction 跳转指令
                for cx,x in enumerate(xx):
                     if x=='jmp' or x=='ja':
                         tid=cx+2  # two patterns: jmp xxx addr or jmp addr
                         if cx+2>=len(xx):
                             tid=cx+1
                         tmpx=xx[tid].split('_') 
                         if len(tmpx)!=2:  # not a valid address
                             break
                         if isvalid_address(tmpx[1]):
                             if len(tmpx[1])<8: # make the address 8 bit
                                 tmpx[1]='0'*(8-len(tmpx[1]))+tmpx[1]
                             loc[add]=tmpx[1]
                             ll=ll-1
                         else:
                             print fx,line#xx[-1].split('_')[1]
                         break
            if ll==0:
                break                
                #print xx[-1][-8:]
    if len(loc)>0:
        pickle.dump(loc,open(jump_map_path+'/'+fx+'.p','w'))    #记录跳转的地址
    del loc,jumpadd   
    print cc*1.0/len(xid)               #输出进度
    f.close()

get_jump.py
记录有跳转的地址

import pickle
import sys

xid=pickle.load(open(sys.argv[1]))
#unconditional_jump=['jmp','j','ja']
ins_path=sys.argv[2]
jump_path=sys.argv[3]

for cc,i in enumerate(xid):
    jmp={}
    tmp=pickle.load(open(ins_path+'/'+i+'.ins.p'))
    for add in tmp:
        if tmp[add] == 'jmp' or tmp[add]=='ja':             #地址有跳转的令1
            jmp[add]=1
    del tmp
    pickle.dump(jmp,open(jump_path+'/'+i+'.jmp.p','w'))     
    del jmp

    print cc*1.0/len(xid)   #输出进度

get_ins.py
获取地址相应的指令码

import pickle
import sys
##########################################################
# usage
# pypy getins.py xid_train.p ../../data/train ./ins_train ./jump_train

# xid_train.p is a list like ['loIP1tiwELF9YNZQjSUO',''....] to specify
# the order of samples in traing data
# ../../data/train is the path of original train data
# ./ins_train is where the local folder of ins.p, {address:ins}
# ./jump_train is where the local folder of jmp.p, {address:jump ins}
##########################################################

xid=pickle.load(open(sys.argv[1])) #xid_train.p or xid_test.p
data_path=sys.argv[2]  
ins_path=sys.argv[3]
def isvalid(s):                 #判断是不是有效的值
    Bytes='0123456789ABCDEF'
    if len(s)==2:
        if s[0] in Bytes :
            return False # ins cannot have these  指令没有上面的数字和字母
    return True
for cc,fx in enumerate(xid):
    f=open(data_path+'/'+fx+'.asm')
    loc={} # address -> instruction
    for line in f:
        if '.text' != line[:5] and '.code' != line[:5]:
            # most of ins are in those two parts
            continue
        xx=line.split()
        if len(xx)>2:
            add=xx[0].split(':')[1] # address获得他的位置
            for i in xx[1:]:
                if isvalid(i): # get the first token that is not a byte
                    loc[add]=i 
                    break      # one instruction per line (address)  一行只有一个指令码
    pickle.dump(loc,open(ins_path+'/'+fx+'.ins.p','w'))
    if cc%50==0:    
        print 'progress',cc*1.0/len(xid),len(loc)
    del loc
    f.close()

get_id.py
找出训练测试集文件名中的id值，并保存其中有的label值

import os
xid=[i.split('.')[0] for i in os.listdir('train') if '.asm' in i]       #获得训练集或者测试集汇编文件名中的id值
Xt_id=[i.split('.')[0] for i in os.listdir('test') if '.asm' in i]
f=open('trainLabels.csv')
f.readline()
label={}
for line in f:
    xx=line.split(',')
    idx=xx[0][1:-1]
    label[idx]=int(xx[-1])					 #-1 就是label所在的特征列
f.close()
y=[label[i] for i in xid]                       #找出train中所有的label
import pickle
pickle.dump(xid,open('xid_train.p','w'))                #保存
pickle.dump(Xt_id,open('xid_test.p','w'))
pickle.dump(xid,open('xid.p','w'))
pickle.dump(Xt_id,open('Xt_id.p','w'))
pickle.dump(y,open('y.p','w'))

gen_opcount_seg.py
查看打开各个文件时候的状态（应该是检查文件的可用性）

import subprocess   
data_path='.'
opcode_path='op_train'
jump_path='jump_train'
jump_map_path='jump_map_train'                              #查看打开文件时候的状态

cmd='mkdir '+' '.join([opcode_path,jump_path,jump_map_path])	#cmd就是所要检查的命令
subprocess.call(cmd,shell=True)          #subprocess.call()执行指定的命令，返回命令执行状态。
															#shell=True执行命令的所有，false是执行只第一条命令
cmd='pypy get_ins.py xid_train.p '+' '.join([data_path+'/train',opcode_path])
subprocess.call(cmd,shell=True)

cmd='pypy get_jump.py xid_train.p '+' '.join([opcode_path,jump_path])
subprocess.call(cmd,shell=True)

cmd='pypy get_jump_map.py xid_train.p '+' '.join([data_path+'/train',jump_path,jump_map_path])
subprocess.call(cmd,shell=True)

cmd='pypy find_new_ins.py xid_train.p '+' '.join([opcode_path,jump_map_path])
subprocess.call(cmd,shell=True)

cmd='pypy filtcmd.py'
subprocess.call(cmd,shell=True)

cmd='pypy find_2g.py xid_train.p '+data_path+'/train'
subprocess.call(cmd,shell=True)

cmd='pypy find_3g.py xid_train.p '+data_path+'/train'
subprocess.call(cmd,shell=True)

cmd='pypy cut3g.py'
subprocess.call(cmd,shell=True)

cmd='pypy cut3g_for_4g.py'
subprocess.call(cmd,shell=True)

cmd='pypy find_4g.py xid_train.p '+data_path+'/train'
subprocess.call(cmd,shell=True)

cmd='pypy cut4g.py'
subprocess.call(cmd,shell=True)

cmd='pypy findhead.py xid_train.p '+data_path+'/train'
subprocess.call(cmd,shell=True)

cmd='pypy rebuild_2g3g4ghead.py xid_train.p '+data_path+'/train'
subprocess.call(cmd,shell=True)

cmd='pypy rebuild_2g3g4ghead.py xid_test.p '+data_path+'/test'
subprocess.call(cmd,shell=True)

cmd='python getfea.py'
subprocess.call(cmd,shell=True)

gen_data.py
生成可用的数据集

import os
import subprocess
train={}
test={}
path='..'
f=open(path+'/trainLabels.csv')
fo=open('trainLabels.csv','w')
fo.write(f.readline())
f1=open(path+'/sampleSubmission.csv')
fo1=open('sampleSubmission.csv','w')
fo1.write(f1.readline())

for line in f:
    xx=line.split(',')
    label=int(xx[-1])                            #取得label         
    if label not in train:                      #如果train中或者test中没有该label的话，就在相应的数据集中加上该label对应的数据
        size=os.path.getsize(path+'/train/'+xx[0][1:-1]+'.asm')
        #if size>1000000:
        #    continue
        train[label]=1
        idx=xx[0][1:-1]
        cmd='cp '+path+'/train/'+idx+'.* train/'
        subprocess.call(cmd,shell=True)
        fo.write(line)
    elif label not in test:                     #将该label对应的样本也写入测试集中
        test[label]=1
        idx=xx[0][1:-1]
        cmd='cp '+path+'/train/'+idx+'.* test/'
        subprocess.call(cmd,shell=True)
        fo1.write(idx+','+','.join(['0' if i!=label-1 else '1' for i in range(9)])+'\n')      #在该特征对应的标签列上标上 ‘1’
    
f.close()
fo.close()
f1.close()
fo1.close()

freq_count.py
记录操作码的出现次数并保存在一个文件中

# -*- coding: utf-8 -*-
"""
part of the code borrowed from the benchmark in the forum.
create Frequency Features for 1 byte. So 16*16 features will add to train and test.
"""
from multiprocessing import Pool
import os
from csv import writer


paths = ['train','test']

 
def consolidate(path):              #consolidate是合并的意思

    s_path = path
    Files = os.listdir(s_path)          #files记录path中所有的文件名
    byteFiles = [i for i in Files if '.bytes' in i]     #记录上面文件名中所有二进制文件的名字
    consolidatedFile = path + '_frequency.csv'
    
    with open(consolidatedFile, 'wb') as f:
        # Preparing header part
        fw = writer(f)
        colnames = ['Id']
        colnames += ['FR_'+hex(i)[2:] for i in range(16**2)]   #这里的[2:]只是为了去掉它十六进制前面的0x
        fw.writerow(colnames)
        
        for t, fname in enumerate(byteFiles):
            consolidation = []
            f = open(s_path+'/'+fname, 'rb')
            twoByte = [0]*16**2
            for row in f:
                codes = row[:-2].split()[1:]
                
                # Conversion of code to to two byte
                twoByteCode = [int(i,16) for i in codes if i != '??']                                     
                # Frequency calculation of two byte codes
                for i in twoByteCode:
                    twoByte[i] += 1                           #记录某一个操作码的出现次数
                
            # Row added
            consolidation += [fname[:fname.find('.bytes')]]   #文件名 + 操作码出现次数
            consolidation += twoByte                          
            
            fw.writerow(consolidation)
            # Writing rows after every 100 files processed
            #if (t+1)%100==0:
            #    printw(t+1, 'files loaded for ', path)

if __name__ == '__main__':
    p = Pool(2)
    p.map(consolidate, paths)
    print "DONE bytes count!"

findhead.py
对asm文件每行代码的第一个操作码进行计数保存

import sys
import pickle

##########################################################
# usage
# pypy findhead.py xid_train.p ../../data/train 

# xid_train.p is a list like ['loIP1tiwELF9YNZQjSUO',''....] to specify
# the order of samples in traing data
# ../../data/train is the path of original train data
##########################################################
xid_name=sys.argv[1]
data_path=sys.argv[2]


xid=pickle.load(open(xid_name)) #xid_train.p or xid_test.p

head={}

for c,f in enumerate(xid):                      #第一个操作码出现的次数统计
    fo=open(data_path+'/'+f+'.asm')
    tot=0
    for line in fo:
        xx=line.split()
        h=xx[0].split(':')
        if h[0] not in head:                    #如果本来在head中还没保存当前的head，则保存
            head[h[0]]=0
        head[h[0]]+=1                           #对当前的head计数
    fo.close()
    if True:                                    #c%10000==0:
        print c*1.0/len(xid),len(head)
print len(head)
pickle.dump(head,open('head.p','w'))

find_new_ins.py
找出符合条件的指令，并有其出现的次数

import pickle
import os
import sys

##########################################################
# usage
# pypy find_new_ins.py xid_train.p ./ins_train  ./jump_map_train

# xid_train.p is a list like ['loIP1tiwELF9YNZQjSUO',''....] to specify
# the order of samples in traing data
# ./ins_train is where the local folder of ins.p, {address:ins}
# ./jump_map_train is where the local folder of jump map, {address of this ins: address of next ins}
##########################################################

xid_name=sys.argv[1]
ins_path=sys.argv[2]
jump_map_path=sys.argv[3]

xid=pickle.load(open(xid_name))  #xid_train.p or xid_test.p

cmd={}                           # new ins found

files=os.listdir(jump_map_path)  #存有跳转指令文件的地址 
mware_that_has_jump={}
for i in files:
    if '.p' in i:                                       #.p文件是已经发现了的恶意文件？
        mware_that_has_jump[i.split('.')[0]]=1          #.p文件中有跳转的直接放入字典中

for cc,fx in enumerate(xid):
    tmpcount={}
    ins=pickle.load(open(ins_path+'/'+fx+'.ins.p'))
    insx=[]
    if fx not in mware_that_has_jump: # there is no jump in that malware     恶意软件没有跳转操作码
        for i in ins:
            if i not in tmpcount:
                tmpcount[i]=0
            tmpcount[i]+=1
        count={}
        for i in tmpcount:
            count[tmpcount[i]]=i
        for j in sorted(count.keys(),reverse=True):
            if j <200:
                break
            if count[j] not in cmd:
                cmd[count[j]]=1 # get the top 200 frequent ins in that mware 获取该malware中前200个常见的INS
        del ins,insx,tmpcount,count
        continue
    jump=pickle.load(open(jump_map_path+'/'+fx+'.p'))
    keys= sorted(ins.keys())
    #print keys[:20]
    nextins={}
    for c,j in enumerate(keys[:-1]):
        if j in jump and jump[j] in ins:
            nextins[j]=jump[j]
            #print j,jump[j]
        else:
            nextins[j]=keys[c+1]
    current=keys[0]
    
    while True:
        if ins[current] not in tmpcount:
            tmpcount[ins[current]]=0
        tmpcount[ins[current]]+=1
        if current not in nextins:
            print 'not in'
            break
        if  sum(tmpcount.values())>len(ins)*5:
            print 'loop runs more than 5x'
            break
        current=nextins[current]

    count={}
    for i in tmpcount:
        count[tmpcount[i]]=i
    for j in sorted(count.keys(),reverse=True):
        if j <200:
            break
        if count[j] not in cmd:
            cmd[count[j]]=0
        cmd[count[j]]+=j
        

    
    del current,ins,insx,jump,keys,nextins,tmpcount,count
              
    print 'find',cc*1.0/len(xid),len(cmd)
print cmd
pickle.dump(cmd,open('newcmd.p','w'))

find_2g.py
find_3g.py
find_4g.py
都是找出相邻的几个操作码，找出他们之间的关系，其中3g在2g的基础上增加，4g在3g的基础上增加

import sys
import pickle

##########################################################
# usage
# pypy find_2g.py xid_train.p ../../data/train 

# xid_train.p is a list like ['loIP1tiwELF9YNZQjSUO',''....] to specify
# the order of samples in traing data
# ../../data/train is the path of original train data
##########################################################
xid_name=sys.argv[1]
data_path=sys.argv[2]


xid=pickle.load(open(xid_name)) #xid_train.p or xid_test.p

newc=pickle.load(open('newc.p'))
cmd2g={}                                                #2g refer to 2-gram
for i in newc:
    for j in newc:
        cmd2g[(i,j)]=0
print newc

for c,f in enumerate(xid):#(files[len(files)/10*a1:len(files)/10*a2]):
    count={}
    for i in cmd2g:
        count[i]=0
    fo=open(data_path+'/'+f+'.asm')
    tot=0
    a=-1
    b=-1
    for line in fo:
        xx=line.split()
        for x in xx:
            if x in newc:
                
                a=b
                b=x
                if (a,b) in cmd2g:
                    count[(a,b)]+=1
                    tot+=1
#                     print (b,a)
    fo.close()
    if c%10==0:
        print c*1.0/len(xid),tot
    for i in cmd2g:
        cmd2g[i]=count[i]+cmd2g[i]
    del count

import pickle
cmd2gx={}
for i in cmd2g:
    if cmd2g[i]>10:
        cmd2gx[i]=cmd2g[i]
print len(cmd2gx)
pickle.dump(cmd2gx,open('cmd2g.p','w'))

ensemble.py
貌似是按照某种规律汇编相应文件的内容？

import pandas as pd
s1=pd.read_csv('model1.csv',index_col=0)
s2=pd.read_csv('model2.csv',index_col=0)
s3=pd.read_csv('model3.csv',index_col=0)
for i in s1.columns.values:
    s1[i]=s1[i]**0.1*s2[i]**0.4*s3[i]*0.5
s1.to_csv('ensemble.csv')

filtcmd.py
对之前找出来的指令码进行筛选，筛掉不合法的指令码

import pickle
cmd=pickle.load(open('newcmd.p'))
newc={}
for c in cmd:
    if '_' in c or c[0] in '?1234567890ABCDEF':  #检查指令的合法性
        continue
    else:
        #print c,cmd[c]
        newc[c]=cmd[c]                           #合法的指令进行保存         
print newc
pickle.dump(newc,open('newc.p','w'))

dll.py
找出训练集和测试集中dll特征并保存

import heapq
import pickle
import math
from csv import DictReader
import glob
import os
import csv
from datetime import datetime

# dll call features.

# load file names 
def load_label(path, label):           #取出label对应的值
    result = []
    for row in DictReader(open(path)):
        if int(row['Class']) == label:
            result.append((row['Id']))
    return result

def dll_single_file(f, N = 4):
    pattern_dict = dict()
    f_lines = list()
    with open(f, 'rb') as outfile:
        for line in outfile:
            if 'idata' in line and 'extrn' in line:      #把在文件中有idata和extrn的行用小写记录下来
                f_lines.append(line.lower())
    for line in f_lines:                                 
        line = line.strip().split()                      #line现在是一个列表，里面是这一行按空格分割的每一个word
        p = line[line.index('extrn')+1].split(':')[0]    #取出extrn后面的词
        if p and p not in pattern_dict:                  #extrn是用来说明：在当前模块所使用的标识符中已在其它模块中被定义为指定类型的标识符，既是一种dll关系。
            pattern_dict[p] = 1                          #记录上面的词
    return pattern_dict

def reduce_dict():
    dict_all = dict()
    for c in range(1,10):
        f_labels = load_label('trainLabels.csv', c)
        for f in f_labels:
            f_name = 'train/'+f+'.asm'
            dll = dll_single_file(f_name)
            for feature in dll:
                if feature not in dict_all:
                    dict_all[feature] = [0]*9
                dict_all[feature][c-1] +=1              #这里记录被应用模块的对应位置，如某个模块为 0，1，0，0，1，1，0，0，0，0
        #print "finishing features in class %i"%c
    return dict_all

# load data     
def num_instances(path, label):                         #记录path文件中值为label的值有多少
    p = 0
    n = 0
    for row in DictReader(open(path)):
        if int(row['Class']) == label:      
            p += 1
        else:
            n += 1
    return p,n


def entropy(p,n):                                       #计算用于观察的某些比率
    p_ratio = float(p)/(p+n)
    n_ratio = float(n)/(p+n)
    return -p_ratio*math.log(p_ratio) - n_ratio * math.log(n_ratio)

def info_gain(p0,n0,p1,n1,p,n):                         #计算用于观察的某些比率
    return entropy(p,n) - float(p0+n0)/(p+n)*entropy(p0,n0) - float(p1+n1)/(p+n)*entropy(p1,n1)



def Heap_gain(p, n, class_label, dict_all, num_features = 1000, gain_minimum_bar = -100):
    heap = [(gain_minimum_bar, 'gain_bar')] * num_features
    root = heap[0]
    for gram, count_list in dict_all.iteritems():
        p1 = count_list[class_label-1]
        n1 = sum(count_list[:(class_label-1)] + count_list[class_label:])
        p0,n0 = p - p1, n - n1
        if p1*p0*n1*n0 != 0:
            gain = info_gain(p0,n0,p1,n1,p,n)
            if gain > root[0]:
                root = heapq.heapreplace(heap, (gain, gram))
    #return heap
    result = [i[1] for i in heap if i[1] != 'gain_bar']
    #print "the length of dll for class %i is %i"%(class_label, len(result))
    return result

def gen_df(features_all, train = True, verbose = False, N = 4):   #生成训练数据或者测试数据
    yield ['Id'] + features_all # yield header
    if train == True:
        ds = 'train'
    else:
        ds = 'test'
    directory_names = list(set(glob.glob(os.path.join(ds, "*.asm"))))
    for f in directory_names:
        f_id = f.split('/')[-1].split('.')[0]
        if verbose == True:
            print 'doing %s'%f_id

        binary_features = list()
        tmp_pattern = dict()
        f_lines = list()
        with open(f, 'rb') as outfile:
            for line in outfile:
                if 'idata' in line and 'extrn' in line:
                    f_lines.append(line.lower())
        for line in f_lines:
            line = line.strip().split()
            p = line[line.index('extrn')+1].split(':')[0]
            if p and p not in tmp_pattern:
                tmp_pattern[p] = 1

        for fea in features_all:
            if fea in tmp_pattern:
                binary_features.append(1)
            else:
                binary_features.append(0)

        yield [f_id] + binary_features

if __name__ == '__main__':
    start = datetime.now()
    dict_all = reduce_dict()
    features_all = []
    for i in range(1,10):
        p, n = num_instances('trainLabels.csv', i)
        features_all  += Heap_gain(p,n,i,dict_all)
    train_data = gen_df(features_all, train = True, verbose = False)
    with open('train_dll.csv','wb') as outfile:
        wr = csv.writer(outfile, delimiter=',', quoting=csv.QUOTE_ALL)
        for row in train_data:
            wr.writerow(row)
    test_data = gen_df(features_all, train = False,verbose = False)
    with open('test_dll.csv','wb') as outfile:
        wr = csv.writer(outfile, delimiter=',', quoting=csv.QUOTE_ALL)
        for row in test_data:
            wr.writerow(row)
    print "DONE DLL functions!"
    #print datetime.now() - start

rebuild_code.py
将asm所有文件的数据按所需的格式（这里是int类型）重新编译一次并保存在一个新文件中

import os,array
import pickle
import numpy as np
import sys
xid=pickle.load(open(sys.argv[1]))
asm_code_path=sys.argv[2]
train_or_test=asm_code_path.split('_')[-1]

X = np.zeros((len(xid),2000))
for cc,i in enumerate(xid):
    f=open(asm_code_path+'/'+i+'.asm')
    ln = os.path.getsize(asm_code_path+'/'+i+'.asm') # length of file in bytes
    width = int(ln**0.5)
    rem = ln%width
    a = array.array("B")        # uint8 array 这里的“B”表示是int类型的数据
    a.fromfile(f,ln-rem)        #读取文件中的内容到a中
    f.close()
    a=np.array(a)
    #im = Image.open('asmimage/'+i+'.png')
    a.resize((2000,))
    #im1 = im.resize((64,64),Image.ANTIALIAS); # for faster computation
    #des = leargist.color_gist(im1)
    X[cc] = a#[0,:1000] #des[0:320]
    print cc*1.0/len(xid)
pickle.dump(X,open('Xcode_'+train_or_test+'.p','w'))

cut3g.py
cut4g.py
精炼原本的找出来的3g，4g指令码

import sys
import pickle
newc=pickle.load(open('cmd3g.p'))
nx={}
c=0
for i in newc:
    if newc[i]>100:                      #记住大于100的
        c+=1
        nx[i]=newc[i]
print c,len(nx)                          #输出3g指令中出现次数大于100的有多少，并保存操作码和出现次数
pickle.dump(nx,open('cutcmd3g.p','w'))

cut3g_for_4g.py
在3g中找出符合4g的操作码与次数

import sys
import pickle
newc=pickle.load(open('cmd3g.p'))
nx={}
c=0
for i in newc:
    if newc[i]>10000:               #只有出现次数大于10000的操作码才能进入4g
        c+=1
        nx[i]=newc[i]
print c,len(newc)
pickle.dump(nx,open('cutcmd3g_for_4g.p','w'))

rebuild_2g3g4ghead.py
在之前找出来的指令码中，找出符合2g3g4g的指令码

import sys
import pickle

##########################################################
# usage
# pypy rebuild_2g.py xid_train.p ../../data/train 

# xid_train.p is a list like ['loIP1tiwELF9YNZQjSUO',''....] to specify
# the order of samples in traing data
# ../../data/train is the path of original train data
##########################################################
                          # sys.argv 是获取运行python文件的时候 命令行参数 ，且以list形式存储参数
xid_name=sys.argv[1]      # sys.argv[1] 是获取文件名后的第一个 命令行参数           注意看上面运行文件时候的命令行  
data_path=sys.argv[2]     # sys.argv[2] 是获取文件名后的第二个 命令行参数  
xid=pickle.load(open(xid_name)) #xid_train.p or xid_test.p

newc=pickle.load(open('newc.p'))

train_or_test=data_path.split('/')[-1]       #由路径的最后的一个单词判断文件是训练文件还是测试文件
if train_or_test=='train':
    f=open(data_path[:-5]+'trainLabels.csv') #打开这个labels数据文件
    f.readline()
    train={}
    for line in f:
        xx=line.split(',')
        train[xx[0][1:-1]]=int(xx[1])        # labels are from 1 -> 9 !               ？？？
    f.close()
    y=[]

cmd2g={}
for i in newc:
    for j in newc:                           #形成一个类似矩阵的结构
        cmd2g[(i,j)]=1
print newc


cmd3g=pickle.load(open('cutcmd3g.p'))        #取出本来保存中3g4g的指令码
cmd4g=pickle.load(open('cutcmd4g.p'))
head=pickle.load(open('head.p'))

print newc
X2g=[]
X3g=[]
X4g=[]
Xhead=[]
for c,f in enumerate(xid):#(files[len(files)/10*a1:len(files)/10*a2]):
    fo=open(data_path+'/'+f+'.asm')

    count2g={}
    count3g={}
    count4g={}
    for i in cmd2g:
        count2g[i]=0

    for i in cmd3g:
        count3g[i]=0

    for i in cmd4g:
        count4g[i]=0

    counthead={}
    for i in head:
        counthead[i]=0

    tot=0
    a=-1
    b=-1
    d=-1
    e=-1
    for line in fo:

        xx=line.split()

        if xx[0].split(':')[0] in counthead:
            counthead[xx[0].split(':')[0]]+=1  #记录head头操作码出现的次数

        for x in xx:
            if x in newc:
                a=b
                b=d
                d=e
                e=x
                if (a,b,d,e) in cmd4g:
                    count4g[(a,b,d,e)]+=1
                    tot+=1

                if (b,d,e) in cmd3g:
                    count3g[(b,d,e)]+=1

                if (d,e) in cmd2g:
                    count2g[(d,e)]+=1

    fo.close()
    name=f.split('.')[0]
    if train_or_test=='train': 
        y.append(train[name])
    if True:#c%10000==0:
        print c*1.0/len(xid),tot
    X4g.append([count4g[i] for i in cmd4g])
    X3g.append([count3g[i] for i in cmd3g])
    X2g.append([count2g[i] for i in cmd2g])
    Xhead.append([counthead[i] for i in head])

    del count4g,count2g,count3g,counthead
train_or_test=data_path.split('/')[-1]
pickle.dump(X4g,open('X4g_'+train_or_test+'.p','w'))
pickle.dump(X3g,open('X3g_'+train_or_test+'.p','w'))
pickle.dump(X2g,open('X2g_'+train_or_test+'.p','w'))
pickle.dump(Xhead,open('Xhead_'+train_or_test+'.p','w'))

if train_or_test=='train':
    pickle.dump(y,open('y.p','w'))

你可能感兴趣的:(检测恶意软件)

【目标检测数据集】卡车数据集1073张VOC+YOLO格式熬夜写代码的平头哥∰ 目标检测 YOLO 人工智能
数据集格式：PascalVOC格式+YOLO格式(不包含分割路径的txt文件，仅仅包含jpg图片以及对应的VOC格式xml文件和yolo格式txt文件)图片数量(jpg文件个数)：1073标注数量(xml文件个数)：1073标注数量(txt文件个数)：1073标注类别数：1标注类别名称:["truck"]每个类别标注的框数：truck框数=1120总框数：1120使用标注工具：labelImg标注
番茄西红柿叶子病害分类数据集12882张11类别 futureflsl 数据集分类数据挖掘人工智能
数据集类型：图像分类用，不可用于目标检测无标注文件数据集格式：仅仅包含jpg图片，每个类别文件夹下面存放着对应图片图片数量(jpg文件个数)：12882分类类别数：11类别名称:["Bacterial_Spot_Bacteria","Early_Blight_Fungus","Healthy","Late_Blight_Water_Mold","Leaf_Mold_Fungus","Powdery
钢筋长度超限检测检数据集VOC+YOLO格式215张1类别 futureflsl 数据集 YOLO 深度学习机器学习
数据集格式：PascalVOC格式+YOLO格式(不包含分割路径的txt文件，仅仅包含jpg图片以及对应的VOC格式xml文件和yolo格式txt文件)图片数量(jpg文件个数)：215标注数量(xml文件个数)：215标注数量(txt文件个数)：215标注类别数：1标注类别名称:["iron"]每个类别标注的框数：iron框数=215总框数：215使用标注工具：labelImg标注规则：对类别进
928、在新冠的日子里（2）隔离天使小鱼儿
昨天YD全部人员核酸检测阴性。但是也都不能回家，要隔离14天，按规定执行。小红也是其中之一，今天是第三天，第二夜，门把手的源头还没有通报，在排查中。隔离措施是对的。是人？是物？是相似病毒？希望是虚惊一场。昨天，单位排长队，做核酸检测。我们都统一做了检测。现在出去做事，核酸检测是必须的。我今天也要外出做事，所以核酸检测也要提供。给小红准备了简单的替换衣服。我们也按规定执行。问闺蜜你们也都不回家吗？回
乡愁誰家今夜扁舟子
从前乡愁是一张张火车票我在这头故乡在那头而现在乡愁是一张张核算检测证明我在这头故乡说：你就在那头吧，别回这头！
计算机木马详细编写思路小熊同学哦 php 开发语言木马木马思路
导语：计算机木马（ComputerTrojan）是一种恶意软件，通过欺骗用户从而获取系统控制权限，给黑客打开系统后门的一种手段。虽然木马的存在给用户和系统带来严重的安全风险，但是了解它的工作原理与编写思路，对于我们提高防范意识、构建更健壮的网络安全体系具有重要意义。本篇博客将深入剖析计算机木马的详细编写思路，以及如何复杂化挑战，以期提高读者对计算机木马的认识和对抗能力。计算机木马的基本原理计算机木
遥感影像的切片处理 sand&wich 计算机视觉 python 图像处理
在遥感影像分析中，经常需要将大尺寸的影像切分成小片段，以便于进行详细的分析和处理。这种方法特别适用于机器学习和图像处理任务，如对象检测、图像分类等。以下是如何使用Python和OpenCV库来实现这一过程，同时确保每个影像片段保留正确的地理信息。准备环境首先，确保安装了必要的Python库，包括numpy、opencv-python和xml.etree.ElementTree。这些库将用于图像处理
新能源汽车 BMS 学习笔记篇—BMS 基本定义及分类 WPG大大通其他笔记汽车 BMS 经验分享新能源电池
一、BMS定义1、概念：BMS（BatteryManagementSystem）即电池管理系统，其管理对象是二次电池（充电电池或蓄电池），其主要目的是电池的利用率，防止电池出现过度充电和过度放电，可应用于电动汽车、电瓶车、机器人、无人机等图片来源：腾讯网https://new.qq.com《标准普尔警告，电动汽车电池生产面临供应链和地缘政治风险》2、四大功能①感知和测量：检测电池的电压、电流、温度
[数据集][目标检测]汽车头部尾部检测数据集VOC+YOLO格式5319张3类别 FL1623863129 数据集目标检测汽车 YOLO
数据集制作单位：未来自主研究中心(FIRC)版权单位：未来自主研究中心(FIRC)版权声明：数据集仅仅供个人使用，不得在未授权情况下挂淘宝、咸鱼等交易网站公开售卖,由此引发的法律责任需自行承担数据集格式：PascalVOC格式+YOLO格式(不包含分割路径的txt文件，仅仅包含jpg图片以及对应的VOC格式xml文件和yolo格式txt文件)图片数量(jpg文件个数)：5319标注数量(xml文件
idea使用自定义checkstyle.xml配置文件 Gemkey
1.下载插件image.png2.插件安装完后,找到设置中的checkstyle,点击"+",新增自定义规则image.png3.输入描述信息,点击Browse找到对应的文件image.pngimage.png4.可以把active勾上,则使用默认校验规则,点击OK,则可以开始使用自定义规则检测单个文件了image.png
2022-07-06 榜一大哥啊
非洲猪瘟检测流程要点1、进入实验室按照要求穿好装备进入实验室，病原稀释及制备，将实验用假阳性按照倍数稀释，最高稀释到一万倍。所有操作流程都在生物安全柜进行，按照流程进行编号，编写检测编号。在每个实验室都要将白大褂以及手套进行更换。2、到试剂准备区进行试剂准备，按照样品数量加阴阳对照进行配备，该项目在超净工作台进行。将制备好的试剂放入传递窗，进入核酸提取环节。3、核酸提取区，进行核酸提纯，用磁吸法核
228.第一个错误的版本 vbuer
你是产品经理，目前正在带领一个团队开发新的产品。不幸的是，你的产品的最新版本没有通过质量检测。由于每个版本都是基于之前的版本开发的，所以错误的版本之后的所有版本都是错的。假设你有n个版本[1,2,...,n]，你想找出导致之后所有版本出错的第一个错误的版本。你可以通过调用boolisBadVersion(version)接口来判断版本号version是否在单元测试中出错。实现一个函数来查找第一个错
Cut, Paste and Learn方法解读 wangxinwei2000 深度学习人工智能
Abstract问题背景：标注数据的缺乏：在实例检测任务中，部署物体检测模型的一个主要障碍是缺乏大量标注数据。例如，在一个特定的厨房环境中找到包含实例的大型标注数据集是不太可能的。每当面对新的环境和新的物体实例时，都需要进行昂贵的数据收集和标注工作。研究贡献：解决方法：本文提出了一种简单的方法，可以以最小的努力生成大量标注的实例数据集。关键洞察：研究者的关键洞察是，仅仅确保“局部真实感”（patc
Java【泛型】 SkyrimCitadelValinor Java基础 java
Java泛型的概述不同类的数据如果封装方法相同，不必为每一种类单独定义一个类，只需定义一个泛型类，减少类的声明，提高编程效率。通过准确定义泛型类，可避免对象类型转换时产生的错误。泛型又提供了一种类型安全检测机制，只有数据类型相匹配的变量才能正常的赋值，否则编译器就不通过。Java中的泛型与C++类模板的作用相同，但是编译方式不同，Java泛型类只会生成一部分目标代码，牺牲运行速度，而C++的类模板
什么是接口测试？做接口测试的意义是什么？白码会说软件测试接口测试软件测试
Timewilltell.1、什么是接口测试？为什么要做接口测试？接口测试是测试系统组件间接口的一种测试。接口测试主要用于检测外部系统与系统之间以及内部各个子系统之间的交互点。测试的重点是要检查数据的交换，传递和控制管理过程，以及系统间的相互逻辑依赖关系等。由于如今的系统复杂度不断上升，传统的测试方法成本急剧增加且测试效率大幅下降，所以就要做接口测试。同时，接口测试相对容易实现自动化持续集成，且相
Python和MATLAB及C++信噪比导图(算法模型) 亚图跨际算法交叉知识 Python 视频图像修复模数转换信号链噪音频谱计算量化周期性视觉刺激高斯噪声的矩形脉冲心率失常检测算法
要点视频图像修复模数转换中混合信号链噪音测量频谱计算和量化周期性视觉刺激脑电图高斯噪声的矩形脉冲总谐波失真周期图功率谱密度各种心率失常检测算法胶体悬浮液跟踪检测计算交通监控摄像头图像噪音计算Python信噪比信噪比是科学和工程中使用的一种测量方法，用于比较所需信号水平与背景噪声水平。信噪比定义为信号功率与噪声功率之比，通常以分贝表示。高于1:1（大于0dB）的比率表示信号大于噪声。信噪比是影响处理
七绝理想（新韵）清风8351
看图作诗七绝理想（新韵）未出旭日朝霞美，碧水青山秀彩妆。努力拼搏为理想，扬帆破浪奔前方。平平仄仄平平仄，仄仄平平仄仄平。仄仄平平平仄仄，平平仄仄仄平平。----作品----未出旭日朝霞美碧水青山秀彩妆【妆：十唐】押韵努力拼搏为理想扬帆破浪奔前方【方：十唐】押韵---检测结果(中华新韵)---存在多音字：为奔，请根据词意判断平仄平仄符合要求，请留意多音字！
深圳疫情最新情况—龙岗坂田全员核酸检测苌疏
一早上就看到群消息关于疫情的事，我工作的附近出现了一起病历，消息一出来，工作群都炸了，都在议论纷纷，没过多久，又出现了坂田地区全员核酸检测的消息。一时之间，人心惶惶，居家隔离的隔离，在辖区上班的我也开始一出小区门口就带上了口罩。像往常一样去上班坐地铁，依旧拥挤，没有异常，但是在小区路过时，看见排着的长队，突然意识到疫情离我这么近，有点慌，他就像一只蟑螂，突然就出现你面前，让你猝不及防。发此文没有别
七绝油菜花清风8351
看图作诗油菜花开遍地黄，文人墨客赛诗章。他言色彩太单调，我道纯洁蕊更香。仄仄平平仄仄平，平平仄仄仄平平。平平仄仄平平仄，仄仄平平仄仄平。----作品----油菜花开遍地黄【黄：十唐】押韵文人墨客赛诗章【章：十唐】押韵他言色彩太单调我道纯洁蕊更香【香：十唐】押韵---检测结果(中华新韵)---存在多音字：单调更，请根据词意判断平仄平仄符合要求，请留意多音字！
【STM32系统】基于STM32设计的锂电池电量/电压检测报警器系统——文末完整资料下载（程序源码/电路原理图/电路PCB/设计文档/模块资料/元器件清单/实物图/答辩问题技巧/PPT模版等）阿齐Archie 单片机嵌入式项目 stm32 嵌入式硬件单片机
基于STM32设计的锂电池电量/电压检测报警器系统系统视频：摘要：本设计旨在研究一个基于STM32F103C8T6微控制器的锂电池电量/电压检测报警器系统，应用于便携式电子设备电池管理。系统通过STM32的ADC模块对锂电池电压进行采集，利用LCD1602显示模块实时显示电池电压，当检测到电池电量不足或电压异常时，蜂鸣器报警模块会发出警报提醒用户。系统采用简单的硬件结构和优化的软件架构，通过对实际
STM32 如何生成随机数千千道 STM32 stm32 单片机物联网
目录一、引言二、STM32随机数发生器概述三、工作原理1.噪声源2.线性反馈移位寄存器（LFSR）3.数据寄存器（RNG_DR）4.监控和检测电路：5.控制和状态寄存器6.生成流程四、使用方法1.使能随机数发生器2.读取随机数3.错误处理五、注意事项1.随机数的质量2.安全性3.性能考虑六、总结一、引言在嵌入式系统开发中，随机数的生成常常是一个重要的需求。无论是用于加密、模拟、游戏还是其他需要不确
【有啥问啥】刷爆各大榜单的Reflection 70B模型背后的错误自我纠正（Reflection-Tuning）技术解析：一种革新AI模型的方法 Chauvin912 大模型行业调研人工智能算法
刷爆各大榜单的Reflection70B模型背后的错误自我纠正（Reflection-Tuning）技术解析：一种革新AI模型的方法在快速发展的AI领域，尤其是大型语言模型（LLM）的竞争中，错误自我纠正技术（Reflection-Tuning）正逐步成为提升模型性能的关键突破。该技术通过赋予模型自我检测和纠正错误的能力，显著提高了输出的准确性和可靠性。本文将深入解析Reflection-Tunn
STM32——看门狗通俗解析百里与司空 stm32 嵌入式硬件单片机门控循环单元
笔者在学习看门狗的视频后，对看门狗仍然是一知半解，后面在实际应用中发现它是一个很好用的检测或者调试工具。所以总结一下笔者作为初学小白对看门狗的理解。主函数初始化阶段、循环阶段和复位众所周知，程序的运行一般是这样的：程序在进入循环阶段之前，会在初始化阶段将每个寄存器或者某些变量赋值。初始化阶段的代码执行一次后，就不再执行了。而循环阶段的代码会执行很多次，一直循环反复的执行下去。这时，如果进行了复位，
HALTT4LLM：大型语言模型的幻觉检测指标谢忻含Norma
HALTT4LLM：大型语言模型的幻觉检测指标haltt4llmThisprojectisanattempttocreateacommonmetrictotestLLM'sforprogressineliminatinghallucinationswhichisthemostseriouscurrentprobleminwidespreadadoptionofLLM'sformanyrealpur
C#实现软件自动升级 BruceEditCode
winform程序相对web程序而言，功能更强大，编程更方便，但软件更新却相当麻烦，要到客户端一台一台地升级，本文结合实际情况，通过软件实现自动升级，弥补了这一缺陷，有较好的参考价值。由于程序在运行时不能用新的版本覆盖自己，因此，我们将登录窗口单独做成一个可执行文件，用户登录时，从网上检测是否有新的主程序，如果有，则从后台下载并覆盖老的版本，用户输入正确的用户名和密码后，通过参数将必要的信息（如用
2019-9-1亲子日记345《开学第一天》冰园哲月
今天是暑假过后开学的日子，早上儿子一如往常要求穿班服系红领巾，看来为了开学准备的几套新衣服也排不上什么大用场了。今天开学第一天，根据教体局的安排，全市中小学进行返校检测，放松了整个暑假，也不知道小家伙能考个什么样子，顺其自然吧！晚上回家儿子把语文试卷带回来了，老师没有批，儿子说，老师让照着阅读书把答案找到填上，我拿起卷子一看，考的全是阅读题，有些题儿子没有答上，应该是不会。有好几个题目考了《西游记
CV、NLP、数据控掘推荐、量化海的那边- AI算法自然语言处理人工智能
下面是对CV（计算机视觉）、NLP（自然语言处理）、数据挖掘推荐和量化的简要概述及其应用领域的介绍：1.CV（计算机视觉，ComputerVision）定义：计算机视觉是一门让计算机能够从图像或视频中提取有用信息，并做出决策的学科。它通过模拟人类的视觉系统来识别、处理和理解视觉信息。主要任务：图像分类：识别图像中的物体并分类，比如猫、狗、车等。目标检测：在图像或视频中定位并识别多个对象，如人脸检测
解决mysql漏洞 Oracle MySQL Server远程安全漏洞(CVE-2015-0411) dieweidong5625 数据库运维 java
有时候会检测到服务器有很多漏洞，而大部分漏洞都是由于服务的版本过低的原因，因为官网出现漏洞就会发布新版本来修复这个漏洞，所以一般情况下，我们只需要对相应的软件包进行升级到安全版本即可。通过查阅官网信息，OracleMySQLServer远程安全漏洞(CVE-2015-0411)，受影响系统：OracleMySQLServer/usr/databases.sql//先备份原有所有数据，防止数据丢失。
OpenCV结构分析与形状描述符（24）检测两个旋转矩形之间是否相交的一个函数rotatedRectangleIntersection()的使用 jndingxin OpenCV opencv 人工智能计算机视觉
操作系统：ubuntu22.04OpenCV版本：OpenCV4.9IDE:VisualStudioCode编程语言：C++11算法描述测两个旋转矩形之间是否存在交集。如果存在交集，则还返回交集区域的顶点。下面是一些交集配置的例子。斜线图案表示交集区域，红色顶点是由函数返回的。rotatedRectangleIntersection()这个函数看起来像是用于检测两个旋转矩形之间是否相交的一个方法。
神兽将归笼边城涛哥
昨晚，一家三口端坐于电脑前，认认真真地参加了学校组织的家长会。这次家长会，就是学生返校前的一次卫生安全防护知识的大宣讲和再动员，校方简要地介绍了前期为准备开学所做的各种演练，也详细地讲解了需要家长和学生高度重视的每一个环节：从如何正确洗手，到上学、就餐时的行进线路；从每天的体温监测，到期末考试的安排等等。明天，学校将组织学生进行核酸检测。我家离学校远，明天得早些出发，听说最近进城的车特多，早上经常
集合框架天子之骄 java 数据结构集合框架
集合框架集合框架可以理解为一个容器，该容器主要指映射(map)、集合(set)、数组(array)和列表(list)等抽象数据结构。从本质上来说，Java集合框架的主要组成是用来操作对象的接口。不同接口描述不同的数据类型。简单介绍： Collection接口是最基本的接口，它定义了List和Set，List又定义了LinkLi
Table Driven（表驱动）方法实例 bijian1013 java enum Table Driven 表驱动
实例一： /** * 驾驶人年龄段 * 保险行业，会对驾驶人的年龄做年龄段的区分判断 * 驾驶人年龄段：01-[18,25);02-[25,30);03-[30-35);04-[35,40);05-[40,45);06-[45,50);07-[50-55);08-[55,+∞) */ public class AgePeriodTest { //if...el
Jquery 总结 cuishikuan java jquery Ajax Web jquery方法
1.$.trim方法用于移除字符串头部和尾部多余的空格。如：$.trim(' Hello ') // Hello2.$.contains方法返回一个布尔值，表示某个DOM元素（第二个参数）是否为另一个DOM元素（第一个参数）的下级元素。如：$.contains(document.documentElement, document.body); 3.$
面向对象概念的提出麦田的设计者 java 面向对象面向过程
面向对象中，一切都是由对象展开的，组织代码，封装数据。在台湾面向对象被翻译为了面向物件编程，这充分说明了，这种编程强调实体。下面就结合编程语言的发展史，聊一聊面向过程和面向对象。 c语言由贝尔实
linux网口绑定被触发 linux
刚在一台IBM Xserver服务器上装了RedHat Linux Enterprise AS 4，为了提高网络的可靠性配置双网卡绑定。一、环境描述我的RedHat Linux Enterprise AS 4安装双口的Intel千兆网卡，通过ifconfig -a命令看到eth0和eth1两张网卡。二、双网卡绑定步骤： 2.1 修改/etc/sysconfig/network
XML基础语法肆无忌惮_ xml
一、什么是XML？ XML全称是Extensible Markup Language，可扩展标记语言。很类似HTML。XML的目的是传输数据而非显示数据。XML的标签没有被预定义，你需要自行定义标签。XML被设计为具有自我描述性。是W3C的推荐标准。二、为什么学习XML？用来解决程序间数据传输的格式问题做配置文件充当小型数据库三、XML与HTM
为网页添加自己喜欢的字体知了ing 字体秒表 css
@font-face { font-family: miaobiao;//定义字体名字 font-style: normal; font-weight: 400; src: url('font/DS-DIGI-e.eot');//字体文件 } 使用： <label style="font-size:18px;font-famil
redis范围查询应用-查找IP所在城市矮蛋蛋 redis
原文地址： http://www.tuicool.com/articles/BrURbqV 需求根据IP找到对应的城市原来的解决方案 oracle表（ip_country）：查询IP对应的城市： 1.把a.b.c.d这样格式的IP转为一个数字，例如为把210.21.224.34转为3524648994 2. select city from ip_
输入两个整数，计算百分比 alleni123 java
public static String getPercent(int x, int total){ double result=(x*1.0)/(total*1.0); System.out.println(result); DecimalFormat df1=new DecimalFormat("0.0000%");
百合——————>怎么学习计算机语言百合不是茶 java 移动开发
对于一个从没有接触过计算机语言的人来说，一上来就学面向对象，就算是心里上面接受的了，灵魂我觉得也应该是跟不上的，学不好是很正常的现象，计算机语言老师讲的再多，你在课堂上面跟着老师听的再多，我觉得你应该还是学不会的，最主要的原因是你根本没有想过该怎么来学习计算机编程语言，记得大一的时候金山网络公司在湖大招聘我们学校一个才来大学几天的被金山网络录取，一个刚到大学的就能够去和
linux下tomcat开机自启动 bijian1013 tomcat
方法一：修改Tomcat/bin/startup.sh 为: export JAVA_HOME=/home/java1.6.0_27 export CLASSPATH=$CLASSPATH:$JAVA_HOME/lib/tools.jar:$JAVA_HOME/lib/dt.jar:. export PATH=$JAVA_HOME/bin:$PATH export CATALINA_H
spring aop实例 bijian1013 java spring AOP
1.AdviceMethods.java package com.bijian.study.spring.aop.schema; public class AdviceMethods { public void preGreeting() { System.out.println("--how are you!--"); } } 2.beans.x
[Gson八]GsonBuilder序列化和反序列化选项enableComplexMapKeySerialization bit1129 serialization
enableComplexMapKeySerialization配置项的含义 Gson在序列化Map时，默认情况下，是调用Key的toString方法得到它的JSON字符串的Key，对于简单类型和字符串类型，这没有问题，但是对于复杂数据对象，如果对象没有覆写toString方法，那么默认的toString方法将得到这个对象的Hash地址。 GsonBuilder用于
【Spark九十一】Spark Streaming整合Kafka一些值得关注的问题 bit1129 Stream
包括Spark Streaming在内的实时计算数据可靠性指的是三种级别： 1. At most once，数据最多只能接受一次，有可能接收不到 2. At least once, 数据至少接受一次，有可能重复接收 3. Exactly once 数据保证被处理并且只被处理一次，具体的多读几遍http://spark.apache.org/docs/lates
shell脚本批量检测端口是否被占用脚本 ronin47
#!/bin/bash cat ports |while read line do#nc -z -w 10 $line nc -z -w 2 $line 58422>/dev/null2>&1if[ $?-eq 0]then echo $line:ok else echo $line:fail fi done 这里的ports 既可以是文件
java-2.设计包含min函数的栈 bylijinnan java
具体思路参见：http://zhedahht.blog.163.com/blog/static/25411174200712895228171/ import java.util.ArrayList; import java.util.List; public class MinStack { //maybe we can use origin array rathe
Netty源码学习-ChannelHandler bylijinnan java netty
一般来说，“有状态”的ChannelHandler不应该是“共享”的，“无状态”的ChannelHandler则可“共享” 例如ObjectEncoder是“共享”的, 但 ObjectDecoder 不是因为每一次调用decode方法时，可能数据未接收完全（incomplete），它与上一次decode时接收到的数据“累计”起来才有可能是完整的数据，是“有状态”的 p
java生成随机数 cngolon java
方法一： /** * 生成随机数 * @author [email protected] * @return */ public synchronized static String getChargeSequenceNum(String pre){ StringBuffer sequenceNum = new StringBuffer(); Date dateTime = new D
POI读写海量数据 ctrain 海量数据
import java.io.FileOutputStream; import java.io.OutputStream; import org.apache.poi.xssf.streaming.SXSSFRow; import org.apache.poi.xssf.streaming.SXSSFSheet; import org.apache.poi.xssf.streaming
mysql 日期格式化date_format详细使用 daizj mysql date_format 日期格式转换日期格式化
日期转换函数的详细使用说明 DATE_FORMAT(date,format) Formats the date value according to the format string. The following specifiers may be used in the format string. The&n
一个程序员分享8年的开发经验 dcj3sjt126com 程序员
在中国有很多人都认为IT行为是吃青春饭的，如果过了30岁就很难有机会再发展下去!其实现实并不是这样子的，在下从事.NET及JAVA方面的开发的也有8年的时间了，在这里在下想凭借自己的亲身经历，与大家一起探讨一下。明确入行的目的很多人干IT这一行都冲着“收入高”这一点的，因为只要学会一点HTML, DIV+CSS，要做一个页面开发人员并不是一件难事，而且做一个页面开发人员更容
android欢迎界面淡入淡出效果 dcj3sjt126com android
很多Android应用一开始都会有一个欢迎界面，淡入淡出效果也是用得非常多的，下面来实现一下。主要代码如下： package com.myaibang.activity; import android.app.Activity;import android.content.Intent;import android.os.Bundle;import android.os.CountDown
linux 复习笔记之常见压缩命令 eksliang tar解压 linux系统常见压缩命令 linux压缩命令 tar压缩
转载请出自出处:http://eksliang.iteye.com/blog/2109693 linux中常见压缩文件的拓展名 *.gz gzip程序压缩的文件 *.bz2 bzip程序压缩的文件 *.tar tar程序打包的数据，没有经过压缩 *.tar.gz tar程序打包后，并经过gzip程序压缩 *.tar.bz2 tar程序打包后，并经过bzip程序压缩 *.zi
Android 应用程序发送shell命令 gqdy365 android
项目中需要直接在APP中通过发送shell指令来控制lcd灯，其实按理说应该是方案公司在调好lcd灯驱动之后直接通过service送接口上来给APP，APP调用就可以控制了，这是正规流程，但我们项目的方案商用的mtk方案，方案公司又没人会改，只调好了驱动，让应用程序自己实现灯的控制，这不蛋疼嘛！！！！发就发吧！一、关于shell指令：我们知道，shell指令是Linux里面带的
java 无损读取文本文件 hw1287789687 读取文件无损读取读取文本文件 charset
java 如何无损读取文本文件呢？以下是有损的 @Deprecated public static String getFullContent(File file, String charset) { BufferedReader reader = null; if (!file.exists()) { System.out.println("getFull
Firebase 相关文章索引 justjavac firebase
Awesome Firebase 最近谷歌收购Firebase的新闻又将Firebase拉入了人们的视野，于是我做了这个 github 项目。 Firebase 是一个数据同步的云服务，不同于 Dropbox 的「文件」，Firebase 同步的是「数据」，服务对象是网站开发者，帮助他们开发具有「实时」（Real-Time）特性的应用。开发者只需引用一个 API 库文件就可以使用标准 RE
C++学习重点 lx.asymmetric C++笔记
1.c++面向对象的三个特性：封装性，继承性以及多态性。 2.标识符的命名规则：由字母和下划线开头，同时由字母、数字或下划线组成；不能与系统关键字重名。 3.c++语言常量包括整型常量、浮点型常量、布尔常量、字符型常量和字符串性常量。 4.运算符按其功能开以分为六类：算术运算符、位运算符、关系运算符、逻辑运算符、赋值运算符和条件运算符。 &n
java bean和xml相互转换 q821424508 java bean xml xml和bean转换 java bean和xml转换
这几天在做微信公众号做的过程中想找个java bean转xml的工具，找了几个用着不知道是配置不好还是怎么回事，都会有一些问题，然后脑子一热谢了一个javabean和xml的转换的工具里，自己用着还行，虽然有一些约束吧，还是贴出来记录一下顺便你提一下下，这个转换工具支持属性为集合、数组和非基本属性的对象。 packag
C 语言初级位运算 1140566087 位运算 c
第十章位运算 1、位运算对象只能是整形或字符型数据，在VC6.0中int型数据占4个字节 2、位运算符：运算符作用 ~ 按位求反 << 左移 >> 右移 & 按位与 ^ 按位异或 | 按位或他们的优先级从高到低； 3、位运算符的运算功能： a、按位取反： ~01001101 = 101
14点睛Spring4.1-脚本编程 wiselyman spring4
14.1 Scripting脚本编程脚本语言和java这类静态的语言的主要区别是:脚本语言无需编译,源码直接可运行; 如果我们经常需要修改的某些代码,每一次我们至少要进行编译,打包,重新部署的操作,步骤相当麻烦; 如果我们的应用不允许重启,这在现实的情况中也是很常见的; 在spring中使用脚本编程给上述的应用场景提供了解决方案,即动态加载bean; spring支持脚本