# -*- coding:UTF-8 -*-
"""
access_log 建模:
根据request长度来生成白名单(基于统计的方法)
第一阶段:训练阶段 application profiling分析对象:access log分析方法:
(1) 去重
(2) 筛掉响应码为4xx,5xx的请求 (或者选择2xx,3xx的请求)
(3) 筛掉静态资源请求
第二阶段:检测阶段on-line learning statistical model 目标:
(1)request长度
方法: 若参数值长度在不等式外 |len(request)-D(x)**0.5|<=E(x)标记为可疑***请求
"""
import os,sys,math
import re,string
import numpy
def help():
global Operation,Logfile
if not sys.argv[1:]:
print "Usage: python parse_accesslog.py [OPTIONS]"
print "Options are:"
print "-o, --operation
print "-f, --file
sys.exit(0)
Operation='None'
Logfile='None'
i = 1
while (i arg = sys.argv[i] if arg=='-o' or arg=='--operation': i += 1 Operation = sys.argv[i].upper() elif arg=='-f' or arg=='--file': i += 1 Logfile = sys.argv[i] else: pass i += 1 if Operation=='None': print "请选择操作方式:create or parse?" sys.exit(0) if Logfile=='None': print "请提供需要分析的日志文件(绝对路径)!" sys.exit(0) class LogModel(object): def __init__(self,logfile): self.logfile = logfile self.rsfile='logModel.ini' self.reqs = set() self.static_elements=("gif", "jpeg","png","bmp","ico","js","css","htm","html") self.success_urls=set() #访问状态为成功的url self.urls=set()#处理之后最终需要分析的url def getRequests(self): with open(self.logfile, "r") as base_log: for line in base_log: line = line.split() url=line[7] #被请求URL #url=line[5].replace('URL:','') status=line[9] #状态码 #status=line[6].replace('CODE:','') self.reqs.add((url,status)) self.reqs=list(self.reqs) def filterSucce***equests(self): #过滤出状态码2XX 3XX的请求 for item in self.reqs: if re.match(r'2\d\d$',str(item[1])) or re.match(r'3\d\d$',str(item[1])): self.success_urls.add(item[0]) self.success_urls=list(self.success_urls) def filterDynamicRequets(self): #筛掉静态的请求 static_string='|'.join(self.static_elements) formatstring='('+static_string+')$' for url in self.success_urls: if re.search(formatstring,url): continue else: self.urls.add(url) self.urls=list(self.urls) #print self.urls def getMatchVal(self): #计算数学期望值E(x)和方差值D(x) N=len(self.urls) E_value=0.0 #期望值 D_value=0.0 #方差 url_length=[] #每个url长度 for url in self.urls: url_length.append(len(url)) """ 使用numpy 对列表求和、求方差 方差公式: s**2=[(X1-x)**2+...+(Xn-x)**2]/N=[(X1**2+X2**2+...+Xn**2)+(X1+...Xn)**2)/N]/N """ narray=numpy.array(url_length) sum1=narray.sum() narray2=narray*narray sum2=narray2.sum() mean=sum1/N E_value=mean D_value=sum2/N-mean**2 #print E_value,D_value,N #sys.exit(0) #for url in self.urls: # D_value+=(len(url)-E_value)**2 output="E(x):%f\nD(x):%f"%(E_value,D_value) #print output with open(self.rsfile, 'w') as f: f.write(output) class ParseLog(LogModel): def __init__(self,logfile): LogModel.__init__(self,logfile) self.badreqs = set() def do_parse(self): E_value=0.0 D_value=0.0 with open(self.rsfile,'r') as f: for line in f: if re.search(r'^E',line): E_value=string.atof(line.replace('E(x):','')) if re.search(r'^D',line): D_value=string.atof(line.replace('D(x):','')) #print "E(x):%f,D(x):%f"%(E_value,D_value) for url in self.urls: L=len(url) if abs(L-math.sqrt(D_value))<=E_value: pass#normal url else:#标记为可疑 suspicious_string=url+','+str(abs(L-math.sqrt(D_value)))+'('+str(E_value)+')' self.badreqs.add(suspicious_string) #self.badreqs.add(url) #print "URL:%s,abs(L-math.sqrt(D_value)):%f,期望值:%f"%(url,abs(L-math.sqrt(D_value)),E_value), #print L,D_value self.badreqs=list(self.badreqs) if __name__=='__main__': help() if Operation=='CREATE':#日志建模 logmodel=LogModel(Logfile) logmodel.getRequests()#读取日志 logmodel.filterSucce***equests()#过滤成功请求 logmodel.filterDynamicRequets()#筛掉静态请求 logmodel.getMatchVal() print "Access Log 建模成功" if Operation=='PARSE':#日志分析 parselog=ParseLog(Logfile) parselog.getRequests()#读取日志 parselog.filterSucce***equests()#过滤成功请求 parselog.filterDynamicRequets()#筛掉静态请求 parselog.do_parse() if len(parselog.badreqs)<1: print "All Requests is normal!" sys.exit(0) for url in parselog.badreqs: print url #pass