# -*- coding:UTF-8 -*-
"""
access_log 建模:
根据request长度来生成白名单(基于统计的方法)
第一阶段:训练阶段 application profiling分析对象:access log分析方法:
(1) 去重
(2) 筛掉响应码为4xx,5xx的请求 (或者选择2xx,3xx的请求)
(3) 筛掉静态资源请求
第二阶段:检测阶段on-line learning statistical model 目标:
(1)request长度
方法: 若参数值长度在不等式外 |len(request)-D(x)**0.5|<=E(x)标记为可疑攻击请求
"""
import os,sys,math
import re,string
import numpy
def help():
global Operation,Logfile
if not sys.argv[1:]:
print "Usage: python parse_accesslog.py [OPTIONS]"
print "Options are:"
print "-o, --operation <create|parse> create:建模|parse 分析日志"
print "-f, --file <logfile> 需要分析的日志文件(绝对路径)"
sys.exit(0)
Operation='None'
Logfile='None'
i = 1
while (i<len(sys.argv)):
arg = sys.argv[i]
if arg=='-o' or arg=='--operation':
i += 1
Operation = sys.argv[i].upper()
elif arg=='-f' or arg=='--file':
i += 1
Logfile = sys.argv[i]
else:
pass
i += 1
if Operation=='None':
print "请选择操作方式:create or parse?"
sys.exit(0)
if Logfile=='None':
print "请提供需要分析的日志文件(绝对路径)!"
sys.exit(0)
class LogModel(object):
def __init__(self,logfile):
self.logfile = logfile
self.rsfile='logModel.ini'
self.reqs = set()
self.static_elements=("gif", "jpeg","png","bmp","ico","js","css","htm","html")
self.success_urls=set() #访问状态为成功的url
self.urls=set()#处理之后最终需要分析的url
def getRequests(self):
with open(self.logfile, "r") as base_log:
for line in base_log:
line = line.split()
url=line[7] #被请求URL
#url=line[5].replace('URL:','')
status=line[9] #状态码
#status=line[6].replace('CODE:','')
self.reqs.add((url,status))
self.reqs=list(self.reqs)
def filterSuccessRequests(self):
#过滤出状态码2XX 3XX的请求
for item in self.reqs:
if re.match(r'2\d\d$',str(item[1])) or re.match(r'3\d\d$',str(item[1])):
self.success_urls.add(item[0])
self.success_urls=list(self.success_urls)
def filterDynamicRequets(self):
#筛掉静态的请求
static_string='|'.join(self.static_elements)
formatstring='('+static_string+')$'
for url in self.success_urls:
if re.search(formatstring,url):
continue
else:
self.urls.add(url)
self.urls=list(self.urls)
#print self.urls
def getMatchVal(self):
#计算数学期望值E(x)和方差值D(x)
N=len(self.urls)
E_value=0.0 #期望值
D_value=0.0 #方差
url_length=[] #每个url长度
for url in self.urls:
url_length.append(len(url))
"""
使用numpy 对列表求和、求方差
方差公式:
s**2=[(X1-x)**2+...+(Xn-x)**2]/N=[(X1**2+X2**2+...+Xn**2)+(X1+...Xn)**2)/N]/N
"""
narray=numpy.array(url_length)
sum1=narray.sum()
narray2=narray*narray
sum2=narray2.sum()
mean=sum1/N
E_value=mean
D_value=sum2/N-mean**2
#print E_value,D_value,N
#sys.exit(0)
#for url in self.urls:
# D_value+=(len(url)-E_value)**2
output="E(x):%f\nD(x):%f"%(E_value,D_value)
#print output
with open(self.rsfile, 'w') as f:
f.write(output)
class ParseLog(LogModel):
def __init__(self,logfile):
LogModel.__init__(self,logfile)
self.badreqs = set()
def do_parse(self):
E_value=0.0
D_value=0.0
with open(self.rsfile,'r') as f:
for line in f:
if re.search(r'^E',line):
E_value=string.atof(line.replace('E(x):',''))
if re.search(r'^D',line):
D_value=string.atof(line.replace('D(x):',''))
#print "E(x):%f,D(x):%f"%(E_value,D_value)
for url in self.urls:
L=len(url)
if abs(L-math.sqrt(D_value))<=E_value:
pass#normal url
else:#标记为可疑
suspicious_string=url+','+str(abs(L-math.sqrt(D_value)))+'('+str(E_value)+')'
self.badreqs.add(suspicious_string)
#self.badreqs.add(url)
#print "URL:%s,abs(L-math.sqrt(D_value)):%f,期望值:%f"%(url,abs(L-math.sqrt(D_value)),E_value),
#print L,D_value
self.badreqs=list(self.badreqs)
if __name__=='__main__':
help()
if Operation=='CREATE':#日志建模
logmodel=LogModel(Logfile)
logmodel.getRequests()#读取日志
logmodel.filterSuccessRequests()#过滤成功请求
logmodel.filterDynamicRequets()#筛掉静态请求
logmodel.getMatchVal()
print "Access Log 建模成功"
if Operation=='PARSE':#日志分析
parselog=ParseLog(Logfile)
parselog.getRequests()#读取日志
parselog.filterSuccessRequests()#过滤成功请求
parselog.filterDynamicRequets()#筛掉静态请求
parselog.do_parse()
if len(parselog.badreqs)<1:
print "All Requests is normal!"
sys.exit(0)
for url in parselog.badreqs:
print url
#pass