进入pychram终端:
pip install user-agent pyyaml ua-parser
完整代码:
from pathlib import Path
import datetime
import re
from user_agents import parse
from queue import Queue
import threading
from collections import defaultdict
#192.168.56.1 - - [18/Mar/2019:10:55:04 +0800] "POST /zabbix/jsrpc.php?output=json-rpc HTTP/1.1" 200 64 "-" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36"
#日志格式处理(数据清洗)
def extract(line:str)->dict:
# print(line)
pattern = '''(?P[\d\.]{7,}) - - \[(?P[^\[\]]+)\] \"(?P[^\"]+)\" (?P\d+) (?P\d+) \"(?P[^\"]+)\" \"(?P[^\"]+)\"'''
regex = re.compile(pattern)
matcher = regex.match(line)
if matcher:
return {k:ops.get(k,lambda x:x)(v) for k,v in matcher.groupdict().items()}
def convers_time(src:str):
ret= datetime.datetime.strptime(src,'%d/%b/%Y:%H:%M:%S %z')
return ret
def convers_request(src:str):
lst = src.split()
ret = dict(zip(['method','url','protocol'],lst))
return ret
def convers_useragent(src:str):
ret = parse(src)
return ret
ops = {\
'datetime':convers_time,
'request':convers_request,
'status':int,
'size':int,
'useragent':convers_useragent
}
def openfile(pathstr:str)->dict:
path = Path(pathstr)
if path.exists() and path.is_file():
with open(pathstr) as f:
for line in f:
ret = extract(line)
if ret:
yield ret
else:
#TODO 不合格的数据有多少
continue
elif path.exists() and path.is_dir():
for item in path.iterdir():
if path.exists() and path.is_file():
with open(pathstr) as f:
for line in f:
ret = extract(line)
if ret:
yield ret
else:
#TODO 不合格的数据有多少
continue
else:
continue
else:
print(pathstr,'file not found')
# raise ImportError
#####################################################################################################
#滑动窗口
def window(src:Queue,handler,width:int=20,interval:int=20):
if width >= interval:
start = datetime.datetime.strptime('1970/01/01 01:01:01 +0800','%Y/%m/%d %H:%M:%S %z')
current = datetime.datetime.strptime('1970/01/01 01:01:02 +0800','%Y/%m/%d %H:%M:%S %z')
delta = datetime.timedelta(seconds=width - interval)
buffer = []
while True:
data = src.get() #block
if data:
buffer.append(data)
current = data['datetime']
if (current - start).total_seconds() >= interval:
ret = handler(buffer)
print(ret)
start = current
#buffer的处理
buffer = [i for i in buffer if i['datetime'] > current - delta]
else:
raise ImportError
#####################################################################################################
#日志分析操作(数据分析)---消费者
#默认操作
def donothing_handler(iterable:list):
return iterable
#状态码分析操作
def status_handler(iterable:list):
d = defaultdict(lambda :0)
for item in iterable:
k = item['status']
d[k] += 1
total = sum(d.values())
return {k:v/total*100 for k,v in d.items()}
#浏览器分析
def browser_handler(iterable:list):
ua_dict = defaultdict(lambda :0)
for item in iterable:
ua = item['useragent']
key = (ua.browser.family,ua.browser.version_string)
ua_dict[key] += 1
return ua_dict
# gn = window(*['test.log'],handler=donothing_handler)
#分发器
def dispacher(src):
queues = []
threads = []
def reg(handler,width,interval):
q = Queue()
queues.append(q)
t = threading.Thread(target=window,args=(q,handler,width,interval))
threads.append(t)
def run():
for t in threads:
t.start()
for x in src:
for q in queues:
q.put(x)
return reg,run
if __name__ == "__main__":
import sys
#path = sys.argv[1]
path = 'test.log'
reg,run = dispacher(openfile(path))
reg(status_handler,20,20)
run()
运行结果:
{200: 100.0}
{200: 50.0, 403: 50.0}
{200: 50.0, 404: 50.0}
test.log素材内容如下:
192.168.56.1 - - [18/Mar/2019:10:55:04 +0800] "POST /zabbix/jsrpc.php?output=json-rpc HTTP/1.1" 200 64 "http://192.168.56.101:888/zabbix/zabbix.php?action=problem.view&ddreset=1" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36"
192.168.56.2 - - [18/Mar/2019:10:55:14 +0800] "POST /zabbix/jsrpc.php?output=json-rpc HTTP/1.1" 403 64 "http://192.168.56.101:888/zabbix/zabbix.php?action=problem.view&ddreset=1" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36"
192.168.56.3 - - [18/Mar/2019:10:55:24 +0800] "POST /zabbix/jsrpc.php?output=json-rpc HTTP/1.1" 200 64 "http://192.168.56.101:888/zabbix/zabbix.php?action=problem.view&ddreset=1" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36"
192.168.56.4 - - [18/Mar/2019:10:55:34 +0800] "POST /zabbix/jsrpc.php?output=json-rpc HTTP/1.1" 404 64 "http://192.168.56.101:888/zabbix/zabbix.php?action=problem.view&ddreset=1" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36"
192.168.56.6 - - [18/Mar/2019:10:55:44 +0800] "POST /zabbix/jsrpc.php?output=json-rpc HTTP/1.1" 200 64 "http://192.168.56.101:888/zabbix/zabbix.php?action=problem.view&ddreset=1" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36"
192.168.56.7 - - [18/Mar/2019:10:55:54 +0800] "POST /zabbix/jsrpc.php?output=json-rpc HTTP/1.1" 403 64 "http://192.168.56.101:888/zabbix/zabbix.php?action=problem.view&ddreset=1" "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.84 Safari/537.36"