简单的一个日志文件自动处理脚本

import random
import datetime
import time
from queue import Queue
import threading
import re
from pathlib import Path
from user_agents import parse

pattern = '(?P[\d.]{7,15}) \S+ - \[(?P[^\[\]]+)\] \
"(?P[^" ]+) (?P[^" ]+) (?P[^" ]+)" \
(?P\d+) (?P\d+) \S+ "(?P[^"]*)"'
regex = re.compile(pattern)

ops = {
    'datetime':lambda dstr:datetime.datetime.strptime(dstr, '%d/%b/%Y:%H:%M:%S %z'),
    'status':int,'size':int,'useragent':lambda ua:parse(ua)
}  #处理日志函数字典

def extract(line:str) -> dict:
    #处理每一行日志函数
    matcher = regex.match(line)
    if matcher:
        return {k:ops.get(k,lambda x:x)(v) for k,v in matcher.groupdict().items()}
        # 如果get到对应的内容,就调用函数,如果没有get到,则使用缺省值lambda函数
    else:
        pass

def loadfile(filename:str,encoding='utf-8'):
    with open(filename,encoding=encoding) as f:
        for line in f:
            fields = extract(line)
            if isinstance(fields,(dict,)):
                yield fields
            else:
                pass

def load(*paths,encoding='utf-8',ext='*.log',glob=False):
    #加载日志文件
    for p in paths:
        path = Path(p)
        if path.is_dir():  #如果是目录则继续
            if isinstance(ext,str):  #如果只传一个参数,则使用列表包起来
                ext = [ext]

            for e in ext:   #按照扩展名递归
                logs = path.rglob(e) if glob else path.glob(e)  #判断是否需要递归
                for log in logs:
                    yield from loadfile(str(log.absolute()))

        elif path.is_file():
            yield from loadfile(str(path.absolute()))

#数据处理函数
def source(seconds=1):
    while True:
        yield {'datetime':datetime.datetime.now(datetime.timezone(datetime.timedelta(hours=8))),
               'value':random.randint(1,100)}
        time.sleep(seconds)

#时间窗口函数
def window(q:Queue,handler,width:int,interval:int):
    """
    :param q: 数据源,Queue队列
    :param handler: 数据处理函数
    :param width: 时间窗口宽度,秒
    :param interval: 处理时间间隔,秒
    """
    buf = []
    start = datetime.datetime.strptime('20190101 000000 +0800','%Y%m%d %H%M%S %z')
    current = datetime.datetime.strptime('20190101 000000 +0800','%Y%m%d %H%M%S %z')
    delta = datetime.timedelta(seconds=width - interval)
    while True:
        data = q.get()
        if data:
            buf.append(data)  #存入临时缓冲等待计算
            current = data['datetime']

        if (current - start).total_seconds() > interval:
            ret = handler(buf)
            print('{}'.format(ret))
            start = current

            buf = [x for x in buf if x['datetime'] > current - delta]


def avg_handler(iterable):
    return sum(map(lambda item:item['value'],iterable)) / len(iterable)

#状态码分析
def status_handler(iterable):
    status = {}
    for item in iterable:
        key = item['status']
        status[key] = status.get(key,0) + 1
    total = len(iterable)
    return {k:v/total for k,v in status.items()}

allbrowsers = {}

#浏览器分析
def browser_handler(iterable):
    browsers = {}
    for item in iterable:
        ua = item['useragent']

        key = (ua.browser.family,ua.browser.version_string)
        browsers[key] = browsers.get(key,0) + 1
        allbrowsers[key] = allbrowsers.get(key,0) + 1

    print(sorted(allbrowsers.items(),key=lambda x:x[1],reverse=True)[:10])
    return browsers

def dispatcher(src):
    handlers = []
    queues = []

    def reg(handler,width:int,interval:int):
        q = Queue()
        queues.append(q)
        t = threading.Thread(target=window,args=(q,handler,width,interval))
        handlers.append(t)

    def run():
        for t in handlers:
            t.start()

            # data = next(src)
        for item in src:
            for q in queues:
                q.put(item)

    return reg, run

if __name__ == "__main__":
    path = 'test.log'
    reg, run = dispatcher(load(path))
    reg(status_handler, 10, 5)
    reg(browser_handler, 5, 5)
    run()

 

你可能感兴趣的:(python)