python分析nginx中access日志

最近项目需要,通过访问日志来确认每秒并发量,处理时间超60ms的量,每小时处理量之类的数据,故花了点小时间用python分析access日志来得到数据报表,切入正题就是代码,简单快速,一天十几个G的access日志文件,在几分钟内可以得到相应的报表

#coding=utf-8

import os
import xlwt
import time

FILE_NAME = "/alidata1/wwwlogs/rtb_2017/03/access-rtb_20170311.log"
#FILE_NAME = "access-rtb.log"
#日志格式:'$remote_addr - $remote_user [$time_local][$request_time] "$request" $status $body_bytes_sent "$http_referer" "$http_user_agent" $http_x_forwarded_for'
#100.109.192.22 - - [09/Mar/2017:00:00:16 +0800][0.002] "POST /d_iqiyi HTTP/1.0" 204 0 "-" "-" 123.125.118.42

time_second_statistic = {}
time_min_statistic = {}
time_hour_statistic = {}
remote_ip_statistic = {}
proccess_time_statistic = {}
platform_statistic = {}

def from_this_dir(filename):
    return os.path.join(os.path.dirname(os.path.abspath(__file__)), filename)

def time_out_check(proccess_time):
    if proccess_time == "" or proccess_time == "0":
        return 0
    if 60 < (float(proccess_time) * 1000):
        return 1
    return 0

file_seek_index = 0
del_index = 0
old_time_hour = ""
old_time_day = ""
file_name = ""
time_hour = ""
time_day = ""
line_index = 0
cell_index = 0
file_handle = None
wbk = None
second_sheet = None
min_sheet = None
hour_sheet = None
remoteip_sheet = None
proccess_sheet = None
platform_sheet = None
quit_flag = False
file_name_time_str = ""

while(not quit_flag):
    if file_handle == None:
        file_handle = open(FILE_NAME)
        file_handle.seek(file_seek_index)
    line = file_handle.readline()
    file_seek_index = file_handle.tell()
    if line == '':
        quit_flag = True
        time_hour = ""
        time_day = ""

    if quit_flag == False:
        line = line.strip('\n')
        strs = line.split(' ')
        time_second = strs[3][1:]
        file_name_time_str = time_second
        time_min = time_second[0:len(time_second)-3]
        time_hour = time_min[0:len(time_min)-3]
        time_day = time_hour[0:len(time_hour) - 3]
        proccess_time = strs[4][7:12]
        remote_ip = strs[len(strs)-1]
        platform = strs[6]

        retv = time_out_check(proccess_time)
        if not time_second_statistic.has_key(time_second):
            time_second_statistic[time_second] = {"index":1, "time_out_index":retv}
        else:
            time_second_statistic[time_second] = \
                {"index":time_second_statistic[time_second]["index"] + 1, "time_out_index":time_second_statistic[time_second]["time_out_index"] + retv}

        if not time_min_statistic.has_key(time_min):
            time_min_statistic[time_min] = {"index":1, "time_out_index":retv}
        else:
            time_min_statistic[time_min] = \
                {"index":time_min_statistic[time_min]["index"] + 1, "time_out_index":time_min_statistic[time_min]["time_out_index"] + retv}

        if not time_hour_statistic.has_key(time_hour):
            time_hour_statistic[time_hour] = {"index":1, "time_out_index":retv}
        else:
            time_hour_statistic[time_hour] = \
                {"index":time_hour_statistic[time_hour]["index"] + 1, "time_out_index":time_hour_statistic[time_hour]["time_out_index"] + retv}

        if not remote_ip_statistic.has_key(remote_ip):
            remote_ip_statistic[remote_ip] = {"index":1, "time_out_index":retv}
        else:
            remote_ip_statistic[remote_ip] = \
                {"index":remote_ip_statistic[remote_ip]["index"] + 1, "time_out_index":remote_ip_statistic[remote_ip]["time_out_index"] + retv}

        if not platform_statistic.has_key(platform):
            platform_statistic[platform] = {"index":1, "time_out_index":retv}
        else:
            platform_statistic[platform] = \
                {"index":platform_statistic[platform]["index"] + 1, "time_out_index":platform_statistic[platform]["time_out_index"] + retv}

        if not proccess_time_statistic.has_key(proccess_time):
            proccess_time_statistic[proccess_time] = 1
        else:
            proccess_time_statistic[proccess_time] = proccess_time_statistic[proccess_time] + 1


    if time_hour != old_time_hour:
        old_time_hour = time_hour
        line_index = 0
        cell_index = 0

        if wbk != None and file_name != None and file_name != "":
            # 从第二行开始插入真实数据
            temp = sorted(time_second_statistic.iteritems(), key=lambda asd: asd[1]["index"], reverse=True)
            line_index = 1
            for value in temp:
                second_sheet.write(line_index, cell_index, value[0])
                second_sheet.write(line_index, cell_index + 1, value[1]["index"])
                second_sheet.write(line_index, cell_index + 2, value[1]["time_out_index"])
                line_index = line_index + 1

            temp = sorted(time_min_statistic.iteritems(), key=lambda asd: asd[1]["index"], reverse=True)
            line_index = 1
            for value in temp:
                min_sheet.write(line_index, cell_index, value[0])
                min_sheet.write(line_index, cell_index + 1, value[1]["index"])
                min_sheet.write(line_index, cell_index + 2, value[1]["time_out_index"])
                line_index = line_index + 1

            temp = sorted(time_hour_statistic.iteritems(), key=lambda asd: asd[1]["index"], reverse=True)
            line_index = 1
            for value in temp:
                hour_sheet.write(line_index, cell_index, value[0])
                hour_sheet.write(line_index, cell_index + 1, value[1]["index"])
                hour_sheet.write(line_index, cell_index + 2, value[1]["time_out_index"])
                line_index = line_index + 1

            temp = sorted(remote_ip_statistic.iteritems(), key=lambda asd: asd[1]["index"], reverse=True)
            line_index = 1
            for value in temp:
                remoteip_sheet.write(line_index, cell_index, value[0])
                remoteip_sheet.write(line_index, cell_index + 1, value[1]["index"])
                remoteip_sheet.write(line_index, cell_index + 2, value[1]["time_out_index"])
                line_index = line_index + 1

            temp = sorted(platform_statistic.iteritems(), key=lambda asd: asd[1]["index"], reverse=True)
            line_index = 1
            for value in temp:
                platform_sheet.write(line_index, cell_index, value[0])
                platform_sheet.write(line_index, cell_index + 1, value[1]["index"])
                platform_sheet.write(line_index, cell_index + 2, value[1]["time_out_index"])
                line_index = line_index + 1

            temp = sorted(proccess_time_statistic.iteritems(), key=lambda asd: asd[1], reverse=True)
            line_index = 1
            for value in temp:
                proccess_sheet.write(line_index, cell_index, value[0])
                proccess_sheet.write(line_index, cell_index + 1, value[1])
                line_index = line_index + 1

            #写入表格,保存文件,清空缓存的字典信息
            wbk.save(from_this_dir(file_name))
            time_second_statistic = {}
            time_min_statistic = {}
            time_hour_statistic = {}
            remote_ip_statistic = {}
            proccess_time_statistic = {}
            platform_statistic = {}

            file_handle.close()
            file_handle = None
            wbk = None
            second_sheet = None
            min_sheet = None
            hour_sheet = None
            remoteip_sheet = None
            proccess_sheet = None
            platform_sheet = None

        if quit_flag == False:
            #一个新的execl文件开始
            file_name = time.strftime('%Y-%m-%d-%H-%M-%S', time.strptime(file_name_time_str, "%d/%b/%Y:%H:%M:%S")) + ".xls"
            f = open(from_this_dir(file_name), 'w')
            f.close()

            #设置表,设置sheet
            wbk = xlwt.Workbook(encoding='utf-8', style_compression=0)
            second_sheet = wbk.add_sheet('每秒', cell_overwrite_ok=True)
            min_sheet = wbk.add_sheet('每分钟', cell_overwrite_ok=True)
            hour_sheet = wbk.add_sheet('每小时', cell_overwrite_ok=True)
            remoteip_sheet = wbk.add_sheet('请求地址', cell_overwrite_ok=True)
            proccess_sheet = wbk.add_sheet('处理时长', cell_overwrite_ok=True)
            platform_sheet = wbk.add_sheet('ADX平台', cell_overwrite_ok=True)

            #设置首行标题
            style = xlwt.XFStyle()
            font = xlwt.Font()
            font.bold = True
            style.font = font
            line_index = 0
            cell_index = 0
            second_sheet.write(line_index, cell_index, '处理时间', style)
            second_sheet.write(line_index, cell_index+1, '处理请求数', style)
            second_sheet.write(line_index, cell_index + 2, '处理超时数', style)
            min_sheet.write(line_index, cell_index, '处理时间', style)
            min_sheet.write(line_index, cell_index+1, '处理请求数', style)
            min_sheet.write(line_index, cell_index + 2, '处理超时数', style)
            hour_sheet.write(line_index, cell_index, '处理时间', style)
            hour_sheet.write(line_index, cell_index+1, '处理请求数', style)
            hour_sheet.write(line_index, cell_index + 2, '处理超时数', style)
            remoteip_sheet.write(line_index, cell_index, '请求地址', style)
            remoteip_sheet.write(line_index, cell_index+1, '处理请求数', style)
            remoteip_sheet.write(line_index, cell_index + 2, '处理超时数', style)
            platform_sheet.write(line_index, cell_index, '请求平台', style)
            platform_sheet.write(line_index, cell_index + 1, '处理请求数', style)
            platform_sheet.write(line_index, cell_index + 2, '处理超时数', style)
            proccess_sheet.write(line_index, cell_index, '处理时长', style)
            proccess_sheet.write(line_index, cell_index+1, '处理请求数', style)

代码中使用到xlwt模块,下载地址:https://pypi.python.org/pypi/xlwt,使用python自带的工具进行安装,然后import即可

你可能感兴趣的:(Python)