最近项目需要,通过访问日志来确认每秒并发量,处理时间超60ms的量,每小时处理量之类的数据,故花了点小时间用python分析access日志来得到数据报表,切入正题就是代码,简单快速,一天十几个G的access日志文件,在几分钟内可以得到相应的报表
#coding=utf-8
import os
import xlwt
import time
FILE_NAME = "/alidata1/wwwlogs/rtb_2017/03/access-rtb_20170311.log"
#FILE_NAME = "access-rtb.log"
#日志格式:'$remote_addr - $remote_user [$time_local][$request_time] "$request" $status $body_bytes_sent "$http_referer" "$http_user_agent" $http_x_forwarded_for'
#100.109.192.22 - - [09/Mar/2017:00:00:16 +0800][0.002] "POST /d_iqiyi HTTP/1.0" 204 0 "-" "-" 123.125.118.42
time_second_statistic = {}
time_min_statistic = {}
time_hour_statistic = {}
remote_ip_statistic = {}
proccess_time_statistic = {}
platform_statistic = {}
def from_this_dir(filename):
return os.path.join(os.path.dirname(os.path.abspath(__file__)), filename)
def time_out_check(proccess_time):
if proccess_time == "" or proccess_time == "0":
return 0
if 60 < (float(proccess_time) * 1000):
return 1
return 0
file_seek_index = 0
del_index = 0
old_time_hour = ""
old_time_day = ""
file_name = ""
time_hour = ""
time_day = ""
line_index = 0
cell_index = 0
file_handle = None
wbk = None
second_sheet = None
min_sheet = None
hour_sheet = None
remoteip_sheet = None
proccess_sheet = None
platform_sheet = None
quit_flag = False
file_name_time_str = ""
while(not quit_flag):
if file_handle == None:
file_handle = open(FILE_NAME)
file_handle.seek(file_seek_index)
line = file_handle.readline()
file_seek_index = file_handle.tell()
if line == '':
quit_flag = True
time_hour = ""
time_day = ""
if quit_flag == False:
line = line.strip('\n')
strs = line.split(' ')
time_second = strs[3][1:]
file_name_time_str = time_second
time_min = time_second[0:len(time_second)-3]
time_hour = time_min[0:len(time_min)-3]
time_day = time_hour[0:len(time_hour) - 3]
proccess_time = strs[4][7:12]
remote_ip = strs[len(strs)-1]
platform = strs[6]
retv = time_out_check(proccess_time)
if not time_second_statistic.has_key(time_second):
time_second_statistic[time_second] = {"index":1, "time_out_index":retv}
else:
time_second_statistic[time_second] = \
{"index":time_second_statistic[time_second]["index"] + 1, "time_out_index":time_second_statistic[time_second]["time_out_index"] + retv}
if not time_min_statistic.has_key(time_min):
time_min_statistic[time_min] = {"index":1, "time_out_index":retv}
else:
time_min_statistic[time_min] = \
{"index":time_min_statistic[time_min]["index"] + 1, "time_out_index":time_min_statistic[time_min]["time_out_index"] + retv}
if not time_hour_statistic.has_key(time_hour):
time_hour_statistic[time_hour] = {"index":1, "time_out_index":retv}
else:
time_hour_statistic[time_hour] = \
{"index":time_hour_statistic[time_hour]["index"] + 1, "time_out_index":time_hour_statistic[time_hour]["time_out_index"] + retv}
if not remote_ip_statistic.has_key(remote_ip):
remote_ip_statistic[remote_ip] = {"index":1, "time_out_index":retv}
else:
remote_ip_statistic[remote_ip] = \
{"index":remote_ip_statistic[remote_ip]["index"] + 1, "time_out_index":remote_ip_statistic[remote_ip]["time_out_index"] + retv}
if not platform_statistic.has_key(platform):
platform_statistic[platform] = {"index":1, "time_out_index":retv}
else:
platform_statistic[platform] = \
{"index":platform_statistic[platform]["index"] + 1, "time_out_index":platform_statistic[platform]["time_out_index"] + retv}
if not proccess_time_statistic.has_key(proccess_time):
proccess_time_statistic[proccess_time] = 1
else:
proccess_time_statistic[proccess_time] = proccess_time_statistic[proccess_time] + 1
if time_hour != old_time_hour:
old_time_hour = time_hour
line_index = 0
cell_index = 0
if wbk != None and file_name != None and file_name != "":
# 从第二行开始插入真实数据
temp = sorted(time_second_statistic.iteritems(), key=lambda asd: asd[1]["index"], reverse=True)
line_index = 1
for value in temp:
second_sheet.write(line_index, cell_index, value[0])
second_sheet.write(line_index, cell_index + 1, value[1]["index"])
second_sheet.write(line_index, cell_index + 2, value[1]["time_out_index"])
line_index = line_index + 1
temp = sorted(time_min_statistic.iteritems(), key=lambda asd: asd[1]["index"], reverse=True)
line_index = 1
for value in temp:
min_sheet.write(line_index, cell_index, value[0])
min_sheet.write(line_index, cell_index + 1, value[1]["index"])
min_sheet.write(line_index, cell_index + 2, value[1]["time_out_index"])
line_index = line_index + 1
temp = sorted(time_hour_statistic.iteritems(), key=lambda asd: asd[1]["index"], reverse=True)
line_index = 1
for value in temp:
hour_sheet.write(line_index, cell_index, value[0])
hour_sheet.write(line_index, cell_index + 1, value[1]["index"])
hour_sheet.write(line_index, cell_index + 2, value[1]["time_out_index"])
line_index = line_index + 1
temp = sorted(remote_ip_statistic.iteritems(), key=lambda asd: asd[1]["index"], reverse=True)
line_index = 1
for value in temp:
remoteip_sheet.write(line_index, cell_index, value[0])
remoteip_sheet.write(line_index, cell_index + 1, value[1]["index"])
remoteip_sheet.write(line_index, cell_index + 2, value[1]["time_out_index"])
line_index = line_index + 1
temp = sorted(platform_statistic.iteritems(), key=lambda asd: asd[1]["index"], reverse=True)
line_index = 1
for value in temp:
platform_sheet.write(line_index, cell_index, value[0])
platform_sheet.write(line_index, cell_index + 1, value[1]["index"])
platform_sheet.write(line_index, cell_index + 2, value[1]["time_out_index"])
line_index = line_index + 1
temp = sorted(proccess_time_statistic.iteritems(), key=lambda asd: asd[1], reverse=True)
line_index = 1
for value in temp:
proccess_sheet.write(line_index, cell_index, value[0])
proccess_sheet.write(line_index, cell_index + 1, value[1])
line_index = line_index + 1
#写入表格,保存文件,清空缓存的字典信息
wbk.save(from_this_dir(file_name))
time_second_statistic = {}
time_min_statistic = {}
time_hour_statistic = {}
remote_ip_statistic = {}
proccess_time_statistic = {}
platform_statistic = {}
file_handle.close()
file_handle = None
wbk = None
second_sheet = None
min_sheet = None
hour_sheet = None
remoteip_sheet = None
proccess_sheet = None
platform_sheet = None
if quit_flag == False:
#一个新的execl文件开始
file_name = time.strftime('%Y-%m-%d-%H-%M-%S', time.strptime(file_name_time_str, "%d/%b/%Y:%H:%M:%S")) + ".xls"
f = open(from_this_dir(file_name), 'w')
f.close()
#设置表,设置sheet
wbk = xlwt.Workbook(encoding='utf-8', style_compression=0)
second_sheet = wbk.add_sheet('每秒', cell_overwrite_ok=True)
min_sheet = wbk.add_sheet('每分钟', cell_overwrite_ok=True)
hour_sheet = wbk.add_sheet('每小时', cell_overwrite_ok=True)
remoteip_sheet = wbk.add_sheet('请求地址', cell_overwrite_ok=True)
proccess_sheet = wbk.add_sheet('处理时长', cell_overwrite_ok=True)
platform_sheet = wbk.add_sheet('ADX平台', cell_overwrite_ok=True)
#设置首行标题
style = xlwt.XFStyle()
font = xlwt.Font()
font.bold = True
style.font = font
line_index = 0
cell_index = 0
second_sheet.write(line_index, cell_index, '处理时间', style)
second_sheet.write(line_index, cell_index+1, '处理请求数', style)
second_sheet.write(line_index, cell_index + 2, '处理超时数', style)
min_sheet.write(line_index, cell_index, '处理时间', style)
min_sheet.write(line_index, cell_index+1, '处理请求数', style)
min_sheet.write(line_index, cell_index + 2, '处理超时数', style)
hour_sheet.write(line_index, cell_index, '处理时间', style)
hour_sheet.write(line_index, cell_index+1, '处理请求数', style)
hour_sheet.write(line_index, cell_index + 2, '处理超时数', style)
remoteip_sheet.write(line_index, cell_index, '请求地址', style)
remoteip_sheet.write(line_index, cell_index+1, '处理请求数', style)
remoteip_sheet.write(line_index, cell_index + 2, '处理超时数', style)
platform_sheet.write(line_index, cell_index, '请求平台', style)
platform_sheet.write(line_index, cell_index + 1, '处理请求数', style)
platform_sheet.write(line_index, cell_index + 2, '处理超时数', style)
proccess_sheet.write(line_index, cell_index, '处理时长', style)
proccess_sheet.write(line_index, cell_index+1, '处理请求数', style)
代码中使用到xlwt模块,下载地址:https://pypi.python.org/pypi/xlwt,使用python自带的工具进行安装,然后import即可