临时帮公司研发同事处理nginx日志,凑合了一个脚本,用着还行,执行效率,还可以吧,一个G的文件,一分钟内就可以处理完成,供大家参考。
#!/usr/bin/env python
import os
import re
import fileinput
logfile = r"/data/log20140904/nginx/192.168.1.50"
#定义正则匹配变量
ip = r"?P<ip>[\d.]*"
timeP = r"""?P<time>\[[^\[\]]*\]"""
request = r"""?P<request>\"
[^\"]*\"
"""
status = r"?P<status>\d+"
bodyBytesSent = r"?P<bodyBytesSent>\d+"
refer = r"""?P<refer>
[^\"]*\"[^\"]*\"
"""
userAgent=r"""?P<userAgent>
\S*
"""
forwardr=r"""?P<forwardr>
[^\"]*
"""
request_time=r"""?P<request_time>
[^\"]*
"""
response_time=r"""?P<response_time>
[^\"]*
"""
#引用变量匹配
p = re.compile(r"(%s)\ -\ -\ (%s)\ (%s)(%s)\ (%s)\ (%s)(%s)\ (%s)" %(ip,timeP,request,status,bodyBytesSent,refer,userAgent,forwardr), re.VERBOSE)
#定义读取目录下文件函数,并分析处理
def logfiledir(filedir):
for file in os.listdir(filedir):
#print file,filedir
backfile = filedir + '/' + file.split('.')[0] + '.' + 'txt'
print backfile
logfile = filedir + '/' + file
#print logfile
if os.path.isfile(logfile):
bf = open(backfile,'w')
for line in open('%s'%logfile,'r').readlines():
#print line,
tline = re.match(r'^10.168.*.*',line,re.M|re.I)
if tline:
print tline.group()
else:
matchs = p.match(line)
#print matchs
if matchs!=None:
allGroups = matchs.groups()
ip = allGroups[0]
time = allGroups[1].split()[0][1:]
request = allGroups[2].split()[1]
status = allGroups[3]
bodyBytesSent = allGroups[4]
refer = allGroups[5].split(':',1)[1]
userAgent = allGroups[6]
forwardr = allGroups[7]
# print ip,time,request,status,bodyBytesSent,refer,forwardr,'\n',
bf.writelines('%s %s %s %s\n'%(ip,time,request,refer))
bf.close()
#调用函数
if __name__ == "__main__":
logfiledir(logfile)