#-*-coding:utf-8-*- #python―2.7.3写的 win下和centos下测试都能通过 #如果centos的 python版本只有2.4.3的话要载入time模块把时间格式转换的函# #数换一下,下面有介绍 #by:Z-Ping #mail:[email protected] import re #import time #python版本只有2.4.3 去掉import前面的注释 import datetime class WebLogFormat: def __init__(self, filename): self.WebFile = open(filename,'r') self.LogFormat = re.compile(r'(?P<origin>\d+\.\d+\.\d+\.\d+) '+ r'(?P<identd>-|\w+) (?P<auth>-|\w+) ' +r'\[(?P<date>[^\[\]:]+):(?P<time>\d+:\d+:\d+) (?P<tz>[\-\+]?\d\d\d\d)\] ' +r'"(-|((?P<method>\w+) (?P<path>[\S]+) (?P<protocol>[^"]+))|[^"]+)" (?P<status>\d+) (?P<bytes>-|\d+)' +r'( (?P<referrer>-|"[^"]*")( (?P<client>-|"[^"]*")( (?P<cookie>-|"[^"]*"))?)?)?\s*\Z') class about_ip(WebLogFormat): def __init__(self, filename): WebLogFormat.__init__(self, filename) #提取日志IP和点击率字典,并由高至低排序 def get_ipdict(self): IP={} for i in self.WebFile: m = self.LogFormat.search(i) IP[m.group('origin')] = IP.get(m.group('origin'), 0) + 1 IP=sorted(IP.iteritems(), key=lambda c:c[1], reverse=True) return IP #找出关键字记录,不敢往列表和字典里面记录,搞不好就是几百兆上G大 def get_errorip(self,status,sss): for i in self.WebFile: m = self.LogFormat.search(i) #print m.groupdict() if m.groupdict()[status] == sss : print i #分时段查询 def get_time(self, starttime, endtime): print datetime.datetime.now() m_format = '%Y%m%d%H%M%S' time_format = '%d/%b/%Y:%X' #python版本2.4.3的 换成 #starttime=datetime.datetime.fromtimestamp(time.mktime(time.strptime(s#tarttime,m_format))) starttime = datetime.datetime.strptime(starttime,m_format) #python版本2.4.3的换成#endtime=datetime.datetime.fromtimestamp(time.mktime(time.strptime(end#time,m_format))) endtime = datetime.datetime.strptime(endtime,m_format) total = 0 for i in self.WebFile: m = self.LogFormat.search(i) p = m.group('date') +':'+ m.group('time') #python版本2.4.3的换成 #ptime=datetime.datetime.fromtimestamp(time.mktime(time.strptime(p,tim#e_format))) ptime = datetime.datetime.strptime(p,time_format) if ptime >= starttime and ptime <= endtime: #print i total+=1 print '总共%s条记录'%total print datetime.datetime.now() #print m.groupdict()['client'] x = about_ip(r'F:\access_20130508.log') x.get_time(starttime='20130508000000', endtime='20130508005959')
在生产环境里面一次跑了64G的日志,终于跑顺了。尼玛累啊!!