用python写一个nginx和apache日志分析的脚本

#-*-coding:utf-8-*-
#python―2.7.3写的  win下和centos下测试都能通过
#如果centos的 python版本只有2.4.3的话要载入time模块把时间格式转换的函# #数换一下,下面有介绍
#by:Z-Ping
#mail:[email protected]
import re
#import time   #python版本只有2.4.3 去掉import前面的注释
import datetime
class WebLogFormat:
    def __init__(self, filename):
        self.WebFile = open(filename,'r')
        self.LogFormat = re.compile(r'(?P<origin>\d+\.\d+\.\d+\.\d+) '+ r'(?P<identd>-|\w+) (?P<auth>-|\w+) '
                     +r'\[(?P<date>[^\[\]:]+):(?P<time>\d+:\d+:\d+) (?P<tz>[\-\+]?\d\d\d\d)\] '
                     +r'"(-|((?P<method>\w+) (?P<path>[\S]+) (?P<protocol>[^"]+))|[^"]+)" (?P<status>\d+) (?P<bytes>-|\d+)'
                     +r'( (?P<referrer>-|"[^"]*")( (?P<client>-|"[^"]*")( (?P<cookie>-|"[^"]*"))?)?)?\s*\Z')
                                                                       
class about_ip(WebLogFormat):
    def __init__(self, filename):
        WebLogFormat.__init__(self, filename)
    #提取日志IP和点击率字典,并由高至低排序
    def get_ipdict(self):
        IP={}
        for i in self.WebFile:
            m = self.LogFormat.search(i)
            IP[m.group('origin')] = IP.get(m.group('origin'), 0) + 1
        IP=sorted(IP.iteritems(), key=lambda c:c[1], reverse=True)
        return IP
                                                      
    #找出关键字记录,不敢往列表和字典里面记录,搞不好就是几百兆上G大
    def get_errorip(self,status,sss):
                                                          
        for i in self.WebFile:
            m = self.LogFormat.search(i)
            #print  m.groupdict()
            if m.groupdict()[status] == sss :
                print i
                                                      
    #分时段查询
    def get_time(self, starttime, endtime):
        print datetime.datetime.now()
        m_format = '%Y%m%d%H%M%S'
        time_format = '%d/%b/%Y:%X'
                                                           
#python版本2.4.3的 换成
#starttime=datetime.datetime.fromtimestamp(time.mktime(time.strptime(s#tarttime,m_format)))
        starttime = datetime.datetime.strptime(starttime,m_format)      #python版本2.4.3的换成#endtime=datetime.datetime.fromtimestamp(time.mktime(time.strptime(end#time,m_format)))
        endtime = datetime.datetime.strptime(endtime,m_format)
        total = 0
        for i in self.WebFile:
            m = self.LogFormat.search(i)
            p = m.group('date') +':'+ m.group('time')           #python版本2.4.3的换成
#ptime=datetime.datetime.fromtimestamp(time.mktime(time.strptime(p,tim#e_format)))
            ptime = datetime.datetime.strptime(p,time_format)
            if ptime >= starttime and ptime <= endtime:
               #print i
                total+=1
                                                                  
        print '总共%s条记录'%total
        print datetime.datetime.now()
        #print m.groupdict()['client']
                                                       
x = about_ip(r'F:\access_20130508.log')
x.get_time(starttime='20130508000000', endtime='20130508005959')

在生产环境里面一次跑了64G的日志,终于跑顺了。尼玛累啊!!

你可能感兴趣的:(python,日志分析)