日志格式为:(需获取标红的内容) - - [28/Nov/2013:04:41:26 +0800] "GET /dj.gif?id=15904e7e858c438614266404a0b9e3dc&dm=news.master.com&ul=http%253A%252F%252Fnews.master.com%252Fsalon%252Fyuedu%252F&fs=1&tm=1385584886&os=WinXP&br=Firefox6.0&rf=http%253A%252F%252Fnews.master.com%252F&uid=1385584391326354&pt=BODY%3A0-DIV%3A1-DIV%3A1-DIV%3A0-DIV%3A1-DIV%3A0-UL%3A0-LI%3A3-A%3A0&tl=http%3A%2F%2Fnews.master.com%2Fstudy%2Fbilingual%2F1564112.shtml&co=%E8%82%AF%E5%B0%BC%E8%BF%AA%E9%81%87%E5%88%BA50%E5%B9%B4%EF%BC%9A%E7%89%B9%E5%B7%A5%E6%9B%9D%E5%85%89%E7%BB%86%E8%8A%82&HTTP/1.1" 200 46 "http://news.master.com/salon/yuedu/" "Mozilla/5.0 (WindowsNT 5.1; rv:6.0) Gecko/20100101 Firefox/6.0"
根据参数dm(每条日志的dm有可能不一样)的值 取得路径为 /data/app/click.master.com/logs/dlogs/news.master.com/20131027/2013102705.log
(其中 /data/app/click.master.com/logs/dlogs/ 为固定路径,news.master.com为参数dm的值 ,20131027为当前时间的年月日,2013102705为上小时的年月日小时)
存文件的时候 文件的格式为:
tm #*# uid #*# os #*# br #*# ip #*# ul #*# pt #*# pm #*# tl #*# co
(注释:每个文本第一行为:tm #*# uid #*# os #*# br #*# ip #*# ul #*# pt #*# pm #*# tl #*# co)然后才添加解析后的信息。下列为解析后信息:
1385584886 #*# 1385584391326354 #*# WinXP #*# Firefox6.0 #*# #*#http%253A%252F%252Fnews.master.com%252Fsalon%252Fyuedu%252F&fs=1 #*#BODY:0-DIV:1-DIV:1-DIV:0-DIV:1-DIV:0-UL:0-LI:3-A:0 #*#cddc3281aa7b73a83ac9b42ad08c34a7 #*#http://news.master.com/study/bilingual/1564112.shtml #*#肯尼迪遇刺50年:特工曝光细节
(注释:这是一行数据,数据以" #*# "隔开,依次顺序为:请求参数中的tm、uid、os、br、请求访问的ip、参数中的ul、pt、参数中pt的md5值、参数中tl的url反编码的值、参数中的co)
处理日志文件的时候,保存路径中的时间不是服务器的时间 ,是上一个小时的时间
#!/usr/bin/python #coding:utf-8 """ Logs parse and format than write to the rigth dir Author by Qfeian @20131130 """ import os import re from urllib import unquote import hashlib import datetime #from time import sleep def hour(n=0): """Timeformat for hours the default is the current time """ now = datetime.datetime.now() h = now + datetime.timedelta(hours=n) return h.strftime('%Y%m%d'), h.strftime('%Y%m%d%H') def hash_str(str): """md5 encryption""" m = hashlib.md5() m.update(str) return m.hexdigest() def unquote_str(str): """urldecode by unquote""" s = unquote(str) return s def dir_query(str): """Check the dir if not exist create it """ s = os.path.isdir(str) if not s: os.makedirs(str) def path_query(paths,str): """如果第一次创建文本,则写入第一行内容""" s1 = os.path.isfile(paths) if not s1: f1 = open(paths,'a') f1.write(str) f1.close() def main(): log_dir = "/usr/local/nginx/logs/click.master.com_log/" #需要分析日志的路径 S_dir = "/data/app/click.master.com/logs/dlogs/" #分析后日志保存路径 str1 = "tm #*# uid #*# os #*# br #*# ip #*# ul #*# pt #*# pm #*# tl #*# co\n" """"format last hour time """ (D, H) = hour(-1) log_name = 'click.master.com_access.log-' + H #上小时nginx日志文件 log_path = os.path.join(log_dir, log_name) logfile = open(log_path, 'r') crg = re.compile(ur"(^\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})" ur".*&dm=(.*?)" ur"&ul=(.*?)" ur"&tm=(.*?)" ur"&os=(.*?)" ur"&br=(.*?)" ur"&.*?&uid=(\d+)" ur"&pt=(.*?)" ur"&tl=(.*?)" ur"&co=(.*?)&") for lines in logfile.readlines(): """ check up on the nginx logs by regular""" a = crg.findall(lines) """ if matching write to the logfile """ if a: (ip, dm, ul, tm, OS, br, uid, pt, tl, co) = a[0] pt = unquote_str(pt) tl = unquote_str(tl) co = unquote_str(co) pm = hash_str(pt) s = "%s #*# %s #*# %s #*# %s #*# %s #*# %s #*# %s #*# %s #*# %s #*# %s\n" % \ (tm, uid, OS, br, ip, ul, pt, pm, tl, co) W_log = os.path.join(S_dir, dm, D, H) W_log = W_log + '.log' W_dir = os.path.dirname(W_log) """ check logdir and logpath""" dir_query(W_dir) path_query(W_log,str1) """ write logs """ f = open(W_log, 'a') f.write(s) # print s # print dm, "\n" # sleep(1) f.close() logfile.close() if __name__ == "__main__": main()