1 分析日志的python框架awk.py
# # Custom awk.py module # class controller: def __init__(self, f): self.m_file = f self.m_handlers = [] def subscribe(self, o): self.m_handlers.append(o) def run(self): for o in self.m_handlers: o.begin() s = self.m_file.readline() while s != "": for o in self.m_handlers: o.process_line(s) s = self.m_file.readline() for o in self.m_handlers: o.end() def print_results(self): print print "Results:" print for o in self.m_handlers: print "------------------------------------------------------" print o.description() print "------------------------------------------------------" print o.result()
# Standard sys module import sys # Custom awk.py module import awk class count_lines: def begin(self): self.m_count = 0 def process_line(self, s): self.m_count += 1 def end(self): pass def description(self): return "# of lines in the file" def result(self): return self.m_count # # Step 1: Create the Awk controller # ac = awk.controller(sys.stdin) # # Step 2: Subscribe the handler # ac.subscribe(count_lines()) # # Step 3: Run # ac.run() # # Step 4: Print the results # ac.print_results()
使用方法是shell中执行
# cat apachelog.log|python count_lines.py
统计浏览次数超过n次的访问者 visitors.py
How many people have returned to the site more than N times?
import re; import sys imort awk class return_visitors: def __init__(self, n): self.m_n = n; self.m_ip_days = {}; def begin(self): pass; def process_line(self, s): try: array = s.split(); ip = array[0]; day = array[3][1:7]; if self.m_ip_days.has_key(ip): if day not in self.m_ip_days[ip]: self.m_ip_days[ip].append(day); else: self.m_ip_days[ip] = []; self.m_ip_days[ip].append(day); except IndexError: pass; def end(self): ips = self.m_ip_days.keys(); count = 0; for ip in ips: if len(self.m_ip_days[ip]) > self.m_n: count += 1; self.m_count = count; def description(self): return "# of IP addresses that visited more than %s days" % self.m_n; def result(self): return self.m_count; ac = awk.controller(sys.stdin) ac.subscribe(return_visitors(2)) ac.run() ac.print_results()
# cat apachelog.log|python visitors.py
import re; import sys imort awk class referring_domains: def __init__(self): self.m_domains = {}; def begin(self): pass; def process_line(self, line): try: array = line.split(); referrer = array[10]; m = re.search('//[a-zA-Z0-9\-\.]*\.[a-zA-z]{2,3}/', referrer); length = len(m.group(0)); domain = m.group(0)[2:length-1]; if self.m_domains.has_key(domain): self.m_domains[domain] += 1; else: self.m_domains[domain] = 1; except AttributeError: pass; except IndexError: pass; def end(self): pass; def description(self): return "Referring domains"; def sort(self, key1, key2): if self.m_domains[key1] > self.m_domains[key2]: return -1; elif self.m_domains[key1] == self.m_domains[key2]: return 0; else: return 1; def result(self): s = ""; keys = self.m_domains.keys(); keys.sort(self.sort); for domain in keys: s += domain; s += " "; s += str(self.m_domains[domain]); s += "\n"; s += "\n\n"; return s; ac = awk.controller(sys.stdin) ac.subscribe(referring_domains()) ac.run() ac.print_results()