前言
由于和上一篇日志间隔较长,我尽力回忆,把当时解答的过程还原出来,越来越讨厌CSDN的博客了,神马玩意啊,个人profile下面都插一个广告栏,弃用是迟早的事。
附上我的知乎: http://www.zhihu.com/people/liu-shuai-82。
题目内容
呃,拿到题目之后着实思考了一会,主要是思考各种处理方法的效率问题。后来又去搜索了一下业界这种数据处理的通用方式以及注意事项,基本上这种大数据的处理要用Hash。后来我确定了方法:
按天遍历每一条日志记录,为每一天创建一个临时文件,在内存中用用户id作为key,对应的“/topic/***”访问路径作为value。每遍历到一条访问路径是“/topic/***”的记录,判断内存中的hash表中是否已经有该条日志用户的id,有的话判断该id在hash表中的value——topic的路径与当前读到的日志记录的topic路径是否相同,如果不同的话,将id写到这一天对应的临时文件中。
上面的过程处理完毕之后,我们得到了30个临时文件,每个临时文件记录了一些用户的id,这些用户满足:在该天内,访问的“/topic/***”路径中,包含两个不同的路径。然后我们要做的是从这30个临时文件中找出一些id,它们在每个文件中都出现过。这些ID就是我们的最终结果。
后面一问,找出一个路径列表,用类似的hash处理方式即可。
按着这样的思路,不难写出代码:
import sys, re, os
re_exp_1 = re.compile('\[.+\] (\d+) \d+ [A-Za-z]+ /topic/(\d+) \(((2[0-4]\d|25[0-5]|[01]?\d\d?)\.){3}(2[0-4]\d|25[0-5]|[01]?\d\d?)\) [\d\.]+[a-zA-Z]+')
re_exp_2 = re.compile('\[.+\] (\d+) \d+ [A-Za-z]+ ([/a-zA-Z\d]+) \(((2[0-4]\d|25[0-5]|[01]?\d\d?)\.){3}(2[0-4]\d|25[0-5]|[01]?\d\d?)\) [\d\.]+[a-zA-Z]+')
def get_id_topic_path(log_line):
'''
return the uid and the subpath of topic via a tuple if match.
'''
m = re_exp_1.match(log_line)
return (m.group(1), m.group(2)) if m else None
def get_id_path(log_line):
'''
return the uid and the full path of via a tuple if match.
'''
m = re_exp_2.match(log_line)
return (m.group(1), m.group(2)) if m else None
def get_usr_list(log_path):
for day in range(1, 31):
day_qualified_file = open(''.join((log_path, str(day), '.txt')), 'w')
day_qualified_table = {}
for hour in range(24):
log_name = '%s%s-%s-%s-%s.log' % (log_path, '2013', '1', str(day), str(hour))
print log_name
log_file = open(log_name, 'r')
for log_line in log_file:
id_path = get_id_topic_path(log_line)
if id_path:
if id_path[0] in day_qualified_table:
if day_qualified_table[id_path[0]] != id_path[1]:
day_qualified_file.write('%s%s' % (id_path[0], '\n'))
del day_qualified_table[id_path[0]]
else:
day_qualified_table[id_path[0]] = id_path[1]
log_file.close()
day_qualified_file.close()
del day_qualified_table
id_set_list = []
for day in range(1, 31):
day_qualified_file_name = ''.join((log_path, str(day), '.txt'))
day_qualified_file = open(day_qualified_file_name, 'r')
id_list = []
for line in day_qualified_file:
id_list.append(line.strip())
day_qualified_file.close()
os.remove(day_qualified_file_name)
id_set = set({}.fromkeys(id_list).keys()) # remove duplicate user id in someday
id_set_list.append(id_set)
result = id_set_list[0]
for id_set in id_set_list:
result = result & id_set
# print result
result_file = open(r'usr.txt', 'w+') # write final result into usr.txt
for uid in result:
result_file.write('%s%s' % (uid, '\n'))
result_file.close()
return result
def get_url_list(log_path, usr_list):
for day in range(1, 31):
day_qualified_file = open(''.join((log_path, str(day), '.txt')), 'w')
day_qualified_table = {}
for hour in range(24):
log_name = '%s%s-%s-%s-%s.log' % (log_path, '2013', '1', str(day), str(hour))
print log_name
log_file = open(log_name, 'r')
for log_line in log_file:
id_path = get_id_path(log_line)
if id_path:
if id_path[0] in usr_list:
if id_path[1] in day_qualified_table:
if day_qualified_table[id_path[1]] != id_path[0]:
day_qualified_file.write('%s%s' % (id_path[1], '\n'))
del day_qualified_table[id_path[1]]
else:
day_qualified_table[id_path[1]] = id_path[0]
log_file.close()
day_qualified_file.close()
del day_qualified_table
url_set_list = []
for day in range(1, 31):
day_qualified_file_name = ''.join((log_path, str(day), '.txt'))
day_qualified_file = open(day_qualified_file_name, 'r')
url_list = []
for line in day_qualified_file:
url_list.append(line.strip())
day_qualified_file.close()
os.remove(day_qualified_file_name)
url_set = set({}.fromkeys(url_list).keys())
url_set_list.append(url_set)
result = url_set_list[0]
for url_set in url_set_list:
result = result & url_set
# print result
result_file = open(r'url.txt', 'w+')
for url in result:
result_file.write('%s%s' % (url, '\n'))
result_file.close()
return result
def main():
if len(sys.argv) < 2:
print '\nUse like this:\n\t$python find.py [log_file_path]\nThe result will be output to usr.txt and url.txt.\n'
return
log_path = sys.argv[1]
# print get_id_topic_path('[I 130403 17:26:40] 123745187 200 GET /topic/034581 (8.8.9.9) 200.39ms')
usr_list = get_usr_list(log_path)
get_url_list(log_path, usr_list)
if __name__ == '__main__':
main()
要注意的是可能每一天的临时文件中有重复的id,首先要做一下去重处理。附上Python列表去重的神方法:
tmp_list = [1, 1, 2, 3, 3, 4, 4]
new_list = {}.fromkeys(tmp_list).keys()
到后面就是语言技巧的事了。找出在30个去重了的列表中都出现的元素,我是先把每个列表变成集合,利用集合的交运算,找出最终符合条件的用户id,写到文件中。
result = id_set_list[0]
for id_set in id_set_list:
result = result & id_set
# print result
附——测试脚本
正直当时学SQA,为了测试自己的代码,我还写了测试脚本,这么说也不准确,就是随机成了一些日志文件,往里面放了一些符合条件的数据,看脚本能不能找出来它们,来进行测试,代码如下:
import random, sys
if len(sys.argv) < 2:
print '\nUse like this:\n\t$python create_test_log.py [log_path]\nAnd the 30 * 24 log files for testing will be created in the log_path.\n'
sys.exit(0)
usr_list = ['19930418', '19930715', '20130607', '19920212']
for day in range(1, 31):
for hour in range(0, 24):
log_num = random.randint(1000, 10000)
print 'Create %s logs in 2013-1-%s-%s' % (log_num, str(day), str(hour))
log_file = open('%s%s-%s-%s-%s.log' % (sys.argv[1], 2013, 1, str(day), str(hour)), 'w')
for i in range(log_num):
level = random.randint(0, 2)
uid = random.randint(0, 9999999)
path_base = random.randint(0, 2)
path = random.randint(0, 9999999)
status = random.randint(0, 2)
method = random.randint(0, 2)
day_str = '0' + str(day) if day < 10 else str(day)
hour_str = '0' + str(hour) if hour < 10 else str(hour)
log = '[I 1301%s %s:%s:%s] %s %s %s /%s/%s (%s.%s.%s.%s) %sms\n' % \
(day_str, \
hour_str, \
str(random.randint(1, 60)), \
str(random.randint(1, 60)), \
str(uid), \
['200', '302', '404'][status], \
['POST', 'GET', 'DELETE'][method], \
['topic', 'answer', 'question'][path_base], \
str(path), \
str(random.randint(0, 255)), \
str(random.randint(0, 255)), \
str(random.randint(0, 255)), \
str(random.randint(0, 255)), \
str(random.random() * 100))
log_file.write(log)
for usr in usr_list:
log = '[I 1301%s %s:%s:%s] %s %s %s /%s/%s (%s.%s.%s.%s) %sms\n' % \
(day_str, \
hour_str, \
str(random.randint(1, 60)), \
str(random.randint(1, 60)), \
usr, \
['200', '302', '404'][status], \
['POST', 'GET', 'DELETE'][method], \
'topic', \
'0101010101', \
str(random.randint(0, 255)), \
str(random.randint(0, 255)), \
str(random.randint(0, 255)), \
str(random.randint(0, 255)), \
str(random.random() * 100))
log_file.write(log)
log = '[I 1301%s %s:%s:%s] %s %s %s /%s/%s (%s.%s.%s.%s) %sms\n' % \
(day_str, \
hour_str, \
str(random.randint(1, 60)), \
str(random.randint(1, 60)), \
usr, \
['200', '302', '404'][status], \
['POST', 'GET', 'DELETE'][method], \
'topic', \
'00000000', \
str(random.randint(0, 255)), \
str(random.randint(0, 255)), \
str(random.randint(0, 255)), \
str(random.randint(0, 255)), \
str(random.random() * 100))
log_file.write(log)
log_file.close()
后记
此时无声胜有声。