知乎笔试(二)——日志处理

前言

由于和上一篇日志间隔较长,我尽力回忆,把当时解答的过程还原出来,越来越讨厌CSDN的博客了,神马玩意啊,个人profile下面都插一个广告栏,弃用是迟早的事。
附上我的知乎: http://www.zhihu.com/people/liu-shuai-82。

题目内容

知乎笔试(二)——日志处理_第1张图片
呃,拿到题目之后着实思考了一会,主要是思考各种处理方法的效率问题。后来又去搜索了一下业界这种数据处理的通用方式以及注意事项,基本上这种大数据的处理要用Hash。后来我确定了方法:
按天遍历每一条日志记录,为每一天创建一个临时文件,在内存中用用户id作为key,对应的“/topic/***”访问路径作为value。每遍历到一条访问路径是“/topic/***”的记录,判断内存中的hash表中是否已经有该条日志用户的id,有的话判断该id在hash表中的value——topic的路径与当前读到的日志记录的topic路径是否相同,如果不同的话,将id写到这一天对应的临时文件中。
上面的过程处理完毕之后,我们得到了30个临时文件,每个临时文件记录了一些用户的id,这些用户满足:在该天内,访问的“/topic/***”路径中,包含两个不同的路径。然后我们要做的是从这30个临时文件中找出一些id,它们在每个文件中都出现过。这些ID就是我们的最终结果。
后面一问,找出一个路径列表,用类似的hash处理方式即可。
按着这样的思路,不难写出代码:
import sys, re, os

re_exp_1 = re.compile('\[.+\] (\d+) \d+ [A-Za-z]+ /topic/(\d+) \(((2[0-4]\d|25[0-5]|[01]?\d\d?)\.){3}(2[0-4]\d|25[0-5]|[01]?\d\d?)\) [\d\.]+[a-zA-Z]+')
re_exp_2 = re.compile('\[.+\] (\d+) \d+ [A-Za-z]+ ([/a-zA-Z\d]+) \(((2[0-4]\d|25[0-5]|[01]?\d\d?)\.){3}(2[0-4]\d|25[0-5]|[01]?\d\d?)\) [\d\.]+[a-zA-Z]+')

def get_id_topic_path(log_line):
	'''
	return the uid and the subpath of topic via a tuple if match.
	'''
	m = re_exp_1.match(log_line)
	return (m.group(1), m.group(2)) if m else None

def get_id_path(log_line):
	'''
	return the uid and the full path of via a tuple if match.
	'''
	m = re_exp_2.match(log_line)
	return (m.group(1), m.group(2)) if m else None

def get_usr_list(log_path):
	for day in range(1, 31):
		day_qualified_file = open(''.join((log_path, str(day), '.txt')), 'w')
		day_qualified_table = {}
		for hour in range(24):
			log_name = '%s%s-%s-%s-%s.log' % (log_path, '2013', '1', str(day), str(hour))
			print log_name
			log_file = open(log_name, 'r')
			for log_line in log_file:
				id_path = get_id_topic_path(log_line)
				if id_path:
					if id_path[0] in day_qualified_table:
						if day_qualified_table[id_path[0]] != id_path[1]:
							day_qualified_file.write('%s%s' % (id_path[0], '\n'))
							del day_qualified_table[id_path[0]]
					else:
						day_qualified_table[id_path[0]] = id_path[1]
			log_file.close()
		day_qualified_file.close()
	del day_qualified_table

	id_set_list = []

	for day in range(1, 31):
		day_qualified_file_name = ''.join((log_path, str(day), '.txt'))
		day_qualified_file = open(day_qualified_file_name, 'r')
		id_list = []
		for line in day_qualified_file:
			id_list.append(line.strip())
		day_qualified_file.close()
		os.remove(day_qualified_file_name)
		id_set = set({}.fromkeys(id_list).keys()) # remove duplicate user id in someday
		id_set_list.append(id_set)

	result = id_set_list[0]
	for id_set in id_set_list:
		result = result & id_set
		# print result

	result_file = open(r'usr.txt', 'w+') # write final result into usr.txt
	for uid in result:
		result_file.write('%s%s' % (uid, '\n'))
	result_file.close()
	return result

def get_url_list(log_path, usr_list):
	for day in range(1, 31):
		day_qualified_file = open(''.join((log_path, str(day), '.txt')), 'w')
		day_qualified_table = {}
		for hour in range(24):
			log_name = '%s%s-%s-%s-%s.log' % (log_path, '2013', '1', str(day), str(hour))
			print log_name
			log_file = open(log_name, 'r')
			for log_line in log_file:
				id_path = get_id_path(log_line)
				if id_path:
					if id_path[0] in usr_list:
						if id_path[1] in day_qualified_table:
							if day_qualified_table[id_path[1]] != id_path[0]:
								day_qualified_file.write('%s%s' % (id_path[1], '\n'))
								del day_qualified_table[id_path[1]]
						else:
							day_qualified_table[id_path[1]] = id_path[0]
			log_file.close()
		day_qualified_file.close()
	del day_qualified_table

	url_set_list = []

	for day in range(1, 31):
		day_qualified_file_name = ''.join((log_path, str(day), '.txt'))
		day_qualified_file = open(day_qualified_file_name, 'r')
		url_list = []
		for line in day_qualified_file:
			url_list.append(line.strip())
		day_qualified_file.close()
		os.remove(day_qualified_file_name)
		url_set = set({}.fromkeys(url_list).keys())
		url_set_list.append(url_set)

	result = url_set_list[0]
	for url_set in url_set_list:
		result = result & url_set
		# print result

	result_file = open(r'url.txt', 'w+')
	for url in result:
		result_file.write('%s%s' % (url, '\n'))
	result_file.close()
	return result

						

def main():
	if len(sys.argv) < 2:
		print '\nUse like this:\n\t$python find.py [log_file_path]\nThe result will be output to usr.txt and url.txt.\n'
		return
	log_path = sys.argv[1]
	# print get_id_topic_path('[I 130403 17:26:40] 123745187 200 GET /topic/034581 (8.8.9.9) 200.39ms')
	usr_list = get_usr_list(log_path)
	get_url_list(log_path, usr_list)

if __name__ == '__main__':
	main()

要注意的是可能每一天的临时文件中有重复的id,首先要做一下去重处理。附上Python列表去重的神方法:
tmp_list = [1, 1, 2, 3, 3, 4, 4]
new_list = {}.fromkeys(tmp_list).keys()

到后面就是语言技巧的事了。找出在30个去重了的列表中都出现的元素,我是先把每个列表变成集合,利用集合的交运算,找出最终符合条件的用户id,写到文件中。
	result = id_set_list[0]
	for id_set in id_set_list:
		result = result & id_set
		# print result

附——测试脚本

正直当时学SQA,为了测试自己的代码,我还写了测试脚本,这么说也不准确,就是随机成了一些日志文件,往里面放了一些符合条件的数据,看脚本能不能找出来它们,来进行测试,代码如下:
import random, sys

if len(sys.argv) < 2:
	print '\nUse like this:\n\t$python create_test_log.py [log_path]\nAnd the 30 * 24 log files for testing will be created in the log_path.\n'
	sys.exit(0)

usr_list = ['19930418', '19930715', '20130607', '19920212']

for day in range(1, 31):
	for hour in range(0, 24):
		log_num = random.randint(1000, 10000)
		print 'Create %s logs in 2013-1-%s-%s' % (log_num, str(day), str(hour))
		log_file = open('%s%s-%s-%s-%s.log' % (sys.argv[1], 2013, 1, str(day), str(hour)), 'w')
		for i in range(log_num):
			level = random.randint(0, 2)
			uid = random.randint(0, 9999999)
			path_base = random.randint(0, 2)
			path = random.randint(0, 9999999)
			status = random.randint(0, 2)
			method = random.randint(0, 2)
			day_str = '0' + str(day) if day < 10 else str(day)
			hour_str = '0' + str(hour) if hour < 10 else str(hour)
			log = '[I 1301%s %s:%s:%s] %s %s %s /%s/%s (%s.%s.%s.%s) %sms\n' % \
				(day_str, \
					hour_str, \
					str(random.randint(1, 60)), \
					str(random.randint(1, 60)), \
					str(uid), \
					['200', '302', '404'][status], \
					['POST', 'GET', 'DELETE'][method], \
					['topic', 'answer', 'question'][path_base], \
					str(path), \
					str(random.randint(0, 255)), \
					str(random.randint(0, 255)), \
					str(random.randint(0, 255)), \
					str(random.randint(0, 255)), \
					str(random.random() * 100))
			log_file.write(log)
		for usr in usr_list:
			log = '[I 1301%s %s:%s:%s] %s %s %s /%s/%s (%s.%s.%s.%s) %sms\n' % \
				(day_str, \
					hour_str, \
					str(random.randint(1, 60)), \
					str(random.randint(1, 60)), \
					usr, \
					['200', '302', '404'][status], \
					['POST', 'GET', 'DELETE'][method], \
					'topic', \
					'0101010101', \
					str(random.randint(0, 255)), \
					str(random.randint(0, 255)), \
					str(random.randint(0, 255)), \
					str(random.randint(0, 255)), \
					str(random.random() * 100))
			log_file.write(log)
			log = '[I 1301%s %s:%s:%s] %s %s %s /%s/%s (%s.%s.%s.%s) %sms\n' % \
			(day_str, \
				hour_str, \
				str(random.randint(1, 60)), \
				str(random.randint(1, 60)), \
				usr, \
				['200', '302', '404'][status], \
				['POST', 'GET', 'DELETE'][method], \
				'topic', \
				'00000000', \
				str(random.randint(0, 255)), \
				str(random.randint(0, 255)), \
				str(random.randint(0, 255)), \
				str(random.randint(0, 255)), \
				str(random.random() * 100))
			log_file.write(log)
		log_file.close()

后记

此时无声胜有声。


你可能感兴趣的:(知乎笔试(二)——日志处理)