现有两个文本文件ids.txt和md5s.txt
ids.txt有1万行,按行存储了一些id
md5s.txt有50万行,按行存储了包含上述id的一些id的md5值和对应的类型
现在要求你写一个python程序,计算出ids.txt里面的id存在的每个类型在ids.txt中出现的次数
示例:
ids.txt
8pud48wb4grzd799c53u
4krayxlclgjsh7v05rc7
rozc9fiiqpuzjs3kkerd
twzsjoopfa6gsky72dbe
j2z86z3e35h25i2k2ili
md5s.txt
2115f3ec9cec4d9473adde5622f8af2a baidu
dd7406385609c73568dc61638a820342 baidu
bd5d56c8248c49db8368cdf2637d294d toutiao
025aaa4b83735c7d1e37a3e72797903f
202cb962ac59075b964b07152d234b70 toutiao
ffbbfcd692e84d6b82af1b5c0e6f5446 tencent
3f86578312012b52573bc5c279ac1dcb baidu
96a0143692631211a7e497c162de22de toutiao
048369068d08922144a6b188e8af304a
40978eb7ce7e36d9b1b0d2a30a1afbb8 baidu
输出
baidu 3
toutiao 1
import hashlib
import os
import time
class Find():
def __init__(self):
self.list = []
self.__ids_list = []
self.__md5s_list = []
self.__ids_md5s = {}
def __md5_str(self, str):
md5 = hashlib.md5(str.encode('utf8')).hexdigest()
return md5
def __open_file(self, path):
with open(path, 'r', encoding='utf-8') as file:
content = file.readlines()
content_list = [str.replace('\n', '') for str in content]
return content_list
def __dispose_md5s(self, md5s_list):
list1 = []
for str in md5s_list:
if len(str) > 36:
list1.extend(str.split())
return list1
def __find(self, ids_list, md5s_list):
for ids in ids_list:
try:
index = md5s_list.index(self.__md5_str(ids))
if self.__ids_md5s.get(md5s_list[index + 1], '1') == '1':
self.__ids_md5s['{}'.format(md5s_list[index + 1])] = 0
self.__ids_md5s['{}'.format(md5s_list[index + 1])] += 1
except BaseException as a:
pass
return self.__ids_md5s
def main(self, path, file1, file2):
path1 = os.path.join(path, file1)
path2 = os.path.join(path, file2)
self.__ids_list = self.__open_file(path1)
self.__md5s_list = self.__dispose_md5s(self.__open_file(path2))
return self.__find(self.__ids_list, self.__md5s_list)
if __name__ == '__main__':
start_time = time.time()
file1 = 'ids.txt'
file2 = 'md5s.txt'
path = os.path.dirname(os.path.abspath(__file__))
run = Find()
result_dict = run.main(path, file1, file2)
for k, v in result_dict.items():
print(k, v)
end_time = time.time()
print("总共用时:", end_time - start_time)
"""
结果:
toutiao 3838
taobao 2258
tencent 2375
baidu 507
总共用时: 57.6369833946228
"""