python实现对数据比较多的两个txt文件的对比处理(题目)

现有两个文本文件ids.txt和md5s.txt
ids.txt有1万行,按行存储了一些id
md5s.txt有50万行,按行存储了包含上述id的一些id的md5值和对应的类型
现在要求你写一个python程序,计算出ids.txt里面的id存在的每个类型在ids.txt中出现的次数

示例:
ids.txt
8pud48wb4grzd799c53u
4krayxlclgjsh7v05rc7
rozc9fiiqpuzjs3kkerd
twzsjoopfa6gsky72dbe
j2z86z3e35h25i2k2ili

md5s.txt
2115f3ec9cec4d9473adde5622f8af2a baidu
dd7406385609c73568dc61638a820342 baidu
bd5d56c8248c49db8368cdf2637d294d toutiao
025aaa4b83735c7d1e37a3e72797903f
202cb962ac59075b964b07152d234b70 toutiao
ffbbfcd692e84d6b82af1b5c0e6f5446 tencent
3f86578312012b52573bc5c279ac1dcb baidu
96a0143692631211a7e497c162de22de toutiao
048369068d08922144a6b188e8af304a
40978eb7ce7e36d9b1b0d2a30a1afbb8 baidu

输出
baidu 3
toutiao 1


import hashlib
import os
import time


class Find():
    def __init__(self):
        self.list = []
        self.__ids_list = []
        self.__md5s_list = []
        self.__ids_md5s = {}

    def __md5_str(self, str):
        md5 = hashlib.md5(str.encode('utf8')).hexdigest()
        return md5

    def __open_file(self, path):
        with open(path, 'r', encoding='utf-8') as file:
            content = file.readlines()
        content_list = [str.replace('\n', '') for str in content]
        return content_list

    def __dispose_md5s(self, md5s_list):
        list1 = []
        for str in md5s_list:
            if len(str) > 36:
                list1.extend(str.split())
        return list1

    def __find(self, ids_list, md5s_list):
        for ids in ids_list:
            try:
                index = md5s_list.index(self.__md5_str(ids))
                if self.__ids_md5s.get(md5s_list[index + 1], '1') == '1':
                    self.__ids_md5s['{}'.format(md5s_list[index + 1])] = 0
                self.__ids_md5s['{}'.format(md5s_list[index + 1])] += 1
            except BaseException as a:
                pass
        return self.__ids_md5s

    def main(self, path, file1, file2):
        path1 = os.path.join(path, file1)
        path2 = os.path.join(path, file2)
        self.__ids_list = self.__open_file(path1)
        self.__md5s_list = self.__dispose_md5s(self.__open_file(path2))
        return self.__find(self.__ids_list, self.__md5s_list)


if __name__ == '__main__':
    start_time = time.time()
    file1 = 'ids.txt'
    file2 = 'md5s.txt'
    path = os.path.dirname(os.path.abspath(__file__))
    run = Find()
    result_dict = run.main(path, file1, file2)
    for k, v in result_dict.items():
        print(k, v)
    end_time = time.time()
    print("总共用时:", end_time - start_time)

"""
结果:
toutiao 3838
taobao 2258
tencent 2375
baidu 507
总共用时: 57.6369833946228
"""

你可能感兴趣的:(python)