整体思路:
首先是循环遍历文件夹下的文件;
通过建立字典:key:文件内容的哈希值,value:文件名
遇到一个文件,得到哈希值,并比较是否在字典中存在,如果存在就输出文件名,,否则加入字典
# -*- coding: utf-8 -*-
import os
import fnmatch
import hashlib
def is_file_match(filename, patterns):
for pattern in patterns:
if fnmatch.fnmatch(filename, pattern):
return True
return False
def find_specific_files(root, patterns=['*.py'], exclude_dirs=[]):
for root, dirnames, filenames in os.walk(root):
for filename in filenames:
if is_file_match(filename, patterns):
yield os.path.join(root, filename)
for d in exclude_dirs:
if d in dirnames:
dirnames.remove(d)
d=input('输入文件地址:')
a={}
def get_chunk(filename):
with open(filename,encoding='UTF-8') as f:
while True:
chunk = f.readline()
if not chunk:
break
else:
yield chunk
def get_file_checksum(filename):
h=hashlib.md5()
for chunk in get_chunk(filename):
h.update(chunk.encode())
return h.hexdigest()
for item in find_specific_files(d):
key=get_file_checksum(item)
if key in a:
print(item,'is the same with',a[key])
else:
a[key]=item