只需要稍微查看一下文件遍历的方法os.walk()和文件md5的方法
之后,再修改一下
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import argparse
import os
import tempfile
import hashlib
def print_dedu():
for file_md5, filelist in dedu_dict.items():
if len(filelist) == 1:
continue
print "md5:{}".format(file_md5)
for filename in filelist:
print "{}".format(filename)
print ""
def exec_dedu():
for file_md5, filelist in dedu_dict.items():
if len(filelist) == 1:
continue
print "md5:{}".format(file_md5)
filelist.pop()
for filename in filelist:
print "rm {}".format(filename)
os.remove(filename)
print ""
parser = argparse.ArgumentParser(description="This is a de-duplicate tool")
parser.add_argument("dir",default=".",
help="target directory")
parser.add_argument("-s","--safe","--scan",
action="store_true",dest="not_delete",
help="scan directory only ,don't delete file")
parser.add_argument("-o","--output",type=argparse.FileType("w"), default=None,
help="output of scan result")
args = parser.parse_args()
print "[INFO]dir:{}".format(args.dir)
if args.not_delete:
print "[INFO]we are in safe mode."
not_delete = args.not_delete
output_fifo=""
output_filename=""
if args.output:
output_fifo=args.output
output_filename=args.output.name
else:
fd, output_filename = tempfile.mkstemp(prefix="dedu-",suffix=".log")
output_fifo=os.fdopen(fd,"w")
print "[INFO]output filename:{}".format(output_filename)
output_fifo.write("Hello World\n")
# check paramter
if not os.path.isdir(args.dir):
print("dir{} is not exists!".format(args.dir))
sys.exit(-1)
target_dir = args.dir
# let's traverse_directory
def md5(fname):
""" from http://stackoverflow.com/quest- ions/3431825/generating-a-md5-checksum-of-a-file """
hash = hashlib.md5()
with open(fname, "rb") as f:
for chunk in iter(lambda: f.read(4096), b""):
hash.update(chunk)
return hash.hexdigest()
dedu_dict={}
for dirpath, subdirList, subfileList in os.walk(target_dir):
for filename in subfileList:
full_filename = os.path.join(dirpath, filename)
file_md5 = md5(full_filename)
if file_md5 not in dedu_dict :
dedu_dict[file_md5] = []
dedu_dict[file_md5].append(full_filename)
#dedu_dict[file_md5].append(full_filename)
print_dedu()
output_fifo.close()
if not not_delete:
exec_dedu()