Python文件去重工具

只需要稍微查看一下文件遍历的方法os.walk()和文件md5的方法
之后,再修改一下

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import argparse
import os
import tempfile
import hashlib


def print_dedu():
    for file_md5, filelist in dedu_dict.items():
        if len(filelist) == 1:
            continue
        print "md5:{}".format(file_md5)
        for filename in filelist:
            print "{}".format(filename)
        print ""

def exec_dedu():
    for file_md5, filelist in dedu_dict.items():
        if len(filelist) == 1:
            continue
        print "md5:{}".format(file_md5)
        filelist.pop()
        for filename in filelist:
            print "rm {}".format(filename)
            os.remove(filename)
        print ""

parser = argparse.ArgumentParser(description="This is a de-duplicate tool")

parser.add_argument("dir",default=".",
                    help="target directory")
parser.add_argument("-s","--safe","--scan",
                    action="store_true",dest="not_delete",
                    help="scan directory only ,don't delete file")
parser.add_argument("-o","--output",type=argparse.FileType("w"), default=None,
                    help="output of scan result")
args = parser.parse_args()

print "[INFO]dir:{}".format(args.dir)
if args.not_delete:
    print "[INFO]we are in safe mode."
not_delete = args.not_delete

output_fifo=""
output_filename=""
if args.output:
    output_fifo=args.output
    output_filename=args.output.name
else:
    fd, output_filename = tempfile.mkstemp(prefix="dedu-",suffix=".log")
    output_fifo=os.fdopen(fd,"w")

print "[INFO]output filename:{}".format(output_filename)
output_fifo.write("Hello World\n")

# check paramter
if not os.path.isdir(args.dir):
    print("dir{} is not exists!".format(args.dir))
    sys.exit(-1)
target_dir = args.dir
# let's traverse_directory
def md5(fname):
    """ from http://stackoverflow.com/quest- ions/3431825/generating-a-md5-checksum-of-a-file """
    hash = hashlib.md5()
    with open(fname, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash.update(chunk)
    return hash.hexdigest()

dedu_dict={}
for dirpath, subdirList, subfileList in os.walk(target_dir):
    for filename in subfileList:
        full_filename = os.path.join(dirpath, filename)
        file_md5 = md5(full_filename)
        if file_md5  not in dedu_dict :
            dedu_dict[file_md5] = []
        dedu_dict[file_md5].append(full_filename)
        #dedu_dict[file_md5].append(full_filename)

print_dedu()
output_fifo.close()

if not not_delete:
    exec_dedu()

你可能感兴趣的:(python)