通过该代码,去除重复备份的数据,减少存储空间浪费
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import os
import datetime
import platform
import sys
import shutil
import hashlib
def mkdir(path):
# 引入模块
import os
# 去除首位空格
path = path.strip()
# 去除尾部 \ 符号
#path = path.rstrip("\\")
# 判断路径是否存在
# 存在 True
# 不存在 False
isExists = os.path.exists(path)
# 判断结果
if not isExists:
# 如果不存在则创建目录
# 创建目录操作函数
try:
os.makedirs(path)
except:
return False
print(path +" create successed!")
return True
else:
# 如果目录存在则不创建,并提示目录已存在
print(path +"is exist!")
return True
def CalcFileSha256(filname):
''' calculate file sha256 '''
bufsize =1024 *1024 *16
with open(filname,"rb")as f:
sha256obj = hashlib.sha256()
while True:
data = f.read(bufsize)
if data ==None or len(data) ==0:
break
sha256obj.update(data)
hash_value = sha256obj.hexdigest()
return hash_value
class CoSailTextFile():
def __init__(self, filename =None):
self.filename = filename
self.f =None
def open(self, m):
try:
self.f =open(self.filename, m)
except Exception as e:
print(e)
self.f =None
return False
return True
def writeLine(self, line):
try:
self.f.write(line +"\n")
return True
except Exception as e:
print(e)
return False
def writeLines(self, lines):
for linein lines:
if self.writeLine(line) ==False:
return False
return True
def readLine(self):
text =None
try:
text =self.f.readline()
except Exception as e:
print(e)
return None
return text
def readAll(self):
return self.f.read()
def readLines(self):
return self.f.readlines()
def close(self):
if self.f !=None:
try:
self.f.close()
self.f =None
except Exception as e:
print(e)
return False
return True
return True
def isOpen(self):
return self.f !=None
def getFilePathExtend(fileName):
filePath, suffix = os.path.splitext(fileName)
filePath = os.path.dirname(fileName)
l =len(suffix)
fileName = os.path.split(fileName)[1]
return filePath, suffix[1:l +1], fileName
filesdict = {}
def scansamenamefile(path, type="filename"):
"""
扫描目录内重复文件,
:param path: 扫描路径
:param type: 相同文件判别方式,"filename" : 文件名 "sha256":sha256码
:return:
"""
if os.path.exists(path) ==False:
print("Store Path :" + path +" isn't exist!")
return None
try:
files = os.listdir(path)
except FileNotFoundError:
print("File Not FoundError")
return None
for fiin files:
# file full path
# sleep 1
# time.sleep(1)
fi_d = os.path.join(path, fi)
samefilelist =None
# is directory
if os.path.isdir(fi_d):
if os.path.islink(fi_d):
continue
""" recursion"""
scansamenamefile(fi_d, type)
# print("dir = ", fi_d)
# is file
else:
file_path, suffix, file_name = getFilePathExtend(fi_d)
tmpfullpath = file_path +"/" + file_name
samekey =""
sha256 = CalcFileSha256(tmpfullpath)
if type =="filename":
samekey = file_name
if type =="sha256":
samekey = sha256
try:
samefilelist = filesdict[samekey]
except Exception as e:
print(e)
samefilelist =None
if samefilelist ==None:
samefilelist = []
filesdict[samekey] = samefilelist
fileattrdict = {}
fileattrdict["filepath"] = tmpfullpath
fileattrdict["size"] = os.path.getsize(tmpfullpath)
fileattrdict["sha256"] = CalcFileSha256(tmpfullpath)
fileattrdict["filename"] = file_name
samefilelist.append(fileattrdict)
return filesdict
def statallfiles(d):
v =0;
o =None
for kin d:
o = d[k]
v +=len(o)
return v
def statsamenamefiles(d):
v =0;
o =None
for kin d:
o = d[k]
if len(o) >=2:
v +=1
return v
def outduplicatefilemsg(d, textFile):
v =0;
o =None
textFile.writeLine("")
textFile.writeLine("**************Duplicate file list*****************")
for kin d:
o = d[k]
if len(o) >=2:
textFile.writeLine("")
txt ="file name : " + o[0]["filename"] +" sha256 : " +o[0]["sha256"] +" Duplicate : " +str(len(o))
textFile.writeLine(txt)
txt ="file list :"
for objin o:
txt ="fileputh :" + obj["filepath"] +" size : " +str(obj["size"]) +" SHA256 = " + obj["sha256"]
textFile.writeLine(txt)
def outonefilemsg(d, textFile):
v =0;
o =None
textFile.writeLine("")
textFile.writeLine("**************Unique file list*****************")
for kin d:
o = d[k]
if len(o) ==1:
for objin o:
txt ="fileputh :" + obj["filepath"] +" size : " +str(obj["size"])
textFile.writeLine(txt)
def statpathfileattr(pathlistfilename, logpath, type="filename"):
"""
:param pathlistfilename: 要统计的文件目录列表名,每行一个目录
:param logpath 日志文件路径
:return:
"""
"""
"""
if logpathis None:
"""
如果日志路径为空,则以输入文件列表路径为日志存储路径"""
logpath, _, _= getFilePathExtend(pathlistfilename)
if os.path.exists(pathlistfilename) ==False:
print("path list filename isn't exist!")
return
if os.path.exists(logpath) ==False:
print("log path isn't exist")
return
f = CoSailTextFile(pathlistfilename)
f.open("r")
lines = f.readLines()
for linein lines:
line = line.strip('\n')
statpathfileattrbypath(line, logpath, type)
f.close()
def backuppathfile(pathlistfilename, descpath, type="filename"):
"""
:param pathlistfilename: 备份文件列表中指定目录到目的目录
:param descpath 备份目的mulu
:return:
"""
"""
"""
if os.path.exists(descpath) ==False:
print("backup descpath isn't exist")
return
if os.path.exists(pathlistfilename) ==False:
print("path list filename isn't exist!")
return
f = CoSailTextFile(pathlistfilename)
f.open("r")
lines = f.readLines()
for linein lines:
line = line.strip('\n')
d = scansamenamefile(line, type)
if d ==None:
continue
backupfiletodest(descpath, line, d)
f.close()
def statpathfileattrbypath(path, logpath, type="filename"):
"""
统计指定路径的文件数量以及重复文件,并输出统计信息
:param path: 要统计的路径
:param logpath: 统计信息存储路基
:return:
"""
d = scansamenamefile(path, type)
if d ==None :
return
print("allfiles", statallfiles(d))
print("sameallfiles", statsamenamefiles(d))
t = path.replace("/","-");
t = t.replace("\\","-");
t = t.replace(":","")
if (platform.system() =='Windows'):
pass
elif (platform.system()=='Linux'):
t = t[1:]
lonfilename = logpath +"/" + t + datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') +".log"
lonfilename = lonfilename.replace("//","/")
f = CoSailTextFile(lonfilename)
f.open("w")
f.writeLine("stat path : " + path)
f.writeLine("all files : " +str(statallfiles(d)))
f.writeLine("Duplicate files : " +str(statsamenamefiles(d)))
outduplicatefilemsg(d, f)
outonefilemsg(d, f)
f.close()
def backupfiletodest(destpath, sourcepath, filedict=None):
"""
从原路径中拷贝文件
:param destpath: 备份目的目录
:param sourcepath:要备份的目录
:param filedict:要备份目录中文件字典
:return:
"""
for keyin filedict:
obj = filedict[key]
if len(obj) ==0:
continue
fileattr = obj[0]
filefullpath = fileattr["filepath"]
tmp,_, filename = getFilePathExtend(filefullpath)
#tmp = filefullpath.replace(sourcepath, "")
#if (platform.system() == 'Windows'):
tmp = tmp.replace(sourcepath,"")#tmp.replace(sourcepath[0:2], "")
destfullpath = destpath + tmp#os.path.join(destpath, tmp)
destfullpath = destfullpath.replace("//","/")
if os.path.exists(destfullpath) ==False:
mkdir(destfullpath)
try:
shutil.copyfile(filefullpath, os.path.join(destfullpath, filename))
except Exception as e:
print(e)
if __name__ =='__main__':
filesdict.clear()
filesdict={}
argv = sys.argv
print(argv)
"""
if (len(argv) == 3):
statpathfileattr(argv[1], argv[2])
elif (len(argv) == 2):
statpathfileattr(argv[1], None)
else:
statpathfileattr("E:/1.txt", "E:/")
"""
"""
"E:/1.txt" : 目录列表
E:/wiredtirge :备份目的目录 "sha256" : 文件拷贝类型,按sha256 码, 或"filename" 文件名
"""
backuppathfile("E:/1.txt","E:/wiredtirge","sha256")
#backupfiletodest("E:/wiredtirge", "E:/data",filesdict )
print(type(CalcFileSha256("E:/gsl2.4.zip")))
sha2561 =CalcFileSha256("E:/gsl2.4.zip")
sha2562 = CalcFileSha256("E:/1.txt")
if (sha2561 == sha2562):
print("true")
print(sha2562)
print(sha2561)
#d = buildfilenamedictformfilelist("/home/hadoop/tmp/dir-and-files.list")
"""存储目的路径根目录"""
"""
destrootpath = "/home/hadoop/tmp4"
path = "E:/data"
d = scansamenamefile(path)
print(d)
for k in d:
print("key =", k)
print("files = ", len(d[k]))
print("allfiles", statallfiles(d))
print("sameallfiles", statsamenamefiles(d))
f = CoSailTextFile("e:/sss.log")
f.open("w")
f.writeLine("stat path : " + path)
f.writeLine("all files : " + str(statallfiles(d)))
f.writeLine("Duplicate files : " +str(statsamenamefiles(d)))
outduplicatefilemsg(d, f)
outonefilemsg(d, f)
f.close()
f = CoSailTextFile("E:/1.txt")
f.open("r")
text = f.readLines()
print((text))
"""