python 按日期整理照片视频 自动归档 去除重复节省空间

随着手机拍照片越来越方便,记录日常生活的照片和视频越来多,每次手机满了就备份到电脑里,一家人的手机都备份的都有3百多G的容量了,而且我还要备份2份以备丢失。我总觉得里面的照片视频有重复苦于没时间写代码,最近学了一下python觉得挺简单,有这么都库可以使用,花了点时间把我多年想整理的又没做的事情给完成了,代码很粗陋,请大家都指教

1、“文件夹”  移动到  (“文件夹_新”  ,  “文件夹_新_重复”)两个文件夹里面。“文件夹_新” 里是需要的内容,“文件夹_新_重复” 是可以删除的文件

2、按时间:图片exif时间,视频拍摄时间,文件名里的日期,文件名里的时间戳,最后按文件备份到电脑的时间整理。  年/月/图片或视频

import hashlib
from logging import error
import os
import multiprocessing
import shutil
import re
import time
import exifread
import imghdr
import datetime
import pytz
import filetype
from win32com.propsys import propsys, pscon
from PIL import Image

GL_MYDIR = ""


GL_DATE_RE = [r"([1,2]{1}\d{3})([0,1]{1}\d)([0,1,2,3]{1}\d)[^\d]",
              r"([1,2]{1}\d{3})_([0,1]{1}\d)_([0,1,2,3]{1}\d)[^\d]",
              r"([1,2]{1}\d{3}):([0,1]{1}\d):([0,1,2,3]{1}\d)[^\d]",
              r"([1,2]{1}\d{3})-([0,1]{1}\d)-([0,1,2,3]{1}\d)[^\d]"]

GL_YAER_MAX = time.localtime(time.time()).tm_year
GL_YAER_MIN = 2000
# =========================计算哈希码===========================


def calcmd5(q, filepath):
    with open(filepath, 'rb') as f:
        filesize = os.path.getsize(filepath)
        filesize = filesize.to_bytes(4, 'big')
        md5obj = hashlib.md5()
        # 读取文件首行,全部读取太慢了,首行+文件大小
        data = f.readline()
        md5obj.update(data)
        md5obj.update(filesize)
        hash = md5obj.hexdigest()
        q.put({"hash": hash, "path": filepath})


# =========================进度提示===========================
def rate(q, k):
    i = 0
    data_list = []
    while True:
        if not q.empty():
            value = q.get(True)
            i = i+1
            data_list.append(value)
            print(i, 'Get %s ' % value["hash"], end='')
            print(" 总进度:%.2f %%,%d" % ((i*100/k),k))
            # time.sleep(random.random())
        if i >= k:
            # print('完成 回车键退出')
            break
    return data_list
# ==========================格式化时间=======================


def TimeStampToTime(timestamp):
    timeArray = time.localtime(timestamp)
    return time.strftime("%Y{y}/%m{m}", timeArray).format(y='年', m='月')


# =========================图片exif信息=======================
def image_exif_date2(path):
    date = ""
    with open(path, 'rb') as f:
        tags = exifread.process_file(f)
        for tag, value in tags.items():
            if re.match('.*Date.*', tag):
                date = str(value)
                re.match('.*Date.*', tag)

    return date


def image_exif_date(path):
    date = ""
    try:
        if os.path.exists(path):
            img = Image.open(path)
            exif_data = img._getexif()
            date = exif_data[36867]
            # 或者 ImageDate = exif_data[306]
            # print("图片exif", path, date)
        return date
    except Exception as r:
        date = image_exif_date2(path)
        # print("图片exif错误", path, r)
        return date
# =========================视频exif信息=========================


def mp4_exif_date(path):
    date = ''
    try:
        properties = propsys.SHGetPropertyStoreFromParsingName(path)
        date = properties.GetValue(pscon.PKEY_Media_DateEncoded).GetValue()
        if not isinstance(date, datetime.datetime):
            if date:
                date = datetime.datetime.fromtimestamp(int(date))
                date = date.replace(tzinfo=pytz.timezone('UTC'))
                # print("视频exif", path, date)
        return date
    except Exception as r:
        # print("视频exif错误", path, r)
        return ""


# =========================判断是否为图片========================
def imge_flag(path):
    type_list = ['jpg', 'bmp', 'png', 'jpeg', 'rgb', 'tif','gif']
    # try:
    #     if imghdr.what(path) in Type_list:
    #         return True
    #     else:
    #         return False
    # except Exception as r:
    #     # print("图片格式错误", path, r)
    falg = file_flag(path, type_list)
    return falg

# =========================判断是否为视频========================


def mp4_flag(path):
    type_list = ['mp4', 'm4v', 'mkv', 'webm',
                 'mov', 'wmv', 'avi', 'mpg', 'flv', '3gp']
    # kind = filetype.guess(path)
    # try:
    #     if kind.extension in Type_list:
    #         return True
    #     else:
    #         return False
    # except Exception as r:
    #     # print("视频格式错误", path, r)
    falg = file_flag(path, type_list)
    return falg

# =========================判断后缀========================


def file_flag(path, type_list):
    tempfilename = os.path.basename(path)
    suffix = os.path.splitext(tempfilename)[-1]
    suffix=suffix.replace(".", "")
    suffix=suffix.lower()
    if suffix in type_list:
        return True
    else:
        return False
# ==========================获得文件日期=======================


def getfiledate(path):
    date_str = ""
    # 图片exif 里的时间
    if date_str is "" and imge_flag(path):
        filename_str = image_exif_date(path)
        print(filename_str)
        for item_re in GL_DATE_RE:
            date_match = re.search(item_re, filename_str)
            if date_match is not None:
                date_str = "%s年/%s月" % (date_match.group(1),
                                        date_match.group(2))
                year = date_str[0:4]
                month = date_str[-3:-1]
                if int(year) < int(GL_YAER_MIN) or int(year) > int(GL_YAER_MAX) or int(month) > 12:
                    date_str = ""
                else:
                    break
    # 视频exif 里的时间
    if date_str is "" and mp4_flag(path):
        filename_str = mp4_exif_date(path)
        if filename_str:
            filename_str = str(filename_str)
            for item_re in GL_DATE_RE:
                date_match = re.search(item_re, filename_str)
                if date_match is not None:
                    date_str = "%s年/%s月" % (date_match.group(1),
                                            date_match.group(2))
                    year = date_str[0:4]
                    month = date_str[-3:-1]
                    if int(year) < int(GL_YAER_MIN) or int(year) > int(GL_YAER_MAX) or int(month) > 12:
                        date_str = ""
                    else:
                        break
    # 文件名里的时间戳
    if date_str is "":
        filename_str = os.path.basename(path)
        date_match = re.search("[^\d](\d{13})[^\d]", filename_str)
        if date_match is not None:
            sjc = int(date_match.group(1))
            timeStamp = int(sjc/1000)
            timeArray = time.localtime(timeStamp)
            date_str = time.strftime(
                "%Y{y}/%m{m}", timeArray).format(y='年', m='月')
            year = date_str[0:4]
            month = date_str[-3:-1]
            if int(year) < int(GL_YAER_MIN) or int(year) > int(GL_YAER_MAX) or int(month) > 12:
                date_str = ""
    # 文件名里的时间
    if date_str is "":
        filename_str = os.path.basename(path)
        for item_re in GL_DATE_RE:
            date_match = re.search(item_re, filename_str)
            if date_match is not None:
                date_str = "%s年/%s月" % (date_match.group(1),
                                        date_match.group(2))
                year = date_str[0:4]
                month = date_str[-3:-1]
                if int(year) < int(GL_YAER_MIN) or int(year) > int(GL_YAER_MAX) or int(month) > 12:
                    date_str = ""
                else:
                    break
    # 文件名创建时间
    if date_str is "":
        date_str = TimeStampToTime(os.path.getctime(path))
        year = date_str[0:4]
        month = date_str[-3:-1]
        if int(year) < int(GL_YAER_MIN) or int(year) > int(GL_YAER_MAX) or int(month) > 12:
            date_str = ""

    if date_str is not "":
        if imge_flag(path):
            date_str=date_str+"/图片"
        elif mp4_flag(path):
            date_str=date_str+"/视频"
    return date_str

# ==========================得到新文件地址=======================
def getfilenewpath(newdir, oldpath):
    date_str = getfiledate(oldpath)
    if date_str is not "":
        newdir = os.path.join(newdir, date_str)
    filename_str = os.path.basename(oldpath)
    newpath = os.path.join(newdir, filename_str)
    newdir2 = os.path.dirname(newpath)
    if not os.path.exists(newdir2):
        os.makedirs(newdir2)
    return newpath


# =========================移动文件===========================
def movefile(newdata_list, newdir):
    if not os.path.exists(newdir):
        os.makedirs(newdir)
    i = 0
    for item in newdata_list:
        if os.path.exists(item["path"]):
            newpath = getfilenewpath(newdir, item["path"])
            newpath2 = os.path.join(os.path.dirname(newpath), "new_" + os.path.basename(newpath))
            i = i+1
            if not os.path.exists(newpath):
                # shutil.copy(item["path"], newpath)
                shutil.move(item["path"], newpath)
            else:
                # shutil.copy(item["path"], newpath2)
                shutil.move(item["path"], newpath2)
            print(i,"move",item["path"] )
#  =========================主函数===========================


def main():
    # 创建进程池
    po = multiprocessing.Pool(4)
    # 创建一个队列
    q = multiprocessing.Manager().Queue()
    totalFileCount = sum([len(files)
                         for root, dirs, files in os.walk(GL_MYDIR)])
    data = po.apply_async(rate, args=(q, totalFileCount))
    k = 0
    for root, dirs, files in os.walk(GL_MYDIR):
        for file in files:
            k = k+1
            hashfile = os.path.join(root, file)
            # print(k, hashfile)
            if os.path.exists(hashfile):
                po.apply_async(calcmd5, args=(q, hashfile))
            else:
                print("no filename")

    po.close()
    if data.successful:
        data_list = data.get()
        temp_list = []
        temp2_list = []
        newdata_list = []
        newdata2_list = []
        print("非重复的文件:")
        i=0
        for item in data_list:
            if item["hash"] not in temp_list:
                temp_list.append(item["hash"])
                newdata_list.append(item)
                i=i+1
                print(i,"only",item["hash"])
            else:
                temp2_list.append(item["hash"])
        # 获得 newdata_list 去掉重复后的数据(重复的只取一个)
        print("重复的文件:")
        i=0
        for item in data_list:
            if item["hash"] in temp2_list:
                newdata2_list.append(item)
                i=i+1
                print(i,"repeat",item["hash"])
        print("开始移动非重复的文件【%d】:" % len(newdata_list))
        movefile(newdata_list, GL_MYDIR+"_新")
        print("开始移动重复的文件【%d】:" % len(newdata2_list))
        movefile(newdata2_list, GL_MYDIR+"_新_重复")
        print("文件移动到%s成功!" % (GL_MYDIR+"_新"))
        


if __name__ == "__main__":
    GL_MYDIR=input("请输入文件夹完整路径:")
    begin_time=time.time()
    # begin_time=time.process_time() 进程时间
    main()
    # end_time=time.process_time()
    end_time=time.time()
    run_time=end_time-begin_time
    print("运行时间",round(run_time,2),"秒")
    os.system("pause")

测试了单个文件大于4G以上程序会有卡死现状暂时我还弄不懂 呵呵。刚开始我还以为是列表一次存储4万多个文件需要内存太大卡死了,又弄了个数据库版的,一样卡死。原因是单个文件太大导致的

数据库版

按日期整理照片视频【数据库版】.py

import hashlib
import os
import multiprocessing
import shutil
import re
import time
import exifread
import imghdr
import datetime
import pytz
import filetype
from win32com.propsys import propsys, pscon
from PIL import Image
import sqlite3


GL_MYDIR = ""


GL_DATE_RE = [r"([1,2]{1}\d{3})([0,1]{1}\d)([0,1,2,3]{1}\d)[^\d]",
              r"([1,2]{1}\d{3})_([0,1]{1}\d)_([0,1,2,3]{1}\d)[^\d]",
              r"([1,2]{1}\d{3}):([0,1]{1}\d):([0,1,2,3]{1}\d)[^\d]",
              r"([1,2]{1}\d{3})-([0,1]{1}\d)-([0,1,2,3]{1}\d)[^\d]"]

GL_YAER_MAX = time.localtime(time.time()).tm_year
GL_YAER_MIN = 2000
CONN = sqlite3.connect('test.db')
# =========================计算哈希码===========================


def calcmd5(q, filepath):
    with open(filepath, 'rb') as f:
        filesize = os.path.getsize(filepath)
        filesize = filesize.to_bytes(4, 'big')
        md5obj = hashlib.md5()
        # 读取文件首行,全部读取太慢了,首行+文件大小
        data = f.readline()
        md5obj.update(data)
        md5obj.update(filesize)
        hash = md5obj.hexdigest()
        q.put({"hash": hash, "path": filepath})


# =========================进度提示===========================
def rate(q, k):
    i = 0
    cur = CONN.cursor()
    create_table("data_list")
    while True:
        if not q.empty():
            value = q.get(True)
            i = i+1
            sql_text = "INSERT INTO data_list VALUES('%s', '%s')" % (
                value["hash"], value["path"])
            cur.execute(sql_text)
            print(i, 'Get %s ' % value["hash"], end='')
            print(" 总进度:%.2f %%,%d" % (i*100/k, k))
        if i >= k:
            CONN.commit()
            print('hash完成')
            return True


def create_table(table_name):
    cursor = CONN.cursor()
    sql = '''SELECT tbl_name FROM sqlite_master WHERE type = 'table' '''
    cursor.execute(sql)
    values = cursor.fetchall()
    tables = []
    for v in values:
        tables.append(v[0])
    # 如果表名不存在,建表
    if table_name not in tables:
        sql_text = "CREATE TABLE %s(hash TEXT,path TEXT)" % table_name
        cursor.execute(sql_text)
        print(table_name + ' 创建成功')
    else:
        cursor = CONN.cursor()
        sql = "Delete from %s" % table_name
        cursor.execute(sql)
        print(table_name + ' 已经存在')
# ==========================格式化时间=======================


def TimeStampToTime(timestamp):
    timeArray = time.localtime(timestamp)
    return time.strftime("%Y{y}/%m{m}", timeArray).format(y='年', m='月')


# =========================图片exif信息=======================
def image_exif_date2(path):
    date = ""
    with open(path, 'rb') as f:
        tags = exifread.process_file(f)
        for tag, value in tags.items():
            if re.match('.*Date.*', tag):
                date = str(value)
                re.match('.*Date.*', tag)

    return date


def image_exif_date(path):
    date = ""
    try:
        if os.path.exists(path):
            img = Image.open(path)
            exif_data = img._getexif()
            date = exif_data[36867]
            # 或者 ImageDate = exif_data[306]
            # print("图片exif", path, date)
        return date
    except Exception as r:
        date = image_exif_date2(path)
        # print("图片exif错误", path, r)
        return date
# =========================视频exif信息=========================


def mp4_exif_date(path):
    date = ''
    try:
        properties = propsys.SHGetPropertyStoreFromParsingName(path)
        date = properties.GetValue(pscon.PKEY_Media_DateEncoded).GetValue()
        if not isinstance(date, datetime.datetime):
            if date:
                date = datetime.datetime.fromtimestamp(int(date))
                date = date.replace(tzinfo=pytz.timezone('UTC'))
                # print("视频exif", path, date)
        return date
    except Exception as r:
        # print("视频exif错误", path, r)
        return ""


# =========================判断是否为图片========================
def imge_flag(path):
    type_list = ['jpg', 'bmp', 'png', 'jpeg', 'rgb', 'tif','gif']
    # try:
    #     if imghdr.what(path) in Type_list:
    #         return True
    #     else:
    #         return False
    # except Exception as r:
    #     # print("图片格式错误", path, r)
    falg = file_flag(path, type_list)
    return falg

# =========================判断是否为视频========================


def mp4_flag(path):
    type_list = ['mp4', 'm4v', 'mkv', 'webm',
                 'mov', 'wmv', 'avi', 'mpg', 'flv', '3gp']
    # kind = filetype.guess(path)
    # try:
    #     if kind.extension in Type_list:
    #         return True
    #     else:
    #         return False
    # except Exception as r:
    #     # print("视频格式错误", path, r)
    falg = file_flag(path, type_list)
    return falg

# =========================判断后缀========================


def file_flag(path, type_list):
    tempfilename = os.path.basename(path)
    suffix = os.path.splitext(tempfilename)[-1]
    suffix=suffix.replace(".", "")
    suffix=suffix.lower()
    if suffix in type_list:
        return True
    else:
        return False
# ==========================获得文件日期=======================


def getfiledate(path):
    date_str = ""
    # 图片exif 里的时间
    if date_str is "" and imge_flag(path):
        filename_str = image_exif_date(path)
        for item_re in GL_DATE_RE:
            date_match = re.search(item_re, filename_str)
            if date_match is not None:
                date_str = "%s年/%s月" % (date_match.group(1),
                                        date_match.group(2))
                year = date_str[0:4]
                month = date_str[-3:-1]
                if int(year) < int(GL_YAER_MIN) or int(year) > int(GL_YAER_MAX) or int(month) > 12:
                    date_str = ""
                else:
                    break
    # 视频exif 里的时间
    if date_str is "" and mp4_flag(path):
        filename_str = mp4_exif_date(path)
        if filename_str:
            filename_str = str(filename_str)
            for item_re in GL_DATE_RE:
                date_match = re.search(item_re, filename_str)
                if date_match is not None:
                    date_str = "%s年/%s月" % (date_match.group(1),
                                            date_match.group(2))
                    year = date_str[0:4]
                    month = date_str[-3:-1]
                    if int(year) < int(GL_YAER_MIN) or int(year) > int(GL_YAER_MAX) or int(month) > 12:
                        date_str = ""
                    else:
                        break
    # 文件名里的时间戳
    if date_str is "":
        filename_str = os.path.basename(path)
        date_match = re.search("[^\d](\d{13})[^\d]", filename_str)
        if date_match is not None:
            sjc = int(date_match.group(1))
            timeStamp = int(sjc/1000)
            timeArray = time.localtime(timeStamp)
            date_str = time.strftime(
                "%Y{y}/%m{m}", timeArray).format(y='年', m='月')
            year = date_str[0:4]
            month = date_str[-3:-1]
            if int(year) < int(GL_YAER_MIN) or int(year) > int(GL_YAER_MAX) or int(month) > 12:
                date_str = ""
    # 文件名里的时间
    if date_str is "":
        filename_str = os.path.basename(path)
        for item_re in GL_DATE_RE:
            date_match = re.search(item_re, filename_str)
            if date_match is not None:
                date_str = "%s年/%s月" % (date_match.group(1),
                                        date_match.group(2))
                year = date_str[0:4]
                month = date_str[-3:-1]
                if int(year) < int(GL_YAER_MIN) or int(year) > int(GL_YAER_MAX) or int(month) > 12:
                    date_str = ""
                else:
                    break
    # 文件名创建时间
    if date_str is "":
        date_str = TimeStampToTime(os.path.getctime(path))
        year = date_str[0:4]
        month = date_str[-3:-1]
        if int(year) < int(GL_YAER_MIN) or int(year) > int(GL_YAER_MAX) or int(month) > 12:
            date_str = ""

    if date_str is not "":
        if imge_flag(path):
            date_str = date_str+"/图片"
        elif mp4_flag(path):
            date_str = date_str+"/视频"
    return date_str

# ==========================得到新文件地址=======================


def getfilenewpath(newdir, oldpath):
    date_str = getfiledate(oldpath)
    if date_str is not "":
        newdir = os.path.join(newdir, date_str)
    filename_str = os.path.basename(oldpath)
    newpath = os.path.join(newdir, filename_str)
    newdir2 = os.path.dirname(newpath)
    if not os.path.exists(newdir2):
        os.makedirs(newdir2)
    return newpath


# =========================移动文件===========================
def movefile(newdata_list, newdir):
    if not os.path.exists(newdir):
        os.makedirs(newdir)
    i = 0
    for item in newdata_list:
        if os.path.exists(item[1]):
            newpath = getfilenewpath(newdir, item[1])
            newpath2 = os.path.join(os.path.dirname(
                newpath), "new_" + os.path.basename(newpath))
            i = i+1
            if not os.path.exists(newpath):
                # shutil.copy(item[1], newpath)
                shutil.move(item[1], newpath)
            else:
                # shutil.copy(item[1], newpath2)
                shutil.move(item[1], newpath2)
            print(i,"move",item[1] )

#  =========================主函数===========================


def main():
    # 创建进程池
    po = multiprocessing.Pool(4)
    # 创建一个队列
    q = multiprocessing.Manager().Queue()
    totalFileCount = sum([len(files)
                         for root, dirs, files in os.walk(GL_MYDIR)])
    data = po.apply_async(rate, args=(q, totalFileCount))
    k = 0
    for root, dirs, files in os.walk(GL_MYDIR):
        for file in files:
            k = k+1
            hashfile = os.path.join(root, file)
            # print(k, hashfile)
            if os.path.exists(hashfile):
                po.apply_async(calcmd5, args=(q, hashfile))
            else:
                print("no filename")

    po.close()
    if data.successful:
        flag = data.get()
        temp_list = []
        temp2_list = []
        newdata_list = []
        newdata2_list = []
        cur = CONN.cursor()
        sql_text = "SELECT * FROM data_list "
        data_list = cur.execute(sql_text)
        print("非重复的文件:")
        i = 0
        for item in data_list:
            if item[0] not in temp_list:
                temp_list.append(item[0])
                newdata_list.append(item)
                i = i+1
                print(i, "only", item[0])
            else:
                temp2_list.append(item[0])
        # 获得 newdata_list 去掉重复后的数据(重复的只取一个)
        print("重复的文件:")
        i = 0
        data_list = cur.execute(sql_text)
        for item in data_list:
            if item[0] in temp2_list:
                newdata2_list.append(item)
                i = i+1
                print(i, "repeat", item[0])
        print("开始移动非重复的文件【%d】:" % len(newdata_list))
        movefile(newdata_list, GL_MYDIR+"_新")
        print("开始移动重复的文件【%d】:" % len(newdata2_list))
        movefile(newdata2_list, GL_MYDIR+"_新_重复")
        print("文件移动到%s成功!" % (GL_MYDIR+"_新"))


if __name__ == "__main__":
    GL_MYDIR = input("请输入文件夹完整路径:")
    begin_time = time.time()
    main()
    end_time = time.time()
    run_time = end_time-begin_time
    print("运行时间", round(run_time, 2), "秒")
    os.system("pause")

安装pypiwin32后成功解决win32api不能安装问题:

pip install pypiwin32

你可能感兴趣的:(python)