利用python去除pdf水印

去水印只针对pdf
通过给与的水印图,找出相似水印并去除

def is_same_img(area_chart, target_img, VPT):
    """

    :param area_chart: 去水印图
    :param target_img: 目标图
    :param VPT: 阈值
    :return: 是否相似
    """

    # 规范图片尺寸
    with Image.open(area_chart) as img2:
        size = img2.size
    with Image.open(target_img) as img1:
        img1 = img1.convert('RGB')
        resize_img = img1.resize(size, Image.ANTIALIAS)  # x, y 为压缩后的宽和高  Image.ANTIALIAS  抗锯齿
    resize_img.save(target_img, quality=100)

    path = './imgs'
    if switch == 0:
        path = './'

    highfreq_factor = 4  # resize的尺度
    hash_size = 32  # 最终返回hash数值长度
    image_scale = 64
    list_file = []
    list_phash = []
    list_ahash = []
    list_dhash = []
    list_whash = []
    for file in [area_chart, target_img]:
        if os.path.splitext(file)[1] == '.png':
            path_file = os.path.join(path, file)  # 拼路径
            list_file.append(file)
            phash = imagehash.phash(Image.open(path_file), hash_size=hash_size, highfreq_factor=highfreq_factor)  # 感知哈希(perception hashing)
            ahash = imagehash.average_hash(Image.open(path_file), hash_size=hash_size)  # 平均散列(average hashing)
            dhash = imagehash.dhash(Image.open(path_file), hash_size=hash_size)  # 梯度散列(difference hashing)
            whash = imagehash.whash(Image.open(path_file), image_scale=image_scale, hash_size=hash_size, mode='db4')  # 离散小波变换(wavelet hashing)
            list_phash.append(phash)
            list_ahash.append(ahash)
            list_dhash.append(dhash)
            list_whash.append(whash)
    for i in range(len(list_file)):
        for j in range(i + 1, len(list_file)):
            phash_value = 1 - (list_phash[i] - list_phash[j]) / len(list_phash[i].hash) ** 2
            ahash_value = 1 - (list_ahash[i] - list_ahash[j]) / len(list_ahash[i].hash) ** 2
            dhash_value = 1 - (list_dhash[i] - list_dhash[j]) / len(list_dhash[i].hash) ** 2
            whash_value = 1 - (list_whash[i] - list_whash[j]) / len(list_whash[i].hash) ** 2
            value_hash = max(phash_value, ahash_value, dhash_value, whash_value)
            if (value_hash > VPT):  # 阈值设为0.7
                size_i = os.path.getsize(path + '\\' + list_file[i])
                size_j = os.path.getsize(path + '\\' + list_file[j])
                print(list_file[i], str(size_i / 1024) + 'KB')
                print(list_file[j], str(size_j / 1024) + 'KB')
                print(value_hash)
                print('***********************')
                return True
    return False


def delete_wartermark(target_path, area_chart, VPT=0.9):
    """
    :param target_path: 目标路径
    :param area_chart: 去水印图
    :param VPT: 图片相似图 阈值
    :return:  文件路径
    """
    save_pdf_path = ''
    try:
        if '.pdf' in target_path:
            with fitz.open(target_path) as pdf_document:
                for current_page in range(len(pdf_document)):
                    for image in pdf_document.getPageImageList(current_page):
                        xref = image[0]
                        pix = fitz.Pixmap(pdf_document, xref)
                        if pix.n < 4:  # this is GRAY or RGB
                            save_path = "page%s_%s.png" % (current_page, xref)
                            pix.writePNG(save_path)
                            if is_same_img(save_path, area_chart, VPT):
                                pdf_document._deleteObject(image[0])
                            if os.path.exists(save_path):
                                os.remove(save_path)
                splittext = os.path.splitext(target_path)
                save_pdf_path = splittext[-2] + '1' + splittext[-1]
                pdf_document.save(save_pdf_path)
                print('成功----删除水印')
            if os.path.exists(target_path):
                os.remove(target_path)
    except Exception as e:
        print(e)
        print('失败----删除水印')
    return save_pdf_path


pdf_document = r'C:\Users\Administrator\OneDrive\all_huaqiu\huaqiu_spider\test\input.pdf'
delete_wartermark(pdf_document, area_chart='area_chart.png', VPT=0.73)

本文参考资料:
Python操作PDF-文本和图片提取(使用PyPDF2和PyMuPDF)
Python处理PDF的实用姿势
使用PyPDF2在PDF上去除水印

你可能感兴趣的:(利用python去除pdf水印)