python修改pdf文件(去掉文件中部分内容)

需求

最近有个编辑修改pdf文件的需求:去掉每页等页眉、页脚(里面包含图片),修改后的文件还得保存为pdf。

初步方案

使用python的库:PyPDF2。发现只有功能:简单的读写、拆分、合并等功能。不能去掉文件的部分内容。

解决过程

  1. 先拆分pdf为单页pdf文件;
  2. 将单页pdf转化为png图片;
  3. 编辑切割png图片;
  4. png图片转为pdf文件并合并。

实现过程

import sys
import time
from PyPDF2 import PdfFileReader, PdfFileWriter
from PIL import Image as pilImage
from reportlab.lib.pagesizes import portrait
from reportlab.pdfgen import canvas
from wand.image import Image

#=====================
class DelLogo_pdf(object):
    def __init__(self):
        #原pdf文件名
        self.old_pdf_name = "MBC.pdf"
        #新pdf文件名
        self.new_pdf_name = "new_noLogo.pdf"
        #pdf总页数
        self.pdf_total_pages = 0
        # 图片后缀
        self.image_suffix = ".png"
        #图片精度
        self.resolution = 180
        #宽度和高度 1488*2105
        #距离图片左边界距离x, 距离图片上边界距离y,裁剪框宽度w,裁剪框高度h
        self.gx = 0
        self.gy = 120
        self.gw = 1488
        self.gh = 1600

        print("init ok")

    def run(self):
        #1原pdf文件,按单页,存单页pdf文件,
        self.split_pdf(self.old_pdf_name)
        print("pdf_total_pages:", self.pdf_total_pages)

        #2单页pdf文件,分别转为单个图片
        pages = 0
        while pages < self.pdf_total_pages:
            pdfFileName = "./tmp/"+str(pages)+".pdf"
            imageFileName = "./tmp/"+str(pages)+self.image_suffix
            self.pdf_to_image(pdfFileName, imageFileName, self.resolution)
            pages += 1

        #3图片切去logo, 并保存为新图片
        pages = 0
        while pages < self.pdf_total_pages:
            oldImageName = "./tmp/"+str(pages)+self.image_suffix
            newImageName = "./tmp/"+str(pages)+"_new"+self.image_suffix
            self.del_image_logo(oldImageName, newImageName, self.gx,self.gy,self.gw,self.gh)
            pages += 1

        #4、单个新图片,转为单页pdf
        pages = 0
        while pages < self.pdf_total_pages:
            pdfName = "./tmp/"+str(pages)+"_new.pdf"
            imageName = "./tmp/"+str(pages)+"_new"+self.image_suffix
            self.image_to_pdf(imageName, pdfName)
            pages += 1

        #5 合成
        infnList = []
        pages = 0
        while pages < self.pdf_total_pages:
            pdfName = "./tmp/"+str(pages)+"_new.pdf"
            infnList.append(pdfName)
            pages += 1

        self.merge_pdf(infnList, self.new_pdf_name)
        print("over.....")
        return

    # 1、原pdf文件,按单页,存单页pdf文件,
    def split_pdf(self, readFileName):

        print("原pdf文件,按单页,存单页pdf文件")
        # 获取一个 PdfFileReader 对象
        pdfReader = PdfFileReader(open(readFileName, 'rb'))
        # 获取 PDF 的页数
        pageCount = pdfReader.getNumPages()
        #self.pdf_total_pages = pageCount
        self.pdf_total_pages = pageCount
        print("原pdf文件总页数:", pageCount)
        pages = 0
        while pages < self.pdf_total_pages:
            # 返回一个 PageObject
            page = pdfReader.getPage(pages)
            # 获取一个 PdfFileWriter 对象
            pdfWriter = PdfFileWriter()
            # 将一个 PageObject 加入到 PdfFileWriter 中
            pdfWriter.addPage(page)
            writeFileName = "./tmp/"+str(pages)+".pdf"
            # 输出到文件中
            pdfWriter.write(open(writeFileName, 'wb'))
            pages += 1


    # 2、单页pdf文件,分别转为单个图片;
    def pdf_to_image(self, pdfFileName, imageFileName, imgResolution):
        with Image(filename=pdfFileName, resolution=imgResolution) as img :
            with img.convert('png') as converted:
                converted.save(filename=imageFileName)

        print("pdf_to_image:", pdfFileName, imageFileName)

    #3  图片切去logo, 并保存为新图片。
    def del_image_logo(self, oldImageName, newImageName, x, y, w, h):
        im = pilImage.open(oldImageName)
        # 图片的宽度和高度
        img_size = im.size
        print("图片"+oldImageName+" 图片宽度和高度分别是{}".format(img_size))
        '''
        裁剪:传入一个元组作为参数
        元组里的元素分别是:(距离图片左边界距离x, 距离图片上边界距离y,裁剪框宽度w,裁剪框高度h)
        '''
        region = im.crop((x, y, x+w, y+h))
        region.save(newImageName)

        #==== 图片背景色设为白色
        im = pilImage.open(newImageName)
        x,y = im.size
        p = pilImage.new('RGBA', im.size, (255,255,255))
        p.paste(im, (0, 0, x, y), im)
        p.save(newImageName)

    #4、单个新图片,转为单页pdf。
    def image_to_pdf(self, imageName, pdfName):
        print("image_to_pdf:", imageName, pdfName)
        (maxw, maxh) = pilImage.open(imageName).size
        c = canvas.Canvas(pdfName, pagesize=portrait((maxw, maxh)))
        c.drawImage(imageName, 0, 0, maxw, maxh)
        c.showPage()
        c.save()

    #5、合成所有单页pdf,转为完整新pdf文件
    # infnList:单个pdf文件名数组。
    # outfn: 新pdf文件
    def merge_pdf(self, infnList, outfn):
        print("合成pdf:", infnList, outfn)
        pdf_output = PdfFileWriter()
        for infn in infnList:
            pdf_input = PdfFileReader(open(infn, 'rb'))
            # 获取 pdf 共用多少页
            page_count = pdf_input.getNumPages()
            #print(page_count)
            for i in range(page_count):
                pdf_output.addPage(pdf_input.getPage(i))
        pdf_output.write(open(outfn, 'wb'))



if __name__ == '__main__':
    localtime = time.asctime( time.localtime(time.time()) )
    print("开始时间为 :", localtime)
    delLogo_pdf = DelLogo_pdf()
    delLogo_pdf.run()
    localtime = time.asctime( time.localtime(time.time()) )
    print("结束时间为 :", localtime)

复制代码

注释:

  1. 效率问题,pdf与png之间转化时间较长(1-2s)。

转载于:https://juejin.im/post/5c1fa58f5188257d9937609e

你可能感兴趣的:(python修改pdf文件(去掉文件中部分内容))