使用pypdf2把原始pdf转换成kindle看着舒适的pdf

文章目录

  • 裁剪pdf
  • 使用脚本拆分页面并转成kindle可见的大小
  • 压缩pdf
  • (可选) 拆分pdf

由于买了个kindle,所以想要最大效率地利用它。而在kindle上看pdf是很难受的,因为kindle屏幕太小,展示一个页面字体基本看不清。
因此,我写了个python脚本,配合acrobat使用能够把原始的pdf,尤其是A4页面格式的pdf转换成kindle看着舒适的pdf。使用方法如下

裁剪pdf

使用acrobat等软件对相应的pdf进行裁剪,尽量多得去除白边。这样在kindle上才能看到尽量大的文字

对于文字版pdf,白边界限很清楚,可以使用pdf-xchange editor,然后文档->裁剪页面->设为空白外边框,然后点全部页面。此时能够自动去除白边,效果非常好

使用脚本拆分页面并转成kindle可见的大小

脚本如下:

import os
import PyPDF2
from multiprocessing import Process
import shutil

"""
将pdf文件切割成kindle可看的大小
"""

class PDF2Kindle(object):
    """
    PDF到kindle阅读的类
    """  

    def FillPage(self,page,ws,hs):
        """
        这里本来想要做成如果剩下想要拆分的页面不够铺满想要的大小,则在后面填充空白,试了几次都失败了,不过可以凑合用
        params:
            page,页面句柄
            ws,int,标准宽度
            hs,int,标准高度
        return:
            new_page,新的页面句柄
        """
        # blank = page.createBlankPage(width=ws,height=hs)
        # if self.crosswise:
        #     blank.rotateCounterClockwise(90)
        # blank.mergePage(page)
        return page

    def CropPage(self,pdf_writer,page,page_idx):
        """
        读取图片,裁剪为标准大小,并输出为pdf
        params:
            pdf_writer,读取的pdf文件句柄
            page,该页面句柄
        return:
            pdf_writer,返回的pdf文件句柄,FIXME 可能用不到
            page_num,int,页数
        """
        all_crops = []

        if self.crosswise:
            ws = self.hs
            hs = self.ws
        else:
            ws = self.ws
            hs = self.hs
            
        w = float(page.mediaBox.getUpperRight_x())
        h = float(page.mediaBox.getUpperRight_y())
        cwbeg = float(page.cropBox.getLowerLeft_x())
        chbeg = float(page.cropBox.getLowerLeft_y())
        cwend = float(page.cropBox.getUpperRight_x())
        chend = float(page.cropBox.getUpperRight_y())

        print("Generating %s page %d, shape=(%d,%d) ..." % (self.read_full_name,page_idx+1,w,h))

        scale = ws/(cwend-cwbeg)
        page.scaleBy(scale)

        cwbeg *= scale
        chbeg *= scale
        cwend *= scale
        chend *= scale

        page.cropBox.lowerLeft=(cwbeg,chbeg)
        page.cropBox.upperRight=(cwend,chend)

        if self.crosswise:
            page.rotateCounterClockwise(90)

        #进行分割
        hend = chend
        cnt = 0
        while hend > chbeg:
            new_page = page
            new_page.cropBox.upperRight= (cwend,hend)
            new_page.cropBox.lowerLeft = (cwbeg,max(chbeg,hend-hs))
            if hend-chbeg<=hs:
                new_page = self.FillPage(new_page,ws,hs)
            #0.97表示连续两张拆分页面之间有3%的重叠
            hend -= hs*0.97


            #写入其中一个pdf并读取为单独的一页
            tmp_writer = PyPDF2.PdfFileWriter()
            tmp_writer.addPage(new_page)

            tmp_page_idx = "%03d" % page_idx
            tmp_cnt_idx = "%03d" % cnt
            tmp_fn = self.tmppath + '/' + self.read_fn + '_' + tmp_page_idx + '_' + tmp_cnt_idx + '.pdf'

            #判断是否有tmppath
            if not os.path.exists(self.tmppath):
                os.mkdir(self.tmppath)
            tmp_writer.write(open(tmp_fn, "wb"))

            #再读取回来加入到all_crops
            tmp_reader = PyPDF2.PdfFileReader(open(tmp_fn, "rb"))
            tmp_page = tmp_reader.getPage(0)

            #保存crop_im到pdf中?
            all_crops.append(tmp_page)
            cnt += 1

        #已分割完成,放入pdf中
        page_num = len(all_crops)
        for p in all_crops:
            pdf_writer.addPage(p)

        return pdf_writer,page_num

    def BatchConvertPDF(self,std_shape,crosswise,read_full_name,writepath,tmppath):
        """
        将一个目录下的所有图片均转化成拼接好的pdf
        """
        read_only_path,read_file_full_name = os.path.split(read_full_name)
        read_fn,read_ext = os.path.splitext(read_file_full_name)
        write_fn = writepath + '/' + read_fn

        (self.ws,self.hs) = std_shape
        self.crosswise = crosswise
        self.read_full_name = read_full_name
        self.read_fn = read_fn
        self.tmppath = tmppath
        self.write_fn = write_fn

        #按页转化页面
        pdf_reader = PyPDF2.PdfFileReader(open(self.read_full_name, "rb"))
        pdf_writer = PyPDF2.PdfFileWriter()
        pdf_reader_page_num = pdf_reader.getNumPages()
        cur_cnt = 0
        page_cnt = 0
        start_cnt = 1
        for i in range(pdf_reader_page_num):

            #转化为标准页面
            page = pdf_reader.getPage(i)
            _,page_num = self.CropPage(pdf_writer,page,i)

            #merge并添加书签
            bookmark_str = '%03d' % (i+1)
            pdf_writer.addBookmark(bookmark_str,page_cnt)

            #更新标签
            cur_cnt +=1
            page_cnt +=page_num

            #如果达到上限,则写入pdf
            if i==pdf_reader_page_num-1:
                start_str = '%03d' % start_cnt
                end_str = '%03d' % (i+1)

                #写文件
                full_name = write_fn + '.pdf'
                print("Write %s..." % full_name)
                pdf_writer.write(open(full_name, "wb"))


                #重置索引
                cur_cnt = 0
                start_cnt = i+2
                page_cnt = 0

                #新建pdf_merger
                pdf_writer = PyPDF2.PdfFileWriter()

def ProcessPDF2Kindle(args):
    """
    子进程,处理PDF2Kindle
    """
    pdf2k = PDF2Kindle()
    pdf2k.BatchConvertPDF(std_shape=args['std_shape'],
        crosswise=args['crosswise'],
        read_full_name=args['read_full_name'],
        writepath=args['writepath'],
        tmppath=args['tmppath'])

def DeleteDIR(dirname):
    """
    删除路径
    params:
        dirname,str,路径名
    """
    if not os.path.exists(dirname):
        return

    shutil.rmtree(dirname)

if __name__ == '__main__':
    """
    params:
        std_shape,tuple,标准形状,形式为(width,height), eg. (525,640) in kindle, and (1072,1440) in kpw3
        crosswise,binary,是否要横向阅读,字体会大一些
        read_full_name,str,读取的文件的路径
        writepath,str,写入的路径
        tmppath,str,tmppdf的路径
    """
    args = {
        'std_shape':(536,720),#需要转换的分辨率,对于kindle来说,这个分辨率正合适
        'crosswise':True,#是否横着看,横着看字体会比竖着看大一些
        'read_full_name':'bbb.pdf',#需要转换的pdf
        'writepath':'write',
        'tmppath':'tmp'#需要提供一个tmp文件夹缓存pdf
    }

    #这里因为pdfread需要手动open一些文件,这里懒得再一个一个close了,所以添加一个线程自动close
    p = Process(target=ProcessPDF2Kindle,args=(args,))
    #pdf2kindle
    p.start()
    p.join()
    #删除tmp文件夹
    DeleteDIR(args['tmppath'])

修改上面代码中的read_full_name参数,然后运行该文件,得到裁剪后的pdf,放在write文件夹中

注意:对于某些pdf在转换时会出现编码问题,这个目前还无法解决

压缩pdf

此时pdf文件大小偏大,可以使用acrobat打开该文件,然后,文件->另存为其他->缩小大小的pdf,有效降低pdf文件的大小

(可选) 拆分pdf

如果文件还是偏大,或者页面偏多,则可以选择对pdf进行拆分,拆分代码如下:

import os
import PyPDF2
from multiprocessing import Process

"""
对于使用pdf_to_kindle的文件,使用该文件来进行分割
"""

class SlicePDF(object):
    """
    PDF到kindle阅读的类
    """  

    def GetBookmark(self,pdf_reader):
        """
        得到pdf中所有的书签
        params:
            pdf_reader,PdfFileReader句柄
        return:
            all_bookmarks,list
        """
        #添加pdf所有的索引
        total_idx = {}
        page_all_idx = pdf_reader.getPage(0)['/Parent']['/Kids']
        for i,idx in enumerate(page_all_idx):
            idnum = idx.idnum
            idnum = str(idnum)
            total_idx[idnum] = i

        #寻找每个大页面对应的小页面
        ori_bm = pdf_reader.outlines
        all_bookmarks = []
        for item in ori_bm:
            idx = item.page
            idnum = idx.idnum
            idnum = str(idnum)
            page = total_idx[idnum]
            all_bookmarks.append(page)

        return all_bookmarks

    def BatchConvertPDF(self,read_full_name,writepath,con_num):
        """
        将一个目录下的所有图片均转化成拼接好的pdf
        """
        read_only_path,read_file_full_name = os.path.split(read_full_name)
        read_fn,read_ext = os.path.splitext(read_file_full_name)
        write_fn = writepath + '/' + read_fn

        self.read_full_name = read_full_name
        self.read_fn = read_fn
        self.write_fn = write_fn

        if con_num <1:
            print("con_num must >=1")
            quit()

        #寻找每个大页面对应的小页面
        pdf_reader = PyPDF2.PdfFileReader(open(self.read_full_name, "rb"))
        all_bookmarks = self.GetBookmark(pdf_reader)

        #进行分割
        pdf_writer = PyPDF2.PdfFileWriter()
        all_reader_num = pdf_reader.getNumPages()
        ori_page_num = len(all_bookmarks)
        sub_page = 0
        start_page = 1
        for i in range(ori_page_num):
            #找到该大页的所有小页面
            beg_page = all_bookmarks[i]
            if i == ori_page_num-1:
                end_page = all_reader_num
            else:
                end_page = all_bookmarks[i+1]

            #添加该大页的所有页面
            for page_idx in range(beg_page,end_page):
                page = pdf_reader.getPage(page_idx)
                pdf_writer.addPage(page)
            #添加书签
            bookmark_str = '%03d' % (i+1)
            pdf_writer.addBookmark(bookmark_str,beg_page-sub_page)

            #如果达到上限,则写入pdf
            if (i+1)%con_num==0 or i == ori_page_num-1:
                start_str = '%03d' % start_page
                end_str = '%03d' % (i+1)
                if con_num == 1:
                    full_name = write_fn +  '_' + start_str + '.pdf'
                else:
                    full_name = write_fn +  '_' + start_str + '_' + end_str  + '.pdf'
                print("Write %s..." % full_name)
                pdf_writer.write(open(full_name, "wb"))

                #更新参数
                if i != ori_page_num-1:
                    start_page = i+2
                    sub_page = all_bookmarks[i+1]

                #新建pdf_merger
                pdf_writer = PyPDF2.PdfFileWriter()

def ProcessSlicePDF(args):
    """
    子进程,处理PDF2Kindle
    """
    pdf2k = SlicePDF()
    pdf2k.BatchConvertPDF(
        read_full_name=args['read_full_name'],
        writepath=args['writepath'],
        con_num=args['con_num'])

if __name__ == '__main__':
    """
    params:
        read_full_name,str,读取的文件的路径
        writepath,str,写入的路径
        con_num,int,每个pdf包含的页数,如果<=0则拼接全部的pdf,否则按照con_num来拼接
    """
    args = {
        'read_full_name':'write/bbb.pdf', #修改这里为读入的文件
        'writepath':'write',
        'con_num':10 #修改这里为拆分pdf每页的大小
    }

    p = Process(target=ProcessSlicePDF,args=(args,))
    #pdf2kindle
    p.start()
    p.join(

注意:读入的pdf一定是第一个脚本处理过的pdf,否则可能报错

修中的read_full_name为要读入的pdf文件,con_num表示每多少页拆分成一个文件,其中每页代表最原始的pdf的页面,然后运行该文件,则得到拆分后的pdf

你可能感兴趣的:(日常脚本,python,pypdf2,pdf,kindle)