由于买了个kindle,所以想要最大效率地利用它。而在kindle上看pdf是很难受的,因为kindle屏幕太小,展示一个页面字体基本看不清。
因此,我写了个python脚本,配合acrobat使用能够把原始的pdf,尤其是A4页面格式的pdf转换成kindle看着舒适的pdf。使用方法如下
使用acrobat等软件对相应的pdf进行裁剪,尽量多得去除白边。这样在kindle上才能看到尽量大的文字
对于文字版pdf,白边界限很清楚,可以使用pdf-xchange editor,然后文档->裁剪页面->设为空白外边框,然后点全部页面。此时能够自动去除白边,效果非常好
脚本如下:
import os
import PyPDF2
from multiprocessing import Process
import shutil
"""
将pdf文件切割成kindle可看的大小
"""
class PDF2Kindle(object):
"""
PDF到kindle阅读的类
"""
def FillPage(self,page,ws,hs):
"""
这里本来想要做成如果剩下想要拆分的页面不够铺满想要的大小,则在后面填充空白,试了几次都失败了,不过可以凑合用
params:
page,页面句柄
ws,int,标准宽度
hs,int,标准高度
return:
new_page,新的页面句柄
"""
# blank = page.createBlankPage(width=ws,height=hs)
# if self.crosswise:
# blank.rotateCounterClockwise(90)
# blank.mergePage(page)
return page
def CropPage(self,pdf_writer,page,page_idx):
"""
读取图片,裁剪为标准大小,并输出为pdf
params:
pdf_writer,读取的pdf文件句柄
page,该页面句柄
return:
pdf_writer,返回的pdf文件句柄,FIXME 可能用不到
page_num,int,页数
"""
all_crops = []
if self.crosswise:
ws = self.hs
hs = self.ws
else:
ws = self.ws
hs = self.hs
w = float(page.mediaBox.getUpperRight_x())
h = float(page.mediaBox.getUpperRight_y())
cwbeg = float(page.cropBox.getLowerLeft_x())
chbeg = float(page.cropBox.getLowerLeft_y())
cwend = float(page.cropBox.getUpperRight_x())
chend = float(page.cropBox.getUpperRight_y())
print("Generating %s page %d, shape=(%d,%d) ..." % (self.read_full_name,page_idx+1,w,h))
scale = ws/(cwend-cwbeg)
page.scaleBy(scale)
cwbeg *= scale
chbeg *= scale
cwend *= scale
chend *= scale
page.cropBox.lowerLeft=(cwbeg,chbeg)
page.cropBox.upperRight=(cwend,chend)
if self.crosswise:
page.rotateCounterClockwise(90)
#进行分割
hend = chend
cnt = 0
while hend > chbeg:
new_page = page
new_page.cropBox.upperRight= (cwend,hend)
new_page.cropBox.lowerLeft = (cwbeg,max(chbeg,hend-hs))
if hend-chbeg<=hs:
new_page = self.FillPage(new_page,ws,hs)
#0.97表示连续两张拆分页面之间有3%的重叠
hend -= hs*0.97
#写入其中一个pdf并读取为单独的一页
tmp_writer = PyPDF2.PdfFileWriter()
tmp_writer.addPage(new_page)
tmp_page_idx = "%03d" % page_idx
tmp_cnt_idx = "%03d" % cnt
tmp_fn = self.tmppath + '/' + self.read_fn + '_' + tmp_page_idx + '_' + tmp_cnt_idx + '.pdf'
#判断是否有tmppath
if not os.path.exists(self.tmppath):
os.mkdir(self.tmppath)
tmp_writer.write(open(tmp_fn, "wb"))
#再读取回来加入到all_crops
tmp_reader = PyPDF2.PdfFileReader(open(tmp_fn, "rb"))
tmp_page = tmp_reader.getPage(0)
#保存crop_im到pdf中?
all_crops.append(tmp_page)
cnt += 1
#已分割完成,放入pdf中
page_num = len(all_crops)
for p in all_crops:
pdf_writer.addPage(p)
return pdf_writer,page_num
def BatchConvertPDF(self,std_shape,crosswise,read_full_name,writepath,tmppath):
"""
将一个目录下的所有图片均转化成拼接好的pdf
"""
read_only_path,read_file_full_name = os.path.split(read_full_name)
read_fn,read_ext = os.path.splitext(read_file_full_name)
write_fn = writepath + '/' + read_fn
(self.ws,self.hs) = std_shape
self.crosswise = crosswise
self.read_full_name = read_full_name
self.read_fn = read_fn
self.tmppath = tmppath
self.write_fn = write_fn
#按页转化页面
pdf_reader = PyPDF2.PdfFileReader(open(self.read_full_name, "rb"))
pdf_writer = PyPDF2.PdfFileWriter()
pdf_reader_page_num = pdf_reader.getNumPages()
cur_cnt = 0
page_cnt = 0
start_cnt = 1
for i in range(pdf_reader_page_num):
#转化为标准页面
page = pdf_reader.getPage(i)
_,page_num = self.CropPage(pdf_writer,page,i)
#merge并添加书签
bookmark_str = '%03d' % (i+1)
pdf_writer.addBookmark(bookmark_str,page_cnt)
#更新标签
cur_cnt +=1
page_cnt +=page_num
#如果达到上限,则写入pdf
if i==pdf_reader_page_num-1:
start_str = '%03d' % start_cnt
end_str = '%03d' % (i+1)
#写文件
full_name = write_fn + '.pdf'
print("Write %s..." % full_name)
pdf_writer.write(open(full_name, "wb"))
#重置索引
cur_cnt = 0
start_cnt = i+2
page_cnt = 0
#新建pdf_merger
pdf_writer = PyPDF2.PdfFileWriter()
def ProcessPDF2Kindle(args):
"""
子进程,处理PDF2Kindle
"""
pdf2k = PDF2Kindle()
pdf2k.BatchConvertPDF(std_shape=args['std_shape'],
crosswise=args['crosswise'],
read_full_name=args['read_full_name'],
writepath=args['writepath'],
tmppath=args['tmppath'])
def DeleteDIR(dirname):
"""
删除路径
params:
dirname,str,路径名
"""
if not os.path.exists(dirname):
return
shutil.rmtree(dirname)
if __name__ == '__main__':
"""
params:
std_shape,tuple,标准形状,形式为(width,height), eg. (525,640) in kindle, and (1072,1440) in kpw3
crosswise,binary,是否要横向阅读,字体会大一些
read_full_name,str,读取的文件的路径
writepath,str,写入的路径
tmppath,str,tmppdf的路径
"""
args = {
'std_shape':(536,720),#需要转换的分辨率,对于kindle来说,这个分辨率正合适
'crosswise':True,#是否横着看,横着看字体会比竖着看大一些
'read_full_name':'bbb.pdf',#需要转换的pdf
'writepath':'write',
'tmppath':'tmp'#需要提供一个tmp文件夹缓存pdf
}
#这里因为pdfread需要手动open一些文件,这里懒得再一个一个close了,所以添加一个线程自动close
p = Process(target=ProcessPDF2Kindle,args=(args,))
#pdf2kindle
p.start()
p.join()
#删除tmp文件夹
DeleteDIR(args['tmppath'])
修改上面代码中的read_full_name参数,然后运行该文件,得到裁剪后的pdf,放在write文件夹中
注意:对于某些pdf在转换时会出现编码问题,这个目前还无法解决
此时pdf文件大小偏大,可以使用acrobat打开该文件,然后,文件->另存为其他->缩小大小的pdf,有效降低pdf文件的大小
如果文件还是偏大,或者页面偏多,则可以选择对pdf进行拆分,拆分代码如下:
import os
import PyPDF2
from multiprocessing import Process
"""
对于使用pdf_to_kindle的文件,使用该文件来进行分割
"""
class SlicePDF(object):
"""
PDF到kindle阅读的类
"""
def GetBookmark(self,pdf_reader):
"""
得到pdf中所有的书签
params:
pdf_reader,PdfFileReader句柄
return:
all_bookmarks,list
"""
#添加pdf所有的索引
total_idx = {}
page_all_idx = pdf_reader.getPage(0)['/Parent']['/Kids']
for i,idx in enumerate(page_all_idx):
idnum = idx.idnum
idnum = str(idnum)
total_idx[idnum] = i
#寻找每个大页面对应的小页面
ori_bm = pdf_reader.outlines
all_bookmarks = []
for item in ori_bm:
idx = item.page
idnum = idx.idnum
idnum = str(idnum)
page = total_idx[idnum]
all_bookmarks.append(page)
return all_bookmarks
def BatchConvertPDF(self,read_full_name,writepath,con_num):
"""
将一个目录下的所有图片均转化成拼接好的pdf
"""
read_only_path,read_file_full_name = os.path.split(read_full_name)
read_fn,read_ext = os.path.splitext(read_file_full_name)
write_fn = writepath + '/' + read_fn
self.read_full_name = read_full_name
self.read_fn = read_fn
self.write_fn = write_fn
if con_num <1:
print("con_num must >=1")
quit()
#寻找每个大页面对应的小页面
pdf_reader = PyPDF2.PdfFileReader(open(self.read_full_name, "rb"))
all_bookmarks = self.GetBookmark(pdf_reader)
#进行分割
pdf_writer = PyPDF2.PdfFileWriter()
all_reader_num = pdf_reader.getNumPages()
ori_page_num = len(all_bookmarks)
sub_page = 0
start_page = 1
for i in range(ori_page_num):
#找到该大页的所有小页面
beg_page = all_bookmarks[i]
if i == ori_page_num-1:
end_page = all_reader_num
else:
end_page = all_bookmarks[i+1]
#添加该大页的所有页面
for page_idx in range(beg_page,end_page):
page = pdf_reader.getPage(page_idx)
pdf_writer.addPage(page)
#添加书签
bookmark_str = '%03d' % (i+1)
pdf_writer.addBookmark(bookmark_str,beg_page-sub_page)
#如果达到上限,则写入pdf
if (i+1)%con_num==0 or i == ori_page_num-1:
start_str = '%03d' % start_page
end_str = '%03d' % (i+1)
if con_num == 1:
full_name = write_fn + '_' + start_str + '.pdf'
else:
full_name = write_fn + '_' + start_str + '_' + end_str + '.pdf'
print("Write %s..." % full_name)
pdf_writer.write(open(full_name, "wb"))
#更新参数
if i != ori_page_num-1:
start_page = i+2
sub_page = all_bookmarks[i+1]
#新建pdf_merger
pdf_writer = PyPDF2.PdfFileWriter()
def ProcessSlicePDF(args):
"""
子进程,处理PDF2Kindle
"""
pdf2k = SlicePDF()
pdf2k.BatchConvertPDF(
read_full_name=args['read_full_name'],
writepath=args['writepath'],
con_num=args['con_num'])
if __name__ == '__main__':
"""
params:
read_full_name,str,读取的文件的路径
writepath,str,写入的路径
con_num,int,每个pdf包含的页数,如果<=0则拼接全部的pdf,否则按照con_num来拼接
"""
args = {
'read_full_name':'write/bbb.pdf', #修改这里为读入的文件
'writepath':'write',
'con_num':10 #修改这里为拆分pdf每页的大小
}
p = Process(target=ProcessSlicePDF,args=(args,))
#pdf2kindle
p.start()
p.join(
注意:读入的pdf一定是第一个脚本处理过的pdf,否则可能报错
修中的read_full_name为要读入的pdf文件,con_num表示每多少页拆分成一个文件,其中每页代表最原始的pdf的页面,然后运行该文件,则得到拆分后的pdf