所用技术
1. python编程基础
2. 使用pyPdf
3. 使用python操作word
4. 正则表达式的使用
5. windows的bat编程
下面是一个pyPdf库使用的示例:
from pyPdf import PdfFileWriter, PdfFileReader output = PdfFileWriter() input1 = PdfFileReader(file("document1.pdf", "rb")) # add page 1 from input1 to output document, unchanged output.addPage(input1.getPage(0)) # add page 2 from input1, but rotated clockwise 90 degrees output.addPage(input1.getPage(1).rotateClockwise(90)) # add page 3 from input1, rotated the other way: output.addPage(input1.getPage(2).rotateCounterClockwise(90)) # alt: output.addPage(input1.getPage(2).rotateClockwise(270)) # add page 4 from input1, but first add a watermark from another pdf: page4 = input1.getPage(3) watermark = PdfFileReader(file("watermark.pdf", "rb")) page4.mergePage(watermark.getPage(0)) # add page 5 from input1, but crop it to half size: page5 = input1.getPage(4) page5.mediaBox.upperRight = ( page5.mediaBox.getUpperRight_x() / 2, page5.mediaBox.getUpperRight_y() / 2 ) output.addPage(page5) # print how many pages input1 has: print "document1.pdf has %s pages." % input1.getNumPages()) # finally, write "output" to document-output.pdf outputStream = file("document-output.pdf", "wb") output.write(outputStream)
有了该库,就可以很容易将现有的pdf做分割。
因为我的需求是要将pdf中的关键字提取出来,用它来作为文件名。pyPdf中提供了将pdf中的文字全部提取出来。
inputfile.getPage(0).extractText()
这里返回的unicode,需要转为str
inputfile.getPage(0).extractText().encode("utf-8")
然后将每页的关键字提取出来,增加函数如下:
p_sheetName = re.compile('Blattname: (.+?)project') def getSheetName(str): m = p_sheetName.search(str) if m: return m.group(1) else: return None;
最终代码如下:
from pyPdf import PdfFileWriter, PdfFileReader import re,os p_sheetName = re.compile('Blattname: (.+?)project') def getSheetName(str): m = p_sheetName.search(str) if m: return m.group(1) else: return None; def splitpdf(srcFile): input1 = file(srcFile,"rb") inputfile = PdfFileReader(input1) numofpages = inputfile.getNumPages() print "pages: %d" % numofpages #new directory folderName,ext_ = os.path.splitext(srcFile) if not os.path.isdir(folderName): os.makedirs(folderName) for page_index in range(1,numofpages+1): output = PdfFileWriter() output.addPage(inputfile.getPage(page_index-1)) sheetName = getSheetName(inputfile.getPage(page_index-1).extractText().encode("utf-8")) #save file saveFileName = os.path.join(folderName,"%d %s.pdf" % (page_index,sheetName)) print saveFileName outputFile = file(saveFileName,"wb") output.write(outputFile) outputFile.close() input1.close() splitpdf("E:\\test.pdf")
下一步,将pdf参数化
from pyPdf import PdfFileWriter, PdfFileReader import re,sys,os,string def translator(frm='', to='', delete='', keep=None): if len(to) == 1 : to = to * len(frm) trans = string.maketrans(frm,to) if keep is not None: allchars = string.maketrans('','') delete = allchars.translate(allchars,keep.translate(allchars,delete)) def translate(s): return s.translate(trans,delete) return translate delete_some_speicl = translator(delete="/:\\?*><|") p_sheetName = re.compile('Blattname: (.+?)project') def getSheetName(str): m = p_sheetName.search(str) return delete_some_speicl(m.group(1)) def splitpdf(srcFile): try: folderName,ext_ = os.path.splitext(srcFile) if ext_ != '.pdf': raise Exception(os.path.basename(srcFile) + " is not pdf!") input1 = file(srcFile,"rb") inputfile = PdfFileReader(input1) numofpages = inputfile.getNumPages() print "pages: %d" % numofpages #new directory if not os.path.isdir(folderName): os.makedirs(folderName) for page_index in range(1,numofpages+1): output = PdfFileWriter() output.addPage(inputfile.getPage(page_index-1)) sheetName = getSheetName(inputfile.getPage(page_index-1).extractText().encode("utf-8")) #save file saveFileName = os.path.join(folderName,"%d %s.pdf" % (page_index,sheetName)) print saveFileName outputFile = file(saveFileName,"wb") output.write(outputFile) outputFile.close() input1.close() print "Split success!" print "please find them at " + folderName except Exception,e: print e if __name__ == '__main__': if len(sys.argv) < 2: print 'usage: %s filename' % os.path.basename(sys.argv[0]) exit(0) #print sys.argv[1] splitpdf(sys.argv[1])
这里translator函数是将关键字中的特殊字符过滤掉,因为新建文件时可能会出错。
其实分开pdf也还需要一些手动操作,不然还需用vba导入到word中,我想直接用python干完这些事,如果就用到了win32com来操作word
下面是使用操作word的一个示例:
import win32com from win32com.client import Dispatch, constants w = win32com.client.Dispatch('Word.Application') # 或者使用下面的方法,使用启动独立的进程: # w = win32com.client.DispatchEx('Word.Application') # 后台运行,不显示,不警告 w.Visible = 0 w.DisplayAlerts = 0 # 打开新的文件 doc = w.Documents.Open( FileName = filenamein ) # worddoc = w.Documents.Add() # 创建新的文档 # 插入文字 myRange = doc.Range(0,0) myRange.InsertBefore('Hello from Python!') # 使用样式 wordSel = myRange.Select() wordSel.Style = constants.wdStyleHeading1 # 正文文字替换 w.Selection.Find.ClearFormatting() w.Selection.Find.Replacement.ClearFormatting() w.Selection.Find.Execute(OldStr, False, False, False, False, False, True, 1, True, NewStr, 2) # 页眉文字替换 w.ActiveDocument.Sections[0].Headers[0].Range.Find.ClearFormatting() w.ActiveDocument.Sections[0].Headers[0].Range.Find.Replacement.ClearFormatting() w.ActiveDocument.Sections[0].Headers[0].Range.Find.Execute(OldStr, False, False, False, False, False, True, 1, False, NewStr, 2) # 表格操作 doc.Tables[0].Rows[0].Cells[0].Range.Text ='123123' worddoc.Tables[0].Rows.Add() # 增加一行 # 转换为html wc = win32com.client.constants w.ActiveDocument.WebOptions.RelyOnCSS = 1 w.ActiveDocument.WebOptions.OptimizeForBrowser = 1 w.ActiveDocument.WebOptions.BrowserLevel = 0 # constants.wdBrowserLevelV4 w.ActiveDocument.WebOptions.OrganizeInFolder = 0 w.ActiveDocument.WebOptions.UseLongFileNames = 1 w.ActiveDocument.WebOptions.RelyOnVML = 0 w.ActiveDocument.WebOptions.AllowPNG = 1 w.ActiveDocument.SaveAs( FileName = filenameout, FileFormat = wc.wdFormatHTML ) # 打印 doc.PrintOut() # 关闭 # doc.Close() w.Documents.Close(wc.wdDoNotSaveChanges) w.Quit()
仿照上例,修改前面的代码如下:
from pyPdf import PdfFileWriter, PdfFileReader import re,sys,os,string,win32com from win32com.client import Dispatch, constants win32com.client.gencache.EnsureDispatch('Word.Application') def translator(frm='', to='', delete='', keep=None): if len(to) == 1 : to = to * len(frm) trans = string.maketrans(frm,to) if keep is not None: allchars = string.maketrans('','') delete = allchars.translate(allchars,keep.translate(allchars,delete)) def translate(s): return s.translate(trans,delete) return translate delete_some_speicl = translator(delete="/:\\?*><|") p_sheetName = re.compile('Blattname: (.+?)project') def getSheetName(str): m = p_sheetName.search(str) return m.group(1) def splitPdfToWord(srcFile): try: folderName,ext_ = os.path.splitext(srcFile) if ext_ != '.pdf': raise Exception(os.path.basename(srcFile) + " is not pdf!") input1 = file(srcFile,"rb") inputfile = PdfFileReader(input1) numofpages = inputfile.getNumPages() print "Total Pages: %d" % numofpages wordApp = win32com.client.Dispatch('Word.Application') wordApp.Visible = False wordApp.DisplayAlerts = 0 doc = wordApp.Documents.Add() sel = wordApp.Selection #new directory if not os.path.isdir(folderName): os.makedirs(folderName) for page_index in range(1,numofpages+1): output = PdfFileWriter() output.addPage(inputfile.getPage(page_index-1)) sheetName = getSheetName(inputfile.getPage(page_index-1).extractText().encode("utf-8")) sel.Style = constants.wdStyleHeading1 sel.TypeText("Page%d %s" % (page_index,sheetName)) sheetName = delete_some_speicl(sheetName) #save file saveFileName = os.path.join(folderName,"%d %s.pdf" % (page_index,sheetName)) print "Add Page %d" % page_index #print saveFileName outputFile = file(saveFileName,"wb") output.write(outputFile) outputFile.close() sel.TypeParagraph() sel.Style = constants.wdStyleBodyText sel.InlineShapes.AddOLEObject(ClassType="AcroExch.Document.11",FileName=saveFileName) sel.InsertBreak(Type=constants.wdPageBreak) input1.close() doc.SaveAs(folderName+".doc") print "Split success!" print "please find them at " + folderName print "create word document success!" print "Location:" + folderName + ".doc" except Exception,e: print e finally: wordApp.Quit() if __name__ == '__main__': if len(sys.argv) < 2: print 'usage: %s filename' % os.path.basename(sys.argv[0]) sys.exit(1) splitPdfToWord(sys.argv[1])