import zipfile
import os
import shutil
import pytesseract
import PIL
from PIL import Image
from docx import Document
from docx.shared import Pt
from docx.oxml.ns import qn
class get_result:
def __init__(self,dir_str,file_list,b)
self.dir_str=dir_str
self.file_list=file_list
self.b=b
def get_image(self):
file_list=self.file_list
dir_str=self.dir_str
b=self.b
for s in file_list:
file_path=dir_str.format(s)
try:
doc=zipfile.ZipFile(file_path)
r_path=b+"\\image\\{0}".format(s)
if os.path.exists(r_path)!=0:
shutil.rmtree(r_path)
os.mkdir(r_path)
for info in doc.infolist():
if info.filename.endswith((".jpeg",'.jpg','.png','.gif')):
doc.extract(info.filename,r_path)
except Exception as e:
print (e)
finally:
pass
print ("图片提取完成")
def insert_word(self):
file_list=self.file_list
b=self.b
for s in file_list:
try:
r_path=b+"\\image\\{0}\\word\\media".format(s)
docx_path=b+"\\{0}.docx".format(s)
t=""
if os.path.exists(r_path)!=0:
for filename in os.listdir(r_path):
t=t+"\n"+str(pytesseract.image_to_string(Image.open(r_path+"\\"+filename),lang="chi_sim"))
'''
写入txt文件
fd=open(txt_path,'w')#w 将覆盖原文件内容,a,向原文件追加内容
fd.write(t)
fd.close()
'''
'''写入word文档'''
doc=Document()
doc.styles["Normal"].font.name = u"微软雅黑"
doc.styles["Normal"].font.size = Pt(14)
doc.styles['Normal']._element.rPr.rFonts.set(qn('w:eastAsia'), u'微软雅黑')
doc.add_paragraph(t)
doc.save(docx_path)
except Exception as e:
print (e)
finally:
pass
print ('数据写入完成')
def r(dir_str,file_list,b):
getResult=get_result(dir_str,file_list,b)
getResult.get_image()
getResult.insert_word()
if __name__=="__main__"::
dir_str='{0}.docx'
file_list={
'201310',
'201410',
'201510'
}
b=os.getcwd()
r(dir_str,file_list,b)