Python读取PDF、WORD、EXCEL、PPT里文本

文章目录

  • 场景
  • PDF
  • WORD
    • 读取段落
    • 读取表格
  • EXCEL
  • PPT

场景

获取文件中的文本内容(只读不写

PDF

安装:pip install pdfminer3k

from io import StringIO
from pdfminer.converter import TextConverter
from pdfminer.layout import LAParams
from pdfminer.pdfinterp import PDFResourceManager, process_pdf

def read_pdf(path_pdf):
    with open(path_pdf, 'rb') as pdf:
        # resource manager
        # PDF资源管理器
        rsrcmgr = PDFResourceManager()
        # 输出str到内存
        outfp = StringIO()
        # 解析PDF的参数
        laparams = LAParams()
        # 文本转换器
        device = TextConverter(rsrcmgr, outfp, laparams=laparams)
        # 进行处理
        process_pdf(rsrcmgr, device, pdf)
        # 获取处理后的文本内容
        content = outfp.getvalue()
        # 关闭设备
        device.close()
        outfp.close()
        return content  # 

if __name__ == '__main__':
    lines = read_pdf('P020190716349644060705.pdf')
    print(lines)

WORD

安装:pip install python-docx

读取段落

from docx import Document
# 加载文件
d = Document('a.docx')
# 遍历段落
for paragraph in d.paragraphs:
    print(paragraph.text)

读取表格

from docx import Document
# 加载文件
d = Document('a.docx')
# 按行取数
for table in d.tables:
    for row in table.rows:
        for cell in row.cells:
            print(cell.text)

EXCEL

from pandas import read_excel

def xlsx2df(fname, sheet_name=0):
    return read_excel(fname, sheet_name)

PPT

安装:pip install python-pptx

import pptx
# 打开PPT
p = pptx.Presentation('a.pptx')
# 遍历幻灯片
for slide in p.slides:
    # 遍历幻灯片内每个形状
    for shape in slide.shapes:
        # 文本框
        if isinstance(shape, pptx.shapes.placeholder.SlidePlaceholder):
            for paragraph in shape.text_frame.paragraphs:
                print(paragraph.text)
        # 表格
        if isinstance(shape, pptx.shapes.graphfrm.GraphicFrame):
            for cell in shape.table.iter_cells():
                print(cell.text)

你可能感兴趣的:(数据处理)