【python+pdf】使用python进行pdf中文字,表格和图片的提取(输出为txt,excel和png)

使用python进行pdf中文字,表格和图片的提取(输出为txt,excel和png)

此代码提供了以下几个功能:

  • 提取某个PDF中的全部文字和全部表格并输出
  • 提取某个PDF中全部的图片并依序输出
  • 提取某个PDF中某页的文字和表格并输出

其他一些关于PDF的操作可以在Acrobat上更容易地实现

我使用的例子是一份中期票据的募集说明书(前一百页)
你可以在上海清算所很容易地获得 https://www.shclearing.com.cn/xxpl/fxpl/mtn/202203/t20220314_1020626.html

# -*- coding: utf-8 -*-
"""
Created on Sat Jul 16 21:30:00 2022

@author: KX0507
"""

"""
Hope this may be helpful for you 
I have discovered pdf related work is useful in one internship while merging and spliting is easy with Acrobat  
A easier way to obtain words,tabulars and even pictures may be useful to a lot working in industry 
Here you can achieve these by changing only the path of the file and path of output 
"""

""" 
!!!!NOTE  : if you are using python for the first time 
learn to pip install packages below 
you will find it easy with this , go directly to the 4th part 
https://cloud.tencent.com/developer/article/1921353 
"""

"""
此代码提供了从PDF文件中解析出文字,表格及图片的功能,并直接以txt,excel和png的方式输出
你只需要更改path和out_path为你自己的文件路径即可  
"""
import pandas as pd 
import numpy as np 
import pdfplumber  
import os 
import fitz 
import re 



path = r'C:\Users\KX0507\Desktop\使用python进行PDF操作\2022年募集说明书.pdf' #where you store your pdf file 要解析的PDF文件 
out_path  = r'C:\Users\KX0507\Desktop\使用python进行PDF操作'  #where you want your out files to be  输出文件的地方 



def extract_tables(filepath,outpath,name = 'name' ) : 
    #@filepath : where you store your pdf file   PDF文件存储路径
    #@outpath : where you want your excel to be  输出的表格文件路径
    #@name : the name of your output file 输出文件的名称 
    
    
    #with this function you can get all the tabulars in a pdf 
    if not os.path.exists(outpath) :
        os.makedirs(outpath) 
    
    
    with pdfplumber.open(filepath) as pdf:
        tables =  [] 
        for i in range(0,len(pdf.pages)) : 
            page = pdf.pages[i] 
            tables.append( page.extract_tables()   ) 
    #tables - list of lists of lists of lists  
    #all tabulars in the pdf file are included in tables 代表整个PDF文档里面全部的表格文件
    #the first dimension denotes pages,second denotes numbers of tabulars 第一个维度指的是页序号,第二个维度指的是表格数
    #the third and forth are contents of tabulars第三个维度和第四个维度指的是表格横纵 
        df = pd.DataFrame() 
        df_seperation = pd.DataFrame([np.nan,np.nan]) # two lines of empty values to separate each table 
    
        for i in range(0,len(tables)) :
            tabular = tables[i]  #choose the ith tabular  选取第i页的表格 
            if len(tabular) > 0 :  #if tabular exists  如果该页存在表格的话
                for j in range(0,len(tabular)) :  # j  表示第几个表格 
                    df_temp = pd.DataFrame(tabular[j])  
                    df = pd.concat([df,df_seperation, df_temp]) #update variable df 更新总表格 
    df.to_excel(outpath+'\\tables of '+str(name)+'.xlsx') 

def extract_words(filepath,outpath,name ='name') :
    #@filepath : where you store your pdf file   PDF文件存储路径
    #@outpath : where you want your excel to be  输出的表格文件路径
    #@name : the name of your output file 输出文件的名称  
    
    #with this function you can get all the words in a pdf 
    if not os.path.exists(outpath) :
        os.makedirs(outpath) 

    with pdfplumber.open(path) as pdf : 
        text = []  #this is where you are gonna store all the words 
        for i in range(0,len(pdf.pages)) :
            page = pdf.pages[i] 
            text.append(page.extract_text() ) 
        with open(out_path + '\\texts of ' + name +'.txt','w+',encoding ='utf-8' ) as f :
            for line in text :
                f.write(line) 
                
def single_extract(filepath,outpath,page_number = 1 , name = 'name') :
    #@filepath : where you store your pdf file   PDF文件存储路径
    #@outpath : where you want your excel and txt to be  输出的表格文件路径
    #@page_number : which page, it can only be an int 选择特定的页面 
    #@name : the name of your output file 输出文件的名称      
    
    #with this function you can get the text and table of a paticular page 
    if not os.path.exists(outpath) :
        os.makedirs(outpath)     
    
    with pdfplumber.open(path) as pdf :
        page = pdf.pages[page_number-1] 
        tabular = page.extract_tables()  
        df = pd.DataFrame() 
        df_seperation = pd.DataFrame([np.nan]) #one line of empty values to separate possible continuous tabulars 
        if len(tabular) > 0 :  #if tabular exists  如果该页存在表格的话
            for j in range(0,len(tabular)) :  # j  表示第几个表格 
                df_temp = pd.DataFrame(tabular[j])  
                df = pd.concat([df,df_seperation, df_temp]) #update variable df 更新总表格 
        df.to_excel(outpath + '\\tables of page ' +str(page_number) + ' of '+str(name) + '.xlsx') 
        text = page.extract_text()
        with open (out_path + '\\texts of page '+str(page_number) + ' of '+str(name) + '.txt','w+',encoding = 'utf-8' ) as f :
            for line in text : 
                f.write(line)  

def extract_pic(filepath, pic_dirpath,cutoff = 0):
    """
    @param filepath:where you store your pdf file   PDF文件存储路径 
    @param pic_dirpath: where you want your pictures to be  输出的表格文件路径
    @cutoff : pixels under this level pictures will not be shown 像素过滤  
    
    Obtain pictures from pdf  提取PDF中的图片  
    """
    if not os.path.exists(pic_dirpath):
        os.makedirs(pic_dirpath)
    # 使用正则表达式来查找图片
    check_XObject = r"/Type(?= */XObject)"
    check_Image = r"/Subtype(?= */Image)"
    img_count = 0

    
    pdf_info = fitz.open(filepath)
    xref_len = pdf_info.xref_length()
    print("文件名:{}, 页数: {}, 对象: {}".format(filepath, len(pdf_info), xref_len-1))


    for index in range(1, xref_len):
        text = pdf_info.xref_object(index)

        is_XObject = re.search(check_XObject, text)
        is_Image = re.search(check_Image, text)
        #make sure we are dealing with pictures 如果不是对象也不是图片,则不操作 
        
        if is_XObject or is_Image:
            img_count += 1
            # 根据索引生成图像 
            pix = fitz.Pixmap(pdf_info, index)
            pic_filepath = os.path.join(pic_dirpath, 'img_' + str(img_count) + '.png')

            if pix.size < cutoff: 
                 continue
            if pix.n >= 5:
                pix = fitz.Pixmap(fitz.csRGB, pix) 

            pix.save(pic_filepath) 


extract_words(path,out_path,name='Sample')  
extract_tables(path,out_path,name ='Sample')  
single_extract(path,out_path,32,'Sample')  
extract_pic(path,out_path)

【python+pdf】使用python进行pdf中文字,表格和图片的提取(输出为txt,excel和png)_第1张图片

你可能感兴趣的:(Python与文件处理,python,金融)