此代码提供了以下几个功能:
其他一些关于PDF的操作可以在Acrobat上更容易地实现
我使用的例子是一份中期票据的募集说明书(前一百页)
你可以在上海清算所很容易地获得 https://www.shclearing.com.cn/xxpl/fxpl/mtn/202203/t20220314_1020626.html
# -*- coding: utf-8 -*-
"""
Created on Sat Jul 16 21:30:00 2022
@author: KX0507
"""
"""
Hope this may be helpful for you
I have discovered pdf related work is useful in one internship while merging and spliting is easy with Acrobat
A easier way to obtain words,tabulars and even pictures may be useful to a lot working in industry
Here you can achieve these by changing only the path of the file and path of output
"""
"""
!!!!NOTE : if you are using python for the first time
learn to pip install packages below
you will find it easy with this , go directly to the 4th part
https://cloud.tencent.com/developer/article/1921353
"""
"""
此代码提供了从PDF文件中解析出文字,表格及图片的功能,并直接以txt,excel和png的方式输出
你只需要更改path和out_path为你自己的文件路径即可
"""
import pandas as pd
import numpy as np
import pdfplumber
import os
import fitz
import re
path = r'C:\Users\KX0507\Desktop\使用python进行PDF操作\2022年募集说明书.pdf' #where you store your pdf file 要解析的PDF文件
out_path = r'C:\Users\KX0507\Desktop\使用python进行PDF操作' #where you want your out files to be 输出文件的地方
def extract_tables(filepath,outpath,name = 'name' ) :
#@filepath : where you store your pdf file PDF文件存储路径
#@outpath : where you want your excel to be 输出的表格文件路径
#@name : the name of your output file 输出文件的名称
#with this function you can get all the tabulars in a pdf
if not os.path.exists(outpath) :
os.makedirs(outpath)
with pdfplumber.open(filepath) as pdf:
tables = []
for i in range(0,len(pdf.pages)) :
page = pdf.pages[i]
tables.append( page.extract_tables() )
#tables - list of lists of lists of lists
#all tabulars in the pdf file are included in tables 代表整个PDF文档里面全部的表格文件
#the first dimension denotes pages,second denotes numbers of tabulars 第一个维度指的是页序号,第二个维度指的是表格数
#the third and forth are contents of tabulars第三个维度和第四个维度指的是表格横纵
df = pd.DataFrame()
df_seperation = pd.DataFrame([np.nan,np.nan]) # two lines of empty values to separate each table
for i in range(0,len(tables)) :
tabular = tables[i] #choose the ith tabular 选取第i页的表格
if len(tabular) > 0 : #if tabular exists 如果该页存在表格的话
for j in range(0,len(tabular)) : # j 表示第几个表格
df_temp = pd.DataFrame(tabular[j])
df = pd.concat([df,df_seperation, df_temp]) #update variable df 更新总表格
df.to_excel(outpath+'\\tables of '+str(name)+'.xlsx')
def extract_words(filepath,outpath,name ='name') :
#@filepath : where you store your pdf file PDF文件存储路径
#@outpath : where you want your excel to be 输出的表格文件路径
#@name : the name of your output file 输出文件的名称
#with this function you can get all the words in a pdf
if not os.path.exists(outpath) :
os.makedirs(outpath)
with pdfplumber.open(path) as pdf :
text = [] #this is where you are gonna store all the words
for i in range(0,len(pdf.pages)) :
page = pdf.pages[i]
text.append(page.extract_text() )
with open(out_path + '\\texts of ' + name +'.txt','w+',encoding ='utf-8' ) as f :
for line in text :
f.write(line)
def single_extract(filepath,outpath,page_number = 1 , name = 'name') :
#@filepath : where you store your pdf file PDF文件存储路径
#@outpath : where you want your excel and txt to be 输出的表格文件路径
#@page_number : which page, it can only be an int 选择特定的页面
#@name : the name of your output file 输出文件的名称
#with this function you can get the text and table of a paticular page
if not os.path.exists(outpath) :
os.makedirs(outpath)
with pdfplumber.open(path) as pdf :
page = pdf.pages[page_number-1]
tabular = page.extract_tables()
df = pd.DataFrame()
df_seperation = pd.DataFrame([np.nan]) #one line of empty values to separate possible continuous tabulars
if len(tabular) > 0 : #if tabular exists 如果该页存在表格的话
for j in range(0,len(tabular)) : # j 表示第几个表格
df_temp = pd.DataFrame(tabular[j])
df = pd.concat([df,df_seperation, df_temp]) #update variable df 更新总表格
df.to_excel(outpath + '\\tables of page ' +str(page_number) + ' of '+str(name) + '.xlsx')
text = page.extract_text()
with open (out_path + '\\texts of page '+str(page_number) + ' of '+str(name) + '.txt','w+',encoding = 'utf-8' ) as f :
for line in text :
f.write(line)
def extract_pic(filepath, pic_dirpath,cutoff = 0):
"""
@param filepath:where you store your pdf file PDF文件存储路径
@param pic_dirpath: where you want your pictures to be 输出的表格文件路径
@cutoff : pixels under this level pictures will not be shown 像素过滤
Obtain pictures from pdf 提取PDF中的图片
"""
if not os.path.exists(pic_dirpath):
os.makedirs(pic_dirpath)
# 使用正则表达式来查找图片
check_XObject = r"/Type(?= */XObject)"
check_Image = r"/Subtype(?= */Image)"
img_count = 0
pdf_info = fitz.open(filepath)
xref_len = pdf_info.xref_length()
print("文件名:{}, 页数: {}, 对象: {}".format(filepath, len(pdf_info), xref_len-1))
for index in range(1, xref_len):
text = pdf_info.xref_object(index)
is_XObject = re.search(check_XObject, text)
is_Image = re.search(check_Image, text)
#make sure we are dealing with pictures 如果不是对象也不是图片,则不操作
if is_XObject or is_Image:
img_count += 1
# 根据索引生成图像
pix = fitz.Pixmap(pdf_info, index)
pic_filepath = os.path.join(pic_dirpath, 'img_' + str(img_count) + '.png')
if pix.size < cutoff:
continue
if pix.n >= 5:
pix = fitz.Pixmap(fitz.csRGB, pix)
pix.save(pic_filepath)
extract_words(path,out_path,name='Sample')
extract_tables(path,out_path,name ='Sample')
single_extract(path,out_path,32,'Sample')
extract_pic(path,out_path)