13python利用pdfplumber库提取PDF文字以及表格内容

pip install pypdf2
pip install pdfplumber==0.5.14
利用pdfplumber提取文字

import pdfplumber

with pdfplumber.open("Netease Q2 2019 Earnings Release-Final.pdf") as pdf:
    first_page = pdf.pages[0]
    print(first_page.extract_text())

利用pdfplumber提取表格

import pdfplumber

with pdfplumber.open("simple_1.pdf") as pdf:
    first_page = pdf.pages[0]
    print(first_page.extract_table())

利用pdfplumber提取多个简单的表格

import pdfplumber

with pdfplumber.open("simple_1.pdf") as pdf:
    table_page = pdf.pages[0]
    for table in table_page.extract_tables():
    	print(table)

需要设置一下.extract_table()方法里面的参数

import pdfplumber

with pdfplumber.open("Netease Q2 2019 Earnings Release-Final.pdf") as pdf:
    table_page = pdf.pages[9]
    table = table_page.extract_table(
        table_settings = {
            'vertical_strategy':"text",
            "horizontal_strategy":"text",
        })
    print(table)

将获取的数据写到Excel中

import pdfplumber

with pdfplumber.open("Netease Q2 2019 Earnings Release-Final.pdf") as pdf:
    table_page = pdf.pages[9]
    table = table_page.extract_table(
        table_settings = {
            'vertical_strategy':"text",
            "horizontal_strategy":"text",
        })

from openpyxl import Workbook

workbook = Workbook()
sheet = workbook.active
for row in table:
    sheet.append(row)
workbook.save(filename = "Netease Q2 2019 Earnings Release-Final.xlsx")

去除空行

new_table = []
for row in table:
    if not "".join([str(item) for item in row]) == "":

下面这个代码可以背下来,日后很有用的,完成的功能就是将列表中的元素变成字符串

"".join([str(item) for item in row])

运行去除空行的代码,然后重新把数据写入

workbook = Workbook()
sheet = workbook.active
for row in table:
    if not "".join([str(item) for item in row])== "":
        sheet.append(row)
workbook.save(filename = "Netease Q2 2019 Earnings Release-Final.xlsx")

以下代码就是实现将前三行非None的内容合并为一个字符串,然后合并到一个列表里面

new_row = []
new_row.append("".join([str(item) if item else "" for item in row[:3]]))
new_row += row[3:]
new_table.append(new_row)

全部代码如下:

import pdfplumber
from openpyxl import Workbook

with pdfplumber.open("Netease Q2 2019 Earnings Release-Final.pdf") as pdf:
    table_page = pdf.pages[9]
    table = table_page.extract_table(
    table_settings = {
    'vertical_strategy':"text",
    "horizontal_strategy":"text",}
)

workbook = Workbook()
sheet = workbook.active
for row in table:
    if not "".join([str(item) for item in row])== "":
        new_row = []
        new_row.append("".join([str(item) if item else "" for item in row[:3]]))
        new_row += row[3:]
        sheet.append(new_row)
    
workbook.save(filename = "Netease Q2 2019 Earnings Release-Final.xlsx")

综合应用


import pdfplumber
from openpyxl import Workbook
workbook = Workbook()
sheet = workbook.active


with pdfplumber.open("Netease Q2 2019 Earnings Release-Final.pdf") as pdf:
    table_page = pdf.pages[13]
    table = table_page.extract_table(
    table_settings = {
    'vertical_strategy':"text",
    "horizontal_strategy":"text",
    }
)


del table[:6]
del table[-1]


new_table = []
for row in table:
	if 'and' in row:
		row = [" ".join(str(x) for x in row[:3])] + row[3:]
	if 'argi' in row:
		row = ["".join(str(x) for x in row[:3])]
		
	new_row = [x.replace("(","") for x in row if (x != None) and (x != "")]
	if len(new_row) != 0: 
		new_row = [x.replace(")","") for x in new_row if x != ""] 
		new_table.append(new_row)
#print(new_table)

new_table[1] = [new_table[1][0].split("       ")[0]] + ["".join((str(new_table[1][0].split("       ")[1]),new_table[1][1],))] + new_table[1][2:] 
new_table[5] = [new_table[5][0]] + ["".join((str(new_table[5][1]),new_table[5][2]))] + new_table[5][3:]


space_ls = ['        ','          ','             ','          ','        ']
for i in range(5):
	ls_loc_1_2 = new_table[i+1][1].split(space_ls[i])
	new_table[i+1][1] = ls_loc_1_2[0]
	new_table[i+1].insert(2, ls_loc_1_2[1].replace(" ",""))
	#print(new_table[i])


for i in range(13,18):
	new_table[i][1:3] = [x.replace(" ","") for x in new_table[i][1:3]]

for row in new_table[-4:]:
	row.insert(-1,row[-1])

for row in new_table:
	sheet.append(row)


workbook.save(filename = "综合应用.xlsx")
print('finnished!')


你可能感兴趣的:(Python职场实用技能)