pip install pypdf2
pip install pdfplumber==0.5.14
利用pdfplumber提取文字
import pdfplumber
with pdfplumber.open("Netease Q2 2019 Earnings Release-Final.pdf") as pdf:
first_page = pdf.pages[0]
print(first_page.extract_text())
利用pdfplumber提取表格
import pdfplumber
with pdfplumber.open("simple_1.pdf") as pdf:
first_page = pdf.pages[0]
print(first_page.extract_table())
利用pdfplumber提取多个简单的表格
import pdfplumber
with pdfplumber.open("simple_1.pdf") as pdf:
table_page = pdf.pages[0]
for table in table_page.extract_tables():
print(table)
需要设置一下.extract_table()方法里面的参数
import pdfplumber
with pdfplumber.open("Netease Q2 2019 Earnings Release-Final.pdf") as pdf:
table_page = pdf.pages[9]
table = table_page.extract_table(
table_settings = {
'vertical_strategy':"text",
"horizontal_strategy":"text",
})
print(table)
将获取的数据写到Excel中
import pdfplumber
with pdfplumber.open("Netease Q2 2019 Earnings Release-Final.pdf") as pdf:
table_page = pdf.pages[9]
table = table_page.extract_table(
table_settings = {
'vertical_strategy':"text",
"horizontal_strategy":"text",
})
from openpyxl import Workbook
workbook = Workbook()
sheet = workbook.active
for row in table:
sheet.append(row)
workbook.save(filename = "Netease Q2 2019 Earnings Release-Final.xlsx")
去除空行
new_table = []
for row in table:
if not "".join([str(item) for item in row]) == "":
下面这个代码可以背下来,日后很有用的,完成的功能就是将列表中的元素变成字符串
"".join([str(item) for item in row])
运行去除空行的代码,然后重新把数据写入
workbook = Workbook()
sheet = workbook.active
for row in table:
if not "".join([str(item) for item in row])== "":
sheet.append(row)
workbook.save(filename = "Netease Q2 2019 Earnings Release-Final.xlsx")
以下代码就是实现将前三行非None的内容合并为一个字符串,然后合并到一个列表里面
new_row = []
new_row.append("".join([str(item) if item else "" for item in row[:3]]))
new_row += row[3:]
new_table.append(new_row)
全部代码如下:
import pdfplumber
from openpyxl import Workbook
with pdfplumber.open("Netease Q2 2019 Earnings Release-Final.pdf") as pdf:
table_page = pdf.pages[9]
table = table_page.extract_table(
table_settings = {
'vertical_strategy':"text",
"horizontal_strategy":"text",}
)
workbook = Workbook()
sheet = workbook.active
for row in table:
if not "".join([str(item) for item in row])== "":
new_row = []
new_row.append("".join([str(item) if item else "" for item in row[:3]]))
new_row += row[3:]
sheet.append(new_row)
workbook.save(filename = "Netease Q2 2019 Earnings Release-Final.xlsx")
综合应用
import pdfplumber
from openpyxl import Workbook
workbook = Workbook()
sheet = workbook.active
with pdfplumber.open("Netease Q2 2019 Earnings Release-Final.pdf") as pdf:
table_page = pdf.pages[13]
table = table_page.extract_table(
table_settings = {
'vertical_strategy':"text",
"horizontal_strategy":"text",
}
)
del table[:6]
del table[-1]
new_table = []
for row in table:
if 'and' in row:
row = [" ".join(str(x) for x in row[:3])] + row[3:]
if 'argi' in row:
row = ["".join(str(x) for x in row[:3])]
new_row = [x.replace("(","") for x in row if (x != None) and (x != "")]
if len(new_row) != 0:
new_row = [x.replace(")","") for x in new_row if x != ""]
new_table.append(new_row)
#print(new_table)
new_table[1] = [new_table[1][0].split(" ")[0]] + ["".join((str(new_table[1][0].split(" ")[1]),new_table[1][1],))] + new_table[1][2:]
new_table[5] = [new_table[5][0]] + ["".join((str(new_table[5][1]),new_table[5][2]))] + new_table[5][3:]
space_ls = [' ',' ',' ',' ',' ']
for i in range(5):
ls_loc_1_2 = new_table[i+1][1].split(space_ls[i])
new_table[i+1][1] = ls_loc_1_2[0]
new_table[i+1].insert(2, ls_loc_1_2[1].replace(" ",""))
#print(new_table[i])
for i in range(13,18):
new_table[i][1:3] = [x.replace(" ","") for x in new_table[i][1:3]]
for row in new_table[-4:]:
row.insert(-1,row[-1])
for row in new_table:
sheet.append(row)
workbook.save(filename = "综合应用.xlsx")
print('finnished!')