该处使用的是利用selenium进行pdf下载
二、对指定字段进行筛选写入csv中
引入库
import pdfplumber
import os
import csv
代码如下
import pdfplumber
import os
import csv
def parase_pdf(table_keyword,inside_keyword,outside_keyword):
#全局变量
global pdffile_list
global parase_out_writer
global parase_out
global OUT_DIR
global file_names
while True:
#rLock2.acquire()
if len(file_names):
print('--------{}---------'.format(len(file_names)))
file_name=file_names[0]
file_names.remove(file_name)
if file_name.endswith('.PDF') or file_name.endswith('.pdf'):
path =os.path.join(OUT_DIR,file_name)
print('get pdf address')
try:
pdf = pdfplumber.open(path,password='')
except:
print("*************open pdf error*******************")
print("*************open pdf*******************")
find_table=0
find_pre_table=0
find_keyword=0
find_keyword_outside=0
name_find=[]
value_find=[]
page_find=[]
#for page in pdf.pages:
#print(page.extract_text())
begin_index=int(len(pdf.pages)/2)
for i in range(begin_index,len(pdf.pages)):
if find_table:
find_pre_table=1
else:
find_pre_table=0
find_table=0
page=pdf.pages[i]
#print(page.extract_text())
data=page.extract_text()
if len(table_keyword):
for keyword in table_keyword:
if keyword in data:
find_table=1
else:
find_table=0
break
else:
find_table=1
if find_table or find_pre_table:
data_list=data.strip().split()
for j in range(len(data_list)):
if len(inside_keyword):
for keyword in inside_keyword:
if keyword in data_list[j]:
find_keyword=1
else:
find_keyword=1
if find_keyword:
find_keyword=0
print('run here')
if len(outside_keyword):
for keyword in outside_keyword:
if keyword not in data_list[j]:
find_keyword_outside=1
else:
find_keyword_outside=0
break
else:
find_keyword_outside=1
if find_keyword_outside:
find_keyword_outside=0
try:
temp_value=data_list[j+1]
temp_value=temp_value.replace(',','')
temp_value=float(temp_value)
name_find.append(data_list[j])
value_find.append(temp_value)
page_find.append(i)
try:
parase_out_writer.writerows([[file_name,data_list[j],str(temp_value),data_list[j+1],str(i)]])
except:
pass
parase_out.flush()
print("*****find******{} value is {} and {}".format(data_list[j],data_list[j+1],temp_value))
print("*************find in page {}*******************".format(i))
print("*************find in {}*******************".format(path))
break # only find one result
except:
continue
pdf.close()
# os.remove(path) # pdf.close 后删除文件 否则太多了
print('****time to processing PDF file is ')
else:
path =os.path.join(OUT_DIR,file_name)
# os.remove(path)
return name_find,value_find,page_find # 一定不要把return放到while里面,遇到return会立即结束 #str(time.strftime('%Y-%m-%d'))
OUT_DIR = r'公告pdf'
table_keyword=['利润表']
inside_keyword=['营业收入']
outside_keyword=['收到']
# ,'营业利润','资产负债率'
file_names=os.listdir(OUT_DIR)
parase_out_file_path=OUT_DIR+'/parase_out_file2.csv'
parase_out=open(parase_out_file_path, 'w', newline='', encoding='utf-8')
parase_out_writer = csv.writer(parase_out)
parase_pdf(table_keyword,inside_keyword,outside_keyword)
import pdfplumber
import os
import csv
inside_keyword = '资产负债率'
def parase_pdf(inside_keyword):
global pdffile_list
global parase_out_writer
global parase_out
global OUT_DIR
global file_names
while True:
# rLock2.acquire()
if len(file_names):
print('--------{}---------'.format(len(file_names)))
file_name = file_names[0]
file_names.remove(file_name)
if file_name.endswith('.PDF') or file_name.endswith('.pdf'):
path = os.path.join(OUT_DIR, file_name)
print('get pdf address')
try:
pdf = pdfplumber.open(path,password='')
except:
print("*************open pdf error*******************")
print("*************open pdf*******************")
for page in pdf.pages:
data = page.extract_text()
if inside_keyword in page.extract_text():
# print(page.extract_text())
data_list = data.strip().split()
for j in range(len(data_list)):
if inside_keyword in data_list[j]:
# print(data_list)
print('提取'+f'{inside_keyword}'+'中')
if len(data_list[j])<7:
print(data_list[j],data_list[j+1])
# print(len(data_list[j]))
try:
parase_out_writer.writerows([[file_name, data_list[j],data_list[j + 1]]])
except:
pass
parase_out.flush()
# print(dict(data_list))
OUT_DIR = r'公告pdf'
file_names=os.listdir(OUT_DIR)
parase_out_file_path=OUT_DIR+'/parase_out_file5.csv'
parase_out=open(parase_out_file_path, 'w', newline='', encoding='utf-8')
parase_out_writer = csv.writer(parase_out)
parase_pdf(inside_keyword)
总结
提示:这里对pdf内容提取进行总结:
因为博主要提取的资产负债率是百分比和营业收入和营业利润有区别所以将二者分开提取之后如果需要应用的 话将路径改为自己的文件路径
三、对csv里的数据进行可视化分析
(1)
我们从pdf里面提取的数据仍然是不规整的这时候怎么办呢
就得给它进行数据清洗和数据规整了
直接上代码
第一步
import pandas as pd
import re
df = pd.read_csv(r'C:\Users\13252\PycharmProjects\pythonProject\巨潮资讯\公告pdf\parase_out_file2.csv'),header=None,names=['id','earning','number1','number2']新增表头
第二步
df['year'] = df['name'].apply(lambda x: re.findall(r'\d{4}',x)[0])#新增年份列
df.to_csv(r'C:\Users\13252\PycharmProjects\pythonProject\巨潮资讯\公告pdf\parase_out_file2.csv',index=False)
df
df = pd.read_csv(r'C:\Users\13252\PycharmProjects\pythonProject\巨潮资讯\公告pdf\parase_out_file2.csv')
# df.to_csv(r'C:\Users\13252\PycharmProjects\pythonProject\巨潮资讯\公告pdf\parase_out_file2.csv',index=False)
df.sort_values("year",ascending=True,inplace=True)#根据年份进行排序
df.to_csv(r'C:\Users\13252\PycharmProjects\pythonProject\巨潮资讯\公告pdf\parase_out_file2.csv',index=False)
df1 = pd.read_csv(r'C:\Users\13252\PycharmProjects\pythonProject\巨潮资讯\公告pdf\parase_out_file3.csv')
df1.sort_values("year",ascending=True,inplace=True)#根据年份进行排序
df1.to_csv(r'C:\Users\13252\PycharmProjects\pythonProject\巨潮资讯\公告pdf\parase_out_file3.csv',index=False)
df1
df3 = pd.read_csv(r'C:\Users\13252\PycharmProjects\pythonProject\巨潮资讯\公告pdf\parase_out_file4.csv')
# df3.to_csv(r'C:\Users\13252\PycharmProjects\pythonProject\巨潮资讯\公告pdf\parase_out_file4.csv',index=False) 保存
df3['year'] = df3['name'].apply(lambda x: re.findall(r'\d{4}',x)[-1])#提取年份
df3.sort_values("year",ascending=True,inplace=True)#根据年份进行排序
df3.to_csv(r'C:\Users\13252\PycharmProjects\pythonProject\巨潮资讯\公告pdf\parase_out_file4.csv',index=False)
df3
df3.drop_duplicates('data',keep='first',inplace=True)
df3.to_csv(r'C:\Users\13252\PycharmProjects\pythonProject\巨潮资讯\公告pdf\parase_out_file4.csv',index=False)
df3 #去重 三个表格都是此原理
这些数据虽然不多但是感觉有点乱乱的需要排序去重还需要添加行和列
(2)
第二步我们需要开始进行图表的绘制
引入我们所需要的库
import matplotlib
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import MultipleLocator
因为每个数据都是一个原理所以博主就不一一举例了
折线图的绘制
df = pd.read_csv(r'C:\Users\13252\PycharmProjects\pythonProject\巨潮资讯\公告pdf\parase_out_file2.csv')
plt.figure(figsize=(16,6))
ax=plt.gca()
x_major_locator=MultipleLocator(1)#设置间隔
ax.xaxis.set_major_locator(x_major_locator)
plt.plot(df['year'],df['number1'],color='#A0522D',marker='o',label="营业收入",linewidth=2,linestyle="--")
散点图的绘制
引入我们需要的库
import matplotlib
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.pyplot import MultipleLocator
df = pd.read_csv(r'C:\Users\13252\PycharmProjects\pythonProject\巨潮资讯\公告pdf\parase_out_file2.csv')
plt.figure(figsize=(16,6))
x=df['year']
y=df['number1']
ax=plt.gca()
x_major_locator=MultipleLocator(1)
ax.xaxis.set_major_locator(x_major_locator)
plt.scatter(x,y,alpha=0.5,marker='*',c='r',label="五角")
柱状图的绘制
引入库
import pandas as pd
import matplotlib.pyplot as plt
from numpy import arange
from matplotlib.font_manager import FontProperties
plt.style.use('fivethirtyeight')
df = pd.read_csv(r'C:\Users\13252\PycharmProjects\pythonProject\巨潮资讯\公告pdf\parase_out_file2.csv')
# df
date = df['year']
data=df['number1']
plt.bar(date,data,width=0.5,alpha=0.5)
plt.title("主营业务收入和营业收入",fontproperties=font_set)
总结
每个图表的我只写出了一个csv数据绘制的图表其余的只需要照葫芦画瓢即可
萌新写代码可能有的地方没标注出来或者有的地方没写出来希望各位大神多多指出
有问题的可以私下找我私聊欢迎大家多多指正