爬取上交所和深交所的年报问询函到Excel

注意事项

需要安装一些包,如pdfminer、pdfminer3k、pdfplumber等;
pdfminer不能解析上交所问询函,使用解析功能更为强大的pdfplumber可以解析,但是内容上可能会出现个别字重复的现象;
pdfminer3k、pdfplumber可能存在不兼容问题导致程序无法运行,解析上交所年报用到pdfplumber,如果不能运行,根据提示看是否安装了pdfminer,或者尝试卸载pdfminer3k重新安装pdfplumber;
解析深交所年报用到pdfminer3k,如果安装了仍然不能运行,可尝试卸载pdfplumber,重新安装pdfminer3k;
如果要爬取所有问询函,需要根据实际页数更改代码中的循环参数。

上交所

第一步,爬取上交所问询函链接

# -*- coding: utf-8 -*-
# @Time    : 2020/8/6 18:16
# @Author  : 马拉小龙虾
# @FileName: 上交所一条龙.py
# @Software: PyCharm Community Edition
# @Blog    :https://blog.csdn.net/weixin_43636302



import requests
import csv
import re

def downlourl(currentpage):
    url = "http://query.sse.com.cn/commonSoaQuery.do?siteId=28&sqlId=BS_GGLL&extGGLX=&stockcode=&channelId=10743%2C10744%2C10012&extGGDL=&order=createTime%7Cdesc%2Cstockcode%7Casc&isPagination=true&pageHelp.pageSize=15&pageHelp.pageNo=" + repr(currentpage) + "&pageHelp.beginPage=" + repr(currentpage) +"&pageHelp.cacheSize=1"
    return(url)

headers = {
    'Referer':'http://www.sse.com.cn/disclosure/credibility/supervision/inquiries/',
    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}

with open('sh.csv',"w",newline='') as f:
    writer = csv.writer(f, delimiter=',')
    title=['时间2','标题','公司代码','函件类别','公司简称','函件类型','时间1','网址','函件编码']
    writer.writerow(title)
    for page in range(1,101):
        r = requests.get(downlourl(page), headers=headers)
        for i in r.json()['result']:
            result=re.search('c/(\d+).pdf',i['docURL'])
            print(result.group(1))
            # print(i['docURL'][-20:-4])
            writer.writerow([i['cmsOpDate'],i['docTitle'],i['stockcode'],i['extWTFL'],i['extGSJC'],i['docType'],i['createTime'],i['docURL'],re.search('c/(\d+).pdf',i['docURL']).group(1)])
        print('完成爬取第%d页'%page)

第二步,下载问询函并解析,保存到单个txt文件中

import pandas as pd
import time
from urllib.request import urlopen
from urllib.request import Request
from urllib.request import quote
import requests
import pdfplumber
import re

data = pd.read_csv("sh.csv",encoding='GBK')


函件编码 = data.loc[:,'函件编码']
网址 = data.loc[:,'网址']
函件类型 = data.loc[:,'函件类型']

headers = {'content-type': 'application/json',
           'Accept-Encoding': 'gzip, deflate',
           'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0'}

baseurl = "http://reportdocs.static.szse.cn/UpFiles/fxklwxhj/"

def parse(docucode):

    _path = "http://" + quote(docucode)
    print(_path)

    resource = requests.get(_path, stream=True)
    with open('E:\\py_shiyan\\Pycharm备份\\询证函\\上交所/'+re.search('c/(\d+.pdf)', docucode).group(1),'wb') as fd:
        for y in resource.iter_content(102400):
            fd.write(y)
        print(re.search('c/(\d+.pdf)', docucode).group(1), '完成下载')
    # 读取本地文件
    path = 'E:\py_shiyan\Pycharm备份\询证函\上交所/'+ re.search('c/(\d+.pdf)', docucode).group(1)
    pdf = pdfplumber.open(path)
    f = open("E:\py_shiyan\Pycharm备份\询证函\上交所/" + re.search('c/(\d+).pdf', docucode).group(1) + '.txt', 'w')
    f2 = open("E:\py_shiyan\Pycharm备份\询证函\上交所/" +  '异常记录.txt', 'a')
    n=1
    for page in pdf.pages:
        n+=1
        print(page.extract_text())
        try:
            f.writelines(page.extract_text())
        except:
            print(f'========={docucode}第{n}页解码异常==========')
            f2.write(f'\n ========={docucode}第{n}页解码异常==========')


for i in range(len(网址)):
    函件名称 =re.search('c/(\d+.pdf)', 网址[i]).group(1)
    # 函件名称 = 网址[i][-20:]
    # 标题=标题[i]
    print(函件名称)
    开始爬取时间 = "这是第%d个公告"%i
    print(开始爬取时间)
    print(time.strftime('%Y.%m.%d.%H:%M:%S',time.localtime(time.time())))
    try:
        if 函件类型[i]=="pdf":
            parse(网址[i])
            print(函件名称 + "爬取成功")
        else:
            with open("E:\py_shiyan\Pycharm备份\询证函\上交所\%s"%函件名称,'wb') as f:
                _path = baseurl + quote(函件名称) +"?random=0.3006649122149502"
                request = requests.get(url=_path, headers=headers)  # 随机从user_agent列表中抽取一个元素
                f.write(request.content)
        结束爬取时间 = time.strftime('%Y.%m.%d.%H:%M:%S', time.localtime(time.time()))
        print(结束爬取时间)
        print("第%d个公告爬取完成" % i)
    except:
        time.sleep(5)
        try:
            if 函件类型[i]=="pdf":
                parse(网址[i])
                print(函件名称 + "爬取成功")
            else:
                with open("E:\py_shiyan\Pycharm备份\询证函\上交所\%s"%函件名称,'wb') as f:
                    _path = baseurl + quote(函件名称) +"?random=0.3006649122149502"
                    request = requests.get(url=_path, headers=headers)  # 随机从user_agent列表中抽取一个元素
                    f.write(request.content)
            结束爬取时间 = time.strftime('%Y.%m.%d.%H:%M:%S', time.localtime(time.time()))
            print(结束爬取时间)
            print("第%d个公告爬取完成" % i)
        except:
            f2 = open("E:\py_shiyan\Pycharm备份\询证函\上交所/" + '异常记录.txt', 'a')
            f2.write(f'\n ========={函件名称}爬取时网络异常==========')

第三步,将所有单个txt文件中的内容放到一个excel中

import os
import docx2txt
from openpyxl import Workbook

content_list = []

wb = Workbook()
sheet = wb.active
sheet['A1'].value = '公告编码'
sheet['A2'].value = '公告内容'

def readdocx(filepath):
    content = docx2txt.process(filepath)  #打开传进来的路径
    docucode = filepath.split('/')[-1]
    content_list.append([docucode.split('.')[0],content])
    content_list.append([docucode.split('.')[0],content])

def readtxt(filepath):
    content = open(filepath, "r").read()
    docucode = filepath.split('/')[-1]
    content_list.append([docucode.split('.')[0],content])

def eachFile(filepath):
    pathDir = os.listdir(filepath) #获取当前路径下的文件名,返回List
    for s in pathDir:
        newDir=os.path.join(filepath,s)#将文件命加入到当前文件路径后面
        if os.path.isfile(newDir) :         #如果是文件
            doctype = os.path.splitext(newDir)[1]
            if doctype == ".txt":  #判断是否是txt
                readtxt(newDir)
            elif doctype == ".docx":
                readdocx(newDir)
            else:
                pass
        else:
            eachFile(newDir) #如果不是文件,递归这个文件夹的路径

filepath="E:\py_shiyan\Pycharm备份\询证函\上交所"
eachFile("E:\py_shiyan\Pycharm备份\询证函\上交所/")
a = 1
for doc in content_list:
    sheet['A%d'%a].value = doc[0]
    print(doc[0])
    sheet['B%d'%a].value = doc[1]
    a += 1
wb.save('上交所.xlsx')

深交所问询函

第一步,下载深交所问询函信息。

import requests
import csv
import re

def downlourl(currentpage):
    # url = "http://www.szse.cn/api/report/ShowReport/data?SHOWTYPE=JSON&CATALOGID=main_wxhj&loading=first&random=0.8973760391132892"
    url="http://www.szse.cn/api/report/ShowReport/data?SHOWTYPE=JSON&CATALOGID=main_wxhj&TABKEY=tab1&PAGENO="+ str(currentpage)+"&random=0.7562589043142469"
    return(url)

headers = {
    'Referer':'http://www.szse.cn/disclosure/supervision/inquire/index.html',
    'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
}

with open('sz.csv',"w",newline='') as f:
    title=['公司代码','公司简称','发函日期','函件类别','函件编码','公司回复','函件类型']
    # title=['函件编码','函件类型']
    writer = csv.writer(f, delimiter=',')
    writer.writerow(title)
    for page in range(1,2):
        r = requests.get(downlourl(page), headers=headers)
        # print(r.json()[0])
        for i in r.json()[0]['data']:
            # print(i)
            ck=re.search("encode-open=(.*?)>",i['ck'])
            hfck=re.search(">(.*?)<",i['hfck'])
            # print((ck.group(1)).split(".")[1][:-1])
            lx=(ck.group(1)).split(".")[1][:-1]
            if hfck:
                # print(hfck.group(1))
                hfck=hfck.group(1)
            try:
                writer.writerow([i['gsdm'],i['gsjc'],i['fhrq'],i['hjlb'],(ck.group(1))[19:-5],hfck,lx])
            except:
                f2 = open("E:\py_shiyan\Pycharm备份\询证函\深交所/" + '异常记录.txt', 'a')
                f2.write(f'\n ========={(ck.group(1))[19:-5]}解码异常==========')
            # writer.writerow([(ck.group(1))[19:-5],lx])
        print('完成爬取第%d页'%page)

第二步,根据基本信息拼接出问询函链接,在线解析PDF到单个txt文件。

import pandas as pd
import time
from urllib.request import urlopen
from urllib.request import Request
from urllib.request import quote
import requests
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import LTTextBoxHorizontal, LAParams
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter
from pdfminer.pdfinterp import PDFTextExtractionNotAllowed
from pdfminer.pdfparser import PDFParser, PDFDocument


data = pd.read_csv("sz.csv",encoding='GBK')
# data = pd.read_csv("sz.csv",encoding='GBK')
# print(data)
# data.columns=['函件编码','函件类型']


函件编码 = data.loc[:,'函件编码']
函件类型 = data.loc[:,'函件类型']

headers = {'content-type': 'application/json',
           'Accept-Encoding': 'gzip, deflate',
           'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:53.0) Gecko/20100101 Firefox/53.0'}

baseurl = "http://reportdocs.static.szse.cn/UpFiles/fxklwxhj/"

def parse(docucode):
    # 打开在线PDF文档
    _path = baseurl + quote(docucode) +"?random=0.3006649122149502"
    request = Request(url=_path, headers=headers)  # 随机从user_agent列表中抽取一个元素
    fp = urlopen(request)
    # 读取本地文件
    # path = './2015.pdf'
    # fp = open(path, 'rb')
    # 用文件对象来创建一个pdf文档分析器
    praser_pdf = PDFParser(fp)
    # 创建一个PDF文档
    doc = PDFDocument()
    # 连接分析器 与文档对象
    praser_pdf.set_document(doc)
    doc.set_parser(praser_pdf)
    # 提供初始化密码doc.initialize("123456")
    # 如果没有密码 就创建一个空的字符串
    doc.initialize()
    # 检测文档是否提供txt转换,不提供就忽略
    if not doc.is_extractable:
        raise PDFTextExtractionNotAllowed
    else:
        # 创建PDf资源管理器 来管理共享资源
        rsrcmgr = PDFResourceManager()
        # 创建一个PDF参数分析器
        laparams = LAParams()
        # 创建聚合器
        device = PDFPageAggregator(rsrcmgr, laparams=laparams)
        # 创建一个PDF页面解释器对象
        interpreter = PDFPageInterpreter(rsrcmgr, device)
        # 循环遍历列表,每次处理一页的内容
        # doc.get_pages() 获取page列表
        for page in doc.get_pages():
            # 使用页面解释器来读取
            interpreter.process_page(page)
            # 使用聚合器获取内容
            layout = device.get_result()
            # 这里layout是一个LTPage对象 里面存放着 这个page解析出的各种对象 一般包括LTTextBox, LTFigure, LTImage, LTTextBoxHorizontal 等等 想要获取文本就获得对象的text属性,
            for out in layout:
                # 判断是否含有get_text()方法,图片之类的就没有
                # if ``hasattr(out,"get_text"):
                docname = "E:\py_shiyan\Pycharm备份\询证函\深交所/"+str(docucode).split('.')[0]+'.txt'
                with open(docname,'a') as f:
                    if isinstance(out, LTTextBoxHorizontal):
                        results = out.get_text()
                        print(results)
                        try:
						    f.write(results)
						except:
						    print(f'========={docucode}解码异常==========')
						    f3 = open("E:\py_shiyan\Pycharm备份\询证函\深交所/" + '异常记录.txt', 'a')
						    f3.write(f'\n ========={docucode}页解码异常==========')


for i in range(len(函件编码)):
    函件名称 = (函件编码[i] + '.' + 函件类型[i])
    print(函件名称)
    开始爬取时间 = "这是第%d个公告"%i
    print(开始爬取时间)
    print(time.strftime('%Y.%m.%d.%H:%M:%S',time.localtime(time.time())))
    if 函件类型[i]=="pdf":
        parse(函件名称)
        print(函件名称 + "爬取成功")
    else:
        with open("E:\py_shiyan\Pycharm备份\询证函\深交所\%s"%函件名称,'wb') as f:
            _path = baseurl + quote(函件名称) +"?random=0.3006649122149502"
            request = requests.get(url=_path, headers=headers)  # 随机从user_agent列表中抽取一个元素
            f.write(request.content)
    结束爬取时间 = time.strftime('%Y.%m.%d.%H:%M:%S', time.localtime(time.time()))
    print(结束爬取时间)
    print("第%d个公告爬取完成" % i)

第三步,将所有txt文件中的信息汇总到一个excel中。

import os
import docx2txt
from openpyxl import Workbook

content_list = []

wb = Workbook()
sheet = wb.active
sheet['A1'].value = '公告编码'
sheet['A2'].value = '公告内容'

def readdocx(filepath):
    content = docx2txt.process(filepath)  #打开传进来的路径
    docucode = filepath.split('/')[-1]
    content_list.append([docucode.split('.')[0],content])
    content_list.append([docucode.split('.')[0],content])

def readtxt(filepath):
    content = open(filepath, "r").read()     #打开传进来的路径
    docucode = filepath.split('/')[-1]
    content_list.append([docucode.split('.')[0],content])

def eachFile(filepath):
    pathDir = os.listdir(filepath) #获取当前路径下的文件名,返回List
    for s in pathDir:
        newDir=os.path.join(filepath,s)#将文件命加入到当前文件路径后面
        if os.path.isfile(newDir) :         #如果是文件
            doctype = os.path.splitext(newDir)[1]
            if doctype == ".txt":  #判断是否是txt
                readtxt(newDir)
            elif doctype == ".docx":
                readdocx(newDir)
            else:
                pass
        else:
            eachFile(newDir) #如果不是文件,递归这个文件夹的路径


filepath="E:\py_shiyan\Pycharm备份\询证函\深交所"
eachFile("E:\py_shiyan\Pycharm备份\询证函\深交所/")
a = 1
for doc in content_list:
    sheet['A%d'%a].value = doc[0]
    print(doc[0])
    sheet['B%d'%a].value = doc[1]
    a += 1
wb.save('深交所.xlsx')

PS:想要提高速度,还可以使用多线程。

你可能感兴趣的:(Python,爬虫,问询函,pdf解析,python)