Python爬取上交所年报并下载

批量下载上交所年报及相关细节处理

一、使用工具

Python3.6、Pycharm

二、准备工作

import json
import requests
import re
import datetime
import csv 
f=open ('stkcd.csv',mode='w',encoding='gbk',newline='')
writer = csv.writer(f)
head=['stkcd']
writer.writerow(head)

PS:新建csv文件用于记录写入下载公司年报的代码,便于事后检查年报下载是否有遗漏。

三、主要代码

#要下载的年报日期可以根据需要调整,开始日期和结束日期间隔最好不要超过30日#
begin = datetime.date(2019,1,19)
end = datetime.date(2019,6,21)
for i in range((end - begin).days+1):
    searchDate = str(begin + datetime.timedelta(days=i))
    response=requests.get(
        'http://query.sse.com.cn/infodisplay/queryLatestBulletinNew.do?&jsonCallBack=jsonpCallback43752&productId=&reportType2=DQGG&reportType=YEARLY&beginDate='+searchDate+'&endDate='+searchDate+'&pageHelp.pageSize=25&pageHelp.pageCount=50&pageHelp.pageNo=1&pageHelp.beginPage=1&pageHelp.cacheSize=1&pageHelp.endPage=5&_=1561094157400'
        ,
        headers={'Referer':'http://www.sse.com.cn/disclosure/listedinfo/regular/'}
    )
	json_str = response.text[19:-1]
    data = json.loads(json_str)
    for report in data['result']:
        download_url = 'http://www.sse.com.cn/' + report['URL']
        if re.search('年度报告',report['title'],re.S):
            if re.search('摘要',report['title'],re.S):###避免下载一些年报摘要等不需要的文件###
                pass
            else:
                filename = report['security_Code']+report['title'] +searchDate+ '.pdf'
                print(filename)
                writer.writerow([report['security_Code']])###将公司代码写进csv文件,便于计数,非必须步骤###
                if re.search('ST',report['title'],re.S):###下载前要将文件名中带*号的去掉,因为文件命名规则不能带*号,否则程序会中断###
                    filename=report['security_Code']+'-ST' +searchDate+ '.pdf'
                    download_url = 'http://static.sse.com.cn/' + report['URL']
                    resource = requests.get(download_url, stream=True)
                    with open(filename, 'wb') as fd:
                        for y in resource.iter_content(102400):
                            fd.write(y)
                        print(filename, '完成下载')
                else:
                    download_url = 'http://static.sse.com.cn/' + report['URL']
                    resource = requests.get(download_url, stream=True)
                    with open(filename, 'wb') as fd:
                        for y in resource.iter_content(102400):
                            fd.write(y)
                        print(filename, '完成下载')

你可能感兴趣的:(Python)