python爬虫篇2——爬取深圳证券交易所股票金融数据

程序运行截图:

python爬虫篇2——爬取深圳证券交易所股票金融数据_第1张图片

mysql代码:

create database financial;
use financial;
CREATE TABLE `lrb` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `报告期` datetime DEFAULT NULL,
  `股票名` varchar(45) DEFAULT NULL,
  `股票代码` varchar(45) DEFAULT NULL,
  `净利润` varchar(45) DEFAULT NULL,
  `净利润同比` varchar(45) DEFAULT NULL,
  `扣非归母净利润` varchar(45) DEFAULT NULL,
  `扣非归母净利润同比` varchar(45) DEFAULT NULL,
  `营业总收入` varchar(45) DEFAULT NULL,
  `营业总收入同比` varchar(45) DEFAULT NULL,
  `营业支出` varchar(45) DEFAULT NULL,
  `营业支出同比` varchar(45) DEFAULT NULL,
  `销售费用` varchar(45) DEFAULT NULL,
  `管理费用` varchar(45) DEFAULT NULL,
  `财务费用` varchar(45) DEFAULT NULL,
  `营业总支出` varchar(45) DEFAULT NULL,
  `营业总支出同比` varchar(45) DEFAULT NULL,
  `营业利润` varchar(45) DEFAULT NULL,
  `营业利润同比` varchar(45) DEFAULT NULL,
  `利润总额` varchar(45) DEFAULT NULL,
  `公告日期` datetime DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=452 DEFAULT CHARSET=utf8;

CREATE TABLE `ybmx` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `报告期` datetime DEFAULT NULL,
  `股票名` varchar(45) DEFAULT NULL,
  `股票代码` varchar(45) DEFAULT NULL,
  `报告名称` varchar(100) DEFAULT NULL,
  `评级类别` varchar(45) DEFAULT NULL,
  `作者` varchar(100) DEFAULT NULL,
  `机构` varchar(45) DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=513 DEFAULT CHARSET=utf8;

CREATE TABLE `yjb` (
  `id` int(11) NOT NULL AUTO_INCREMENT,
  `报告期` datetime DEFAULT NULL,
  `股票名` varchar(45) DEFAULT NULL,
  `股票代码` varchar(45) DEFAULT NULL,
  `每股收益` varchar(45) DEFAULT NULL,
  `每股收益扣除` varchar(45) DEFAULT NULL,
  `营业收入` varchar(45) DEFAULT NULL,
  `营业收入同比增长` varchar(45) DEFAULT NULL,
  `营业收入季度环比增长` varchar(45) DEFAULT NULL,
  `净利润` varchar(45) DEFAULT NULL,
  `净利润同比增长` varchar(45) DEFAULT NULL,
  `净利润季度环比增长` varchar(45) DEFAULT NULL,
  `每股净资产` varchar(45) DEFAULT NULL,
  `净资产收益率` varchar(45) DEFAULT NULL,
  `每股经营现金流量` varchar(45) DEFAULT NULL,
  `销售毛利率` varchar(45) DEFAULT NULL,
  `利润分配` varchar(90) DEFAULT NULL,
  `股息率` varchar(45) DEFAULT NULL,
  `首次公告日期` datetime DEFAULT NULL,
  `最新公告日期` datetime DEFAULT NULL,
  PRIMARY KEY (`id`)
) ENGINE=InnoDB AUTO_INCREMENT=545 DEFAULT CHARSET=utf8;

python代码:

import json
import re
import sys
import time
from ast import literal_eval

from 股吧数据抓取.MysqlHelper import MysqlHelper
import requests

"""mysql数据库类"""
helper = MysqlHelper('localhost', 8080, 'root', '123', 'financial', 'utf8')

"""深圳证券交易所获取指定股票代码公A司基本信息"""


class ForumBaseData:
    result = ""
    """初始化目标基本信息目标地址"""
    """code:股票代码"""

    def __init__(self, code):
        self.url = "http://www.szse.cn/api/report/index/companyGeneralization?random=0.23242534566812312&secCode=" + code
        try:
            self.respose = requests.get(self.url)
        except requests.exceptions.ConnectionError:
            self.respose = requests.get(self.url)
        self.saveBaseData()

    def saveBaseData(self):
        global result
        # 解析json数据
        result = json.loads(self.respose.text)["message"]
        if result == "成功":
            data = json.loads(self.respose.text)["data"]
            # 查询是否已存在这条数据
            sql = 'select agdm,bgdm from forumBaseData_Sz where agdm=%s and bgdm=%s'
            point = helper.all(sql, [data["agdm"], data["bgdm"]])
            if len(point) == 0:
                sql = "insert into forumBaseData_Sz(gsqc,ywqc,zcdz,agdm,agjc,agssrq,agzgb,agltgb,bgdm,bgjc,bgssrq,bgzgb,bgltgb,dldq,sheng,shi,sshymc,http)values(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
                params = [data["gsqc"], data["ywqc"], data["zcdz"], data["agdm"], data["agjc"], data["agssrq"],
                          data["agzgb"], data["agltgb"], data["bgdm"], data["bgjc"], data["bgssrq"], data["bgzgb"],
                          data["bgltgb"], data["dldq"], data["sheng"], data["shi"], data["sshymc"], data["http"], ]
                helper.execute(sql, params)
            else:
                print("此条公司股票基本信息已存在数据库中!")

    """打印结果"""

    def __str__(self):
        global result
        return result


class FinancialData:
    def __init__(self, code):
        self.code = code
        # 服务器域名
        # 首页url
        self.mainpage = "http://data.eastmoney.com/bbsj/yjbb/%s.html" % self.code
        self.base = "http://dcfm.eastmoney.com//em_mutisvcexpandinterface/api/js/get?type="
        self.tail = "&token=70f12f2f4f091e459a279469fe49eca5&filter=(scode=" + self.code + ")&st=reportdate&sr=1&p=1&ps=50&js=var%20AOADDBxl={pages:(tp),data:%20(x),font:(font)}&rt=52235310"
        # 业绩报表
        self.yjserver = self.base + "YJBB21_YJBB" + self.tail
        # 利润表
        self.lrserver = self.base + "CWBB_LRB20" + self.tail
        #  研报明细表
        self.page = 1
        self.pagenum = 0
        self.ybmxserver = "http://reportapi.eastmoney.com/report/list?pageNo=%s&pageSize=50&code=%s&industryCode=*&industry=*&rating=*&ratingchange=*&beginTime=&endTime=&fields=&qType=0" % (
            self.page, self.code)
        # 业绩报表字典
        self.yjb_dict = {'reportdate': '报告期', 'sname': '股票名', 'scode': '股票代码',
                         'basiceps': '每股收益', 'cutbasiceps': '每股收益扣除', 'totaloperatereve': '营业收入',
                         'ystz': '营业收入同比增长', 'yshz': '营业收入季度环比增长', 'parentnetprofit': '净利润',
                         'sjltz': '净利润同比增长', 'sjlhz': '净利润季度环比增长', 'bps': '每股净资产',
                         'roeweighted': '净资产收益率', 'mgjyxjje': '每股经营现金流量', 'xsmll': '销售毛利率',
                         'assigndscrpt': '利润分配', 'gxl': '股息率', 'firstnoticedate': '首次公告日期',
                         'latestnoticedate': '最新公告日期'}
        # 利润表字典
        self.lrb_dict = {'reportdate': '报告期', 'sname': '股票名', 'scode': '股票代码',
                         'parentnetprofit': '净利润', 'sjltz': '净利润同比', 'kcfjcxsyjlr': '扣非归母净利润',
                         'sjlktz': '扣非归母净利润同比', 'totaloperatereve': '营业总收入', 'tystz': '营业总收入同比',
                         'operateexp': '营业支出', 'operateexp_tb': '营业支出同比', 'saleexp': '销售费用',
                         'manageexp': '管理费用', 'financeexp': '财务费用', 'totaloperateexp': '营业总支出',
                         'totaloperateexp_tb': '营业总支出同比', 'operateprofit': '营业利润', 'yltz': '营业利润同比',
                         'sumprofit': '利润总额', 'noticedate': '公告日期'}
        # 研报明细字典
        self.ybmx_dict = {'publishDate': '报告期', 'stockName': '股票名', 'stockCode': '股票代码',
                          'title': '报告名称', 'emRatingName': '评级类别', 'researcher': '作者', 'orgSName': '机构'}
        # 总表
        self.table_dict = {'yjb': self.yjb_dict, 'lrb': self.lrb_dict, 'ybmx': self.ybmx_dict}
        # 表名list  业绩表,利润表,研报明细表
        self.table_name = ['yjb', 'lrb', 'ybmx']
        # 数据地址list
        self.url_list = [self.yjserver, self.lrserver, self.ybmxserver]
        # 数据list
        self.data_list = []
        # 请求头
        self.headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Encoding': 'gzip, deflate',
            'Accept-Language': 'zh-CN,zh;q=0.9',
            'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3722.400 QQBrowser/10.5.3738.400'}

    # 获取金融数据
    def getFinancialData(self):
        for url in self.url_list:
            if url != self.ybmxserver:
                req = requests.get(url=url, headers=self.headers)
                req.encoding = 'utf-8'
                html = req.text
                data = html.replace("var AOADDBxl=", "").replace("var CiBQPcHj=", "").replace('pages:',
                                                                                              '"pages":').replace(
                    'data:', '"data":').replace(
                    'font:', '"font":')
                data = json.loads(data)
                # 解析字典
                self.realdata = literal_eval(
                    keymap_replace(str(data['data']), MappingToDict(data['font']['FontMapping'])))
                self.data_list.append(self.realdata)
                # print(self.realdata)
            else:
                req = requests.get(url=url, headers=self.headers)
                req.encoding = 'utf-8'
                html = req.text
                data = json.loads(html)
                self.data_list.append(data['data'])
                # 总页数
                if data['hits'] != 0:
                    self.pagenum = data['hits'] // 50 + 1
                else:
                    self.pagenum = 0
                # 循环将所有数据保存
                while self.page < self.pagenum:
                    self.page = self.page + 1
                    self.changeybmxserver(self.page)
                    req = requests.get(url=self.ybmxserver, headers=self.headers)
                    req.encoding = 'utf-8'
                    html = req.text
                    data = json.loads(html)
                    self.data_list.append(data['data'])
                self.page = 1
        # print(str(self.data_list))

    def changeybmxserver(self, page):
        self.ybmxserver = "http://reportapi.eastmoney.com/report/list?pageNo=%s&pageSize=50&code=%s&industryCode=*&industry=*&rating=*&ratingchange=*&beginTime=&endTime=&fields=&qType=0" % (
            page, self.code)

    # 将金融数据保存到数据库中
    def saveFinancialData(self):
        # 获取金融数据
        if len(self.code) <= 6 and self.code.isdigit():
            self.getFinancialData()
            for i in range(len(self.table_name)):
                value_dict = {}
                if i < 2:
                    # print(self.data_list[i])
                    for j in range(len(self.data_list[i])):
                        data = self.data_list[i][j]
                        if j == 0:
                            print("股票名称:%s" % data['sname'])
                            print('开始下载' + self.table_name[i] + '表')
                        sys.stdout.write(
                            '\r   %s:[正在下载 %.2f%%]   ' % (self.table_name[i], ((j + 1) / len(self.data_list[i])) * 100))
                        sys.stdout.flush()
                        time.sleep(0.1)
                        if ((j + 1) / len(self.data_list[i])) * 100 == 100:
                            print(self.table_name[i] + '表下载完成')
                        value_dict['报告期'] = data['reportdate']
                        value_dict['股票名'] = data['sname']
                        value_dict['股票代码'] = data['scode']
                        for key, value in data.items():
                            if key in self.table_dict[self.table_name[i]]:
                                value_dict[self.table_dict[self.table_name[i]][key]] = value
                        global sql
                        if i == 0:
                            sql = 'select `报告期` from yjb where `报告期`=%s and `股票名`=%s and `股票代码`=%s'
                        elif i == 1:
                            sql = 'select `报告期` from lrb where `报告期`=%s and `股票名`=%s and `股票代码`=%s'
                        # eval去掉字符串引号
                        point = helper.all(sql, [data['reportdate'], data['sname'], data['scode']])
                        if len(point) == 0:
                            sql1 = """insert into %s (`报告期`,`股票名`,`股票代码`) values ('%s','%s','%s')""" % (
                                self.table_name[i], data['reportdate'], data['sname'], data['scode'])
                            helper.execute(sql1)
                            for key, value in value_dict.items():
                                if key not in ['报告期', '股票名', '股票代码']:
                                    sql2 = """UPDATE %s SET %s='%s' WHERE `报告期`='%s' """ % (
                                        self.table_name[i], key, value, value_dict['报告期'])
                                    helper.execute(sql2)
                        else:
                            # 数据重复不插入
                            pass
                        value_dict = {}
                else:
                    k = i
                    while k < len(self.data_list):
                        for j in range(len(self.data_list[k])):
                            data = self.data_list[k][j]
                            if j == 0:
                                print("股票名称:%s" % data['stockName'])
                                print(
                                    '开始下载' + self.table_name[i] + '表' + '(第%i页/共%i页)' % (k - 1, self.pagenum))
                            sys.stdout.write(
                                '\r   %s:[正在下载 %.2f%%]   ' % (self.table_name[i], ((j + 1) / len(self.data_list[k])) * 100))
                            sys.stdout.flush()
                            time.sleep(0.1)
                            if ((j + 1) / len(self.data_list[k])) * 100 == 100:
                                print(self.table_name[i] + '表下载完成')
                            sql = 'select `报告名称` from ybmx where `报告名称`=%s and `作者`=%s and `报告期`=%s'
                            # eval去掉字符串引号
                            params = [data['title'], data['researcher'], data['publishDate']]
                            point = helper.all(sql, params)
                            if len(point) == 0:
                                sql1 = "insert into ybmx (`报告期`,`股票名`,`股票代码`,`报告名称`,`评级类别`,`作者`,`机构`) values (%s,%s,%s,%s,%s,%s,%s)"
                                params1 = [data['publishDate'], data['stockName'], data['stockCode'],
                                           data['title'], data['emRatingName'], data['researcher'], data['orgSName']]
                                helper.execute(sql1, params1)
                            else:
                                # 数据重复不插入
                                pass
                        else:
                            k = k + 1
            print("金融数据下载完成!")
        else:
            print("股票代码输入有误!")

# mapping转dice
def MappingToDict(mapplist):
    dicts = {}
    for mapping in mapplist:
        dicts[mapping["code"]] = str(mapping["value"])
    return dicts


# 按照字典替换字符
def keymap_replace(
        string: str,
        mappings: dict,
        lower_keys=False,
        lower_values=False,
        lower_string=False,
) -> str:
    replaced_string = string.lower() if lower_string else string
    for character, replacement in mappings.items():
        replaced_string = replaced_string.replace(
            character.lower() if lower_keys else character,
            replacement.lower() if lower_values else replacement
        )
    return replaced_string


if __name__ == '__main__':
    # ForumBaseData("002555")
    print('*' * 60)
    print('\t\t\t\t\t金融数据下载工具')
    print('*' * 60)
    code = input('请输入股票代码:')
    print('*' * 60)
    FinancialData(code).saveFinancialData()
    # FinancialData(code).getFinancialData()
    print('*' * 60)

程序可能存在部分bug,欢迎交流指正。

你可能感兴趣的:(python)