python抓取动态数据 A股上市公司基本信息

1.背景

之前写的抓取A股所有上市公司信息的小程序在上交所网站改版后,需要同步修改

pyton2.7.9

2.分析过程

以抓取宇通客车【600066】信息为例

打开网址http://www.sse.com.cn/assortment/stock/list/info/company/index.shtml?COMPANY_CODE=600066

python抓取动态数据 A股上市公司基本信息_第1张图片

红框中的内容是需要抓取的信息,查看网页源码

python抓取动态数据 A股上市公司基本信息_第2张图片

可以看到公司信息并没有直接写到html中,使用chrome “开发者工具”快捷键F12,查看浏览器与服务器的交互过程(在这一步走了弯路,使用selenium+plantomjs模拟浏览器然后分析html以及使用ghost.py+beautifulsoup都没有成功)

python抓取动态数据 A股上市公司基本信息_第3张图片

可以在标红线的url上看到返回的公司信息,剩下的就是模拟浏览器请求这个url了,request header中的refer一定不能省略,不然会报403

python抓取动态数据 A股上市公司基本信息_第4张图片

返回的信息是json格式的,可以使用python自带的json库转换为dict,可以参考searchJ.js来获得想要的信息

具体见github网址https://github.com/shenyanf/AShareListedCompanyList

# -*- coding: utf-8 -*- 
'''
Created on 2016年4月19日
@author: a
'''
import urllib2
import json
from time import sleep


class JSONObject:
    def __init__(self, d):
        self.__dict__ = d

class AchieveSSEStockInfo:
    '''获得上海证卷交易所股票信息.'''
    
    # 指标的方法,顺序已经排好,请不要乱动
    __public__ = ['getCompanyCode', 'getCompanyShortName', 'getCompanyName', 'getCompanyEnlishName', 'getIpoAddress', 'getASharesCode',
                  'getASharesShortName', 'getASharesIPODate', 'getASharesTotalCapital', 'getASharesOutstandingCaptial', 'getBSharesCode',
                  'getBSharesShortName', 'getBSharesIPODate', 'getBSharesTotalCapital', 'getBSharesOutstandingCaptial', 'getArea', 'getProvince', 'getCity', 'getTrade', 'getWebsite']
    
    achieveIndexFromURLA = ['CHANGEABLE_BOND_ABBR', 'OFFICE_ZIP', 'AREA_NAME_DESC', 'FULL_NAME_IN_ENGLISH', 'COMPANY_CODE', 'CSRC_MIDDLE_CODE_DESC', 'SECURITY_ABBR_A', 'COMPANY_ADDRESS', 'SECURITY_CODE_A', 'SECURITY_CODE_B', 'SECURITY_30_DESC', 'COMPANY_ABBR', 'OFFICE_ADDRESS', 'CHANGEABLE_BOND_CODE', 'ENGLISH_ABBR', 'LEGAL_REPRESENTATIVE', 'REPR_PHONE', 'E_MAIL_ADDRESS', 'FOREIGN_LISTING_ADDRESS', 'STATE_CODE_A_DESC', 'SSE_CODE_DESC', 'FOREIGN_LISTING_DESC', 'SECURITY_CODE_A_SZ', 'CSRC_GREAT_CODE_DESC', 'WWW_ADDRESS', 'CSRC_CODE_DESC', 'STATE_CODE_B_DESC', 'FULLNAME']
    
    '''
    all indexs as follow:
        companyCode     公司代码
        companyShortName     公司简称
        companyName      公司全称
        companyEnlishName      英文名称
        ipoAddress      注册地址
        aSharesCode      A股代码
        aSharesShortName      A股简称
        aSharesIPODate      A股上市日期
        aSharesTotalCapital      A股总股本
        aSharesOutstandingCaptial      A股流通股本
        bSharesCode      B股代码
        bSharesShortName      B股简称
        bSharesIPODate      B股上市日期 
        bSharesTotalCapital       B股总股本  
        bSharesOutstandingCaptial      B股流通股本
        area      地区 
        province      省份
        city      城市
        trade      所属行业
        website      公司网址
        
        status A股状态/B股状态
    '''
    
    def getCompanyCode(self):
        return self.__getBasicValue('COMPANY_CODE')
    
    def getStatus(self):
        v = self.__getBasicValue('STATE_CODE_A_DESC') + '/' + self.__getBasicValue('STATE_CODE_B_DESC')
#         print v
        if v == '-/-' or u'摘牌' in v:
            return False
        else:
            return True
    
    def getCompanyShortName(self):
        return self.__getBasicValue('COMPANY_ABBR') + '/' + self.__getBasicValue('ENGLISH_ABBR')
    
    def getCompanyName(self):
        return self.__getBasicValue('FULLNAME')
    
    def getCompanyEnlishName(self):
        return self.__getBasicValue('FULL_NAME_IN_ENGLISH')
    
    def getIpoAddress(self):
        return self.__getBasicValue('COMPANY_ADDRESS')
    
    def getASharesCode(self):
        return self.__getBasicValue('SECURITY_CODE_A')
    
    def getASharesShortName(self):
        return self.__getBasicValue('COMPANY_ABBR') + '/' + self.__getBasicValue('ENGLISH_ABBR')
    
    def getASharesIPODate(self):
        result = ''
        try:
            rsDict = self.__getDatas(self.basicURLB)
            if rsDict == '-' or rsDict is None:
                result = '-'
            else:
                ipoDate = dict((name, getattr(rsDict[0], name)) for name in dir(rsDict[0]) if not name.startswith('__'))
                print ipoDate
                result = ipoDate.get('LISTINGDATEA')
        except:
            result = '-'
        return result
        

    def getTotalCapital(self):      
        return self.__getCapitalValue('totalShares')
    
    def getASharesTotalCapital(self):
        aShareTotalShare = 0.0
        
        AShareNonFlowShare = self.__getCapitalValue('totalNonFlowShare')
        AShareFlowShare = self.getASharesOutstandingCaptial()
         
        if  AShareNonFlowShare != '-' and  AShareNonFlowShare:
            aShareTotalShare += float(AShareNonFlowShare)
        if AShareFlowShare != '-' and AShareFlowShare:
            aShareTotalShare += float(AShareFlowShare)

        return aShareTotalShare
    
    def getASharesOutstandingCaptial(self):
        return self.__getCapitalValue('AShares')
    
    def getBSharesTotalCapital(self):
        return self.getBSharesOutstandingCaptial()
        
    def getBSharesOutstandingCaptial(self):
        return self.__getCapitalValue('BShares')
    
    def getBSharesCode(self):
        return self.__getBasicValue('SECURITY_CODE_B')
    
    def getBSharesShortName(self):
        if self.getBSharesCode().find('-') != -1:
            return ''
        else:
            return self.getASharesShortName()
    
    def getBSharesIPODate(self):
        result = ''
        try:
            rsDict = self.__getDatas(self.basicURLC)
            if rsDict == '-' or rsDict is None:
                result = '-'
            else:
                ipoDate = dict((name, getattr(rsDict[0], name)) for name in dir(rsDict[0]) if not name.startswith('__'))
                print ipoDate
                result = ipoDate.get('LISTINGDATEB')
        except:
            result = '-'
        return result
        
    def getArea(self):
        return self.__getBasicValue('AREA_NAME_DESC')
    
    def getProvince(self):
        return self.getArea() 
    
    def getCity(self):
        return self.getArea() 
    
    def getTrade(self):
        return self.__getBasicValue('SSE_CODE_DESC')
#    CSRC行业(门类/大类/中类)
#    'CSRC_CODE_DESC') + '/' + self.__getBasicValue('CSRC_GREAT_CODE_DESC') + '/' + self.__getBasicValue('CSRC_MIDDLE_CODE_DESC')
    
    def getWebsite(self):
        return self.__getBasicValue('WWW_ADDRESS')
        
    def __getDatas(self, url, basicInfo=True):
        '''获取指定地址的html内容 .'''
        
        request = urllib2.Request(url)

        request.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8')
        request.add_header('Accept-Encoding', 'gzip, deflate, sdch')
        request.add_header('Accept-Language', 'zh-CN,zh;q=0.8,en;q=0.6')
        request.add_header('Cache-Control', 'max-age=0')
        request.add_header('Connection', 'keep-alive')
        request.add_header('Host', 'query.sse.com.cn')
        request.add_header('Upgrade-Insecure-Requests', '1')
        if basicInfo:
            request.add_header('Referer', 'http://www.sse.com.cn/assortment/stock/list/info/company/index.shtml?COMPANY_CODE=' + str(self.stockCode))
        else:
            request.add_header('Referer', 'http://www.sse.com.cn/assortment/stock/list/info/capital/index.shtml?COMPANY_CODE=' + str(self.stockCode))
        request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36')
        
        # 尝试5次,如果每次都是timeout,打印提示信息,返回none 
        maxNum = 5
        for i in range(maxNum):
            try:    
                response = urllib2.urlopen(url=request, timeout=15)
                # 慢一点 不然被屏蔽
                sleep(5)
                break
            except:
                pass
            
            if i < maxNum - 1:
                continue
            else:
                print 'URLError:  All times is failed '
                return None
        
        response.encoding = 'utf-8'
        result = response.read()
#         print result
        
        str2JsonData = str(result).split('(')[1].split(')')[0]
        pythonObjData = json.loads(str2JsonData, object_hook=JSONObject)
        
#         print pythonObjData.result
        if not pythonObjData.result:
            return '-'
        else:
            return pythonObjData.result
    
    
    def __getBasicValue(self, key):
        '''获得上市公司基本信息的值.'''
        result = ''
        try:
            # 首次使用该方法,需要访问url,获取网页内容
            if self.stockBasicInfo == None:
                rsDict = self.__getDatas(self.basicURLA)
                if rsDict == '-' or rsDict is None:
                    result = '-'
                else:
                    self.stockBasicInfo = dict((name, getattr(rsDict[0], name)) for name in dir(rsDict[0]) if not name.startswith('__'))
#                     print self.stockBasicInfo
            result = self.stockBasicInfo.get(key)
        except:
            result = '-'
        
#         print result
        return result
    
    def __getCapitalValue(self, key):
        '''获得上市公司股本信息的值.'''
        result = ''
        try:
            # 首次使用该方法,需要访问url,获取网页内容
            if self.stockCapitalInfo == None:
                rsDict = self.__getDatas(self.capitalURL, basicInfo=False)
                if rsDict == '-' or rsDict is None:
                    result = '-'
                else:
                    self.stockCapitalInfo = dict((name, getattr(rsDict, name)) for name in dir(rsDict) if not name.startswith('__'))
#                     print self.stockCapitalInfo
            result = self.stockCapitalInfo.get(key)
        except:
            result = '-'
        
#         print result
        return result
    
    def __mergeBasicURL(self, sqlId, stockCode):
        return 'http://query.sse.com.cn/commonQuery.do?jsonCallBack=jsonpCallback12345&isPagination=false&sqlId=' + sqlId + '&productid=' + str(stockCode) + '&_=14555555555552'
    
    def __init__(self, stockCode):
        self.stockCode = stockCode
        self.basicURLA = self.__mergeBasicURL('COMMON_SSE_ZQPZ_GP_GPLB_C', stockCode)
        self.basicURLB = self.__mergeBasicURL('COMMON_SSE_ZQPZ_GP_GPLB_AGSSR_C', stockCode)
        self.basicURLC = self.__mergeBasicURL('COMMON_SSE_ZQPZ_GP_GPLB_BGSSR_C', stockCode)
        self.basicURLD = self.__mergeBasicURL('COMMON_SSE_ZQPZ_GP_GPLB_MSXX_C', stockCode)
        self.basicURLE = r'http://query.sse.com.cn/commonSoaQuery.do?jsonCallBack=jsonpCallback46644&isPagination=true&stockCode=' + str(stockCode) + '&tradeBeginDate=19700101&tradeEndDate=20161001&order=tradeBeginDate%7Cdesc&sqlId=PL_SCRL_SCRLB&pageHelp.pageNo=1&pageHelp.beginPage=1&pageHelp.cacheSize=1&pageHelp.endPage=1&pageHelp.pageSize=5&_=1475720975596'
        self.capitalURL = 'http://query.sse.com.cn/security/stock/queryCompanyStockStruct.do?jsonCallBack=jsonpCallback86976&isPagination=false&companyCode=' + str(stockCode) + '&_=1475732919742'
       
        self.stockBasicInfo = None
        self.stockCapitalInfo = None
        pass 
    

if __name__ == '__main__':
    for i in range(600001, 600003):
        a = AchieveSSEStockInfo(600013)
        for j in range(a.__public__.__len__()):
            m = a.__public__[j]
            f = getattr(a, m)
            print m, f()


附录:

1.使用requests库抓取页面的时候的编码问题 https://segmentfault.com/q/1010000000341014
2.openpyxl参考手册 http://openpyxl.readthedocs.io/en/default/   http://openpyxl.readthedocs.io/en/default/usage.html
3.urllib2使用 http://zhuoqiang.me/python-urllib2-usage.html#http
4.读写json数据 http://python3-cookbook.readthedocs.io/zh_CN/latest/c06/p02_read-write_json_data.html
5.python中 class 或对象属性转化成dict 、dict转换成对象 http://blog.csdn.net/chenyulancn/article/details/8203763
6.【原创】说说JSON和JSONP,也许你会豁然开朗,含jQuery用例 http://www.cnblogs.com/dowinning/archive/2012/04/19/json-jsonp-jquery.html

7.Applying borders to a cell in OpenPyxl   http://stackoverflow.com/questions/24917201/applying-borders-to-a-cell-in-openpyxl


后记:

目前上交所已经提供A股上市公司xls的下载了,虽然信息不太完整,连接地址http://query.sse.com.cn/security/stock/downloadStockListFile.do?csrcCode=&stockCode=&areaName=&stockType=1

你可能感兴趣的:(爬虫)