1.背景
之前写的抓取A股所有上市公司信息的小程序在上交所网站改版后,需要同步修改
pyton2.7.9
2.分析过程
以抓取宇通客车【600066】信息为例
打开网址http://www.sse.com.cn/assortment/stock/list/info/company/index.shtml?COMPANY_CODE=600066
红框中的内容是需要抓取的信息,查看网页源码
可以看到公司信息并没有直接写到html中,使用chrome “开发者工具”快捷键F12,查看浏览器与服务器的交互过程(在这一步走了弯路,使用selenium+plantomjs模拟浏览器然后分析html以及使用ghost.py+beautifulsoup都没有成功)
可以在标红线的url上看到返回的公司信息,剩下的就是模拟浏览器请求这个url了,request header中的refer一定不能省略,不然会报403
返回的信息是json格式的,可以使用python自带的json库转换为dict,可以参考searchJ.js来获得想要的信息
具体见github网址https://github.com/shenyanf/AShareListedCompanyList
# -*- coding: utf-8 -*-
'''
Created on 2016年4月19日
@author: a
'''
import urllib2
import json
from time import sleep
class JSONObject:
def __init__(self, d):
self.__dict__ = d
class AchieveSSEStockInfo:
'''获得上海证卷交易所股票信息.'''
# 指标的方法,顺序已经排好,请不要乱动
__public__ = ['getCompanyCode', 'getCompanyShortName', 'getCompanyName', 'getCompanyEnlishName', 'getIpoAddress', 'getASharesCode',
'getASharesShortName', 'getASharesIPODate', 'getASharesTotalCapital', 'getASharesOutstandingCaptial', 'getBSharesCode',
'getBSharesShortName', 'getBSharesIPODate', 'getBSharesTotalCapital', 'getBSharesOutstandingCaptial', 'getArea', 'getProvince', 'getCity', 'getTrade', 'getWebsite']
achieveIndexFromURLA = ['CHANGEABLE_BOND_ABBR', 'OFFICE_ZIP', 'AREA_NAME_DESC', 'FULL_NAME_IN_ENGLISH', 'COMPANY_CODE', 'CSRC_MIDDLE_CODE_DESC', 'SECURITY_ABBR_A', 'COMPANY_ADDRESS', 'SECURITY_CODE_A', 'SECURITY_CODE_B', 'SECURITY_30_DESC', 'COMPANY_ABBR', 'OFFICE_ADDRESS', 'CHANGEABLE_BOND_CODE', 'ENGLISH_ABBR', 'LEGAL_REPRESENTATIVE', 'REPR_PHONE', 'E_MAIL_ADDRESS', 'FOREIGN_LISTING_ADDRESS', 'STATE_CODE_A_DESC', 'SSE_CODE_DESC', 'FOREIGN_LISTING_DESC', 'SECURITY_CODE_A_SZ', 'CSRC_GREAT_CODE_DESC', 'WWW_ADDRESS', 'CSRC_CODE_DESC', 'STATE_CODE_B_DESC', 'FULLNAME']
'''
all indexs as follow:
companyCode 公司代码
companyShortName 公司简称
companyName 公司全称
companyEnlishName 英文名称
ipoAddress 注册地址
aSharesCode A股代码
aSharesShortName A股简称
aSharesIPODate A股上市日期
aSharesTotalCapital A股总股本
aSharesOutstandingCaptial A股流通股本
bSharesCode B股代码
bSharesShortName B股简称
bSharesIPODate B股上市日期
bSharesTotalCapital B股总股本
bSharesOutstandingCaptial B股流通股本
area 地区
province 省份
city 城市
trade 所属行业
website 公司网址
status A股状态/B股状态
'''
def getCompanyCode(self):
return self.__getBasicValue('COMPANY_CODE')
def getStatus(self):
v = self.__getBasicValue('STATE_CODE_A_DESC') + '/' + self.__getBasicValue('STATE_CODE_B_DESC')
# print v
if v == '-/-' or u'摘牌' in v:
return False
else:
return True
def getCompanyShortName(self):
return self.__getBasicValue('COMPANY_ABBR') + '/' + self.__getBasicValue('ENGLISH_ABBR')
def getCompanyName(self):
return self.__getBasicValue('FULLNAME')
def getCompanyEnlishName(self):
return self.__getBasicValue('FULL_NAME_IN_ENGLISH')
def getIpoAddress(self):
return self.__getBasicValue('COMPANY_ADDRESS')
def getASharesCode(self):
return self.__getBasicValue('SECURITY_CODE_A')
def getASharesShortName(self):
return self.__getBasicValue('COMPANY_ABBR') + '/' + self.__getBasicValue('ENGLISH_ABBR')
def getASharesIPODate(self):
result = ''
try:
rsDict = self.__getDatas(self.basicURLB)
if rsDict == '-' or rsDict is None:
result = '-'
else:
ipoDate = dict((name, getattr(rsDict[0], name)) for name in dir(rsDict[0]) if not name.startswith('__'))
print ipoDate
result = ipoDate.get('LISTINGDATEA')
except:
result = '-'
return result
def getTotalCapital(self):
return self.__getCapitalValue('totalShares')
def getASharesTotalCapital(self):
aShareTotalShare = 0.0
AShareNonFlowShare = self.__getCapitalValue('totalNonFlowShare')
AShareFlowShare = self.getASharesOutstandingCaptial()
if AShareNonFlowShare != '-' and AShareNonFlowShare:
aShareTotalShare += float(AShareNonFlowShare)
if AShareFlowShare != '-' and AShareFlowShare:
aShareTotalShare += float(AShareFlowShare)
return aShareTotalShare
def getASharesOutstandingCaptial(self):
return self.__getCapitalValue('AShares')
def getBSharesTotalCapital(self):
return self.getBSharesOutstandingCaptial()
def getBSharesOutstandingCaptial(self):
return self.__getCapitalValue('BShares')
def getBSharesCode(self):
return self.__getBasicValue('SECURITY_CODE_B')
def getBSharesShortName(self):
if self.getBSharesCode().find('-') != -1:
return ''
else:
return self.getASharesShortName()
def getBSharesIPODate(self):
result = ''
try:
rsDict = self.__getDatas(self.basicURLC)
if rsDict == '-' or rsDict is None:
result = '-'
else:
ipoDate = dict((name, getattr(rsDict[0], name)) for name in dir(rsDict[0]) if not name.startswith('__'))
print ipoDate
result = ipoDate.get('LISTINGDATEB')
except:
result = '-'
return result
def getArea(self):
return self.__getBasicValue('AREA_NAME_DESC')
def getProvince(self):
return self.getArea()
def getCity(self):
return self.getArea()
def getTrade(self):
return self.__getBasicValue('SSE_CODE_DESC')
# CSRC行业(门类/大类/中类)
# 'CSRC_CODE_DESC') + '/' + self.__getBasicValue('CSRC_GREAT_CODE_DESC') + '/' + self.__getBasicValue('CSRC_MIDDLE_CODE_DESC')
def getWebsite(self):
return self.__getBasicValue('WWW_ADDRESS')
def __getDatas(self, url, basicInfo=True):
'''获取指定地址的html内容 .'''
request = urllib2.Request(url)
request.add_header('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8')
request.add_header('Accept-Encoding', 'gzip, deflate, sdch')
request.add_header('Accept-Language', 'zh-CN,zh;q=0.8,en;q=0.6')
request.add_header('Cache-Control', 'max-age=0')
request.add_header('Connection', 'keep-alive')
request.add_header('Host', 'query.sse.com.cn')
request.add_header('Upgrade-Insecure-Requests', '1')
if basicInfo:
request.add_header('Referer', 'http://www.sse.com.cn/assortment/stock/list/info/company/index.shtml?COMPANY_CODE=' + str(self.stockCode))
else:
request.add_header('Referer', 'http://www.sse.com.cn/assortment/stock/list/info/capital/index.shtml?COMPANY_CODE=' + str(self.stockCode))
request.add_header('User-Agent', 'Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.97 Safari/537.36')
# 尝试5次,如果每次都是timeout,打印提示信息,返回none
maxNum = 5
for i in range(maxNum):
try:
response = urllib2.urlopen(url=request, timeout=15)
# 慢一点 不然被屏蔽
sleep(5)
break
except:
pass
if i < maxNum - 1:
continue
else:
print 'URLError: All times is failed '
return None
response.encoding = 'utf-8'
result = response.read()
# print result
str2JsonData = str(result).split('(')[1].split(')')[0]
pythonObjData = json.loads(str2JsonData, object_hook=JSONObject)
# print pythonObjData.result
if not pythonObjData.result:
return '-'
else:
return pythonObjData.result
def __getBasicValue(self, key):
'''获得上市公司基本信息的值.'''
result = ''
try:
# 首次使用该方法,需要访问url,获取网页内容
if self.stockBasicInfo == None:
rsDict = self.__getDatas(self.basicURLA)
if rsDict == '-' or rsDict is None:
result = '-'
else:
self.stockBasicInfo = dict((name, getattr(rsDict[0], name)) for name in dir(rsDict[0]) if not name.startswith('__'))
# print self.stockBasicInfo
result = self.stockBasicInfo.get(key)
except:
result = '-'
# print result
return result
def __getCapitalValue(self, key):
'''获得上市公司股本信息的值.'''
result = ''
try:
# 首次使用该方法,需要访问url,获取网页内容
if self.stockCapitalInfo == None:
rsDict = self.__getDatas(self.capitalURL, basicInfo=False)
if rsDict == '-' or rsDict is None:
result = '-'
else:
self.stockCapitalInfo = dict((name, getattr(rsDict, name)) for name in dir(rsDict) if not name.startswith('__'))
# print self.stockCapitalInfo
result = self.stockCapitalInfo.get(key)
except:
result = '-'
# print result
return result
def __mergeBasicURL(self, sqlId, stockCode):
return 'http://query.sse.com.cn/commonQuery.do?jsonCallBack=jsonpCallback12345&isPagination=false&sqlId=' + sqlId + '&productid=' + str(stockCode) + '&_=14555555555552'
def __init__(self, stockCode):
self.stockCode = stockCode
self.basicURLA = self.__mergeBasicURL('COMMON_SSE_ZQPZ_GP_GPLB_C', stockCode)
self.basicURLB = self.__mergeBasicURL('COMMON_SSE_ZQPZ_GP_GPLB_AGSSR_C', stockCode)
self.basicURLC = self.__mergeBasicURL('COMMON_SSE_ZQPZ_GP_GPLB_BGSSR_C', stockCode)
self.basicURLD = self.__mergeBasicURL('COMMON_SSE_ZQPZ_GP_GPLB_MSXX_C', stockCode)
self.basicURLE = r'http://query.sse.com.cn/commonSoaQuery.do?jsonCallBack=jsonpCallback46644&isPagination=true&stockCode=' + str(stockCode) + '&tradeBeginDate=19700101&tradeEndDate=20161001&order=tradeBeginDate%7Cdesc&sqlId=PL_SCRL_SCRLB&pageHelp.pageNo=1&pageHelp.beginPage=1&pageHelp.cacheSize=1&pageHelp.endPage=1&pageHelp.pageSize=5&_=1475720975596'
self.capitalURL = 'http://query.sse.com.cn/security/stock/queryCompanyStockStruct.do?jsonCallBack=jsonpCallback86976&isPagination=false&companyCode=' + str(stockCode) + '&_=1475732919742'
self.stockBasicInfo = None
self.stockCapitalInfo = None
pass
if __name__ == '__main__':
for i in range(600001, 600003):
a = AchieveSSEStockInfo(600013)
for j in range(a.__public__.__len__()):
m = a.__public__[j]
f = getattr(a, m)
print m, f()
附录:
1.使用requests库抓取页面的时候的编码问题 https://segmentfault.com/q/10100000003410147.Applying borders to a cell in OpenPyxl http://stackoverflow.com/questions/24917201/applying-borders-to-a-cell-in-openpyxl
后记:
目前上交所已经提供A股上市公司xls的下载了,虽然信息不太完整,连接地址http://query.sse.com.cn/security/stock/downloadStockListFile.do?csrcCode=&stockCode=&areaName=&stockType=1