使用Python爬取港交所股票行情数据——附Python源码

GitHub地址:https://github.com/yangwohenmai/TEST/blob/master/GetAPIData/GetHKData/%E8%AF%BB%E5%8F%96%E6%B8%AF%E4%BA%A4%E6%89%80%E6%95%B0%E6%8D%AE.py

爬取港交所数据最大的问题是如何获取港交所页面的Token,有了Token之后就可以从港交所接口请求数据了。

下面这段python首先解析港交所页面,从页面中获取港交所Token值,而后请求返回的数据,数据格式类似于Json,但是需要稍微处理一下,就可以用Json解析了。

完整的json数据如下:

{{
  "data": {
    "responsecode": "000",
    "responsemsg": "",
    "quote": {
      "hi": "74.350",
      "rs_stock_flag": false,
      "fiscal_year_end": "31 Dec 2018",
      "hist_closedate": "30 May 2019",
      "replication_method": null,
      "amt_os": "3,856,240,500",
      "primaryexch": "HKEX",
      "ric": "0001.HK",
      "product_subtype": null,
      "db_updatetime": "31 May 2019 09:36",
      "mkt_cap_u": "B",
      "am_u": "M",
      "ew_sub_right": "",
      "secondary_listing": false,
      "ew_amt_os_cur": null,
      "ccy": "HKD",
      "management_fee": "",
      "ew_underlying_code": null,
      "trdstatus": "N",
      "nav": "",
      "original_offer_price": "",
      "issue": "",
      "asset_class": null,
      "eps": 10.1109,
      "inline_upper_strike_price": "",
      "sedol": "BW9P816",
      "am": "697.27",
      "iv": "",
      "ew_strike": "",
      "as": "74.100",
      "geographic_focus": null,
      "incorpin": "Cayman Islands",
      "etp_baseCur": null,
      "ew_amt_os": "",
      "bd": "74.050",
      "registrar": "Computershare Hong Kong Investor Services Ltd.",
      "depositary": null,
      "exotic_type": null,
      "callput_indicator": null,
      "primary_market": null,
      "underlying_index": null,
      "lot": "500",
      "lo52": "72.800",
      "shares_issued_date": "30 Apr 2019",
      "premium": "",
      "strike_price_ccy": null,
      "yield": "",
      "vo_u": "M",
      "base_currency": null,
      "coupon": "",
      "expiry_date": "",
      "chairman": "Li Tzar Kuoi Victor",
      "underlying_ric": "0001.HK",
      "hi52": "92.500",
      "issuer_name": "CK Hutchison Holdings Ltd.",
      "h_share_flag": false,
      "ew_sub_per_from": "",
      "div_yield": "4.28",
      "interest_payment_date": "-",
      "updatetime": "31 May 2019 16:08",
      "aum_date": "",
      "lo": "73.050",
      "mkt_cap": "285.55",
      "f_aum_hkd": null,
      "ew_sub_per_to": "",
      "ls": "74.050",
      "nav_date": "",
      "csic_classification": null,
      "floating_flag": false,
      "issued_shares_note": null,
      "eff_gear": "",
      "board_lot_nominal": "",
      "hsic_ind_classification": "Conglomerates - Conglomerates",
      "ew_desc": null,
      "inception_date": "",
      "nc": "+1.050",
      "aum": "",
      "vo": "9.41",
      "secondary_listing_flag": false,
      "listing_date": "1 Nov 1972",
      "as_at_label": "as at",
      "ew_amt_os_dat": "",
      "nm": "CK Hutchison Holdings Ltd.",
      "nm_s": "CKH HOLDINGS",
      "sym": "1",
      "inline_lower_strike_price": "",
      "listing_category": "Primary Listing",
      "ew_strike_cur": null,
      "exotic_warrant_indicator": null,
      "investment_focus": null,
      "call_price": "",
      "tck": "0.050",
      "strike_price": "",
      "summary": "CK Hutchison Holdings Limited is an investment holding company mainly engaged in the retail business. Along with subsidiaries, the Company operates its business through five segments: the Retail segment, the Telecommunications segment, the Infrastructure segment, the Ports and Related Services segment, and the Husky Energy segment. The Retail segment is involved in the manufacturing and sale of health and beauty products, as well as consumer electronics and electrical appliances. It also operates supermarkets, as well as manufactures and distributes bottled water and beverage products. The Telecommunications segment provides mobile telecommunications and data services by 3 Group Europe, Hutchison Telecommunications Hong Kong Holdings, and Hutchison Asia Telecommunications. The Infrastructure segment is involved in the energy infrastructure, transportation infrastructure, water infrastructure, waste management, waste-to-energy and infrastructure related businesses.",
      "op": "73.050",
      "aum_u": "",
      "nav_ccy": null,
      "os": "",
      "wnt_gear": "",
      "transfer_of_listing_date": "",
      "hsic_sub_sector_classification": "Conglomerates",
      "amt_ccy": null,
      "domicile_country": null,
      "entitlement_ratio": "",
      "product_type": "EQTY",
      "office_address": "48th Floor
Cheung Kong Center
2 Queen's Road Central
Hong Kong", "pc": "+1.44", "days_to_expiry": null, "underlying_code": null, "pe": "7.32", "eps_ccy": "HKD", "hdr": false, "launch_date": "", "hc": "73.000", "isin": "KYG217651051", "moneyness": "" } }, "qid": "NULL" }}

程序里我随便挑了几个字段输出出来

#coding=utf-8
#!/usr/bin/python
# 导入requests库
import requests
# 导入文件操作库
import os
import re
import bs4
from bs4 import BeautifulSoup
import sys
import json



# 主方法
def main():
    # 给请求指定一个请求头来模拟chrome浏览器
    headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'}
    page_max = 100
    house = 'https://www.hkex.com.hk/?sc_lang=EN'
    res = requests.get(house, headers=headers)
    soup = BeautifulSoup(res.text, 'html.parser')
    #print(re.search('Base64-AES-Encrypted-Token',soup.text).span())
    #print (soup.text[2438:2465])
    #定位到Base64-AES-Encrypted-Token
    num = re.search('Base64-AES-Encrypted-Token',soup.text).span()
    print(num)
    print(num[1])
    #从定位点向后取120个字符
    numstr = soup.text[num[1]:num[1]+120]
    #print (re.search('return',numstr).span())
    #在120个字符中定位到return
    num1 = re.search('return',numstr).span()
    #从return的定位+2 向后取 100个字符
    numstr1 = numstr[num1[1]+2:num1[1]+100]
    print(numstr1)
    news = ''
    #对100个字符遍历,找到引号内的token
    for s in range(len(numstr1)):
        if numstr1[s] != '"':
            news = news+ numstr1[s]
            print(news)
        else:
            print('找到了token:'+news)
            result = 'https://www1.hkex.com.hk/hkexwidget/data/getequityquote?sym=1&token=%s&lang=eng&qid=NULL&callback=NULL' %news
            print(result)
            break

    res = requests.get(result, headers=headers)
    soup1 = BeautifulSoup(res.text, 'html.parser')
    
    print(json.loads(soup1.text[5:len(soup1.text)-1]))
    jsonstr = json.loads(soup1.text[5:len(soup1.text)-1])
    print(jsonstr['data']['quote']['hi'])
    print(jsonstr['data']['quote']['db_updatetime'])
    print(jsonstr['data']['quote']['amt_os'])
    print(jsonstr['data']['quote']['ric'])
    print(jsonstr['data']['quote']['primaryexch'])
    #WriteTxt(json.loads(soup1.text[5:len(soup1.text)-1]), 'D:/', 'bbb1')

        
#存储在任意路径 , message:消息内容 , path:文件路径 , filmname:文件名
def WriteTxt( message, path, filmname):
    strMessage = '\n' #+ time.strftime('%Y-%m-%d %H:%M:%S')
    strMessage += ':\n%s' % message
    fileName = os.path.join(path, "_" + filmname +  '.txt')
    with open(fileName, 'a', encoding='utf-8') as f:
        f.write(strMessage)



if __name__ == '__main__':
    main()

输出结果如下:

74.350
31 May 2019 09:36
3,856,240,500
0001.HK
HKEX

这中间有个问题要说一下,文中我只是拿00001这个代码做了个例子,所以向接口发出一次请求,只返回了一个代码的数据。如果你想每天批量爬取港股所有的行情数据,首先你要建立一个港股所有股票的码表,通过遍历这个码表,把每个股票代码对应的数据取出来。

取数据的核心请求链接是:

https://www1.hkex.com.hk/hkexwidget/data/getequityquote?sym=1&token=%s&lang=eng&qid=NULL&callback=NULL

其中链接里sym=1这个地方就是对应的股票代码,这里股票代码的00001,在连接里要把前面的0都去掉。同理,如果你想获取00002这个股票代码的数据,那么链接里就要写sym=2

每次替换sym后面对应的数字,就能获取相应股票的行情数据。

C#版本:使用C#爬取港交所股票行情数据——附C#源码

你可能感兴趣的:(小技术_Python)