最近在对日线进行分析回测时,需要用到股票的复权因子,因为TuShare的复权因子并没有提供公开调用API,这里考虑从Sina下载并解析。
主要参考了这篇文章:用Python从sina下载复权因子
新浪财经的复权因子数据例子,例如600000股票2017年第一季度复权因子在这里:http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_FuQuanMarketHistory/stockid/600000.phtml?year=2017&jidu=1
print '\xc0\xfa\xca\xb7\xbd\xbb\xd2\xd7'.decode('GB2312') #将GB2312编码打印出汉字
import numpy as np
import pandas as pd
import tushare as ts
import datetime
import time
import tushare as ts
import os
import urllib2 #urllib2是Python的一个获取URLs的组件
import re #python的正则表达式模块
#global variables
data_dir = 'D:\\python_study\\stock_hist_data\\' #下载数据的存放路径
sleep_time=2 #default 2s
#找到特定str所在的那一行
def find_str(str_list, str):
idx = -1
row = 0
for l in str_list:
idx = l.find(str)
if idx != -1:
break
row += 1
return row
#第二类解析
def deco_url2(url_list, str):
s='\xc0\xfa\xca\xb7\xbd\xbb\xd2\xd7begin-->' #开始标志:历史交易begin-->
e='\xc0\xfa\xca\xb7\xbd\xbb\xd2\xd7end-->' #结束标志:历史交易end-->
s_idx = find_str(url_list, s)
e_idx = find_str(url_list, e)
factor=[]
if e_idx - s_idx == 966-899: #这一行没看懂,返回空的因子list
return factor
else:
x='\xbc\xbe\xb6\xc8\xb8\xb4\xc8\xa8\xc0\xfa\xca\xb7\xbd\xbb\xd2\xd7' #查找'季度复权历史交易'
x_idx=find_str(url_list,x)
time_row=x_idx+17 #加17找到有日期的那一行, 这些加多少完全是根据网页源码显示的来手工数出来的,写的此时我也不知道更高级点的做法该怎么做
factor_row=x_idx+26 #加26找到有因子的那一行
delta = 14 #加14找到下一天因子行
while factor_row <= e_idx-7:
t = url_list[time_row][-14:-4]
#print t, url_list[factor_row-1]
f_re = re.search('\d+(\.\d*)?',url_list[factor_row])
temp = t+','+f_re.group()+','+str
factor.append(temp.strip())
time_row += delta
factor_row += delta
return factor
#第一类解析
def deco_url1(url_list, str):
s='\xc0\xfa\xca\xb7\xbd\xbb\xd2\xd7begin-->'
e='\xc0\xfa\xca\xb7\xbd\xbb\xd2\xd7end-->'
s_idx = find_str(url_list, s)
e_idx = find_str(url_list, e)
factor=[]
if e_idx - s_idx == 966-899:
return factor
else:
x='\xbc\xbe\xb6\xc8\xb8\xb4\xc8\xa8\xc0\xfa\xca\xb7\xbd\xbb\xd2\xd7'
x_idx=find_str(url_list,x)
time_row=x_idx+18
factor_row=x_idx+26
delta = 14
while factor_row <= e_idx-7:
if delta == 14:
t = url_list[time_row-1][5:15]
if delta == 11:
t = url_list[time_row-1][6:16]
#print t, url_list[factor_row-1]
f_re = re.search('\d+(\.\d*)?',url_list[factor_row-1])
if f_re == None:
time_row -= 2
factor_row -= 3
delta = 11
ti = url_list[time_row-1].find('2')
t = url_list[time_row-1][ti:(10+ti)]
#print url_list[factor_row-1]
f_re = re.search('\d+(\.\d*)?',url_list[factor_row-1])
temp = t+','+f_re.group()+','+str
factor.append(temp.strip())
time_row += delta
factor_row += delta
return factor
#获取感兴趣的所有股票信息,这里只获取沪深300股票
def get_all_stock_id():
stock_info=ts.get_hs300s()
return stock_info['code'].values
#sample: get_former_right_factor('000001', 2017, '1')
def get_former_right_factor(sym, year, jidu):
global sleep_time,data_dir
l1="http://vip.stock.finance.sina.com.cn/corp/go.php/vMS_FuQuanMarketHistory/stockid/"
l2=".phtml?year="
l3="&jidu="
str_year = str(year)
file_path = data_dir + sym + '\\' + str(year) + '\\'
file_name = str(year) + '_' + jidu + '_factor.txt'
if not os.path.exists(file_path):
os.makedirs(file_path)
if not os.path.exists(file_path+file_name):
link=l1 + sym + l2 + str(year) + l3 + jidu
try:
url_test = urllib2.urlopen(link)
except IOError, msg:
print str(msg).decode('UTF-8')
sleep_time=min(sleep_time*2, 512)#每次下载失败后sleep_time翻倍,但是最大512s (新浪为防止恶意查询设置的封禁间隔为6min=360s)
print 'Get factor data error: symbol: '+ sym + ', year: ' + str_year + ', jidu: ' + jidu + ', sleep time is: '+str(sleep_time)
return True
else:
url_content = url_test.readlines()
time = str(year)+jidu
#if time >= '20062':
# s = deco_url1(url_content,sym)
#else:
s = deco_url2(url_content,sym)
#save to files
fobj= open(file_path + file_name, 'w')
for ss in s:
fobj.write(ss+'\n')
fobj.close()
sleep_time=max(sleep_time/2, 2) #每次成功下载后sleep_time变为一半,但是至少2s
return True
years=range(2010,2017,1)
stocks=get_all_stock_id()
for stock in stocks:
for year in years:
for jd in ['4','3','2','1']: #一年四个季度
get_former_right_factor(stock, year, jd)
time.sleep(sleep_time)