http://www.quantstart.com/articles/Basics-of-Statistical-Mean-Reversion-Testing
trend following/momentum和mean-reversion是两种最基本的设计策略的思路,对于后者而言,首先我们要知道time-series是不是满足mean-reversion的。从数学上说,连续的均值回复过程就是随机过程中的OU过程(Ornstein-Uhlenbeck),不同于布朗运动。我们可以利用pandas和statsmodel进行ADF检验来判断均值回复性(均值回复性是平稳性的必要条件),以判断序列是不是具有均值回复性。如果我们得到的统计量大于临界值,那么不能拒绝原假设,即序列是非均值回复的,而是随机游走。如果是从网络数据库抓取数据,上面的过程可以写为
# Import the Time Series library import statsmodels.tsa.stattools as ts # Import Datetime and the Pandas DataReader from datetime import datetime from pandas.io.data import DataReader # Download the Google OHLCV data from 1/1/2000 to 1/1/2013 goog = DataReader("GOOG", "yahoo", datetime(2000,1,1), datetime(2013,1,1))
首先对时间序列进行单位根检验
ts.adfuller(,1)
我们还可以用HURST INDEX对平稳性进行检验
H<0.5 - The time series is mean reverting
H=0.5 - The time series is a Geometric Brownian Motion
H>0.5 - The time series is trending
from numpy import cumsum, log, polyfit, sqrt, std, subtract from numpy.random import randn def hurst(ts): """Returns the Hurst Exponent of the time series vector ts""" # Create the range of lag values lags = range(2, 100) # Calculate the array of the variances of the lagged differences tau = [sqrt(std(subtract(ts[lag:], ts[:-lag]))) for lag in lags] # Use a linear fit to estimate the Hurst Exponent poly = polyfit(log(lags), log(tau), 1) # Return the Hurst exponent from the polyfit output return poly[0]*2.0 # Create a Gometric Brownian Motion, Mean-Reverting and Trending Series gbm = log(cumsum(randn(100000))+1000) mr = log(randn(100000)+1000) tr = log(cumsum(randn(100000)+1)+1000) # Output the Hurst Exponent for each of the above series # and the price of Google (the Adjusted Close price) for # the ADF test given above in the article print "Hurst(GBM): %s" % hurst(gbm) print "Hurst(MR): %s" % hurst(mr) print "Hurst(TR): %s" % hurst(tr) # Assuming you have run the above code to obtain 'goog'! print "Hurst(GOOG): %s" % hurst(goog['Adj Close'])下面讨论协整的检验,Cointegrated Augmented Dickey-Fuller Test,至于协整这一概念的阐述本质两个序列的线性组合为平稳的,
#做统计套利时我们常常要作散点图以及进行协整检验
# cadf.py
import datetime
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import pandas as pd
import pandas.io.data as web
import pprint
import statsmodels.tsa.stattools as ts
from pandas.stats.api import ols
def plot_price_series(df, ts1, ts2):
months = mdates.MonthLocator() #可以是DayLocator HourLocator
fig, ax = plt.subplots()
ax.plot(df.index, df[ts1], label=ts1)
ax.plot(df.index, df[ts2], label=ts2)
ax.xaxis.set_major_locator(months)
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))
ax.set_xlim(datetime.datetime(2012, 1, 1), datetime.datetime(2013, 1, 1))
ax.grid(True)
fig.autofmt_xdate()
plt.xlabel('Month/Year')
plt.ylabel('Price ($)')
plt.title('%s and %s Daily Prices' % (ts1, ts2))
plt.legend()
plt.show()
import datetime
#import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
#import pandas as pd
#import pandas.io.data as web
#import pprint
import statsmodels.tsa.stattools as ts
from pandas.stats.api import ols
#我们的main函数里面的DataFrame就是d,而程序处理后的df就是mergeDF和Pnl等结果
def plot_price_series(df, ts1, ts2):
hours = mdates.HourLocator() #可以是DayLocator HourLocator
fig, ax = plt.subplots()
ax.plot(df.index, df[ts1], label=ts1) #这里要把index重设一下,因为原本的index是pTime和sym_
ax.plot(df.index, df[ts2], label=ts2)
ax.xaxis.set_major_locator(hours)
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))
ax.set_xlim(datetime.datetime(2014, 1, 2), datetime.datetime(2014, 1, 1)) #开始和结束日期要和样本统一
ax.grid(True)
fig.autofmt_xdate()
plt.xlabel('Hour/Year')
plt.ylabel('TrdPriceLast ($)')
plt.title('%s and %s Prices' % (ts1, ts2))
plt.legend()
plt.show()
def plot_scatter_series(df, ts1, ts2):
plt.xlabel('%s TrdPriceLast ($)' % ts1)
plt.ylabel('%s TrdPriceLast ($)' % ts2)
plt.title('%s and %s Price Scatterplot' % (ts1, ts2))
plt.scatter(df[ts1], df[ts2])
plt.show()
def plot_residuals(df):
hours = mdates.HourLocator()
fig, ax = plt.subplots()
ax.plot(df.index, df["res"], label="Residuals")
ax.xaxis.set_major_locator(Hours)
ax.xaxis.set_major_formatter(mdates.DateFormatter('%b %Y'))
ax.set_xlim(datetime.datetime(2014, 1, 2), datetime.datetime(2014, 1, 1))
ax.grid(True)
fig.autofmt_xdate()
plt.xlabel('Hour/Year')
plt.ylabel('TrdPriceLast ($)')
plt.title('Residual Plot')
plt.legend()
plt.plot(df["res"])
plt.show()
if __name__ == "__main__":
start = datetime.datetime(2014, 1, 2)
end = datetime.datetime(2014, 1, 1)
arex = web.DataReader("AREX", "yahoo", start, end)
wll = web.DataReader("WLL", "yahoo", start, end)
df = pd.DataFrame(index=arex.index)
d = d.reset_index()
d = d.set_index(["pTime"])
df["D"] = d[d["sym"]=="D"][TrdPriceLast]
df["E"] = d[d["sym"]=="E"][TrdPriceLast]
# Plot the two time series
plot_price_series(df, "D", "E")
# Display a scatter plot of the two time series
plot_scatter_series(df, "D", "E")
# Calculate optimal hedge ratio "beta"
res = ols(y=df['D'], x=df["E"])
beta_hr = res.beta.x
# Calculate the residuals of the linear combination
df["res"] = df["D"] - beta_hr*df["E"]
# Plot the residuals
plot_residuals(df)
# Calculate and output the CADF test on the residuals
cadf = ts.adfuller(df["res"])
pprint.pprint(cadf)