1、加载Python包
import pandas as pd
import math
import numpy as np
import datetime, time
from sklearn import preprocessing, cross_validation, svm
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')
2、读入数据集
finance_data = pd.read_csv('/Users/MichaelDeng/PycharmProjects/Python_finance/AU9999历史交易数据(2011-12-01至2017-08-18).csv',encoding="gbk")
3、查看数据集
print type(finance_data)
print finance_data.index
print finance_data.columns
print finance_data.describe()
finance_data.head()
4、对数据集处理
df = finance_data[[u'日期',u'开盘价', u'最高价', u'最低价', u'收盘价', u'成交量(公斤)']]
df.columns=['Date','Adj. Open','Adj. Hign','Adj. Low','Adj. Close','Adj. Volume']
df['Date']=pd.to_datetime(df['Date'])
df.index=df['Date']
df = df.sort_index(ascending=True)
del df['Date']
df.head()
|
Adj. Open |
Adj. Hign |
Adj. Low |
Adj. Close |
Adj. Volume |
Date |
|
|
|
|
|
2011-12-01 |
353.38 |
359.00 |
353.00 |
357.50 |
27,800.00 |
2011-12-02 |
358.50 |
358.70 |
356.00 |
357.74 |
35,874.00 |
2011-12-05 |
360.00 |
360.00 |
356.00 |
357.50 |
43,370.00 |
2011-12-06 |
357.50 |
357.60 |
351.21 |
351.50 |
45,684.00 |
2011-12-07 |
353.48 |
354.99 |
350.00 |
354.60 |
39,684.00 |
5、处理异常行数据
df[df.index=='2014-05-02'].index
df.drop(df[df.index=='2014-05-02'].index,inplace=True)
print df.index
print df.columns
print df.describe()
df.head()
DatetimeIndex([‘2011-12-01’, ‘2011-12-02’, ‘2011-12-05’, ‘2011-12-06’, ‘2011-12-07’, ‘2011-12-08’, ‘2011-12-09’, ‘2011-12-12’, ‘2011-12-13’, ‘2011-12-14’, … ‘2017-08-07’, ‘2017-08-08’, ‘2017-08-09’, ‘2017-08-10’, ‘2017-08-11’, ‘2017-08-14’, ‘2017-08-15’, ‘2017-08-16’, ‘2017-08-17’, ‘2017-08-18’], dtype=’datetime64[ns]’, name=u’Date’, length=1397, freq=None) Index([u’Adj. Open’, u’Adj. Hign’, u’Adj. Low’, u’Adj. Close’, u’Adj. Volume’], dtype=’object’) Adj. Open Adj. Hign Adj. Low Adj. Close count 1397.000000 1397.000000 1397.000000 1397.000000 mean 276.691918 277.820988 274.407158 276.071790 std 38.338578 38.187169 38.099712 38.020006 min 210.790000 218.500000 167.500000 216.900000 25% 245.990000 247.000000 244.000000 245.700000 50% 265.980000 267.000000 264.000000 265.350000 75% 291.990000 293.990000 288.200000 290.800000 max 362.990000 363.000000 360.500000 362.000000
|
Adj. Open |
Adj. Hign |
Adj. Low |
Adj. Close |
Adj. Volume |
Date |
|
|
|
|
|
2011-12-01 |
353.38 |
359.00 |
353.00 |
357.50 |
27,800.00 |
2011-12-02 |
358.50 |
358.70 |
356.00 |
357.74 |
35,874.00 |
2011-12-05 |
360.00 |
360.00 |
356.00 |
357.50 |
43,370.00 |
2011-12-06 |
357.50 |
357.60 |
351.21 |
351.50 |
45,684.00 |
2011-12-07 |
353.48 |
354.99 |
350.00 |
354.60 |
39,684.00 |
6、某些列进行数据类型转换
for i in range(0,len(df)):
df['Adj. Volume'][i] = df['Adj. Volume'][i].encode('ascii','ignore').replace(',','')
df['Adj. Volume'] = df['Adj. Volume'].astype(np.float64)
df.describe()
df.head()
|
Adj. Open |
Adj. Hign |
Adj. Low |
Adj. Close |
Adj. Volume |
Date |
|
|
|
|
|
2011-12-01 |
353.38 |
359.00 |
353.00 |
357.50 |
27800.0 |
2011-12-02 |
358.50 |
358.70 |
356.00 |
357.74 |
35874.0 |
2011-12-05 |
360.00 |
360.00 |
356.00 |
357.50 |
43370.0 |
2011-12-06 |
357.50 |
357.60 |
351.21 |
351.50 |
45684.0 |
2011-12-07 |
353.48 |
354.99 |
350.00 |
354.60 |
39684.0 |
7、计算其他特征指标
df['HL_PCT'] = (df['Adj. Hign'] - df['Adj. Close']) / df['Adj. Close'] * 100.0
df['CO_PCT'] = (df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open'] * 100.0
df = df[['Adj. Close', 'HL_PCT', 'CO_PCT', 'Adj. Volume']]
df.head()
|
Adj. Close |
HL_PCT |
CO_PCT |
Adj. Volume |
Date |
|
|
|
|
2011-12-01 |
357.50 |
0.419580 |
1.165884 |
27800.0 |
2011-12-02 |
357.74 |
0.268351 |
-0.211994 |
35874.0 |
2011-12-05 |
357.50 |
0.699301 |
-0.694444 |
43370.0 |
2011-12-06 |
351.50 |
1.735420 |
-1.678322 |
45684.0 |
2011-12-07 |
354.60 |
0.109983 |
0.316850 |
39684.0 |
8、构造预测指标
forecast_col = 'Adj. Close'
df.fillna(-99999, inplace=True)
forecast_out = 7
df['label'] = df[forecast_col].shift(-forecast_out)
df.head()
|
Adj. Close |
HL_PCT |
CO_PCT |
Adj. Volume |
label |
Date |
|
|
|
|
|
2011-12-01 |
357.50 |
0.419580 |
1.165884 |
27800.0 |
346.35 |
2011-12-02 |
357.74 |
0.268351 |
-0.211994 |
35874.0 |
339.30 |
2011-12-05 |
357.50 |
0.699301 |
-0.694444 |
43370.0 |
338.00 |
2011-12-06 |
351.50 |
1.735420 |
-1.678322 |
45684.0 |
322.30 |
2011-12-07 |
354.60 |
0.109983 |
0.316850 |
39684.0 |
329.30 |
9、特征指标数据缩放,预测数据分割
X = np.array(df.drop(['label'], 1))
X = preprocessing.scale(X)
X = X[:-forecast_out]
X_lately = X[-forecast_out:]
df.dropna(inplace=True)
y = np.array(df['label'])
print X
print y
[[ 2.14248705 -0.23638346 1.47967723 -1.47731955]
[ 2.14880178 -0.39770846 -0.00316564 -1.38496861]
[ 2.14248705 0.06201065 -0.52236799 -1.29922888]
...,
[-0.07082465 0.18229388 -0.60315191 -0.4283689 ]
[-0.07713937 -0.52775306 0.08725379 -0.62432583]
[-0.0505649 0.30437763 0.09167618 -0.34186281]]
[ 346.35 339.3 338. ..., 274.67 277. 278.5 ]
10、模型建立与模型预测
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)
clf = LinearRegression(n_jobs=-1)
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)
print accuracy
forecast_set = clf.predict(X_lately)
print(forecast_set, accuracy, forecast_out)
df['Forecast'] = np.nan
0.96681672249
(array([ 275.02644769, 274.79433411, 273.48581788, 275.07236144,
273.26221787, 273.02901971, 274.17157259]), 0.96681672249024297, 7)
11、预测结果可视化
df = df['2017']
last_date = df.iloc[-1].name
last_date
last_unix = time.mktime(last_date.timetuple())
one_day = 86400
next_unix = last_unix + one_day
for i in forecast_set:
next_date = datetime.datetime.fromtimestamp(next_unix)
next_unix += one_day
df.loc[next_date] = [np.nan for _ in range(len(df.columns) - 1)] + [i]
df['Adj. Close'].plot()
df['Forecast'].plot()
plt.legend(loc=4)
plt.xlabel('Date')
plt.ylabel('Price')
plt.show()
data:image/s3,"s3://crabby-images/27d14/27d145fd1c0ebb6adf250ba60e4a16c58319b182" alt="使用Python预测黄金AU9999收盘价_第1张图片"
历史数据
df_his = df[df['Forecast'].isnull()]
df_his['Adj. Close'].plot()
plt.legend(loc=4)
plt.xlabel('Date')
plt.ylabel('Price')
plt.show()
data:image/s3,"s3://crabby-images/b20fd/b20fda8933937bf5d18adf4e4f82addda07c1eca" alt="使用Python预测黄金AU9999收盘价_第2张图片"
预测数据
df_for = df[df['Forecast'].notnull()]
df_for['Forecast'].plot()
plt.legend(loc=4)
plt.xlabel('Date')
plt.ylabel('Price')
plt.show()
data:image/s3,"s3://crabby-images/1ba34/1ba3452501bd360338cab4d2c538221dc200f32f" alt="使用Python预测黄金AU9999收盘价_第3张图片"