使用Python预测黄金AU9999收盘价

1、加载Python包

# - coding: utf-8 -*-
import pandas as pd
import math
import numpy as np
import datetime, time
from sklearn import preprocessing, cross_validation, svm
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
from matplotlib import style
style.use('ggplot')

2、读入数据集

#如何将xls转为csv
finance_data = pd.read_csv('/Users/MichaelDeng/PycharmProjects/Python_finance/AU9999历史交易数据(2011-12-01至2017-08-18).csv',encoding="gbk")

3、查看数据集

print type(finance_data)
print finance_data.index
print finance_data.columns
print finance_data.describe() # .astype(np.float64)
finance_data.head()

4、对数据集处理

df = finance_data[[u'日期',u'开盘价', u'最高价', u'最低价', u'收盘价', u'成交量(公斤)']]
df.columns=['Date','Adj. Open','Adj. Hign','Adj. Low','Adj. Close','Adj. Volume']
df['Date']=pd.to_datetime(df['Date'])
df.index=df['Date']
df = df.sort_index(ascending=True) #Date列不是索引时, df = df.sort(["Date"],ascending=True)
del df['Date']
df.head()
Adj. Open Adj. Hign Adj. Low Adj. Close Adj. Volume
Date
2011-12-01 353.38 359.00 353.00 357.50 27,800.00
2011-12-02 358.50 358.70 356.00 357.74 35,874.00
2011-12-05 360.00 360.00 356.00 357.50 43,370.00
2011-12-06 357.50 357.60 351.21 351.50 45,684.00
2011-12-07 353.48 354.99 350.00 354.60 39,684.00

5、处理异常行数据

# df[df['date'] == '20161111'] # 获取符合这个条件的行,前提是date为列名
#df[df['date'] == '20161111'].index[0] # 获取符合这个条件的行的行索引的值

df[df.index=='2014-05-02'].index# 获取符合这个条件的行的行索引的值
df.drop(df[df.index=='2014-05-02'].index,inplace=True)
print df.index
print df.columns
print df.describe()#.astype(np.float64)
df.head()
DatetimeIndex([‘2011-12-01’, ‘2011-12-02’, ‘2011-12-05’, ‘2011-12-06’, ‘2011-12-07’, ‘2011-12-08’, ‘2011-12-09’, ‘2011-12-12’, ‘2011-12-13’, ‘2011-12-14’, … ‘2017-08-07’, ‘2017-08-08’, ‘2017-08-09’, ‘2017-08-10’, ‘2017-08-11’, ‘2017-08-14’, ‘2017-08-15’, ‘2017-08-16’, ‘2017-08-17’, ‘2017-08-18’], dtype=’datetime64[ns]’, name=u’Date’, length=1397, freq=None) Index([u’Adj. Open’, u’Adj. Hign’, u’Adj. Low’, u’Adj. Close’, u’Adj. Volume’], dtype=’object’) Adj. Open Adj. Hign Adj. Low Adj. Close count 1397.000000 1397.000000 1397.000000 1397.000000 mean 276.691918 277.820988 274.407158 276.071790 std 38.338578 38.187169 38.099712 38.020006 min 210.790000 218.500000 167.500000 216.900000 25% 245.990000 247.000000 244.000000 245.700000 50% 265.980000 267.000000 264.000000 265.350000 75% 291.990000 293.990000 288.200000 290.800000 max 362.990000 363.000000 360.500000 362.000000
Adj. Open Adj. Hign Adj. Low Adj. Close Adj. Volume
Date
2011-12-01 353.38 359.00 353.00 357.50 27,800.00
2011-12-02 358.50 358.70 356.00 357.74 35,874.00
2011-12-05 360.00 360.00 356.00 357.50 43,370.00
2011-12-06 357.50 357.60 351.21 351.50 45,684.00
2011-12-07 353.48 354.99 350.00 354.60 39,684.00

6、某些列进行数据类型转换

for i in range(0,len(df)):
        df['Adj. Volume'][i] = df['Adj. Volume'][i].encode('ascii','ignore').replace(',','')# 将unicode转为string,encode('ascii','ignore')len(df)
# df.apply(lambda x: myfunc(df), axis=1)
df['Adj. Volume'] = df['Adj. Volume'].astype(np.float64)
df.describe()#.astype(np.float64)
df.head()
# 前提是数据中没有其他字符
# df = df.apply(lambda x: pd.to_numeric(x,errors='ignore'), axis=0) # df.convert_objects(convert_numeric=True).dtypes
Adj. Open Adj. Hign Adj. Low Adj. Close Adj. Volume
Date
2011-12-01 353.38 359.00 353.00 357.50 27800.0
2011-12-02 358.50 358.70 356.00 357.74 35874.0
2011-12-05 360.00 360.00 356.00 357.50 43370.0
2011-12-06 357.50 357.60 351.21 351.50 45684.0
2011-12-07 353.48 354.99 350.00 354.60 39684.0

7、计算其他特征指标

# 计算真正波幅(True Range,简称TR)。当日最高价减去当日最低价(H-L);当日最高价减去昨日收盘价(H-PC);当日最低价减去昨日收盘价(L-PC)。
# df.columns=['Date','Adj. Open','Adj. Hign','Adj. Low','Adj. Close','Adj. Volume']
df['HL_PCT'] = (df['Adj. Hign'] - df['Adj. Close']) / df['Adj. Close'] * 100.0
df['CO_PCT'] = (df['Adj. Close'] - df['Adj. Open']) / df['Adj. Open'] * 100.0
df = df[['Adj. Close', 'HL_PCT', 'CO_PCT', 'Adj. Volume']]
df.head()
Adj. Close HL_PCT CO_PCT Adj. Volume
Date
2011-12-01 357.50 0.419580 1.165884 27800.0
2011-12-02 357.74 0.268351 -0.211994 35874.0
2011-12-05 357.50 0.699301 -0.694444 43370.0
2011-12-06 351.50 1.735420 -1.678322 45684.0
2011-12-07 354.60 0.109983 0.316850 39684.0

8、构造预测指标

forecast_col = 'Adj. Close'
df.fillna(-99999, inplace=True)
forecast_out = 7 # forecast_out = int(math.ceil(0.01 * len(df)))  # math.ceil:返回大于等于数字参数的最小整数(取整函数)
df['label'] = df[forecast_col].shift(-forecast_out)  # shift函数是对数据进行移动的操作,此处是将股票收盘价向前移动forecast_out个位置,然后作为标签
#df.dropna(inplace=True) #可以按行丢弃带有nan的数据
df.head()
Adj. Close HL_PCT CO_PCT Adj. Volume label
Date
2011-12-01 357.50 0.419580 1.165884 27800.0 346.35
2011-12-02 357.74 0.268351 -0.211994 35874.0 339.30
2011-12-05 357.50 0.699301 -0.694444 43370.0 338.00
2011-12-06 351.50 1.735420 -1.678322 45684.0 322.30
2011-12-07 354.60 0.109983 0.316850 39684.0 329.30

9、特征指标数据缩放,预测数据分割

X = np.array(df.drop(['label'], 1))  # 对于DataFrame,可以从任何坐标轴删除索引值:
X = preprocessing.scale(X)  # 数据缩放的算法是?
X = X[:-forecast_out]
X_lately = X[-forecast_out:]
df.dropna(inplace=True)  # 有的天没有股票数据,需要清除掉
y = np.array(df['label'])
print X
print y
[[ 2.14248705 -0.23638346  1.47967723 -1.47731955]
 [ 2.14880178 -0.39770846 -0.00316564 -1.38496861]
 [ 2.14248705  0.06201065 -0.52236799 -1.29922888]
 ..., 
 [-0.07082465  0.18229388 -0.60315191 -0.4283689 ]
 [-0.07713937 -0.52775306  0.08725379 -0.62432583]
 [-0.0505649   0.30437763  0.09167618 -0.34186281]]
[ 346.35  339.3   338.   ...,  274.67  277.    278.5 ]

10、模型建立与模型预测

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)
clf = LinearRegression(n_jobs=-1)  # n_jobs=-1 表示使用所有CPU
# clf= svm.SVR(kernel='poly')
clf.fit(X_train, y_train)
accuracy = clf.score(X_test, y_test)
print accuracy
forecast_set = clf.predict(X_lately)
print(forecast_set, accuracy, forecast_out)
df['Forecast'] = np.nan
0.96681672249
(array([ 275.02644769,  274.79433411,  273.48581788,  275.07236144,
        273.26221787,  273.02901971,  274.17157259]), 0.96681672249024297, 7)

11、预测结果可视化

df = df['2017']# 选取2017年数据画图
last_date = df.iloc[-1].name
last_date
# last_unix = last_date.timestamp()
last_unix = time.mktime(last_date.timetuple())
one_day = 86400
next_unix = last_unix + one_day
for i in forecast_set:
    next_date = datetime.datetime.fromtimestamp(next_unix)
    next_unix += one_day
    df.loc[next_date] = [np.nan for _ in range(len(df.columns) - 1)] + [i]
df['Adj. Close'].plot()
df['Forecast'].plot()
plt.legend(loc=4)
plt.xlabel('Date')
plt.ylabel('Price')
plt.show()

使用Python预测黄金AU9999收盘价_第1张图片

历史数据

df_his = df[df['Forecast'].isnull()]
df_his['Adj. Close'].plot()
plt.legend(loc=4)
plt.xlabel('Date')
plt.ylabel('Price')
plt.show()

使用Python预测黄金AU9999收盘价_第2张图片

预测数据

df_for = df[df['Forecast'].notnull()]
df_for['Forecast'].plot()
plt.legend(loc=4)
plt.xlabel('Date')
plt.ylabel('Price')
plt.show()

使用Python预测黄金AU9999收盘价_第3张图片

你可能感兴趣的:(数据挖掘)