股票预测模型的改良(二)

import numpy as np
import pandas as pd
from sklearn import svm
from sklearn.linear_model import LogisticRegression
import tushare as ts
from sklearn import cross_validation
data=ts.get_k_data('600000',start='2007-01-01',end='2018-04-13')
data_SZ_index=ts.get_k_data('000001',index=True,start='2007-01-01',end='2018-04-13')
from datetime import datetime
data['date'] = [datetime.strptime(x,'%Y-%m-%d') for x in data['date']]
data_SZ_index['date'] = [datetime.strptime(x,'%Y-%m-%d') for x in data_SZ_index['date']]
subdata_SZ_index=data_SZ_index[data_SZ_index['date'].isin(data['date'])]#数据对齐
sub_index_open=subdata_SZ_index['open'].values  #z做了对齐之后,丢失一部分大盘数据对index有影响,直接取数据部分
sub_index_close=subdata_SZ_index['close'].values
col_index=[]
y=[]
data_open=data['open']
data_close=data['close']
num_data=len(data)
for i in xrange(num_data):
    if sub_index_close[i]>=sub_index_open[i]:
        col_index.append(1)
    else:
        col_index.append(0)
    if data_close[i]>=data_open[i]:
        y.append(1)
    else:
        y.append(0)
x_data=data[['open','close','high','low','volume']].as_matrix()
x=np.c_[x_data,col_index]#将大盘指数的涨跌合并到特征值中
data_shape=x.shape
data_rows=data_shape[0]
data_cols=data_shape[1]
data_col_max=x.max(axis=0)
data_col_min=x.min(axis=0)
for i in xrange(0, data_rows, 1):#将输入数组归一化
    for j in xrange(0, data_cols, 1):
        x[i][j] = \
            (x[i][j] - data_col_min[j]) / \
            (data_col_max[j] - data_col_min[j])
x=x[0:2690]
y=y[1:2691]
clf1 = svm.SVC(kernel='rbf')

# x和y的验证集和测试集,切分80-20%的测试集
x_train, x_test, y_train, y_test = \
    cross_validation.train_test_split(x, y, test_size=0.2)
# 训练数据进行训练
clf1.fit(x_train, y_train)
result =[]
for i in (-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5):
    C = 2 ** i
    for j in (-5, -4, -3, -2, -1, 0, 1, 2, 3, 4, 5):
        G = 2 ** j
        clf1 = svm.SVC(kernel='rbf', gamma=G, C=C).fit(x_train,y_train)
        y_predictions1=clf1.predict(x_test)
        k=0
        for i in range(len(y_test)):
            if y_predictions1[i]==y_test[i]:
                k+=1
        result.append([C,G,k])
result1 = sorted(result, key=lambda x:x[2])

for i in result1:
    print i
[0.03125, 0.03125, 276]
[0.03125, 0.0625, 276]
[0.03125, 0.125, 276]
[0.03125, 0.25, 276]
[0.03125, 0.5, 276]
[0.03125, 1, 276]
[0.03125, 2, 276]
[0.03125, 4, 276]
[0.03125, 8, 276]
[0.03125, 16, 276]
[0.03125, 32, 276]
[0.0625, 0.03125, 276]
[0.0625, 0.0625, 276]
[0.0625, 0.125, 276]
[0.0625, 0.25, 276]
[0.0625, 0.5, 276]
[0.125, 0.03125, 276]
[0.125, 0.0625, 276]
[0.125, 0.125, 276]
[0.125, 0.25, 276]
[0.25, 0.03125, 276]
[0.25, 0.0625, 276]
[0.25, 0.125, 276]
[0.5, 0.03125, 276]
[0.5, 0.0625, 276]
[1, 0.03125, 276]
[0.0625, 32, 277]
[2, 32, 284]
[16, 32, 287]
[8, 16, 288]
[32, 32, 290]
[16, 16, 291]
[1, 32, 292]
[4, 16, 292]
[16, 8, 292]
[8, 32, 293]
[16, 0.25, 293]
[16, 4, 293]
[32, 0.25, 293]
[32, 16, 293]
[4, 32, 295]
[16, 2, 295]
[0.0625, 16, 296]
[8, 0.5, 296]
[16, 0.5, 296]
[32, 8, 296]
[0.5, 0.25, 297]
[0.5, 32, 297]
[1, 1, 297]
[2, 16, 297]
[4, 0.5, 297]
[0.5, 2, 298]
[2, 1, 298]
[8, 4, 298]
[32, 4, 298]
[0.0625, 8, 299]
[2, 4, 299]
[2, 8, 299]
[8, 2, 299]
[16, 1, 299]
[0.5, 0.125, 300]
[0.5, 8, 300]
[1, 2, 300]
[2, 0.5, 300]
[4, 0.25, 300]
[4, 1, 300]
[4, 2, 300]
[4, 8, 300]
[8, 0.125, 300]
[8, 8, 300]
[16, 0.0625, 300]
[32, 0.0625, 300]
[32, 0.5, 300]
[0.125, 0.5, 301]
[0.125, 1, 301]
[0.125, 2, 301]
[0.25, 0.5, 301]
[0.5, 0.5, 301]
[0.5, 1, 301]
[0.5, 4, 301]
[0.5, 16, 301]
[1, 0.125, 301]
[1, 0.25, 301]
[2, 0.03125, 301]
[2, 0.0625, 301]
[2, 0.25, 301]
[2, 2, 301]
[4, 0.03125, 301]
[4, 0.0625, 301]
[4, 0.125, 301]
[4, 4, 301]
[8, 0.03125, 301]
[8, 0.0625, 301]
[8, 0.25, 301]
[8, 1, 301]
[16, 0.03125, 301]
[16, 0.125, 301]
[32, 0.03125, 301]
[32, 0.125, 301]
[32, 1, 301]
[0.0625, 1, 302]
[0.0625, 4, 302]
[0.125, 4, 302]
[0.25, 1, 302]
[1, 0.5, 302]
[1, 8, 302]
[1, 16, 302]
[2, 0.125, 302]
[0.0625, 2, 303]
[0.125, 8, 303]
[0.25, 0.25, 303]
[1, 0.0625, 303]
[1, 4, 303]
[0.125, 16, 304]
[0.125, 32, 304]
[0.25, 2, 304]
[0.25, 8, 305]
[0.25, 16, 305]
[0.25, 32, 305]
[32, 2, 305]
[0.25, 4, 307]
clf_SVM= svm.SVC(kernel='rbf',gamma=4,C=0.25)
clf_LgR = LogisticRegression()
result_SVM = []
result_LgR = []
for i in range(5):
    # x和y的验证集和测试集,切分80-20%的测试集
    x_train, x_test, y_train, y_test = \
        cross_validation.train_test_split(x, y, test_size=0.2)
    # 训练数据进行训练
    clf_SVM.fit(x_train, y_train)
    # 将预测数据和测试集的验证数据比对
    result_SVM.append(np.mean(y_test == clf_SVM.predict(x_test)))
    clf_LgR.fit(x_train, y_train)
    result_LgR.append(np.mean(y_test == clf_LgR.predict(x_test)))
print("svm classifier accuacy:")
print(result_SVM)
print("LogisticRegression classifier accuacy:")
print(result_LgR)
svm classifier accuacy:
[0.56133828996282531, 0.55018587360594795, 0.55204460966542745, 0.52416356877323422, 0.54646840148698883]
LogisticRegression classifier accuacy:
[0.55018587360594795, 0.5204460966542751, 0.55204460966542745, 0.51858736059479549, 0.53345724907063197]

通过对SVM模型超参数的优化提高了SVM的预测准确率

你可能感兴趣的:(Python量化交易,python)