建模与调参
4.1 学习目标
1. 学习在金融分控领域常用的机器学习模型
2. 学习机器学习模型的建模过程与调参流程
4.2 内容介绍
建立获取数据文件 tools.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
# tool.py 用于读取数据,输出模型计算结果,绘制图形等
from __future__import print_function
import pickle
import sqlite3
import numpyas np
import pandasas pd
import matplotlib.pyplotas plt
import pandas.io.sqlas sqlHd
from pandas.io.jsonimport json_normalize
FinaleRange =2.0
# 清洗数据
def cleanData(df):
delFlg =False
# 删掉时间大于14点30分的列
colName = df.columns.values
for cNamein colName:
if cName.find('_') == -1:continue
if int(cName.split('_')[1]) ==925 and not delFlg: delFlg =True
elif int(cName.split('_')[1]) ==925 and delFlg: df.drop(cName, axis=1, inplace=True)
if int(cName.split('_')[1]) <=1430:continue
df.drop(cName, axis=1, inplace=True)
"""
# 创建标签列 用于分类
df['high'] = df['high'].shift(-1)
df['today'] = (df['high'] - df['M15_1430_close']) / df['M15_1430_close'] *100.0
# 创建新特征 计算今日截止到下午14点的涨跌幅
lastClose = df['close'].shift(1)
df['M60_1000_Range'] = (df['M60_1000_high'] - lastClose) / lastClose *100.0
df.loc[df['M60_1000_Range'] >=0.0, 'M60_1000_Range'] =1
df.loc[df['M60_1000_Range'] <0.0, 'M60_1000_Range'] =0
df['M60_1100_Range'] = (df['M60_1100_high'] - lastClose) / lastClose *100.0
df.loc[df['M60_1100_Range'] >=0.0, 'M60_1100_Range'] =1
df.loc[df['M60_1100_Range'] <0.0, 'M60_1100_Range'] =0
df['M60_1200_Range'] = (df['M60_1200_high'] - lastClose) / lastClose *100.0
df.loc[df['M60_1200_Range'] >=0.0, 'M60_1200_Range'] =1
df.loc[df['M60_1200_Range'] <0.0, 'M60_1200_Range'] =0
df['M60_1300_Range'] = (df['M60_1300_high'] - lastClose) / lastClose *100.0
df.loc[df['M60_1300_Range'] >=0.0, 'M60_1300_Range'] =1
df.loc[df['M60_1300_Range'] <0.0, 'M60_1300_Range'] =0
df['M60_1400_Range'] = (df['M60_1400_high'] - lastClose) / lastClose *100.0
df.loc[df['M60_1400_Range'] >=0.0, 'M60_1400_Range'] =1
df.loc[df['M60_1400_Range'] <0.0, 'M60_1400_Range'] =0
df = df.fillna(0.0)
df.loc[df['today'] < FinaleRange, 'today'] =0
df.loc[df['today'] >= FinaleRange, 'today'] =1
return df
# 删除cycle列表中的列数据
def getCycleData(df, cycle=['M5']):
colName = df.columns.values
delColList = []
for cNamein colName:
if cName.split('_')[0]in cycle:
delColList.append(cName)
df.drop(delColList, axis=1, inplace=True)
return df
# 删除无效特征
def delInvalidFeature(df, featureList):
colName =df.columns.values
delColList = []
for cNamein colName:
if cNamenot in featureList:
delColList.append(cName)
df.drop(delColList, axis=1, inplace=True)
return df
def getDBConn(dbName):
return sqlite3.connect(dbName)
# 通过 DB获得标的数据
def getDataToDB(code, conn, sqlParams=''):
tableName ='set_' + code
if sqlParams:
sql ='select * from %s where %s' % (tableName, sqlParams)
else: sql ='select * from %s ' % tableName
df = sqlHd.read_sql_query(sql, conn)
df.rename(columns={'date':'Date'}, inplace=True)
return cleanData(df)
# 将数据按4:3:2:1分成train和test. 用于监督学习
def splitDataSuper(df):
data = {}
offset =int(len(df)*0.1)+1 # 单位步长
testLen = offset*7
data['finalTest'] = {'sample': df[testLen:][['Date', 'close', 'today']].reset_index()}
data['trainTest_1'] = {'sample': df[offset*4:offset*7][['Date', 'close', 'today']].reset_index()}
data['trainTest_2'] = {'sample': df[offset*7:offset*9][['Date', 'close', 'today']].reset_index()}
snpret = np.sign(df['today'])
delColList = ['Date', 'open', 'close', 'high', 'low', 'buy', 'sell', 'cash', 'vol', 'today', 'BBUP', 'BBLOW', 'FI', 'CCI', 'EVM', 'SMA', 'ROC']
df = df.drop(delColList, axis=1)
# X_train, X_test, y_train, y_test = train_test_split(df, snpret, test_size=0.3, random_state=42)
# 训练集
data['train'] = {'x_train': df[:offset*7], 'y_train': snpret[:offset*7]}
# 训练集中的测试集
data['trainTest_1']['x_test'] = df[offset*4: offset*7]
data['trainTest_1']['y_train'] = snpret[offset*4: offset*7]
data['trainTest_2']['x_test'] = df[offset*7: offset*9]
data['trainTest_2']['y_train'] = snpret[offset*7: offset*9]
# 最终测试集
data['finalTest']['x_test'] = df[testLen:]# X_test
data['finalTest']['y_train'] = snpret[testLen:]# y_train
return data
# 输出模型评分
# conMean 置信区间 accuracy 准确率 modelScore 模型评分 r2Score R方值
def printScore(modelName='', conMean=0.0, accuracy=0.0, modelScore=0.0, r2Score=0.0):
msg ='%s conMean: %.2f accuracy: %.2f modelScore: %.2f r2Score: %.2f ' % (modelName, conMean, accuracy, modelScore, r2Score)
print(msg)
----------------------------------------------------------------------------------------------------
建立模型库文件 models.py
#!/usr/bin/python
# -*- coding: utf-8 -*-
# models.py 用于建立模型,寻找模型最优参数
from __future__import print_function
import warnings
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=['DeprecationWarning', 'UndefinedMetricWarning'])
import numpyas np
from sklearn.ensembleimport RandomForestClassifier, RandomForestRegressor
from sklearn.ensembleimport AdaBoostClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.model_selectionimport cross_val_score
from sklearn.linear_modelimport LogisticRegression, LinearRegression
from sklearn.discriminant_analysisimport LinearDiscriminantAnalysisas LDA
from sklearn.discriminant_analysisimport QuadraticDiscriminantAnalysisas QDA
from sklearn.metricsimport confusion_matrix, r2_score, accuracy_score
from sklearn.svmimport LinearSVC, SVC, SVR, OneClassSVM
from sklearn.treeimport DecisionTreeClassifier
from sklearn.neighborsimport KNeighborsClassifier
from sklearn.utilsimport class_weight
from toolsimport drawImg
CLASSES = np.array([0, 1])
# 无监督学习
# 新奇点检测Novelty Detection
def testSVMone():
model = OneClassSVM(nu=0.05, kernel='linear', gamma=0.1)
return model
def testSVMone2():
model = OneClassSVM(nu=0.0005, kernel='rbf', tol=0.0008)
return model
# 监督学习
# 支持向量机模型
def testSvm(params={}):
model = SVC(**params)
return model
# 线性模型
def testLogReg(params={}):
model = LogisticRegression(**params)
return model
def testLineReg(params={}):
model = LinearRegression(**params)
return model
# K近邻
def testKNN(params={}):
model = KNeighborsClassifier(**params)
return model
# 随机森林
def testRF(params={}):
model = RandomForestClassifier(**params)
return model
# 支持向量机线性模型
def testLinearSVC(params={}):
model = LinearSVC(**params)
return model
# LDA模型
def testLDA(params={}):
model = LDA(**params)
return model
# 决策树模型
def testDtree(params={}):
model = DecisionTreeClassifier(**params)
return model
# AdaBoost
def testADB(params={}):
model = AdaBoostClassifier(**params)
return model
# 梯度树提升 Gradient Tree Boosting
def testGBC(params={}):
model = GradientBoostingClassifier(**params)
return model
# 依据SelectFromModel函数进行特征选择
def selectFeatureFromModel(x, y, model, modelName):
from sklearn.feature_selectionimport SelectFromModel
return SelectFromModel(model).fit_transform(x, y)
# 依据RFE对特征进行递归消除
def selectFeatureRFE(x, y, model, selectFeature=1):
from sklearn.feature_selectionimport RFE
featureNames = x.columns.values.tolist()
res = RFE(model, n_features_to_select=selectFeature).fit(x, y)
return sorted(zip(map(lambda x:round(x, 4), res.ranking_), featureNames))[:20]
# 依据随机森林的平均精确率减少和平均不纯度减少来选择特征
def selectFeatureRandomTree(x, y):
featureNames = x.columns.values.tolist()
# 平均不纯度减少 此方法存在偏向, 对有更多类别的数据更有利
param = {'n_estimators':500, 'max_features':10, 'max_depth':4}
model = testRF(param)
model.fit(x, y)
print('Mean decrease impurity')
offset =10
featureList =sorted(zip(map(lambda x:round(x*100, 2), model.feature_importances_), featureNames), reverse=True)
temp = zip(*sorted(zip(map(lambda x: round(x*100, 3), model.feature_importances_), featureNames), reverse=True))
# 构造图形数据
y_data, lableName = [list(t) for t in temp]
for i in range(0, len(lableName), offset):
drawImg(featureNames[i:i+offset], y_data[i:i+offset])
# 平均精确率减少
from collections import defaultdict
from sklearn.model_selection import train_test_split
scores = defaultdict(list)
for i in range(3):
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0, stratify=y)
rf = model.fit(x_train, y_train)
acc = r2_score(y_test, rf.predict(x_test))
scores[featureNames[i]].append(acc)
print('Mean decrease accuracy', sorted([(round(np.mean(score), 4), feat) for feat, score in scores.items()], reverse=True))
return featureList
# 运行分类器并进行预测
def runModel(model, X_train, X_test, y_train):
model.fit(X_train, y_train)
pred = model.predict(X_test)
predPro =None
if 'predict_proba' in vars(model):
predPro = model.predict_proba(X_test)
return pred, predPro
# 获得模型置信区间
def getConMean(model, X_train, y_train):
scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1_macro')
return scores.mean()
----------------------------------------------------------------------------------------------------
#!/usr/bin/python
# -*- coding: utf-8 -*-
# features.py 此文件用于模型调参及模型评估
from __future__import print_function
import warnings
with warnings.catch_warnings():
warnings.simplefilter("ignore", category=['DeprecationWarning', 'UndefinedMetricWarning'])
import copy
import time
import numpyas np
# 调用本地模块
import tools
import models
# 将嵌套字典中的数据逐一组合,生成新的参数组合列表
def subFun(key, keyList, result):
temp = result.copy()
if not temp:
for valin keyList: temp.append({key: val})
else:
for valin keyList:
for i, itemin enumerate(result):
if keyin item:
temp.append(copy.deepcopy(item))
temp[-1][key] = val
continue
temp[i][key] = val
return temp
# K近邻
def getKNNParams():
paramsList = []
params = {'n_neighbors': [1, 2, 3, 4, 5], 'algorithm': ['kd_tree', 'ball_tree']}
for keyin params.keys():
paramsList = subFun(key, params[key], paramsList)
return paramsList
# 支持向量机模型
def getSvmParams():
paramsList = []
params = {'C': [10000.0], 'cache_size': [400], 'class_weight': ['balanced'],
'gamma': [0.001, 0.0001], 'kernel': ['rbf'], 'max_iter': [-1],
'probability': [True], 'random_state': [None], 'shrinking': [True],
'tol': [0.001, 0.00001]}
for keyin params.keys():
paramsList = subFun(key, params[key], paramsList)
return paramsList
# 线性模型
def getLineParams():
params = {'penalty': ['l2'], 'C': [1000000.0], 'solver': ['liblinear'],
'max_iter': [1000000], 'dual': [True], 'class_weight': [{0:0.25, 1:0.7}], 'tol': [0.0001], 'n_jobs': [1]}
paramsList = []
for keyin params.keys():
paramsList = subFun(key, params[key], paramsList)
return paramsList
def getRFParams(maxFeatures):
params = {'n_estimators': [10000], 'criterion': ['gini'], 'max_depth': [4],
'max_features': [maxFeatures], 'bootstrap': [True], 'oob_score': [True], 'n_jobs': [-1], 'class_weight': ['balanced', {0:0.32, 1:0.68}]}
paramsList = []
for keyin params.keys():
paramsList = subFun(key, params[key], paramsList)
return paramsList
# 支持向量机线性模型
def getLinearSVCParams():
params = {'loss': ['hinge'], 'intercept_scaling': [0.0005, 0.0001], 'C': [1000.0, 2000.0], 'class_weight': ['balanced']}
paramsList= []
for keyin params.keys():
paramsList= subFun(key, params[key], paramsList)
return paramsList
# 决策树模型
def testDtree(X_train, X_test, y_train, y_test, testDate):
weight = class_weight.compute_class_weight('balanced', CLASSES, y_train)
model = DecisionTreeClassifier(max_depth=2, splitter='random', min_weight_fraction_leaf=0.25)
return pred
# AdaBoost
def getADBParams(maxFeatures):
from sklearn.ensembleimport RandomForestClassifier
params = {'base_estimator': [RandomForestClassifier(n_estimators=10000, criterion='gini', max_depth=4, n_jobs=-1, class_weight='balanced', max_features=maxFeatures, bootstrap=True, oob_score=True)], 'algorithm': ['SAMME'], 'n_estimators': [500], 'learning_rate': [0.1]}
paramsList = []
for keyin params.keys():
paramsList = subFun(key, params[key], paramsList)
return paramsList
# 梯度树提升 Gradient Tree Boosting
def getGBCParams():
params = {'n_estimators': [600, 1000, 2000, 5000, 10000], 'max_depth': [3, 4, 5, 6], 'learning_rate': [0.2, 0.5, 0.8, 1.0]}
paramsList = []
for keyin params.keys():
paramsList = subFun(key, params[key], paramsList)
return paramsList
def modelSet(X_train, X_test, y_train, y_test, testDate, n_components=3):
dTreeResult = models.testDtree(X_train, X_test, y_train, y_test, testDate)
ldaResult = models.testLDA(X_train, X_test, y_train, y_test, testDate)
lsvcResult = models.testLinearSVC(X_train, X_test, y_train, y_test, testDate)
svmResult = models.testSvm(X_train, X_test, y_train, y_test, testDate)
rfResult = models.testRF(X_train, X_test, y_train, y_test, testDate)
lrResult = models.testLine(X_train, X_test, y_train, y_test, testDate)
data = {'a': dTreeResult, 'b': ldaResult, 'c': lsvcResult, 'd': svmResult, 'e': rfResult, 'f': lrResult}
return pd.DataFrame(data, columns=['a', 'b', 'c', 'd', 'e', 'f'])
def ensembleSet(X_train, X_test, y_train, y_test, testDate, n_components=6):
# 根据PCA获得降维后的特征序列
X_train, X_test = getDataStandard(X_train, X_test)# 将数据进行L1正则化
# X_train, X_test = standard.dataNormalize(x_train=X_train, x_test=X_test) # 将数据进行L2正则化 entropy
X_train, X_test = standard.getDataPCA(X_train, X_test, n_components=n_components)
adbResult = testADB(X_train, X_test, y_train, y_test, testDate)
gbcResult = testGBC(X_train, X_test, y_train, y_test, testDate)
vcResult = testVC(X_train, X_test, y_train, y_test, testDate)
rfResult = testRF(X_train, X_test, y_train, y_test, testDate)
data = {'a': adbResult, 'b': gbcResult, 'c': vcResult, 'd': rfResult}
return pd.DataFrame(data, columns=['a', 'b', 'c', 'd'])
def batchTest():
start = time.time()
data = readfileData('f:/newData/601336')
# 第一轮迭代 使用弱分类器学习
X_train = data['train']['x_train']
y_train = data['train']['y_train']
testKey ='trainTest_1'
X_test = data[testKey]['x_test']
y_test = data[testKey]['y_train']
testDate = data[testKey]['sample']
New_train = modelSet(X_train, X_test, y_train, y_test, testDate, n_components=200)
y_train = y_test
# 第二轮迭代 使用集成学习
testKey ='trainTest_2'
X_test = data[testKey]['x_test']
y_test = data[testKey]['y_train']
testDate = data[testKey]['sample']
New_train = ensembleSet(New_train, X_test, y_train, y_test, testDate, n_components=6)
y_train = y_test
# 最终轮迭代
print('Final Turn.', y_train.shape, New_train.shape)
X_test = data['finalTest']['x_test']
y_test = data['finalTest']['y_train']
testDate = data['finalTest']['sample']
modelSet(New_train, X_test, y_train, y_test, testDate, n_components=4)
end = time.time()
print(end-start)
def searchBestParams(data):
# 将数据分割成训练集与测试集(用于有监督学习)
data = tools.splitDataSuper(data)
resFeatureList = models.selectFeatureRandomTree(data['train']['x_train'], data['train']['y_train'])
featureList = [item[1] for item in resFeatureList if item[0] >= 0.5]
"""
featureList = ['M60_1300_EVM', 'M60_1400_EVM', 'cashDiff']
X_train = data['train']['x_train']
X_test = data['finalTest']['x_test']
#X_train = tools.delInvalidFeature(X_train, featureList)
#X_test = tools.delInvalidFeature(data['finalTest']['x_test'], featureList)
y_train = data['train']['y_train']
#paramsList = getADBParams(len(X_train.columns.values))
paramsList = getLineParams()
paramsList = getSvmParams()
#paramsList = getRFParams(len(X_train.columns.values))
paramsList = getKNNParams()
for paramsin paramsList:
#model = models.testLine(params)
#model = models.testSvm(params)
#model = models.testRF(params)
print(params)
model = models.testKNN(params)
yPred, yPredPro = models.runModel(model, X_train, X_test, y_train)
tools.printRes(yPred, data['finalTest']['sample'])
if __name__ =="__main__":
conn = tools.getDBConn('f:/stockDB/stock.db')
data = tools.getDataToDB(filePath[0].split('/')[-1], conn, sqlParams='Date > 20170101')
conn.close()
data = tools.getCycleData(data, cycle=['M5', 'M15', 'M30'])
data = tools.createCurstomData(data, cycle='M60')
searchBestParams(data)