Datawhale第17期学习笔记task4

建模与调参

4.1 学习目标

1. 学习在金融分控领域常用的机器学习模型

2. 学习机器学习模型的建模过程与调参流程

4.2 内容介绍

建立获取数据文件 tools.py

#!/usr/bin/python

# -*- coding: utf-8 -*-

# tool.py 用于读取数据,输出模型计算结果,绘制图形等

from __future__import print_function

import pickle

import sqlite3

import numpyas np

import pandasas pd

import matplotlib.pyplotas plt

import pandas.io.sqlas sqlHd

from pandas.io.jsonimport json_normalize

FinaleRange =2.0

# 清洗数据

def cleanData(df):

    delFlg =False

    # 删掉时间大于14点30分的列

    colName = df.columns.values

    for cNamein colName:

        if cName.find('_') == -1:continue

            if int(cName.split('_')[1]) ==925 and not delFlg: delFlg =True

            elif int(cName.split('_')[1]) ==925 and delFlg: df.drop(cName, axis=1, inplace=True)

            if int(cName.split('_')[1]) <=1430:continue

            df.drop(cName, axis=1, inplace=True)

"""

    # 创建标签列 用于分类

    df['high'] = df['high'].shift(-1)

    df['today'] = (df['high'] - df['M15_1430_close']) / df['M15_1430_close'] *100.0

    # 创建新特征 计算今日截止到下午14点的涨跌幅

    lastClose = df['close'].shift(1)

    df['M60_1000_Range'] = (df['M60_1000_high'] - lastClose) / lastClose *100.0

    df.loc[df['M60_1000_Range'] >=0.0, 'M60_1000_Range'] =1

    df.loc[df['M60_1000_Range'] <0.0, 'M60_1000_Range'] =0

    df['M60_1100_Range'] = (df['M60_1100_high'] - lastClose) / lastClose *100.0

    df.loc[df['M60_1100_Range'] >=0.0, 'M60_1100_Range'] =1

    df.loc[df['M60_1100_Range'] <0.0, 'M60_1100_Range'] =0

    df['M60_1200_Range'] = (df['M60_1200_high'] - lastClose) / lastClose *100.0

    df.loc[df['M60_1200_Range'] >=0.0, 'M60_1200_Range'] =1

    df.loc[df['M60_1200_Range'] <0.0, 'M60_1200_Range'] =0

    df['M60_1300_Range'] = (df['M60_1300_high'] - lastClose) / lastClose *100.0

    df.loc[df['M60_1300_Range'] >=0.0, 'M60_1300_Range'] =1

    df.loc[df['M60_1300_Range'] <0.0, 'M60_1300_Range'] =0

    df['M60_1400_Range'] = (df['M60_1400_high'] - lastClose) / lastClose *100.0

    df.loc[df['M60_1400_Range'] >=0.0, 'M60_1400_Range'] =1

    df.loc[df['M60_1400_Range'] <0.0, 'M60_1400_Range'] =0

    df = df.fillna(0.0)

    df.loc[df['today'] < FinaleRange, 'today'] =0

    df.loc[df['today'] >= FinaleRange, 'today'] =1

    return df

# 删除cycle列表中的列数据

def getCycleData(df, cycle=['M5']):

    colName = df.columns.values

    delColList = []

    for cNamein colName:

    if cName.split('_')[0]in cycle:

        delColList.append(cName)

    df.drop(delColList, axis=1, inplace=True)

    return df

# 删除无效特征

def delInvalidFeature(df, featureList):

    colName =df.columns.values

    delColList = []

    for cNamein colName:

        if cNamenot in featureList:

            delColList.append(cName)

    df.drop(delColList, axis=1, inplace=True)

    return df

def getDBConn(dbName):

     return sqlite3.connect(dbName)

# 通过 DB获得标的数据

def getDataToDB(code, conn, sqlParams=''):

    tableName ='set_' + code

    if sqlParams:

        sql ='select * from %s where %s' % (tableName, sqlParams)

    else: sql ='select * from %s ' % tableName

    df = sqlHd.read_sql_query(sql, conn)

    df.rename(columns={'date':'Date'}, inplace=True)

    return cleanData(df)

# 将数据按4:3:2:1分成train和test. 用于监督学习

def splitDataSuper(df):

    data = {}

    offset =int(len(df)*0.1)+1  # 单位步长

    testLen = offset*7

    data['finalTest'] = {'sample': df[testLen:][['Date', 'close', 'today']].reset_index()}

    data['trainTest_1'] = {'sample': df[offset*4:offset*7][['Date', 'close', 'today']].reset_index()}

    data['trainTest_2'] = {'sample': df[offset*7:offset*9][['Date', 'close', 'today']].reset_index()}

    snpret = np.sign(df['today'])

    delColList = ['Date', 'open', 'close', 'high', 'low', 'buy', 'sell', 'cash', 'vol', 'today', 'BBUP', 'BBLOW', 'FI', 'CCI', 'EVM', 'SMA', 'ROC']

    df = df.drop(delColList, axis=1)

# X_train, X_test, y_train, y_test = train_test_split(df, snpret, test_size=0.3, random_state=42)

    # 训练集

    data['train'] = {'x_train': df[:offset*7], 'y_train': snpret[:offset*7]}

    # 训练集中的测试集

    data['trainTest_1']['x_test'] = df[offset*4: offset*7]

    data['trainTest_1']['y_train'] = snpret[offset*4: offset*7]

    data['trainTest_2']['x_test'] = df[offset*7: offset*9]

   data['trainTest_2']['y_train'] = snpret[offset*7: offset*9]

    # 最终测试集

    data['finalTest']['x_test'] = df[testLen:]# X_test

    data['finalTest']['y_train'] = snpret[testLen:]# y_train

    return data

# 输出模型评分

# conMean 置信区间    accuracy 准确率    modelScore 模型评分    r2Score R方值

def printScore(modelName='', conMean=0.0, accuracy=0.0,          modelScore=0.0, r2Score=0.0):

    msg ='%s  conMean: %.2f  accuracy: %.2f  modelScore: %.2f  r2Score: %.2f  ' %  (modelName, conMean, accuracy, modelScore, r2Score)

    print(msg)

----------------------------------------------------------------------------------------------------

建立模型库文件 models.py

#!/usr/bin/python

# -*- coding: utf-8 -*-

# models.py 用于建立模型,寻找模型最优参数

from __future__import print_function

import warnings

with warnings.catch_warnings():

warnings.simplefilter("ignore", category=['DeprecationWarning', 'UndefinedMetricWarning'])

import numpyas np

from sklearn.ensembleimport RandomForestClassifier, RandomForestRegressor

from sklearn.ensembleimport AdaBoostClassifier, VotingClassifier, GradientBoostingClassifier

from sklearn.model_selectionimport cross_val_score

from sklearn.linear_modelimport LogisticRegression, LinearRegression

from sklearn.discriminant_analysisimport LinearDiscriminantAnalysisas LDA

from sklearn.discriminant_analysisimport QuadraticDiscriminantAnalysisas QDA

from sklearn.metricsimport confusion_matrix, r2_score, accuracy_score

from sklearn.svmimport LinearSVC, SVC, SVR, OneClassSVM

from sklearn.treeimport DecisionTreeClassifier

from sklearn.neighborsimport KNeighborsClassifier

from sklearn.utilsimport class_weight

from toolsimport drawImg

CLASSES = np.array([0, 1])

# 无监督学习

# 新奇点检测Novelty Detection

def testSVMone():

    model = OneClassSVM(nu=0.05, kernel='linear', gamma=0.1)

  return model

def testSVMone2():

    model = OneClassSVM(nu=0.0005, kernel='rbf', tol=0.0008)

    return model

# 监督学习

# 支持向量机模型

def testSvm(params={}):

    model = SVC(**params)

    return model

# 线性模型

def testLogReg(params={}):

    model = LogisticRegression(**params)

    return model

def testLineReg(params={}):

    model = LinearRegression(**params)

    return model

# K近邻

def testKNN(params={}):

    model = KNeighborsClassifier(**params)

    return model

# 随机森林

def testRF(params={}):

    model = RandomForestClassifier(**params)

    return model

# 支持向量机线性模型

def testLinearSVC(params={}):

    model = LinearSVC(**params)

    return model

# LDA模型

def testLDA(params={}):

    model = LDA(**params)

    return model

# 决策树模型

def testDtree(params={}):

    model = DecisionTreeClassifier(**params)

    return model

# AdaBoost

def testADB(params={}):

    model = AdaBoostClassifier(**params)

    return model

# 梯度树提升 Gradient Tree Boosting

def testGBC(params={}):

    model = GradientBoostingClassifier(**params)

    return model

# 依据SelectFromModel函数进行特征选择

def selectFeatureFromModel(x, y, model, modelName):

    from sklearn.feature_selectionimport SelectFromModel

    return SelectFromModel(model).fit_transform(x, y)

# 依据RFE对特征进行递归消除

def selectFeatureRFE(x, y, model, selectFeature=1):

    from sklearn.feature_selectionimport RFE

    featureNames = x.columns.values.tolist()

    res = RFE(model, n_features_to_select=selectFeature).fit(x, y)

    return sorted(zip(map(lambda x:round(x, 4), res.ranking_), featureNames))[:20]

# 依据随机森林的平均精确率减少和平均不纯度减少来选择特征

def selectFeatureRandomTree(x, y):

    featureNames = x.columns.values.tolist()

    # 平均不纯度减少 此方法存在偏向, 对有更多类别的数据更有利

    param = {'n_estimators':500, 'max_features':10, 'max_depth':4}

    model = testRF(param)

    model.fit(x, y)

    print('Mean decrease impurity')

    offset =10

    featureList =sorted(zip(map(lambda x:round(x*100, 2), model.feature_importances_), featureNames), reverse=True)

    temp = zip(*sorted(zip(map(lambda x: round(x*100, 3),           model.feature_importances_), featureNames), reverse=True))

    # 构造图形数据

    y_data, lableName = [list(t) for t in temp]

    for i in range(0, len(lableName), offset):

        drawImg(featureNames[i:i+offset], y_data[i:i+offset])

    # 平均精确率减少

   from collections import defaultdict

   from sklearn.model_selection import train_test_split

   scores = defaultdict(list)

   for i in range(3):

    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2,     random_state=0, stratify=y)

   rf = model.fit(x_train, y_train)

  acc = r2_score(y_test, rf.predict(x_test))

  scores[featureNames[i]].append(acc)

print('Mean decrease accuracy', sorted([(round(np.mean(score), 4), feat) for feat, score in scores.items()], reverse=True))

    return featureList

    # 运行分类器并进行预测

def runModel(model, X_train, X_test, y_train):

    model.fit(X_train, y_train)

    pred = model.predict(X_test)

    predPro =None

     if 'predict_proba' in vars(model):

        predPro = model.predict_proba(X_test)

    return pred, predPro

    # 获得模型置信区间

def getConMean(model, X_train, y_train):

    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='f1_macro')

    return scores.mean()

----------------------------------------------------------------------------------------------------

#!/usr/bin/python

# -*- coding: utf-8 -*-

# features.py 此文件用于模型调参及模型评估

from __future__import print_function

import warnings

with warnings.catch_warnings():

warnings.simplefilter("ignore", category=['DeprecationWarning', 'UndefinedMetricWarning'])

import copy

import time

import numpyas np

# 调用本地模块

import tools

import models

# 将嵌套字典中的数据逐一组合,生成新的参数组合列表

def subFun(key, keyList, result):

    temp = result.copy()

    if not temp:

        for valin keyList: temp.append({key: val})

    else:

        for valin keyList:

            for i, itemin enumerate(result):

                if keyin item:

                    temp.append(copy.deepcopy(item))

                    temp[-1][key] = val

                    continue

                temp[i][key] = val

    return temp

# K近邻

def getKNNParams():

    paramsList = []

    params = {'n_neighbors': [1, 2, 3, 4, 5], 'algorithm': ['kd_tree', 'ball_tree']}

    for keyin params.keys():

        paramsList = subFun(key, params[key], paramsList)

    return paramsList

# 支持向量机模型

def getSvmParams():

    paramsList = []

    params = {'C': [10000.0], 'cache_size': [400], 'class_weight': ['balanced'],

              'gamma': [0.001, 0.0001], 'kernel': ['rbf'], 'max_iter': [-1],

              'probability': [True], 'random_state': [None],  'shrinking': [True],

              'tol': [0.001, 0.00001]}

    for keyin params.keys():

        paramsList = subFun(key, params[key], paramsList)

    return paramsList

# 线性模型

def getLineParams():

    params = {'penalty': ['l2'], 'C': [1000000.0], 'solver': ['liblinear'],

              'max_iter': [1000000], 'dual': [True], 'class_weight': [{0:0.25, 1:0.7}], 'tol': [0.0001], 'n_jobs': [1]}

paramsList = []

    for keyin params.keys():

        paramsList = subFun(key, params[key], paramsList)

    return paramsList

def getRFParams(maxFeatures):

    params = {'n_estimators': [10000], 'criterion': ['gini'], 'max_depth': [4],

              'max_features': [maxFeatures], 'bootstrap': [True], 'oob_score': [True], 'n_jobs': [-1], 'class_weight': ['balanced', {0:0.32, 1:0.68}]}

    paramsList = []

    for keyin params.keys():

        paramsList = subFun(key, params[key], paramsList)

return paramsList

# 支持向量机线性模型

def getLinearSVCParams():

    params = {'loss': ['hinge'], 'intercept_scaling': [0.0005, 0.0001], 'C': [1000.0, 2000.0], 'class_weight': ['balanced']}

   paramsList= []

    for keyin params.keys():

        paramsList= subFun(key, params[key], paramsList)

    return paramsList

# 决策树模型

def testDtree(X_train, X_test, y_train, y_test, testDate):

    weight = class_weight.compute_class_weight('balanced', CLASSES, y_train)

    model = DecisionTreeClassifier(max_depth=2, splitter='random', min_weight_fraction_leaf=0.25)

    return pred

# AdaBoost

def getADBParams(maxFeatures):

    from sklearn.ensembleimport RandomForestClassifier

    params = {'base_estimator': [RandomForestClassifier(n_estimators=10000, criterion='gini', max_depth=4, n_jobs=-1, class_weight='balanced', max_features=maxFeatures, bootstrap=True, oob_score=True)], 'algorithm': ['SAMME'], 'n_estimators': [500], 'learning_rate': [0.1]}

    paramsList = []

    for keyin params.keys():

        paramsList = subFun(key, params[key], paramsList)

    return paramsList

# 梯度树提升 Gradient Tree Boosting

def getGBCParams():

    params = {'n_estimators': [600, 1000, 2000, 5000, 10000], 'max_depth': [3, 4, 5, 6], 'learning_rate': [0.2, 0.5, 0.8, 1.0]}

    paramsList = []

    for keyin params.keys():

        paramsList = subFun(key, params[key], paramsList)

    return paramsList

def modelSet(X_train, X_test, y_train, y_test, testDate, n_components=3):

    dTreeResult = models.testDtree(X_train, X_test, y_train, y_test, testDate)

    ldaResult = models.testLDA(X_train, X_test, y_train, y_test, testDate)

    lsvcResult = models.testLinearSVC(X_train, X_test, y_train, y_test, testDate)

    svmResult = models.testSvm(X_train, X_test, y_train, y_test, testDate)

    rfResult = models.testRF(X_train, X_test, y_train, y_test, testDate)

    lrResult = models.testLine(X_train, X_test, y_train, y_test, testDate)


data = {'a': dTreeResult, 'b': ldaResult, 'c': lsvcResult, 'd': svmResult, 'e': rfResult, 'f': lrResult}

    return pd.DataFrame(data, columns=['a', 'b', 'c', 'd', 'e', 'f'])

def ensembleSet(X_train, X_test, y_train, y_test, testDate, n_components=6):

# 根据PCA获得降维后的特征序列

    X_train, X_test = getDataStandard(X_train, X_test)# 将数据进行L1正则化

# X_train, X_test = standard.dataNormalize(x_train=X_train, x_test=X_test) # 将数据进行L2正则化 entropy

    X_train, X_test = standard.getDataPCA(X_train, X_test, n_components=n_components)

adbResult = testADB(X_train, X_test, y_train, y_test, testDate)

gbcResult = testGBC(X_train, X_test, y_train, y_test, testDate)

vcResult = testVC(X_train, X_test, y_train, y_test, testDate)

rfResult = testRF(X_train, X_test, y_train, y_test, testDate)

data = {'a': adbResult, 'b': gbcResult, 'c': vcResult, 'd': rfResult}

return pd.DataFrame(data, columns=['a', 'b', 'c', 'd'])

def batchTest():

start = time.time()

data = readfileData('f:/newData/601336')

# 第一轮迭代 使用弱分类器学习

    X_train = data['train']['x_train']

y_train = data['train']['y_train']

testKey ='trainTest_1'

    X_test = data[testKey]['x_test']

y_test = data[testKey]['y_train']

testDate = data[testKey]['sample']

New_train = modelSet(X_train, X_test, y_train, y_test, testDate, n_components=200)

y_train = y_test

# 第二轮迭代  使用集成学习

    testKey ='trainTest_2'

    X_test = data[testKey]['x_test']

y_test = data[testKey]['y_train']

testDate = data[testKey]['sample']

New_train = ensembleSet(New_train, X_test, y_train, y_test, testDate, n_components=6)

y_train = y_test

# 最终轮迭代

    print('Final Turn.', y_train.shape, New_train.shape)

X_test = data['finalTest']['x_test']

y_test = data['finalTest']['y_train']

testDate = data['finalTest']['sample']

modelSet(New_train, X_test, y_train, y_test, testDate, n_components=4)

end = time.time()

print(end-start)

def searchBestParams(data):

# 将数据分割成训练集与测试集(用于有监督学习)

    data = tools.splitDataSuper(data)

resFeatureList = models.selectFeatureRandomTree(data['train']['x_train'], data['train']['y_train'])

featureList = [item[1] for item in resFeatureList if item[0] >= 0.5]

"""

    featureList = ['M60_1300_EVM', 'M60_1400_EVM', 'cashDiff']

X_train = data['train']['x_train']

X_test = data['finalTest']['x_test']

#X_train = tools.delInvalidFeature(X_train, featureList)

#X_test = tools.delInvalidFeature(data['finalTest']['x_test'], featureList)

    y_train = data['train']['y_train']


#paramsList = getADBParams(len(X_train.columns.values))

    paramsList = getLineParams()

paramsList = getSvmParams()

#paramsList = getRFParams(len(X_train.columns.values))

    paramsList = getKNNParams()

for paramsin paramsList:

#model = models.testLine(params)

#model = models.testSvm(params)

#model = models.testRF(params)

        print(params)

model = models.testKNN(params)

yPred, yPredPro = models.runModel(model, X_train, X_test, y_train)

tools.printRes(yPred, data['finalTest']['sample'])

if __name__ =="__main__":

    conn = tools.getDBConn('f:/stockDB/stock.db')

    data = tools.getDataToDB(filePath[0].split('/')[-1], conn, sqlParams='Date > 20170101')

    conn.close()

    data = tools.getCycleData(data, cycle=['M5', 'M15', 'M30'])

    data = tools.createCurstomData(data, cycle='M60')

    searchBestParams(data)

你可能感兴趣的:(Datawhale第17期学习笔记task4)