利用Scikit-learn机器学习库的特征分类进行vnpy期货量化交易(代码)

代码如下。我也放在我的GitHub里面。

已经加了注释,运行时候会有warning 信息。其他可以看那边框架解释文章。


# encoding: UTF-8
import warnings
warnings.filterwarnings("ignore")
from pymongo import MongoClient, ASCENDING
import pandas as pd
import numpy as np
from datetime import datetime
import talib
import matplotlib.pyplot as plt
import scipy.stats as st
from sklearn.model_selection import train_test_split
# LogisticRegression 逻辑回归
from sklearn.linear_model import LogisticRegression
# DecisionTreeClassifier 决策树
from sklearn.tree import DecisionTreeClassifier
# SVC 支持向量分类
from sklearn.svm import SVC
# MLP 神经网络
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
class DataAnalyzerforSklearn(object):
    """
    这个类是为了SVM做归纳分析数据,以未来6个bar的斜率线性回归为判断分类是否正确。
    不是直接分析HLOC,而且用下列分非线性参数(就是和具体点位无关)
    1.Percentage
    2.std
    4.MACD
    5.CCI
    6.ATR
    7. 该bar之前的均线斜率
    8. RSI
    """
    def __init__(self, exportpath="C:\\Project\\", datformat=['datetime', 'high', 'low', 'open', 'close','volume']):
        self.mongohost = None
        self.mongoport = None
        self.db = None
        self.collection = None
        self.df = pd.DataFrame()
        self.exportpath = exportpath
        self.datformat = datformat
        self.startBar = 2
        self.endBar = 12
        self.step = 2
        self.pValue = 0.015
    #-----------------------------------------导入数据-------------------------------------------------
    def db2df(self, db, collection, start, end, mongohost="localhost", mongoport=27017, export2csv=False):
        """读取MongoDB数据库行情记录,输出到Dataframe中"""
        self.mongohost = mongohost
        self.mongoport = mongoport
        self.db = db
        self.collection = collection
        dbClient = MongoClient(self.mongohost, self.mongoport, connectTimeoutMS=500)
        db = dbClient[self.db]
        cursor = db[self.collection].find({'datetime':{'$gte':start, '$lt':end}}).sort("datetime",ASCENDING)
        self.df = pd.DataFrame(list(cursor))
        self.df = self.df[self.datformat]
        self.df = self.df.reset_index(drop=True)
        path = self.exportpath + self.collection + ".csv"
        if export2csv == True:
            self.df.to_csv(path, index=True, header=True)
        return self.df
    def csv2df(self, csvpath, dataname="csv_data", export2csv=False):
        """读取csv行情数据,输入到Dataframe中"""
        csv_df = pd.read_csv(csvpath)
        self.df = csv_df[self.datformat]
        self.df["datetime"] = pd.to_datetime(self.df['datetime'])
        self.df = self.df.reset_index(drop=True)
        path = self.exportpath + dataname + ".csv"
        if export2csv == True:
            self.df.to_csv(path, index=True, header=True)
        return self.df
    def df2Barmin(self, inputdf, barmins, crossmin=1, export2csv=False):
        """输入分钟k线dataframe数据,合并多多种数据,例如三分钟/5分钟等,如果开始时间是9点1分,crossmin = 0;如果是9点0分,crossmin为1"""
        dfbarmin = pd.DataFrame()
        highBarMin = 0
        lowBarMin = 0
        openBarMin = 0
        volumeBarmin = 0
        datetime = 0
        for i in range(0, len(inputdf) - 1):
            bar = inputdf.iloc[i, :].to_dict()
            if openBarMin == 0:
                openBarmin = bar["open"]
            if highBarMin == 0:
                highBarMin = bar["high"]
            else:
                highBarMin = max(bar["high"], highBarMin)
            if lowBarMin == 0:
                lowBarMin = bar["low"]
            else:
                lowBarMin = min(bar["low"], lowBarMin)
            closeBarMin = bar["close"]
            datetime = bar["datetime"]
            volumeBarmin += int(bar["volume"])
            # X分钟已经走完
            if not (bar["datetime"].minute + crossmin) % barmins:  # 可以用X整除
                # 生成上一X分钟K线的时间戳
                barMin = {'datetime': datetime, 'high': highBarMin, 'low': lowBarMin, 'open': openBarmin,
                          'close': closeBarMin, 'volume' : volumeBarmin}
                dfbarmin = dfbarmin.append(barMin, ignore_index=True)
                highBarMin = 0
                lowBarMin = 0
                openBarMin = 0
                volumeBarmin = 0
        if export2csv == True:
            dfbarmin.to_csv(self.exportpath + "bar" + str(barmins)+ str(self.collection) + ".csv", index=True, header=True)
        return dfbarmin
    #-----------------------------------------开始计算指标-------------------------------------------------
    def dfcci(self, inputdf, n, export2csv=True):
        """调用talib方法计算CCI指标,写入到df并输出"""
        dfcci = inputdf
        dfcci["cci"] = None
        for i in range(n, len(inputdf)):
            df_ne = inputdf.loc[i - n + 1:i, :]
            cci = talib.CCI(np.array(df_ne["high"]), np.array(df_ne["low"]), np.array(df_ne["close"]), n)
            dfcci.loc[i, "cci"] = cci[-1]
        dfcci = dfcci.fillna(0)
        dfcci = dfcci.replace(np.inf, 0)
        if export2csv == True:
            dfcci.to_csv(self.exportpath + "dfcci" + str(self.collection) + ".csv", index=True, header=True)
        return dfcci
    def dfatr(self, inputdf, n, export2csv=True):
        """调用talib方法计算ATR指标,写入到df并输出"""
        dfatr = inputdf
        for i in range((n+1), len(inputdf)):
            df_ne = inputdf.loc[i - n :i, :]
            atr = talib.ATR(np.array(df_ne["high"]), np.array(df_ne["low"]), np.array(df_ne["close"]), n)
            dfatr.loc[i, "atr"] = atr[-1]
        dfatr = dfatr.fillna(0)
        dfatr = dfatr.replace(np.inf, 0)
        if export2csv == True:
            dfatr.to_csv(self.exportpath + "dfatr" + str(self.collection) + ".csv", index=True, header=True)
        return dfatr
    def dfrsi(self, inputdf, n, export2csv=True):
        """调用talib方法计算ATR指标,写入到df并输出"""
        dfrsi = inputdf
        dfrsi["rsi"] = None
        for i in range(n+1, len(inputdf)):
            df_ne = inputdf.loc[i - n :i, :]
            rsi = talib.RSI(np.array(df_ne["close"]), n)
            dfrsi.loc[i, "rsi"] = rsi[-1]
        dfrsi = dfrsi.fillna(0)
        dfrsi = dfrsi.replace(np.inf, 0)
        if export2csv == True:
            dfrsi.to_csv(self.exportpath + "dfrsi" + str(self.collection) + ".csv", index=True, header=True)
        return dfrsi
    def Percentage(self, inputdf, export2csv=True):
        """调用talib方法计算CCI指标,写入到df并输出"""
        dfPercentage = inputdf
        # dfPercentage["Percentage"] = None
        for i in range(1, len(inputdf)):
            # if dfPercentage.loc[i,"close"]>dfPercentage.loc[i,"open"]:
            #     percentage = ((dfPercentage.loc[i,"high"] - dfPercentage.loc[i-1,"close"])/ dfPercentage.loc[i-1,"close"])*100
            # else:
            #     percentage = (( dfPercentage.loc[i,"low"] - dfPercentage.loc[i-1,"close"] )/ dfPercentage.loc[i-1,"close"])*100
            if dfPercentage.loc[ i - 1, "close"] == 0.0:
                percentage = 0
            else:
                percentage = ((dfPercentage.loc[i, "close"] - dfPercentage.loc[i - 1, "close"]) / dfPercentage.loc[ i - 1, "close"]) * 100.0
            dfPercentage.loc[i, "Perentage"] = percentage
        dfPercentage = dfPercentage.fillna(0)
        dfPercentage = dfPercentage.replace(np.inf, 0)
        if export2csv == True:
            dfPercentage.to_csv(self.exportpath + "Percentage_" + str(self.collection) + ".csv", index=True, header=True)
        return dfPercentage
    def dfMACD(self, inputdf, n, export2csv=False):
        """调用talib方法计算MACD指标,写入到df并输出"""
        dfMACD = inputdf
        for i in range(n, len(inputdf)):
            df_ne = inputdf.loc[i - n + 1:i, :]
            macd,signal,hist = talib.MACD(np.array(df_ne["close"]),12,26,9)
            dfMACD.loc[i, "macd"] = macd[-1]
            dfMACD.loc[i, "signal"] = signal[-1]
            dfMACD.loc[i, "hist"] = hist[-1]
        dfMACD = dfMACD.fillna(0)
        dfMACD = dfMACD.replace(np.inf, 0)
        if export2csv == True:
            dfMACD.to_csv(self.exportpath + "macd" + str(self.collection) + ".csv", index=True, header=True)
        return dfMACD
    def dfSTD(self, inputdf, n, export2csv=False):
        """调用talib方法计算MACD指标,写入到df并输出"""
        dfSTD = inputdf
        for i in range(n, len(inputdf)):
            df_ne = inputdf.loc[i - n + 1:i, :]
            std = talib.STDDEV(np.array(df_ne["close"]),n)
            dfSTD.loc[i, "std"] = std[-1]
        dfSTD = dfSTD.fillna(0)
        dfSTD = dfSTD.replace(np.inf, 0)
        if export2csv == True:
            dfSTD.to_csv(self.exportpath + "dfSTD" + str(self.collection) + ".csv", index=True, header=True)
        return dfSTD
    #-----------------------------------------加入趋势分类-------------------------------------------------
    def addTrend(self, inputdf,  trendsetp=6, export2csv=False):
        """以未来6个bar的斜率线性回归为判断分类是否正确"""
        dfTrend = inputdf
        for i in range(1, len(dfTrend) - trendsetp-1):
            histRe = np.array(dfTrend["close"])[i:i+trendsetp]
            xAixs = np.arange(trendsetp) + 1
            res = st.linregress(y=histRe, x=xAixs)
            if res.pvalue < self.pValue+0.01:
                if res.slope > 0.5:
                    dfTrend.loc[i,"tradeindictor"] = 1
                elif res.slope < -0.5:
                    dfTrend.loc[i, "tradeindictor"] = -1
        dfTrend = dfTrend.fillna(0)
        dfTrend = dfTrend.replace(np.inf, 0)
        if export2csv == True:
            dfTrend.to_csv(self.exportpath + "addTrend" + str(self.collection) + ".csv", index=True, header=True)
        return dfTrend
def GirdValuate(X_train, y_train):
    """1)LogisticRegression
    逻辑回归
    2)DecisionTreeClassifier
    决策树
    3)SVC
    支持向量分类
    4)MLP
    神经网络"""
    clf_DT=DecisionTreeClassifier()
    param_grid_DT= {'max_depth': [1,2,3,4,5,6]}
    clf_Logit=LogisticRegression()
    param_grid_logit= {'solver': ['liblinear','lbfgs','newton-cg','sag']}
    clf_svc=SVC()
    param_grid_svc={'kernel':('linear', 'poly', 'rbf', 'sigmoid'),
                    'C':[1, 2, 4],
                    'gamma':[0.125, 0.25, 0.5 ,1, 2, 4]}
    clf_mlp = MLPClassifier()
    param_grid_mlp= {"hidden_layer_sizes": [(100,), (100, 30)],
                                 "solver": ['adam', 'sgd', 'lbfgs'],
                                 "max_iter": [20],
                                 "verbose": [False]
                                 }
    #打包参数集合
    clf=[clf_DT,clf_Logit,clf_mlp,clf_svc]
    param_grid=[param_grid_DT,param_grid_logit,param_grid_mlp,param_grid_svc]
    from sklearn.model_selection import StratifiedKFold  # 交叉验证
    kflod = StratifiedKFold(n_splits=10, shuffle=True, random_state=7)  # 将训练/测试数据集划分10个互斥子集,这样方便多进程测试
    #网格测试
    for i in range(0,4):
        grid=GridSearchCV(clf[i], param_grid[i], scoring='accuracy',n_jobs = -1,cv = kflod)
        grid.fit(X_train, y_train)
        print (grid.best_params_,': ',grid.best_score_)
if __name__ == '__main__':
    # 读取数据
    # exportpath = "C:\\Users\shui0\OneDrive\Documents\Project\\"
    exportpath = "C:\Project\\"
    DA = DataAnalyzerforSklearn(exportpath)
    #数据库导入
    start = datetime.strptime("20180501", '%Y%m%d')
    end = datetime.strptime("20190501", '%Y%m%d')
    df = DA.db2df(db="VnTrader_1Min_Db", collection="rb8888", start = start, end = end)
    df5min = DA.df2Barmin(df, 5)
    df5minAdd = DA.addTrend(df5min, export2csv=True)
    df5minAdd = DA.dfMACD(df5minAdd, n=34, export2csv=True)
    df5minAdd = DA.dfatr(df5minAdd, n=25, export2csv=True)
    df5minAdd = DA.dfrsi(df5minAdd, n=35, export2csv=True)
    df5minAdd = DA.dfcci(df5minAdd,n = 30,export2csv=True)
    df5minAdd = DA.dfSTD(df5minAdd, n=30, export2csv=True)
    df5minAdd = DA.Percentage(df5minAdd,export2csv = True)
    #划分测试验证。
    df_test = df5minAdd.loc[60:,:]        #只从第60个开始分析,因为之前很多是空值
    y= np.array(df_test["tradeindictor"]) #只保留结果趋势结果,转化为数组
    X = df_test.drop(["tradeindictor","close","datetime","high","low","open","volume"],axis = 1).values #不是直接分析HLOC,只保留特征值,转化为数组
    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=0) #三七
    print("训练集长度: %s, 测试集长度: %s" %(len(X_train),len(X_test)))
    from sklearn.feature_selection import SelectKBest
    from sklearn.feature_selection import SelectPercentile
    from sklearn.feature_selection import mutual_info_classif
    #特征工作,可以按照百分比选出最高分特征类,取最优70%,也可以用SelectKBest,指定要几个特征类。
    print(X_train.shape)
    selectPer = SelectPercentile(mutual_info_classif, percentile=70)
    # selectPer = SelectKBest(mutual_info_classif, k=7)
    X_train = selectPer.fit_transform(X_train, y_train)
    print(X_train.shape)
    X_test = selectPer.transform(X_test)
    # 也可以用Fpr选择
    # selectFea=SelectFpr(alpha=0.01)
    # X_train_new = selectFea.fit_transform(X_train, y_train)
    # X_test_new = selectFea.transform(X_test)
    # 这里使用下面模式进行分析,然后利用网格调参
    GirdValuate(X_train,y_train)
    # 使用选取最好的模型,进行测试看看拼接
    # • 模型预测:model.predict()
    # • Accuracy:metrics.accuracy_score()
    # • Presicion:metrics.precision_score()
    # • Recall:metrics.recall_score()
    from sklearn import metrics
    clf_selected=MLPClassifier(hidden_layer_sizes=(100,30), max_iter=20, solver='adam') #此处填入网格回测最优模型和参数,
    # {'hidden_layer_sizes': (100, 30), 'max_iter': 20, 'solver': 'adam', 'verbose': False} :  0.9897016507648039
    clf_selected.fit(X_train, y_train)
    y_pred = clf_selected.predict(X_test)
    #accuracy
    accuracy=metrics.accuracy_score(y_true=y_test, y_pred=y_pred)
    print ('accuracy:',accuracy)
    #precision
    precision=metrics.precision_score(y_true=y_test, y_pred=y_pred,average="micro")
    print ('precision:',precision)
    #recall
    recall=metrics.recall_score(y_true=y_test, y_pred=y_pred,average="micro")
    print ('recall:',recall)
    #实际值和预测值
    print (y_test)
    print (y_pred)
    dfresult = pd.DataFrame({'Actual':y_test,'Predict':y_pred})
    dfresult.to_csv(exportpath + "result"  + ".csv", index=True, header=True)
    from sklearn.externals import joblib
    #模型保存到本地
    joblib.dump(clf_selected,'clf_selected.m')
    #模型的恢复
    clf_tmp=joblib.load('clf_selected.m')


运行结果:

训练集长度: 11673, 测试集长度: 5003
(11673, 8)
(11673, 5)
('accuracy:', '0.7833300019988008')
('precision:', '0.7833300019988008')
('recall:', '0.7833300019988008')
[ 1.  0.  0. ...  0.  0. -1.]
[0. 0. 0. ... 0. 0. 0.]


在vnpy中使用,简单说下, 在策略init 方法中使用 clf_tmp=joblib.load( 'clf_selected.m' ) 读取模型,然后在onXminBar方法中,

使用ArrayManager计算那些特征值,使用  clf_selected.predict()计算中预测分类,如果1开多单,-1空单,0略过。

来自 “ ITPUB博客 ” ,链接:http://blog.itpub.net/22259926/viewspace-2648828/,如需转载,请注明出处,否则将追究法律责任。

转载于:http://blog.itpub.net/22259926/viewspace-2648828/

你可能感兴趣的:(利用Scikit-learn机器学习库的特征分类进行vnpy期货量化交易(代码))