多目标任务 Xgboost 参数优化,通过交叉验证方法选择最优参数(一次输出多个 y )

import os
import numpy as np
import pandas as pd
import time
from sklearn.multioutput import MultiOutputRegressor
import matplotlib.pyplot as plt

# 核心代码,设置显示的最大列、宽等参数,消掉打印不完全中间的省略号
pd.set_option('display.max_columns', 1000)
pd.set_option('display.width', 1000)
pd.set_option('display.max_colwidth', 1000)

# 设置交叉验证集的折数
from sklearn.model_selection import cross_val_score, KFold
#kf = KFold(n_splits=10, random_state=42, shuffle=True)
kf = KFold(n_splits=10, random_state=42, shuffle=False)
# 时间序列分割
from sklearn.model_selection import TimeSeriesSplit
tscv = TimeSeriesSplit(max_train_size=None, n_splits=10)
def cv_mae(model, train_X, train_y):
    #cv_mae = np.mean(-cross_val_score(model, train_X, train_y, scoring="neg_mean_absolute_error", cv = tscv))
    cv_mae = np.mean(-cross_val_score(model, train_X, train_y, scoring="neg_mean_absolute_error", cv = tscv))
    return cv_mae

# 画图:参数与交叉验证集上的折线图
def parameter_plot(x_list, y_list, x_title, y_title, plot_name):
    # 参数优化折线图
    #plt.figure(1, figsize=(26, 13))
    plt.plot(x_list, y_list, marker='o')
    plt.xlabel(x_title)
    plt.ylabel(y_title)
    plt.title(plot_name)
    plt.show()


# 参数优化: XGB
def parameter_optimize_xgb(train_X, train_y):
    import xgboost as xgb
    # 回归树的颗数
    cv_mae_list = []
    n_estimator_list = []
    # 暴力搜索,选取最优参数 n_estimators = 70
    n_estimators = [x for x in range(20, 210, 10)]
    # for n_estimator in n_estimators:
    #     initialize_params = {'learning_rate': 0.1, 'n_estimators': n_estimator, 'max_depth': 3, 'min_child_weight': 1, 'seed': 0,
    #                     'subsample': 1.0, 'colsample_bytree': 1.0, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1}
    #     xgb_rg = xgb.XGBRegressor(**initialize_params)
    #     mult_xgb = MultiOutputRegressor(xgb_rg)
    #     cv_mae_xgb = cv_mae(mult_xgb, train_X, train_y)
    #     cv_mae_list.append(cv_mae_xgb)
    #     n_estimator_list.append(n_estimator)
    #     print ('n_estimator :{0}  交叉验证平均绝对误差:{1}'.format((n_estimator),(cv_mae_xgb)))
    # parameter_plot(n_estimator_list, cv_mae_list, 'n_estimators', 'CV_MAE', 'n_estimators parameter optimization')

    # 暴力搜索:选取最优参数 learning_rate :0.21
    #n_learning_rate = list(np.linspace(0.01, 2, 20))
    # n_learning_rate = list(np.arange(0.01, 2, 0.1))
    # for n_estimator in n_learning_rate:
    #     initialize_params = {'learning_rate': n_estimator, 'n_estimators': 70, 'max_depth': 3, 'min_child_weight': 1, 'seed': 0,
    #                     'subsample': 1.0, 'colsample_bytree': 1.0, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1}
    #     xgb_rg = xgb.XGBRegressor(**initialize_params)
    #     mult_xgb = MultiOutputRegressor(xgb_rg)
    #     cv_mae_xgb = cv_mae(mult_xgb, train_X, train_y)
    #     cv_mae_list.append(cv_mae_xgb)
    #     n_estimator_list.append(n_estimator)
    #     print ('n_estimator :{0}  交叉验证平均绝对误差:{1}'.format((n_estimator),(cv_mae_xgb)))
    # parameter_plot(n_estimator_list, cv_mae_list, 'learning_rate', 'CV_MAE', 'learning_rate parameter optimization')

    # 暴力搜索:选取最优参数 max_depth : 4
    # parameter_name = 'max_depth'
    # max_depth = list(np.arange(3, 20, 1))
    # for n_estimator in max_depth:
    #     initialize_params = {'learning_rate': 0.21, 'n_estimators': 70, 'max_depth': n_estimator, 'min_child_weight': 1, 'seed': 0,
    #                     'subsample': 1.0, 'colsample_bytree': 1.0, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1}
    #     xgb_rg = xgb.XGBRegressor(**initialize_params)
    #     mult_xgb = MultiOutputRegressor(xgb_rg)
    #     cv_mae_xgb = cv_mae(mult_xgb, train_X, train_y)
    #     cv_mae_list.append(cv_mae_xgb)
    #     n_estimator_list.append(n_estimator)
    #     print (parameter_name + ' :{0}  交叉验证平均绝对误差:{1}'.format((n_estimator),(cv_mae_xgb)))
    # parameter_plot(n_estimator_list, cv_mae_list, parameter_name, 'CV_MAE', parameter_name + ' parameter optimization')

    # 暴力搜索:选取最优参数 min_child_weight : 0.5
    # parameter_name = 'min_child_weight'
    # min_child_weight = list(np.arange(0, 2, 0.1))
    # for n_estimator in min_child_weight:
    #     initialize_params = {'learning_rate': 0.21, 'n_estimators': 70, 'max_depth': 4, 'min_child_weight': n_estimator, 'seed': 0,
    #                     'subsample': 1.0, 'colsample_bytree': 1.0, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1}
    #     xgb_rg = xgb.XGBRegressor(**initialize_params)
    #     mult_xgb = MultiOutputRegressor(xgb_rg)
    #     cv_mae_xgb = cv_mae(mult_xgb, train_X, train_y)
    #     cv_mae_list.append(cv_mae_xgb)
    #     n_estimator_list.append(n_estimator)
    #     print (parameter_name + ' :{0}  交叉验证平均绝对误差:{1}'.format((n_estimator),(cv_mae_xgb)))
    # parameter_plot(n_estimator_list, cv_mae_list, parameter_name, 'CV_MAE', parameter_name + ' parameter optimization')

    #暴力搜索:选取最优参数 subsample: 1
    # parameter_name = 'subsample'
    # subsample = list(np.arange(0.1, 1.1, 0.1))
    # for n_estimator in subsample:
    #     n_estimator = round(n_estimator, 1)
    #     initialize_params = {'learning_rate': 0.21, 'n_estimators': 70, 'max_depth': 4, 'min_child_weight': 0.5, 'seed': 0,
    #                     'subsample': n_estimator, 'colsample_bytree': 1.0, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1}
    #     xgb_rg = xgb.XGBRegressor(**initialize_params)
    #     mult_xgb = MultiOutputRegressor(xgb_rg)
    #     cv_mae_xgb = cv_mae(mult_xgb, train_X, train_y)
    #     cv_mae_list.append(cv_mae_xgb)
    #     n_estimator_list.append(n_estimator)
    #     print (parameter_name + ' :{0}  交叉验证平均绝对误差:{1}'.format((n_estimator),(cv_mae_xgb)))
    # parameter_plot(n_estimator_list, cv_mae_list, parameter_name, 'CV_MAE', parameter_name + ' parameter optimization')

    # 暴力搜索:选取最优参数 colsample_bytree: 1
    # parameter_name = 'colsample_bytree'
    # colsample_bytree = list(np.arange(0.1, 1.1, 0.1))
    # for n_estimator in colsample_bytree:
    #     n_estimator = round(n_estimator, 1)
    #     initialize_params = {'learning_rate': 0.21, 'n_estimators': 70, 'max_depth': 4, 'min_child_weight': 0.5, 'seed': 0,
    #                     'subsample': 1, 'colsample_bytree': n_estimator, 'gamma': 0, 'reg_alpha': 0, 'reg_lambda': 1}
    #     xgb_rg = xgb.XGBRegressor(**initialize_params)
    #     mult_xgb = MultiOutputRegressor(xgb_rg)
    #     cv_mae_xgb = cv_mae(mult_xgb, train_X, train_y)
    #     cv_mae_list.append(cv_mae_xgb)
    #     n_estimator_list.append(n_estimator)
    #     print (parameter_name + ' :{0}  交叉验证平均绝对误差:{1}'.format((n_estimator),(cv_mae_xgb)))
    # parameter_plot(n_estimator_list, cv_mae_list, parameter_name, 'CV_MAE', parameter_name + ' parameter optimization')

    # 暴力搜索:选取最优参数 gamma: 0.2
    # parameter_name = 'gamma'
    # gamma = list(np.arange(0.1, 2, 0.1))
    # for n_estimator in gamma:
    #     n_estimator = round(n_estimator, 1)
    #     initialize_params = {'learning_rate': 0.21, 'n_estimators': 70, 'max_depth': 4, 'min_child_weight': 0.5,
    #                          'seed': 0,
    #                          'subsample': 1, 'colsample_bytree': 1, 'gamma': n_estimator, 'reg_alpha': 0,
    #                          'reg_lambda': 1}
    #     xgb_rg = xgb.XGBRegressor(**initialize_params)
    #     mult_xgb = MultiOutputRegressor(xgb_rg)
    #     cv_mae_xgb = cv_mae(mult_xgb, train_X, train_y)
    #     cv_mae_list.append(cv_mae_xgb)
    #     n_estimator_list.append(n_estimator)
    #     print(parameter_name + ' :{0}  交叉验证平均绝对误差:{1}'.format((n_estimator), (cv_mae_xgb)))
    # parameter_plot(n_estimator_list, cv_mae_list, parameter_name, 'CV_MAE', parameter_name + ' parameter optimization')

    # 暴力搜索:选取最优参数 reg_alpha: 1.8
    # parameter_name = 'reg_alpha'
    # reg_alpha = list(np.arange(0.01, 3, 0.2))
    # for n_estimator in reg_alpha:
    #     n_estimator = round(n_estimator, 1)
    #     initialize_params = {'learning_rate': 0.21, 'n_estimators': 70, 'max_depth': 4, 'min_child_weight': 0.5,
    #                          'seed': 0, 'subsample': 1, 'colsample_bytree': 1, 'gamma': 0.2, 'reg_alpha': n_estimator, 'reg_lambda': 1}
    #     xgb_rg = xgb.XGBRegressor(**initialize_params)
    #     mult_xgb = MultiOutputRegressor(xgb_rg)
    #     cv_mae_xgb = cv_mae(mult_xgb, train_X, train_y)
    #     cv_mae_list.append(cv_mae_xgb)
    #     n_estimator_list.append(n_estimator)
    #     print(parameter_name + ' :{0}  交叉验证平均绝对误差:{1}'.format((n_estimator), (cv_mae_xgb)))
    # parameter_plot(n_estimator_list, cv_mae_list, parameter_name, 'CV_MAE', parameter_name + ' parameter optimization')

    # 暴力搜索:选取最优参数 reg_lambda: 9.5
    parameter_name = 'reg_lambda'
    reg_lambda = list(np.arange(0.01, 20, 0.5))
    for n_estimator in reg_lambda:
        n_estimator = round(n_estimator, 1)
        initialize_params = {'learning_rate': 0.21, 'n_estimators': 70, 'max_depth': 4, 'min_child_weight': 0.5,
                             'seed': 0, 'subsample': 1, 'colsample_bytree': 1, 'gamma': 0.2, 'reg_alpha': 1.8,
                             'reg_lambda': n_estimator}
        xgb_rg = xgb.XGBRegressor(**initialize_params)
        mult_xgb = MultiOutputRegressor(xgb_rg)
        cv_mae_xgb = cv_mae(mult_xgb, train_X, train_y)
        cv_mae_list.append(cv_mae_xgb)
        n_estimator_list.append(n_estimator)
        print(parameter_name + ' :{0}  交叉验证平均绝对误差:{1}'.format((n_estimator), (cv_mae_xgb)))
    parameter_plot(n_estimator_list, cv_mae_list, parameter_name, 'CV_MAE', parameter_name + ' parameter optimization')

    print ()

你可能感兴趣的:(多任务目标输出,xgboost,机器学习)