【读书向】阿里云天池大赛赛题解析——参数优化

【读书向】阿里云天池大赛赛题解析——可视化

目录

  • 【读书向】阿里云天池大赛赛题解析——可视化
  • 网格搜索
    • lgb调参
  • 分类
    • 穷举网格搜索
    • 随机参数优化
  • LightGBM 网格调参


Github 完整代码链接

网格搜索

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier


# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.5, random_state=0)

# model 
clf = RandomForestClassifier(n_jobs=-1)

# Set the parameters by cross-validation

tuned_parameters = {
     
                    'n_estimators': [50, 100, 200]
#                     ,'criterion': ['gini', 'entropy']
#                     ,'max_depth': [2, 5]
#                     ,'max_features': ['log2', 'sqrt', 'int']
#                     ,'bootstrap': [True, False]
#                     ,'warm_start': [True, False]
                    }

scores = ['precision']

for score in scores:
    print("# Tuning hyper-parameters for %s" % score)
    print()

    clf = GridSearchCV(clf, tuned_parameters, cv=5,
                       scoring='%s_macro' % score)
    clf.fit(X_train, y_train)

    print("Best parameters set found on development set:")
    print()
    print(clf.best_params_)
    print()
    print("Grid scores on development set:")
    print()
    means = clf.cv_results_['mean_test_score']
    stds = clf.cv_results_['std_test_score']
    for mean, std, params in zip(means, stds, clf.cv_results_['params']):
        print("%0.3f (+/-%0.03f) for %r"
              % (mean, std * 2, params))
    print()

    print("Detailed classification report:")
    print()
    print("The model is trained on the full development set.")
    print("The scores are computed on the full evaluation set.")
    print()
    y_true, y_pred = y_test, clf.predict(X_test)
    print(classification_report(y_true, y_pred))
    print()
from sklearn.datasets import load_iris
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split

iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data,
                                                    iris.target,
                                                    random_state=0)
print("Size of training set:{} size of testing set:{}".format(
    X_train.shape[0], X_test.shape[0]))

####   grid search start
best_score = 0
for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
    for C in [0.001, 0.01, 0.1, 1, 10, 100]:
        svm = SVC(gamma=gamma, C=C)  #对于每种参数可能的组合,进行一次训练;
        svm.fit(X_train, y_train)
        score = svm.score(X_test, y_test)
        if score > best_score:  #找到表现最好的参数
            best_score = score
            best_parameters = {
     'gamma': gamma, 'C': C}
####   grid search end
print("Best score:{:.2f}".format(best_score))
print("Best parameters:{}".format(best_parameters))
X_trainval,X_test,y_trainval,y_test = train_test_split(iris.data,iris.target,random_state=0)
X_train,X_val,y_train,y_val = train_test_split(X_trainval,y_trainval,random_state=1)
print("Size of training set:{} size of validation set:{} size of testing set:{}".format(X_train.shape[0],X_val.shape[0],X_test.shape[0]))

best_score = 0.0
for gamma in [0.001,0.01,0.1,1,10,100]:
    for C in [0.001,0.01,0.1,1,10,100]:
        svm = SVC(gamma=gamma,C=C)
        svm.fit(X_train,y_train)
        score = svm.score(X_val,y_val)
        if score > best_score:
            best_score = score
            best_parameters = {
     'gamma':gamma,'C':C}
svm = SVC(**best_parameters) #使用最佳参数,构建新的模型
svm.fit(X_trainval,y_trainval) #使用训练集和验证集进行训练,more data always results in good performance.
test_score = svm.score(X_test,y_test) # evaluation模型评估
print("Best score on validation set:{:.2f}".format(best_score))
print("Best parameters:{}".format(best_parameters))
print("Best score on test set:{:.2f}".format(test_score))
from sklearn.model_selection import cross_val_score

best_score = 0.0
for gamma in [0.001,0.01,0.1,1,10,100]:
    for C in [0.001,0.01,0.1,1,10,100]:
        svm = SVC(gamma=gamma,C=C)
        scores = cross_val_score(svm,X_trainval,y_trainval,cv=5) #5折交叉验证
        score = scores.mean() #取平均数
        if score > best_score:
            best_score = score
            best_parameters = {
     "gamma":gamma,"C":C}
svm = SVC(**best_parameters)
svm.fit(X_trainval,y_trainval)
test_score = svm.score(X_test,y_test)
print("Best score on validation set:{:.2f}".format(best_score))
print("Best parameters:{}".format(best_parameters))
print("Score on testing set:{:.2f}".format(test_score))

交叉验证经常与网格搜索进行结合,作为参数评价的一种方法,这种方法叫做grid search with cross validation。sklearn因此设计了一个这样的类GridSearchCV,这个类实现了fit,predict,score等方法,被当做了一个estimator,使用fit方法,该过程中:(1)搜索到最佳参数;(2)实例化了一个最佳参数的estimator;

from sklearn.model_selection import GridSearchCV

#把要调整的参数以及其候选值 列出来;
param_grid = {
     "gamma":[0.001,0.01,0.1,1,10,100],
             "C":[0.001,0.01,0.1,1,10,100]}
print("Parameters:{}".format(param_grid))

grid_search = GridSearchCV(SVC(),param_grid,cv=5) #实例化一个GridSearchCV类
X_train,X_test,y_train,y_test = train_test_split(iris.data,iris.target,random_state=10)
grid_search.fit(X_train,y_train) #训练,找到最优的参数,同时使用最优的参数实例化一个新的SVC estimator。
print("Test set score:{:.2f}".format(grid_search.score(X_test,y_test)))
print("Best parameters:{}".format(grid_search.best_params_))
print("Best score on train set:{:.2f}".format(grid_search.best_score_))

lgb调参

clf = lgb.LGBMRegressor(num_leaves=31)

parameters = {
     'learning_rate': [0.01, 0.1, 1], 'n_estimators': [20, 40]}
clf = GridSearchCV(clf, parameters, cv=5)
clf.fit(train_data, train_target)

print('Best parameters found by grid search are:', clf.best_params_)
score_test = mean_squared_error(test_target, clf.predict(test_data))
print("LGBMRegressor GridSearchCV test MSE:   ", score_test)
# 使用lr_reg和lgb_reg进行融合预测
clf_list = [lr_reg, lgb_reg]

##很容易过拟合
pred = stacking_pred(x_train, y_train, x_valid, kf, clf_list, label_split=None, clf_fin="lgb", if_concat_origin=True)

分类

穷举网格搜索

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split  # 切分数据
# 切分数据 训练数据80% 验证数据20%
train_data, test_data, train_target, test_target = train_test_split(
    train, target, test_size=0.2, random_state=0)

model = RandomForestClassifier()
parameters = {
     'n_estimators': [20, 50, 100], 'max_depth': [1, 2, 3]}

clf = GridSearchCV(model, parameters, cv=3, verbose=2)
clf.fit(train_data, train_target)

score_test = roc_auc_score(test_target, clf.predict(test_data))

print("RandomForestClassifier GridSearchCV test AUC:   ", score_test)
print("最优参数:")
print(clf.best_params_)
sorted(clf.cv_results_.keys())

随机参数优化

from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split  # 切分数据
# 切分数据 训练数据80% 验证数据20%
train_data, test_data, train_target, test_target = train_test_split(
    train, target, test_size=0.2, random_state=0)

model = RandomForestClassifier()
parameters = {
     'n_estimators': [10, 20, 30, 50], 'max_depth': [1, 2, 3]}

clf = RandomizedSearchCV(model, parameters, cv=3, verbose=2)
clf.fit(train_data, train_target)

score_test = roc_auc_score(test_target, clf.predict(test_data))

print("RandomForestClassifier RandomizedSearchCV test AUC:   ", score_test)
print("最优参数:")
print(clf.best_params_)
sorted(clf.cv_results_.keys())

LightGBM 网格调参

LightGBM 调参次序:
第一步:学习率和迭代次数
第二步:确定max_depth和num_leaves
第三步:确定min_data_in_leaf和max_bin in
第四步:确定feature_fraction、bagging_fraction、bagging_freq
第五步:确定lambda_l1和lambda_l2
第六步:确定 min_split_gain
第七步:降低学习率,增加迭代次数,验证模型

#第一步:学习率和迭代次数
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
# 切分数据 训练数据80% 验证数据20%
# 为了加快速度CV选的3,其实一般用5
# 因为每训练一次耗时很多,所以每个参数的选项不多,间隔比较大,正式的时候应该是比较多,间隔比较细的
# 本次只是演示,所以如果最好参数位于区间的边缘也就直接用了,其实如果最好参数在边缘,需要重新再搜索。
model = LGBMClassifier(boosting_type='gbdt',
                       objective='binary',
                       metrics='auc',
                       learning_rate=0.1,
                       max_depth=5,
                       bagging_fraction=0.8,
                       feature_fraction=0.8)
parameters = {
     'n_estimators': [100, 150, 175, 200, 225, 250]}

clf = GridSearchCV(model, parameters, cv=3, verbose=2)
clf.fit(train_data, train_target)
score_test = roc_auc_score(test_target, clf.predict_proba(test_data)[:, 1])

print("LightGBM GridSearchCV AUC Score:   ", score_test)
print("最优参数:")
print(clf.best_params_)
#第二步:确定max_depth和num_leaves
#n_estimators=200
model = LGBMClassifier(boosting_type='gbdt',
                       objective='binary',
                       metrics='auc',
                       n_estimators=200,
                       learning_rate=0.1,
                       bagging_fraction=0.8,
                       feature_fraction=0.8)
parameters = {
     'max_depth': range(4, 8, 1), 'num_leaves': range(10, 150, 10)}
#parameters={'max_depth': range(4,8,2), 'num_leaves':range(10, 100, 20)}
clf = GridSearchCV(model, parameters, cv=3, verbose=2)
clf.fit(train_data, train_target)

score_test = roc_auc_score(test_target, clf.predict_proba(test_data)[:, 1])

print("LightGBM GridSearchCV AUC Score:   ", score_test)
print("最优参数:")
print(clf.best_params_)

根据结果取 max_depth=6, num_leaves=40 也都这么做。

#第三步:确定min_data_in_leaf和max_bin in
#已经确认内容:
#n_estimators=200
#{'max_depth': 6, 'num_leaves': 40}
model = LGBMClassifier(boosting_type='gbdt',
                       objective='binary',
                       metrics='auc',
                       n_estimators=200,
                       max_depth=6,
                       num_leaves=40,
                       learning_rate=0.1,
                       bagging_fraction=0.8,
                       feature_fraction=0.8)

#parameters={'max_bin': range(100,500,50),'min_data_in_leaf':range(100,150,10)}
parameters = {
     
    'max_bin': range(100, 500, 100),
    'min_data_in_leaf': range(100, 150, 50)
}
#调高 verbose可以看到更多信息
clf = GridSearchCV(model, parameters, cv=3, verbose=3)
clf.fit(train_data, train_target)

score_test = roc_auc_score(test_target, clf.predict_proba(test_data)[:, 1])

print("LightGBM GridSearchCV AUC Score:   ", score_test)
print("最优参数:")
print(clf.best_params_)
#第四步:确定feature_fraction、bagging_fraction、bagging_freq
#已经确认内容:
#n_estimators=200
#{'max_depth': 6, 'num_leaves': 40}
#{'max_bin': 400, 'min_data_in_leaf': 120}
model = LGBMClassifier(boosting_type='gbdt',
                       objective='binary',
                       metrics='auc',
                       n_estimators=200,
                       max_depth=6,
                       num_leaves=40,
                       max_bin=400,
                       min_data_in_leaf=120,
                       learning_rate=0.1,
                       bagging_fraction=0.8,
                       feature_fraction=0.8)

parameters = {
     
    'feature_fraction': [0.6, 0.7, 0.8, 0.9, 1.0],
    'bagging_fraction': [0.6, 0.7, 0.8, 0.9, 1.0],
    'bagging_freq': range(0, 10, 2)
}

#调高 verbose可以看到更多信息
clf = GridSearchCV(model, parameters, cv=3, verbose=3)
clf.fit(train_data, train_target)

score_test = roc_auc_score(test_target, clf.predict_proba(test_data)[:, 1])

print("LightGBM GridSearchCV AUC Score:   ", score_test)
print("最优参数:")
print(clf.best_params_)
#第五步:确定lambda_l1和lambda_l2
#已经确认内容:
#n_estimators=200
#{'max_depth': 6, 'num_leaves': 40}
#{'max_bin': 400, 'min_data_in_leaf': 120}
#{'bagging_fraction': 0.9, 'bagging_freq': 4, 'feature_fraction': 0.6}
model = LGBMClassifier(boosting_type='gbdt',
                       objective='binary',
                       metrics='auc',
                       n_estimators=200,
                       max_depth=6,
                       num_leaves=40,
                       max_bin=400,
                       min_data_in_leaf=120,
                       learning_rate=0.1,
                       bagging_freq=4,
                       bagging_fraction=0.9,
                       feature_fraction=0.6)

parameters = {
     
    'lambda_l1': [1e-5, 1e-3, 1e-1, 0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0],
    'lambda_l2': [1e-5, 1e-3, 1e-1, 0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0]
}

#调高 verbose可以看到更多信息
clf = GridSearchCV(model, parameters, cv=3, verbose=3)
clf.fit(train_data, train_target)

score_test = roc_auc_score(test_target, clf.predict_proba(test_data)[:, 1])

print("LightGBM GridSearchCV AUC Score:   ", score_test)
print("最优参数:")
print(clf.best_params_)
#第六步:确定 min_split_gain
#已经确认内容:
#n_estimators=200
#{'max_depth': 6, 'num_leaves': 40}
#{'max_bin': 400, 'min_data_in_leaf': 120}
#{'bagging_fraction': 0.9, 'bagging_freq': 4, 'feature_fraction': 0.6}
#{'lambda_l1': 1e-05, 'lambda_l2': 1e-05}
model = LGBMClassifier(boosting_type='gbdt',
                       objective='binary',
                       metrics='auc',
                       n_estimators=200,
                       max_depth=6,
                       num_leaves=40,
                       max_bin=400,
                       min_data_in_leaf=120,
                       learning_rate=0.1,
                       lambda_l1=1e-05,
                       lambda_l2=1e-05,
                       bagging_freq=4,
                       bagging_fraction=0.9,
                       feature_fraction=0.6)

parameters = {
     
    'min_split_gain': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
}

#调高 verbose可以看到更多信息
clf = GridSearchCV(model, parameters, cv=3, verbose=3)
clf.fit(train_data, train_target)

score_test = roc_auc_score(test_target, clf.predict_proba(test_data)[:, 1])

print("LightGBM GridSearchCV AUC Score:   ", score_test)
print("最优参数:")
print(clf.best_params_)
# 第七步:降低学习率,增加迭代次数,验证模型
#已经确认内容:
#n_estimators=200
#{'max_depth': 6, 'num_leaves': 40}
#{'max_bin': 400, 'min_data_in_leaf': 120}
#{'bagging_fraction': 0.9, 'bagging_freq': 4, 'feature_fraction': 0.6}
#{'lambda_l1': 1e-05, 'lambda_l2': 1e-05}
#{'min_split_gain': 0.0}
model = LGBMClassifier(boosting_type='gbdt',
                       objective='binary',
                       metrics='auc',
                       n_estimators=200,
                       max_depth=6,
                       num_leaves=40,
                       max_bin=400,
                       min_data_in_leaf=120,
                       learning_rate=0.1,
                       lambda_l1=1e-05,
                       lambda_l2=1e-05,
                       min_split_gain=0.0,
                       bagging_freq=4,
                       bagging_fraction=0.9,
                       feature_fraction=0.6)

model.fit(train_data, train_target)
score_test = roc_auc_score(test_target, model.predict_proba(test_data)[:, 1])

print("LightGBM learning rate 0.1 AUC Score:   ", score_test)

model = LGBMClassifier(boosting_type='gbdt',
                       objective='binary',
                       metrics='auc',
                       n_estimators=200,
                       max_depth=6,
                       num_leaves=40,
                       max_bin=400,
                       min_data_in_leaf=120,
                       learning_rate=0.05,
                       lambda_l1=1e-05,
                       lambda_l2=1e-05,
                       min_split_gain=0.0,
                       bagging_freq=4,
                       bagging_fraction=0.9,
                       feature_fraction=0.6)

model.fit(train_data, train_target)
score_test = roc_auc_score(test_target, model.predict_proba(test_data)[:, 1])

print("LightGBM learning rate 0.05 AUC Score:   ", score_test)

model = LGBMClassifier(boosting_type='gbdt',
                       objective='binary',
                       metrics='auc',
                       n_estimators=200,
                       max_depth=6,
                       num_leaves=40,
                       max_bin=400,
                       min_data_in_leaf=120,
                       learning_rate=0.01,
                       lambda_l1=1e-05,
                       lambda_l2=1e-05,
                       min_split_gain=0.0,
                       bagging_freq=4,
                       bagging_fraction=0.9,
                       feature_fraction=0.6)

model.fit(train_data, train_target)
score_test = roc_auc_score(test_target, model.predict_proba(test_data)[:, 1])

print("LightGBM learning rate 0.01 AUC Score:   ", score_test)

model = LGBMClassifier(boosting_type='gbdt',
                       objective='binary',
                       metrics='auc',
                       n_estimators=200,
                       max_depth=6,
                       num_leaves=40,
                       max_bin=400,
                       min_data_in_leaf=120,
                       learning_rate=0.005,
                       lambda_l1=1e-05,
                       lambda_l2=1e-05,
                       min_split_gain=0.0,
                       bagging_freq=4,
                       bagging_fraction=0.9,
                       feature_fraction=0.6)

model.fit(train_data, train_target)
score_test = roc_auc_score(test_target, model.predict_proba(test_data)[:, 1])

print("LightGBM learning rate 0.005 AUC Score:   ", score_test)
model = LGBMClassifier()

model.fit(train_data, train_target)
score_test = roc_auc_score(test_target, model.predict_proba(test_data)[:, 1])

print("默认参数 AUC Score:   ", score_test)

最优参数 model = LGBMClassifier(boosting_type=‘gbdt’,objective=‘binary’,metrics=‘auc’,n_estimators=200,max_depth=6,num_leaves=40,
max_bin=400,min_data_in_leaf=120,
learning_rate=0.05,
lambda_l1=1e-05,lambda_l2=1e-05,min_split_gain=0.0,
bagging_freq=4, bagging_fraction = 0.9,feature_fraction = 0.6)

#f3特征
train = train_f3.copy()
train.head()
print('默认参数')
classifier_df_score(train, 'LGB', 5)
params = {
     
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'eval_metric': 'auc',
    'n_estimators': 200,
    'max_depth': 5,
    'num_leaves': 40,
    'max_bin': 400,
    'min_data_in_leaf': 120,
    'learning_rate': 0.1,
    'lambda_l1': 1e-05,
    'lambda_l2': 1e-05,
    'min_split_gain': 0.0,
    'bagging_freq': 4,
    'bagging_fraction': 0.9,
    'feature_fraction': 0.6,
    'seed': 1024,
    'n_thread': 12
}
print('调参后')
classifier_df_score(train, 'LGB', 5, params)

对比发现调参后的结果比默认参数有所提高,不过不是高很多,比不上特征对结果的影响。而且因为调参只能再测试集上作,有的时候调参造成过拟,调参后线上成绩可能反而会下降。所以调参一般都是在比赛后期再做,前期主要是特征和模型的选择。

你可能感兴趣的:(读书向,机器学习,python,数据挖掘)