Github 完整代码链接
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier
# Split the dataset in two equal parts
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.5, random_state=0)
# model
clf = RandomForestClassifier(n_jobs=-1)
# Set the parameters by cross-validation
tuned_parameters = {
'n_estimators': [50, 100, 200]
# ,'criterion': ['gini', 'entropy']
# ,'max_depth': [2, 5]
# ,'max_features': ['log2', 'sqrt', 'int']
# ,'bootstrap': [True, False]
# ,'warm_start': [True, False]
}
scores = ['precision']
for score in scores:
print("# Tuning hyper-parameters for %s" % score)
print()
clf = GridSearchCV(clf, tuned_parameters, cv=5,
scoring='%s_macro' % score)
clf.fit(X_train, y_train)
print("Best parameters set found on development set:")
print()
print(clf.best_params_)
print()
print("Grid scores on development set:")
print()
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
print("%0.3f (+/-%0.03f) for %r"
% (mean, std * 2, params))
print()
print("Detailed classification report:")
print()
print("The model is trained on the full development set.")
print("The scores are computed on the full evaluation set.")
print()
y_true, y_pred = y_test, clf.predict(X_test)
print(classification_report(y_true, y_pred))
print()
from sklearn.datasets import load_iris
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
iris = load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data,
iris.target,
random_state=0)
print("Size of training set:{} size of testing set:{}".format(
X_train.shape[0], X_test.shape[0]))
#### grid search start
best_score = 0
for gamma in [0.001, 0.01, 0.1, 1, 10, 100]:
for C in [0.001, 0.01, 0.1, 1, 10, 100]:
svm = SVC(gamma=gamma, C=C) #对于每种参数可能的组合,进行一次训练;
svm.fit(X_train, y_train)
score = svm.score(X_test, y_test)
if score > best_score: #找到表现最好的参数
best_score = score
best_parameters = {
'gamma': gamma, 'C': C}
#### grid search end
print("Best score:{:.2f}".format(best_score))
print("Best parameters:{}".format(best_parameters))
X_trainval,X_test,y_trainval,y_test = train_test_split(iris.data,iris.target,random_state=0)
X_train,X_val,y_train,y_val = train_test_split(X_trainval,y_trainval,random_state=1)
print("Size of training set:{} size of validation set:{} size of testing set:{}".format(X_train.shape[0],X_val.shape[0],X_test.shape[0]))
best_score = 0.0
for gamma in [0.001,0.01,0.1,1,10,100]:
for C in [0.001,0.01,0.1,1,10,100]:
svm = SVC(gamma=gamma,C=C)
svm.fit(X_train,y_train)
score = svm.score(X_val,y_val)
if score > best_score:
best_score = score
best_parameters = {
'gamma':gamma,'C':C}
svm = SVC(**best_parameters) #使用最佳参数,构建新的模型
svm.fit(X_trainval,y_trainval) #使用训练集和验证集进行训练,more data always results in good performance.
test_score = svm.score(X_test,y_test) # evaluation模型评估
print("Best score on validation set:{:.2f}".format(best_score))
print("Best parameters:{}".format(best_parameters))
print("Best score on test set:{:.2f}".format(test_score))
from sklearn.model_selection import cross_val_score
best_score = 0.0
for gamma in [0.001,0.01,0.1,1,10,100]:
for C in [0.001,0.01,0.1,1,10,100]:
svm = SVC(gamma=gamma,C=C)
scores = cross_val_score(svm,X_trainval,y_trainval,cv=5) #5折交叉验证
score = scores.mean() #取平均数
if score > best_score:
best_score = score
best_parameters = {
"gamma":gamma,"C":C}
svm = SVC(**best_parameters)
svm.fit(X_trainval,y_trainval)
test_score = svm.score(X_test,y_test)
print("Best score on validation set:{:.2f}".format(best_score))
print("Best parameters:{}".format(best_parameters))
print("Score on testing set:{:.2f}".format(test_score))
交叉验证经常与网格搜索进行结合,作为参数评价的一种方法,这种方法叫做grid search with cross validation。sklearn因此设计了一个这样的类GridSearchCV,这个类实现了fit,predict,score等方法,被当做了一个estimator,使用fit方法,该过程中:(1)搜索到最佳参数;(2)实例化了一个最佳参数的estimator;
from sklearn.model_selection import GridSearchCV
#把要调整的参数以及其候选值 列出来;
param_grid = {
"gamma":[0.001,0.01,0.1,1,10,100],
"C":[0.001,0.01,0.1,1,10,100]}
print("Parameters:{}".format(param_grid))
grid_search = GridSearchCV(SVC(),param_grid,cv=5) #实例化一个GridSearchCV类
X_train,X_test,y_train,y_test = train_test_split(iris.data,iris.target,random_state=10)
grid_search.fit(X_train,y_train) #训练,找到最优的参数,同时使用最优的参数实例化一个新的SVC estimator。
print("Test set score:{:.2f}".format(grid_search.score(X_test,y_test)))
print("Best parameters:{}".format(grid_search.best_params_))
print("Best score on train set:{:.2f}".format(grid_search.best_score_))
clf = lgb.LGBMRegressor(num_leaves=31)
parameters = {
'learning_rate': [0.01, 0.1, 1], 'n_estimators': [20, 40]}
clf = GridSearchCV(clf, parameters, cv=5)
clf.fit(train_data, train_target)
print('Best parameters found by grid search are:', clf.best_params_)
score_test = mean_squared_error(test_target, clf.predict(test_data))
print("LGBMRegressor GridSearchCV test MSE: ", score_test)
# 使用lr_reg和lgb_reg进行融合预测
clf_list = [lr_reg, lgb_reg]
##很容易过拟合
pred = stacking_pred(x_train, y_train, x_valid, kf, clf_list, label_split=None, clf_fin="lgb", if_concat_origin=True)
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split # 切分数据
# 切分数据 训练数据80% 验证数据20%
train_data, test_data, train_target, test_target = train_test_split(
train, target, test_size=0.2, random_state=0)
model = RandomForestClassifier()
parameters = {
'n_estimators': [20, 50, 100], 'max_depth': [1, 2, 3]}
clf = GridSearchCV(model, parameters, cv=3, verbose=2)
clf.fit(train_data, train_target)
score_test = roc_auc_score(test_target, clf.predict(test_data))
print("RandomForestClassifier GridSearchCV test AUC: ", score_test)
print("最优参数:")
print(clf.best_params_)
sorted(clf.cv_results_.keys())
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split # 切分数据
# 切分数据 训练数据80% 验证数据20%
train_data, test_data, train_target, test_target = train_test_split(
train, target, test_size=0.2, random_state=0)
model = RandomForestClassifier()
parameters = {
'n_estimators': [10, 20, 30, 50], 'max_depth': [1, 2, 3]}
clf = RandomizedSearchCV(model, parameters, cv=3, verbose=2)
clf.fit(train_data, train_target)
score_test = roc_auc_score(test_target, clf.predict(test_data))
print("RandomForestClassifier RandomizedSearchCV test AUC: ", score_test)
print("最优参数:")
print(clf.best_params_)
sorted(clf.cv_results_.keys())
LightGBM 调参次序:
第一步:学习率和迭代次数
第二步:确定max_depth和num_leaves
第三步:确定min_data_in_leaf和max_bin in
第四步:确定feature_fraction、bagging_fraction、bagging_freq
第五步:确定lambda_l1和lambda_l2
第六步:确定 min_split_gain
第七步:降低学习率,增加迭代次数,验证模型
#第一步:学习率和迭代次数
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
# 切分数据 训练数据80% 验证数据20%
# 为了加快速度CV选的3,其实一般用5
# 因为每训练一次耗时很多,所以每个参数的选项不多,间隔比较大,正式的时候应该是比较多,间隔比较细的
# 本次只是演示,所以如果最好参数位于区间的边缘也就直接用了,其实如果最好参数在边缘,需要重新再搜索。
model = LGBMClassifier(boosting_type='gbdt',
objective='binary',
metrics='auc',
learning_rate=0.1,
max_depth=5,
bagging_fraction=0.8,
feature_fraction=0.8)
parameters = {
'n_estimators': [100, 150, 175, 200, 225, 250]}
clf = GridSearchCV(model, parameters, cv=3, verbose=2)
clf.fit(train_data, train_target)
score_test = roc_auc_score(test_target, clf.predict_proba(test_data)[:, 1])
print("LightGBM GridSearchCV AUC Score: ", score_test)
print("最优参数:")
print(clf.best_params_)
#第二步:确定max_depth和num_leaves
#n_estimators=200
model = LGBMClassifier(boosting_type='gbdt',
objective='binary',
metrics='auc',
n_estimators=200,
learning_rate=0.1,
bagging_fraction=0.8,
feature_fraction=0.8)
parameters = {
'max_depth': range(4, 8, 1), 'num_leaves': range(10, 150, 10)}
#parameters={'max_depth': range(4,8,2), 'num_leaves':range(10, 100, 20)}
clf = GridSearchCV(model, parameters, cv=3, verbose=2)
clf.fit(train_data, train_target)
score_test = roc_auc_score(test_target, clf.predict_proba(test_data)[:, 1])
print("LightGBM GridSearchCV AUC Score: ", score_test)
print("最优参数:")
print(clf.best_params_)
根据结果取 max_depth=6, num_leaves=40 也都这么做。
#第三步:确定min_data_in_leaf和max_bin in
#已经确认内容:
#n_estimators=200
#{'max_depth': 6, 'num_leaves': 40}
model = LGBMClassifier(boosting_type='gbdt',
objective='binary',
metrics='auc',
n_estimators=200,
max_depth=6,
num_leaves=40,
learning_rate=0.1,
bagging_fraction=0.8,
feature_fraction=0.8)
#parameters={'max_bin': range(100,500,50),'min_data_in_leaf':range(100,150,10)}
parameters = {
'max_bin': range(100, 500, 100),
'min_data_in_leaf': range(100, 150, 50)
}
#调高 verbose可以看到更多信息
clf = GridSearchCV(model, parameters, cv=3, verbose=3)
clf.fit(train_data, train_target)
score_test = roc_auc_score(test_target, clf.predict_proba(test_data)[:, 1])
print("LightGBM GridSearchCV AUC Score: ", score_test)
print("最优参数:")
print(clf.best_params_)
#第四步:确定feature_fraction、bagging_fraction、bagging_freq
#已经确认内容:
#n_estimators=200
#{'max_depth': 6, 'num_leaves': 40}
#{'max_bin': 400, 'min_data_in_leaf': 120}
model = LGBMClassifier(boosting_type='gbdt',
objective='binary',
metrics='auc',
n_estimators=200,
max_depth=6,
num_leaves=40,
max_bin=400,
min_data_in_leaf=120,
learning_rate=0.1,
bagging_fraction=0.8,
feature_fraction=0.8)
parameters = {
'feature_fraction': [0.6, 0.7, 0.8, 0.9, 1.0],
'bagging_fraction': [0.6, 0.7, 0.8, 0.9, 1.0],
'bagging_freq': range(0, 10, 2)
}
#调高 verbose可以看到更多信息
clf = GridSearchCV(model, parameters, cv=3, verbose=3)
clf.fit(train_data, train_target)
score_test = roc_auc_score(test_target, clf.predict_proba(test_data)[:, 1])
print("LightGBM GridSearchCV AUC Score: ", score_test)
print("最优参数:")
print(clf.best_params_)
#第五步:确定lambda_l1和lambda_l2
#已经确认内容:
#n_estimators=200
#{'max_depth': 6, 'num_leaves': 40}
#{'max_bin': 400, 'min_data_in_leaf': 120}
#{'bagging_fraction': 0.9, 'bagging_freq': 4, 'feature_fraction': 0.6}
model = LGBMClassifier(boosting_type='gbdt',
objective='binary',
metrics='auc',
n_estimators=200,
max_depth=6,
num_leaves=40,
max_bin=400,
min_data_in_leaf=120,
learning_rate=0.1,
bagging_freq=4,
bagging_fraction=0.9,
feature_fraction=0.6)
parameters = {
'lambda_l1': [1e-5, 1e-3, 1e-1, 0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0],
'lambda_l2': [1e-5, 1e-3, 1e-1, 0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0]
}
#调高 verbose可以看到更多信息
clf = GridSearchCV(model, parameters, cv=3, verbose=3)
clf.fit(train_data, train_target)
score_test = roc_auc_score(test_target, clf.predict_proba(test_data)[:, 1])
print("LightGBM GridSearchCV AUC Score: ", score_test)
print("最优参数:")
print(clf.best_params_)
#第六步:确定 min_split_gain
#已经确认内容:
#n_estimators=200
#{'max_depth': 6, 'num_leaves': 40}
#{'max_bin': 400, 'min_data_in_leaf': 120}
#{'bagging_fraction': 0.9, 'bagging_freq': 4, 'feature_fraction': 0.6}
#{'lambda_l1': 1e-05, 'lambda_l2': 1e-05}
model = LGBMClassifier(boosting_type='gbdt',
objective='binary',
metrics='auc',
n_estimators=200,
max_depth=6,
num_leaves=40,
max_bin=400,
min_data_in_leaf=120,
learning_rate=0.1,
lambda_l1=1e-05,
lambda_l2=1e-05,
bagging_freq=4,
bagging_fraction=0.9,
feature_fraction=0.6)
parameters = {
'min_split_gain': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
}
#调高 verbose可以看到更多信息
clf = GridSearchCV(model, parameters, cv=3, verbose=3)
clf.fit(train_data, train_target)
score_test = roc_auc_score(test_target, clf.predict_proba(test_data)[:, 1])
print("LightGBM GridSearchCV AUC Score: ", score_test)
print("最优参数:")
print(clf.best_params_)
# 第七步:降低学习率,增加迭代次数,验证模型
#已经确认内容:
#n_estimators=200
#{'max_depth': 6, 'num_leaves': 40}
#{'max_bin': 400, 'min_data_in_leaf': 120}
#{'bagging_fraction': 0.9, 'bagging_freq': 4, 'feature_fraction': 0.6}
#{'lambda_l1': 1e-05, 'lambda_l2': 1e-05}
#{'min_split_gain': 0.0}
model = LGBMClassifier(boosting_type='gbdt',
objective='binary',
metrics='auc',
n_estimators=200,
max_depth=6,
num_leaves=40,
max_bin=400,
min_data_in_leaf=120,
learning_rate=0.1,
lambda_l1=1e-05,
lambda_l2=1e-05,
min_split_gain=0.0,
bagging_freq=4,
bagging_fraction=0.9,
feature_fraction=0.6)
model.fit(train_data, train_target)
score_test = roc_auc_score(test_target, model.predict_proba(test_data)[:, 1])
print("LightGBM learning rate 0.1 AUC Score: ", score_test)
model = LGBMClassifier(boosting_type='gbdt',
objective='binary',
metrics='auc',
n_estimators=200,
max_depth=6,
num_leaves=40,
max_bin=400,
min_data_in_leaf=120,
learning_rate=0.05,
lambda_l1=1e-05,
lambda_l2=1e-05,
min_split_gain=0.0,
bagging_freq=4,
bagging_fraction=0.9,
feature_fraction=0.6)
model.fit(train_data, train_target)
score_test = roc_auc_score(test_target, model.predict_proba(test_data)[:, 1])
print("LightGBM learning rate 0.05 AUC Score: ", score_test)
model = LGBMClassifier(boosting_type='gbdt',
objective='binary',
metrics='auc',
n_estimators=200,
max_depth=6,
num_leaves=40,
max_bin=400,
min_data_in_leaf=120,
learning_rate=0.01,
lambda_l1=1e-05,
lambda_l2=1e-05,
min_split_gain=0.0,
bagging_freq=4,
bagging_fraction=0.9,
feature_fraction=0.6)
model.fit(train_data, train_target)
score_test = roc_auc_score(test_target, model.predict_proba(test_data)[:, 1])
print("LightGBM learning rate 0.01 AUC Score: ", score_test)
model = LGBMClassifier(boosting_type='gbdt',
objective='binary',
metrics='auc',
n_estimators=200,
max_depth=6,
num_leaves=40,
max_bin=400,
min_data_in_leaf=120,
learning_rate=0.005,
lambda_l1=1e-05,
lambda_l2=1e-05,
min_split_gain=0.0,
bagging_freq=4,
bagging_fraction=0.9,
feature_fraction=0.6)
model.fit(train_data, train_target)
score_test = roc_auc_score(test_target, model.predict_proba(test_data)[:, 1])
print("LightGBM learning rate 0.005 AUC Score: ", score_test)
model = LGBMClassifier()
model.fit(train_data, train_target)
score_test = roc_auc_score(test_target, model.predict_proba(test_data)[:, 1])
print("默认参数 AUC Score: ", score_test)
最优参数 model = LGBMClassifier(boosting_type=‘gbdt’,objective=‘binary’,metrics=‘auc’,n_estimators=200,max_depth=6,num_leaves=40,
max_bin=400,min_data_in_leaf=120,
learning_rate=0.05,
lambda_l1=1e-05,lambda_l2=1e-05,min_split_gain=0.0,
bagging_freq=4, bagging_fraction = 0.9,feature_fraction = 0.6)
#f3特征
train = train_f3.copy()
train.head()
print('默认参数')
classifier_df_score(train, 'LGB', 5)
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'eval_metric': 'auc',
'n_estimators': 200,
'max_depth': 5,
'num_leaves': 40,
'max_bin': 400,
'min_data_in_leaf': 120,
'learning_rate': 0.1,
'lambda_l1': 1e-05,
'lambda_l2': 1e-05,
'min_split_gain': 0.0,
'bagging_freq': 4,
'bagging_fraction': 0.9,
'feature_fraction': 0.6,
'seed': 1024,
'n_thread': 12
}
print('调参后')
classifier_df_score(train, 'LGB', 5, params)
对比发现调参后的结果比默认参数有所提高,不过不是高很多,比不上特征对结果的影响。而且因为调参只能再测试集上作,有的时候调参造成过拟,调参后线上成绩可能反而会下降。所以调参一般都是在比赛后期再做,前期主要是特征和模型的选择。