##一. scikit-learn GBDT类库概述
在scikit-learn中,GradientBoostingClassifier为GBDT的分类类, 而GradientBoostingRegressor为GBDT的回归类。两者的参数类型完全相同,当然有些参数比如损失函数loss的可选择项并不相同。这些参数中,类似于Adaboost,我们把重要参数分为两类,第一类是Boosting框架的重要参数,第二类是弱学习器即CART回归树的重要参数。
##二. GBDT类库boosting框架参数
**6) alpha:**这个参数只有GradientBoostingRegressor有,当我们使用Huber损失"huber"和分位数损失“quantile”时,需要指定分位数的值。默认是0.9,如果噪音点较多,可以适当降低这个分位数的值。
##三. GBDT类库弱学习器参数
import os
import pandas as pd
import numpy as np
os.environ["TF_CPP_MIN_LOG_LEVEL"] = '3'
os.chdir("./DL/transfer_learning/classification_2/data_9_feature")
Adware = pd.read_csv('adware.csv')
GM =pd.read_csv('GM.csv')
Begin = pd.read_csv('begin.csv')
Adware = Adware.sample(frac=1.0)
GM = GM.sample(frac=1.0)
Begin = Begin.sample(frac=1.0)
Adware=Adware.drop_duplicates(keep='first')
GM=GM.drop_duplicates(keep='first')
Begin=Begin.drop_duplicates(keep='first')
print(len(Adware),len(GM),len(Begin))
train = pd.merge(Adware[:62000],Begin[:82000], how='outer')
test = pd.merge(Adware[62000:],Begin[82000:], how='outer')
train=train.sample(frac=1.0)
test=test.sample(frac=1.0)
train = train.values
test = test.values
2. 分类
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
gbc = GradientBoostingClassifier(
# boosting参数
init=None,
n_estimators=100,
learning_rate=0.1,
subsample=0.8,
loss='deviance',
# 分割参数
max_features='sqrt',
criterion='friedman_mse',
# 分割停止参数
min_samples_split =1200,
min_impurity_split=None,
min_impurity_decrease=0.0,
max_depth=7,
max_leaf_nodes=None,
# 剪枝参数
min_samples_leaf =60,
warm_start=False,
random_state=10
)
gbc.fit(train[:,:9],train[:,9].ravel())
y_pred = gbc.predict(test[:,:9])
y_predprob = gbc.predict_proba(test[:,:9])[:,1]
print("precision_recall_f1-score_accuracy:\n",metrics.classification_report(test[:,9].ravel(),y_pred))
print("confusion_matrix:\n",metrics.confusion_matrix(test[:,9].ravel(),y_pred))
print("Accuracy : %.4g" % metrics.accuracy_score(test[:,9].ravel(), y_pred))
print("AUC Score (Train): %f" % metrics.roc_auc_score(test[:,9].ravel(), y_predprob))
3. 分类调参(网格搜索)
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import metrics
gbc = GradientBoostingClassifier(
# boosting参数
init=None,
# n_estimators=100,
learning_rate=0.1,
subsample=0.8,
loss='deviance',
# 分割参数
max_features='sqrt',
criterion='friedman_mse',
# 分割停止参数
min_samples_split =1200,
min_impurity_split=None,
min_impurity_decrease=0.0,
max_depth=7,
max_leaf_nodes=None,
# 剪枝参数
min_samples_leaf =60,
warm_start=False,
random_state=10
)
gsearch1 = GridSearchCV(estimator = gbc, param_grid = {'n_estimators':[10,20,30]}, scoring='roc_auc',iid=False,cv=5)
gsearch1.fit(train[:,:9],train[:,9].ravel())
print(gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_)
4. 回归
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn import metrics
clf = GradientBoostingRegressor(
# boosting参数
init=None,
# n_estimators=100,
learning_rate=0.1,
subsample=0.8,
loss='ls',
# 分割参数
max_features='sqrt',
#criterion='friedman_mse',
# 分割停止参数
min_samples_split =1200,
min_impurity_split=None,
min_impurity_decrease=0.0,
max_depth=7,
max_leaf_nodes=None,
# 剪枝参数
min_samples_leaf =60,
warm_start=False,
random_state=10
)
print(clf)
model = clf.fit(train[:,:9],train[:,9].ravel())
output = model.predict(test[:,:9])
print(output[:10])
print(test[:,9][:10])
print(mean_squared_error(test[:,9].ravel(), output))
print(metrics.roc_auc_score(test[:,9].ravel(), output))
5. 回归调参
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn import metrics
clf = GradientBoostingRegressor(
# boosting参数
init=None,
# n_estimators=100,
learning_rate=0.1,
subsample=0.8,
loss='ls',
# 分割参数
max_features='sqrt',
#criterion='friedman_mse',
# 分割停止参数
min_samples_split =1200,
min_impurity_split=None,
min_impurity_decrease=0.0,
max_depth=7,
max_leaf_nodes=None,
# 剪枝参数
min_samples_leaf =60,
warm_start=False,
random_state=10
)
gsearch1 = GridSearchCV(estimator=clf,param_grid = {'n_estimators':[100,200,300]}, scoring='roc_auc',iid=False,cv=5)
gsearch1.fit(train[:,:9],train[:,9].ravel())
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_