本文仅做摘抄记录,展示一些lgbm用作分类与回归的代码,以供学习记忆与备用。
lgbm的github:
https://github.com/Microsoft/LightGBM/blob/master/docs/Parameters.rst
参数解释:
https://blog.csdn.net/ssswill/article/details/85235074
示例代码:https://github.com/Microsoft/LightGBM/tree/master/examples/python-guide
官方文档:
https://lightgbm.readthedocs.io/en/latest/Python-Intro.html
代码:https://github.com/Microsoft/LightGBM/blob/master/examples/python-guide/simple_example.py
# coding: utf-8
# pylint: disable = invalid-name, C0111
import lightgbm as lgb
import pandas as pd
from sklearn.metrics import mean_squared_error
print('Loading data...')
# load or create your dataset
df_train = pd.read_csv('../regression/regression.train', header=None, sep='\t')
df_test = pd.read_csv('../regression/regression.test', header=None, sep='\t')
y_train = df_train[0]
y_test = df_test[0]
X_train = df_train.drop(0, axis=1)
X_test = df_test.drop(0, axis=1)
# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)
# specify your configurations as a dict
params = {
'boosting_type': 'gbdt',
'objective': 'regression',
'metric': {'l2', 'l1'},
'num_leaves': 31,
'learning_rate': 0.05,
'feature_fraction': 0.9,
'bagging_fraction': 0.8,
'bagging_freq': 5,
'verbose': 0
}
print('Starting training...')
# train
gbm = lgb.train(params,
lgb_train,
num_boost_round=20,
valid_sets=lgb_eval,
early_stopping_rounds=5)
print('Saving model...')
# save model to file
gbm.save_model('model.txt')
print('Starting predicting...')
# predict
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
# eval
print('The rmse of prediction is:', mean_squared_error(y_test, y_pred) ** 0.5)
代码来源:https://www.kaggle.com/chauhuynh/my-first-kernel-3-699
df_train_columns = [c for c in df_train.columns if c not in ['card_id', 'first_active_month','target','outliers']]
target = df_train['target']
del df_train['target']
param = {'num_leaves': 31,
'min_data_in_leaf': 30,
'objective':'regression',
'max_depth': -1,
'learning_rate': 0.01,
"min_child_samples": 20,
"boosting": "gbdt",
"feature_fraction": 0.9,
"bagging_freq": 1,
"bagging_fraction": 0.9 ,
"bagging_seed": 11,
"metric": 'rmse',
"lambda_l1": 0.1,
"verbosity": -1,
"nthread": 4,
"random_state": 4590}
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=4590)
oof = np.zeros(len(df_train))
predictions = np.zeros(len(df_test))
feature_importance_df = pd.DataFrame()
for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train,df_train['outliers'].values)):
print("fold {}".format(fold_))
trn_data = lgb.Dataset(df_train.iloc[trn_idx][df_train_columns], label=target.iloc[trn_idx])#, categorical_feature=categorical_feats)
val_data = lgb.Dataset(df_train.iloc[val_idx][df_train_columns], label=target.iloc[val_idx])#, categorical_feature=categorical_feats)
num_round = 10000
clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 100)
oof[val_idx] = clf.predict(df_train.iloc[val_idx][df_train_columns], num_iteration=clf.best_iteration)
fold_importance_df = pd.DataFrame()
fold_importance_df["Feature"] = df_train_columns
fold_importance_df["importance"] = clf.feature_importance()
fold_importance_df["fold"] = fold_ + 1
feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
predictions += clf.predict(df_test[df_train_columns], num_iteration=clf.best_iteration) / folds.n_splits
np.sqrt(mean_squared_error(oof, target))
cols = (feature_importance_df[["Feature", "importance"]]
.groupby("Feature")
.mean()
.sort_values(by="importance", ascending=False)[:1000].index)
best_features = feature_importance_df.loc[feature_importance_df.Feature.isin(cols)]
plt.figure(figsize=(14,25))
sns.barplot(x="importance",
y="Feature",
data=best_features.sort_values(by="importance",
ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.savefig('lgbm_importances.png')
分析:
1.上面代码的超参数是提前选好了的,所以如果你也选好了,那么也可以按上面那样来。
2.上面代码很明显是5折交叉验证时,5折就意味着训练出来了5个模型,同时每个模型都会对test做预测。同时每个预测值都除以5,再把5个模型的预测结果除5之后的相加。也就是做了一个平均,增加泛化能力。
最终结果:
y_pred = (y_1+y_2+y_3+y_4+y_5)/5
与回归1高度相似。也可不用看。
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
lgb_params = {"objective" : "regression", "metric" : "rmse",
"max_depth": 7, "min_child_samples": 20,
"reg_alpha": 1, "reg_lambda": 1,
"num_leaves" : 64, "learning_rate" : 0.01,
"subsample" : 0.8, "colsample_bytree" : 0.8,
"verbosity": -1}
FOLDs = KFold(n_splits=5, shuffle=True, random_state=42)
oof_lgb = np.zeros(len(train_X))
predictions_lgb = np.zeros(len(test_X))
features_lgb = list(train_X.columns)
feature_importance_df_lgb = pd.DataFrame()
for fold_, (trn_idx, val_idx) in enumerate(FOLDs.split(train_X)):
trn_data = lgb.Dataset(train_X.iloc[trn_idx], label=train_y.iloc[trn_idx])
val_data = lgb.Dataset(train_X.iloc[val_idx], label=train_y.iloc[val_idx])
print("-" * 20 +"LGB Fold:"+str(fold_)+ "-" * 20)
num_round = 10000
clf = lgb.train(lgb_params, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=1000, early_stopping_rounds = 50)
oof_lgb[val_idx] = clf.predict(train_X.iloc[val_idx], num_iteration=clf.best_iteration)
fold_importance_df_lgb = pd.DataFrame()
fold_importance_df_lgb["feature"] = features_lgb
fold_importance_df_lgb["importance"] = clf.feature_importance()
fold_importance_df_lgb["fold"] = fold_ + 1
feature_importance_df_lgb = pd.concat([feature_importance_df_lgb, fold_importance_df_lgb], axis=0)
predictions_lgb += clf.predict(test_X, num_iteration=clf.best_iteration) / FOLDs.n_splits
print("Best RMSE: ",np.sqrt(mean_squared_error(oof_lgb, train_y)))
其实。lgbm关键语句就是:
clf = lgb.train(param, trn_data, num_round,
valid_sets = [trn_data, val_data], verbose_eval=100,
early_stopping_rounds = 100)
oof[val_idx] = clf.predict(df_train.iloc[val_idx][df_train_columns], num_iteration=clf.best_iteration)
而我们常见的sklearn系列是fit,predict。
code from:https://www.kaggle.com/waitingli/combining-your-model-with-a-model-without-outlier
param = {'num_leaves': 31,
'min_data_in_leaf': 30,
'objective':'binary',
'max_depth': 6,
'learning_rate': 0.01,
"boosting": "rf",
"feature_fraction": 0.9,
"bagging_freq": 1,
"bagging_fraction": 0.9 ,
"bagging_seed": 11,
"metric": 'binary_logloss',
"lambda_l1": 0.1,
"verbosity": -1,
"random_state": 2333}
%%time
folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof = np.zeros(len(df_train))
predictions = np.zeros(len(df_test))
feature_importance_df = pd.DataFrame()
start = time.time()
for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train.values, target.values)):
print("fold n°{}".format(fold_))
trn_data = lgb.Dataset(df_train.iloc[trn_idx][features], label=target.iloc[trn_idx], categorical_feature=categorical_feats)
val_data = lgb.Dataset(df_train.iloc[val_idx][features], label=target.iloc[val_idx], categorical_feature=categorical_feats)
num_round = 10000
clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 200)
oof[val_idx] = clf.predict(df_train.iloc[val_idx][features], num_iteration=clf.best_iteration)
fold_importance_df = pd.DataFrame()
fold_importance_df["feature"] = features
fold_importance_df["importance"] = clf.feature_importance()
fold_importance_df["fold"] = fold_ + 1
feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
predictions += clf.predict(df_test[features], num_iteration=clf.best_iteration) / folds.n_splits
print("CV score: {:<8.5f}".format(log_loss(target, oof)))
### 'target' is the probability of whether an observation is an outlier
df_outlier_prob = pd.DataFrame({"card_id":df_test["card_id"].values})
df_outlier_prob["target"] = predictions
df_outlier_prob.head()