模型验证部分步骤:
评价指标及预测方式等 性能评价函数 。赛题目标是预测投放的优惠券是否核销。 针对此任务及一些相关背景知识,使用优惠券核销预测的平均AUC(ROC曲线下面积)作为评价标准。 即对每个优惠券coupon_id单独计算核销预测的AUC值,再对所有优惠券的AUC值求平均作为最终的评价标准。
根据以下经验获得最好的模型:
:1:绘制学习曲线对模型拟合程度进行可视化分析
2:采用不同调参方式:网格搜索和随机搜索
3:绘制验证曲线可视化调参过程
4:对交叉验证方式进行选择
5: coupon平均auc计算为最终评价指标
6:选择不同模型进行比较,选择最好的模型
Python代码以及详细注释如下:
# 修改pandas默认的现实设置
import pandas as pd
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 20)
"""按照: 评价指标 验证方式 学习曲线 结果分析,模型选择 模型调参 的步骤对模型进行选择,调优"""
from sklearn import metrics
import numpy as np
import pandas as pd
import datetime
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV
#########部分SKLearn 集成的算法###############
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import MultinomialNB
#########SKLearn 集成的算法###############
import warnings
warnings.filterwarnings("ignore")
############全局参数#################################
id_col_names = ['user_id', 'coupon_id', 'date_received']
target_col_name = 'label'
id_target_cols = ['user_id', 'coupon_id', 'date_received', 'label']
myeval = 'roc_auc'
# cvscore=0
############目录定义#################################
datapath = '../data/'
featurepath = '../feature/'
resultpath = '../result/'
tmppath = '../tmp/'
scorepath = '../score/'
###########工具函数#############################################
# 返回ID列
def get_id_df(df):
return df[id_col_names]
# 返回Target列
def get_target_df(df):
return df[target_col_name]
# 返回特征列
def get_predictors_df(df):
predictors = [f for f in df.columns if f not in id_target_cols]
return df[predictors]
# 按特征名读取训练集
def read_featurefile_train():
df = pd.read_csv('C:\\Users\\Administrator\\Desktop\\数据挖掘项目\\O2O_data\\train_sf2.csv',
sep=',',
encoding="utf-8")
df.fillna(0, inplace=True)
return df
# 按特征名读取测试集
def read_featurefile_test():
df = pd.read_csv('C:\\Users\\Administrator\\Desktop\\数据挖掘项目\\O2O_data\\test_sf2.csv',
sep=',',
encoding="utf-8")
df.fillna(0, inplace=True)
return df
# 将特征归一化
def standize_df(train_data, test_data):
from sklearn import preprocessing
features_columns = [
f for f in test_data.columns if f not in id_target_cols
]
min_max_scaler = preprocessing.MinMaxScaler()
min_max_scaler = min_max_scaler.fit(train_data[features_columns])
train_data_scaler = min_max_scaler.transform(train_data[features_columns])
test_data_scaler = min_max_scaler.transform(test_data[features_columns])
train_data_scaler = pd.DataFrame(train_data_scaler)
train_data_scaler.columns = features_columns
test_data_scaler = pd.DataFrame(test_data_scaler)
test_data_scaler.columns = features_columns
train_data_scaler['label'] = train_data['label']
train_data_scaler[id_col_names] = train_data[id_col_names]
test_data_scaler[id_col_names] = test_data[id_col_names]
return train_data_scaler, test_data_scaler
# 按特征名读取数据
def read_data():
traindf = read_featurefile_train()
testdf = read_featurefile_test()
#return traindf,testdf
return standize_df(traindf, testdf)
# 数据读取
# 所有的特征都是上一节生成的
train_f2, test_f2 = read_data() # 返回的是归一化后的原始训练和测试数据
# 评价指标及预测方式等
# 性能评价函数
# 本赛题目标是预测投放的优惠券是否核销。
# 针对此任务及一些相关背景知识,使用优惠券核销预测的平均AUC(ROC曲线下面积)作为评价标准。
# 即对每个优惠券coupon_id单独计算核销预测的AUC值,再对所有优惠券的AUC值求平均作为最终的评价标准。
# coupon平均auc计算
from sklearn.metrics import roc_auc_score
def myauc(test):
testgroup = test.groupby(['coupon_id'])
aucs = []
for i in testgroup:
coupon_df = i[1]
# 测算AUC必须大于1个类别
if len(coupon_df['label'].unique()) < 2:
continue
auc = metrics.roc_auc_score(coupon_df['label'], coupon_df['pred'])
aucs.append(auc)
return np.average(aucs)
"""
虽然赛题是按照coupon的AUC来计算。不过因为整体AUC(也就是用roc_auc_score 求出的结果)与Coupon AUC同增同减,
所以在进行评估的时候可以直接使用整体AUC。
预测方式,因为要的结果是购买的几率,所以不能直接用Predict因为这样会直接返回0,1,
而要用predict_proba,它会返回每个类别的可能行,取其中为1的列即可
"""
# 验证方式
# 简单交叉验证
# 简单交叉验证
from sklearn.model_selection import train_test_split # 切分数据
target = get_target_df(train_f2).copy()
traindf = train_f2.copy()
# 切分数据 训练数据80% 验证数据20%
train_all, test_all, train_target, test_target = train_test_split(
traindf, target, test_size=0.2, random_state=0)
train_data = get_predictors_df(train_all).copy()
test_data = get_predictors_df(test_all).copy()
clf = LogisticRegression()
clf.fit(train_data, train_target)
train_pred = clf.predict_proba(train_data)[:, 1]
test_pred = clf.predict_proba(test_data)[:, 1]
score_train = roc_auc_score(train_target, train_pred)
score_test = roc_auc_score(test_target, test_pred)
print("LogisticRegression train 总体AUC: ", score_train)
print("LogisticRegression test 总体AUC: ", score_test)
train_all['pred'] = train_pred
test_all['pred'] = test_pred
print("LogisticRegression train Coupon AUC: ", myauc(train_all))
print("LogisticRegression test Coupon AUC: ", myauc(test_all))
# K折交叉验证 K-fold CV
# 5折交叉验证
train = train_f2.copy()
target = get_target_df(train_f2).copy()
from sklearn.model_selection import KFold
kf = KFold(n_splits=5)
for k, (train_index, test_index) in enumerate(kf.split(train)): # train_index, test_index 两个分别为列表
train_data, test_data, train_target, test_target = train.iloc[
train_index], train.iloc[test_index], target[train_index], target[
test_index]
clf = LogisticRegression()
clf.fit(get_predictors_df(train_data), train_target)
train_pred = clf.predict_proba(get_predictors_df(train_data))[:, 1]
test_pred = clf.predict_proba(get_predictors_df(test_data))[:, 1]
score_train = roc_auc_score(train_target, train_pred)
score_test = roc_auc_score(test_target, test_pred)
train_data['pred'] = train_pred
test_data['pred'] = test_pred
print(k + 1, " 折", "LogisticRegression train 总体AUC: ", score_train)
print(k + 1, " 折", "LogisticRegression test 总体AUC: ", score_test)
print(k + 1, " 折", "LogisticRegression train Coupon AUC: ",
myauc(train_data))
print(k + 1, " 折", "LogisticRegression test Coupon AUC: ",
myauc(test_data), '\n')
# 留P法 LPO CV
train = train_f2.copy()
target = get_target_df(train_f2).copy()
from sklearn.model_selection import LeavePOut
lpo = LeavePOut(p=200)
num = 100
for k, (train_index, test_index) in enumerate(lpo.split(train)):
train_data, test_data, train_target, test_target = train.iloc[
train_index], train.iloc[test_index], target[train_index], target[
test_index]
clf = LogisticRegression()
clf.fit(get_predictors_df(train_data), train_target)
train_pred = clf.predict_proba(get_predictors_df(train_data))[:, 1]
test_pred = clf.predict_proba(get_predictors_df(test_data))[:, 1]
score_train = roc_auc_score(train_target, train_pred)
score_test = roc_auc_score(test_target, test_pred)
train_data['pred'] = train_pred
test_data['pred'] = test_pred
print(k + 1, " 折", "LogisticRegression train 总体AUC: ", score_train)
print(k + 1, " 折", "LogisticRegression test 总体AUC: ", score_test)
print(k + 1, " 折", "LogisticRegression train Coupon AUC: ",
myauc(train_data))
print(k + 1, " 折", "LogisticRegression test Coupon AUC: ",
myauc(test_data), '\n')
if k >= 5:
break
# StratifiedKFold
# 通过比较发现还是StratifiedKFold比较适合本赛题。因为本赛题正负样本分布不均匀。
# 而StratifiedKFold 分层采样交叉切分,确保训练集,测试集中各类别样本的比例与原始数据集中相同(每折按类分层抽样)。
# StratifiedKFold 5折交叉验证
train = train_f2.copy()
target = get_target_df(train_f2).copy()
from sklearn.model_selection import StratifiedKFold
kf = StratifiedKFold(n_splits=5)
for k, (train_index, test_index) in enumerate(kf.split(train, target)):
train_data, test_data, train_target, test_target = train.iloc[
train_index], train.iloc[test_index], target[train_index], target[
test_index]
clf = LogisticRegression()
clf.fit(get_predictors_df(train_data), train_target)
train_pred = clf.predict_proba(get_predictors_df(train_data))[:, 1]
test_pred = clf.predict_proba(get_predictors_df(test_data))[:, 1]
score_train = roc_auc_score(train_target, train_pred)
score_test = roc_auc_score(test_target, test_pred)
train_data['pred'] = train_pred
test_data['pred'] = test_pred
print(k + 1, " 折", "LogisticRegression train 总体AUC: ", score_train)
print(k + 1, " 折", "LogisticRegression test 总体AUC: ", score_test)
print(k + 1, " 折", "LogisticRegression train Coupon AUC: ",
myauc(train_data))
print(k + 1, " 折", "LogisticRegression test Coupon AUC: ",
myauc(test_data), '\n')
# 10.4开始
####################使用sklearn的统一代码框架##########################
# 提供的函数包括:
# classifier_single(featurename,classifier,cvnum)
# 按满减情况分别预测
# classifier_single_sep_fd(featurename,classifier,cvnum):
####################整合在sklearn的分类算法###############
def get_sklearn_model(model_name, param=None):
#朴素贝叶斯
if model_name == 'NB':
model = MultinomialNB(alpha=0.01)
#逻辑回归
elif model_name == 'LR':
model = LogisticRegression(penalty='l2')
# KNN
elif model_name == 'KNN':
model = KNeighborsClassifier()
#随机森林
elif model_name == 'RF':
model = RandomForestClassifier()
#决策树
elif model_name == 'DT':
model = tree.DecisionTreeClassifier()
#向量机
elif model_name == 'SVC':
model = SVC(kernel='rbf')
#GBDT
elif model_name == 'GBDT':
model = GradientBoostingClassifier()
#XGBoost
elif model_name == 'XGB':
model = XGBClassifier()
#lightGBM
elif model_name == 'LGB':
model = LGBMClassifier()
else:
print("wrong model name!")
return
if param is not None:
model.set_params(**param)
return model
# 对算法进行分析
def classifier_df_score(train_feat, classifier, cvnum, param=None):
clf = get_sklearn_model(classifier, param)
train = train_feat.copy()
target = get_target_df(train_feat).copy()
kf = StratifiedKFold(n_splits=cvnum)
scores = []
score_coupons = []
for k, (train_index, test_index) in enumerate(kf.split(train, target)):
train_data, test_data, train_target, test_target = train.iloc[
train_index], train.iloc[test_index], target[train_index], target[
test_index]
clf.fit(get_predictors_df(train_data), train_target)
train_pred = clf.predict_proba(get_predictors_df(train_data))[:, 1]
test_pred = clf.predict_proba(get_predictors_df(test_data))[:, 1]
score_test = roc_auc_score(test_target, test_pred)
test_data['pred'] = test_pred
score_coupon_test = myauc(test_data)
scores.append(score_test)
score_coupons.append(score_coupon_test)
print(classifier + "总体AUC:", scores)
print(classifier + "Coupon AUC:", score_coupons)
# f2特征
train = train_f2.copy()
print(train.head())
print('特征f2, 不同模型5折训练Score:')
classifier_df_score(train, 'NB', 5)
classifier_df_score(train, 'LR', 5)
classifier_df_score(train, 'RF', 5)
classifier_df_score(train, 'LGB', 5)
"""通过对比训练集上不同算法的运算结果可以发现,F1特征集因为特征比较少,有严重的欠拟合,
所以所有算法的分数都比较低。 F2特征集通过滑窗增加统计特征,它的分数比f1有了飞跃性的提高,
其实在现实的业务场景F2+LR已经是一个很常用的解决方案了。之所以在实际作业中更倾向逻辑回归而不是类似LightGBM的算法,
是为了减少计算量。当然如果计算资源不是问题的话,LightGBM也是一个好选择
"""
# 绘制学习曲线进行可视化分析
# 我们还可以通过绘制学习曲线,对训练的过程进行比较深入的了解。
# 画学习曲线
def plot_learning_curve(estimator,
title,
X,
y,
ylim=None,
cv=None,
n_jobs=1,
train_sizes=[0.01, 0.02, 0.05, 0.1, 0.2, 0.3]):
plt.figure()
plt.title(title)
if ylim is not None:
plt.ylim(*ylim)
plt.xlabel("Training examples")
plt.ylabel("Score")
train_sizes, train_scores, test_scores = learning_curve(
estimator,
X,
y,
cv=cv,
scoring=myeval,
n_jobs=n_jobs,
train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes,
train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std,
alpha=0.1,
color="r")
plt.fill_between(train_sizes,
test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std,
alpha=0.1,
color="g")
plt.plot(train_sizes,
train_scores_mean,
'o-',
color="r",
label="Training score")
plt.plot(train_sizes,
test_scores_mean,
'o-',
color="g",
label="Cross-validation score")
plt.legend(loc="best")
plt.show()
return plt
# 画算法的学习曲线,为加快画图速度,最多选20%数据
def plot_curve_single(traindf,
classifier,
cvnum,
train_sizes=[0.01, 0.02, 0.05, 0.1, 0.2, 0.3]):
X = get_predictors_df(traindf)
y = get_target_df(traindf)
title = "learning curve of " + classifier + ", cv:" + str(cvnum)
estimator = get_sklearn_model(classifier) # 建模
plot_learning_curve(estimator,
title,
X,
y,
ylim=(0, 1.01),
cv=cvnum,
train_sizes=train_sizes)
# F2特征集
plot_curve_single(train_f2, 'DT', 5, [0.1, 0.2, 0.3, 0.5])
plot_curve_single(train_f2, 'NB', 5, [0.1, 0.2, 0.3, 0.5])
plot_curve_single(train_f2, 'LR', 5, [0.1, 0.2, 0.3, 0.5])
plot_curve_single(train_f2, 'RF', 5, [0.1, 0.2, 0.3, 0.5])
plot_curve_single(train_f2, 'LGB', 5, [0.1, 0.2, 0.3, 0.5])
# 模型超参空间及调参
# f2
train = get_predictors_df(train_f2)
target = get_target_df(train_f2)
print(train.head())
# 以随机森林为例介绍基本调用方法
# 穷举网格搜索
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split # 切分数据
# 切分数据 训练数据80% 验证数据20%
train_data, test_data, train_target, test_target = train_test_split(
train, target, test_size=0.2, random_state=0)
model = RandomForestClassifier()
parameters = {'n_estimators': [20, 50, 100], 'max_depth': [1, 2, 3]}
clf = GridSearchCV(model, parameters, cv=3, verbose=2)
clf.fit(train_data, train_target)
score_test = roc_auc_score(test_target, clf.predict(test_data))
print("RandomForestClassifier GridSearchCV test AUC: ", score_test)
print("最优参数:")
print(clf.best_params_)
sorted(clf.cv_results_.keys())
# 随机参数优化
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split # 切分数据
# 切分数据 训练数据80% 验证数据20%
train_data, test_data, train_target, test_target = train_test_split(
train, target, test_size=0.2, random_state=0)
model = RandomForestClassifier()
parameters = {'n_estimators': [10, 20, 30, 50], 'max_depth': [1, 2, 3]}
clf = RandomizedSearchCV(model, parameters, cv=3, verbose=2)
clf.fit(train_data, train_target)
score_test = roc_auc_score(test_target, clf.predict(test_data))
print("RandomForestClassifier RandomizedSearchCV test AUC: ", score_test)
print("最优参数:")
print(clf.best_params_)
sorted(clf.cv_results_.keys())
"""F3特征集和LightGBM 网格调参
LightGBM 调参次序:
第一步:学习率和迭代次数
第二步:确定max_depth和num_leaves
第三步:确定min_data_in_leaf和max_bin in
第四步:确定feature_fraction、bagging_fraction、bagging_freq
第五步:确定lambda_l1和lambda_l2
第六步:确定 min_split_gain
第七步:降低学习率,增加迭代次数,验证模型
"""
# f2特征
traindf = train_f2.copy()
# 按日期分割,为了加快速度,只用了一般数据进行网格调参,正式的时候应该全用
train = traindf[traindf.date_received < 20160515]
test = traindf[traindf.date_received >= 20160515]
train_data = get_predictors_df(train).copy()
train_target = get_target_df(train).copy()
test_data = get_predictors_df(test).copy()
test_target = get_target_df(test).copy()
print(traindf.head())
# 第一步:学习率和迭代次数
from sklearn.model_selection import GridSearchCV
# 切分数据 训练数据80% 验证数据20%
# 为了加快速度CV选的3,其实一般用5
# 因为每训练一次耗时很多,所以每个参数的选项不多,间隔比较大,正式的时候应该是比较多,间隔比较细的
# 本次只是演示,所以如果最好参数位于区间的边缘也就直接用了,其实如果最好参数在边缘,需要重新再搜索。
model = LGBMClassifier(boosting_type='gbdt',
objective='binary',
metrics='auc',
learning_rate=0.1,
max_depth=5,
bagging_fraction=0.8,
feature_fraction=0.8)
parameters = {'n_estimators': [100, 150, 175, 200, 225, 250]}
clf = GridSearchCV(model, parameters, cv=3, verbose=2)
clf.fit(train_data, train_target)
score_test = roc_auc_score(test_target, clf.predict_proba(test_data)[:, 1])
print("LightGBM GridSearchCV AUC Score: ", score_test)
print("最优参数:")
print(clf.best_params_)
# 第二步:确定max_depth和num_leaves
# n_estimators=200
model = LGBMClassifier(boosting_type='gbdt',
objective='binary',
metrics='auc',
n_estimators=200,
learning_rate=0.1,
bagging_fraction=0.8,
feature_fraction=0.8)
parameters = {'max_depth': range(4, 8, 1), 'num_leaves': range(10, 150, 10)}
# parameters={'max_depth': range(4,8,2), 'num_leaves':range(10, 100, 20)}
clf = GridSearchCV(model, parameters, cv=3, verbose=2)
clf.fit(train_data, train_target)
score_test = roc_auc_score(test_target, clf.predict_proba(test_data)[:, 1])
print("LightGBM GridSearchCV AUC Score: ", score_test)
print("最优参数:")
print(clf.best_params_)
# 根据结果取 max_depth=6, num_leaves=40 也都这么做。
#第三步:确定min_data_in_leaf和max_bin in
#已经确认内容:
#n_estimators=200
#{'max_depth': 6, 'num_leaves': 40}
model = LGBMClassifier(boosting_type='gbdt',
objective='binary',
metrics='auc',
n_estimators=200,
max_depth=6,
num_leaves=40,
learning_rate=0.1,
bagging_fraction=0.8,
feature_fraction=0.8)
#parameters={'max_bin': range(100,500,50),'min_data_in_leaf':range(100,150,10)}
parameters = {
'max_bin': range(100, 500, 100),
'min_data_in_leaf': range(100, 150, 50)
}
#调高 verbose可以看到更多信息
clf = GridSearchCV(model, parameters, cv=3, verbose=3)
clf.fit(train_data, train_target)
score_test = roc_auc_score(test_target, clf.predict_proba(test_data)[:, 1])
print("LightGBM GridSearchCV AUC Score: ", score_test)
print("最优参数:")
print(clf.best_params_)
# LightGBM GridSearchCV AUC Score: 0.8158486756976501
# 最优参数:
# {'max_bin': 400, 'min_data_in_leaf': 100}
#第四步:确定feature_fraction、bagging_fraction、bagging_freq
#已经确认内容:
#n_estimators=200
#{'max_depth': 6, 'num_leaves': 40}
#{'max_bin': 400, 'min_data_in_leaf': 120}
model = LGBMClassifier(boosting_type='gbdt',
objective='binary',
metrics='auc',
n_estimators=200,
max_depth=6,
num_leaves=40,
max_bin=400,
min_data_in_leaf=120,
learning_rate=0.1,
bagging_fraction=0.8,
feature_fraction=0.8)
parameters = {
'feature_fraction': [0.6, 0.7, 0.8, 0.9, 1.0],
'bagging_fraction': [0.6, 0.7, 0.8, 0.9, 1.0],
'bagging_freq': range(0, 10, 2)
}
#调高 verbose可以看到更多信息
clf = GridSearchCV(model, parameters, cv=3, verbose=3)
clf.fit(train_data, train_target)
score_test = roc_auc_score(test_target, clf.predict_proba(test_data)[:, 1])
print("LightGBM GridSearchCV AUC Score: ", score_test)
print("最优参数:")
print(clf.best_params_)
#第五步:确定lambda_l1和lambda_l2
#已经确认内容:
#n_estimators=200
#{'max_depth': 6, 'num_leaves': 40}
#{'max_bin': 400, 'min_data_in_leaf': 120}
#{'bagging_fraction': 0.9, 'bagging_freq': 4, 'feature_fraction': 0.6}
model = LGBMClassifier(boosting_type='gbdt',
objective='binary',
metrics='auc',
n_estimators=200,
max_depth=6,
num_leaves=40,
max_bin=400,
min_data_in_leaf=120,
learning_rate=0.1,
bagging_freq=4,
bagging_fraction=0.9,
feature_fraction=0.6)
parameters = {
'lambda_l1': [1e-5, 1e-3, 1e-1, 0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0],
'lambda_l2': [1e-5, 1e-3, 1e-1, 0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0]
}
#调高 verbose可以看到更多信息
clf = GridSearchCV(model, parameters, cv=3, verbose=3)
clf.fit(train_data, train_target)
score_test = roc_auc_score(test_target, clf.predict_proba(test_data)[:, 1])
print("LightGBM GridSearchCV AUC Score: ", score_test)
print("最优参数:")
print(clf.best_params_)
#第六步:确定 min_split_gain
#已经确认内容:
#n_estimators=200
#{'max_depth': 6, 'num_leaves': 40}
#{'max_bin': 400, 'min_data_in_leaf': 120}
#{'bagging_fraction': 0.9, 'bagging_freq': 4, 'feature_fraction': 0.6}
#{'lambda_l1': 1e-05, 'lambda_l2': 1e-05}
model = LGBMClassifier(boosting_type='gbdt',
objective='binary',
metrics='auc',
n_estimators=200,
max_depth=6,
num_leaves=40,
max_bin=400,
min_data_in_leaf=120,
learning_rate=0.1,
lambda_l1=1e-05,
lambda_l2=1e-05,
bagging_freq=4,
bagging_fraction=0.9,
feature_fraction=0.6)
parameters = {
'min_split_gain': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
}
#调高 verbose可以看到更多信息
clf = GridSearchCV(model, parameters, cv=3, verbose=3)
clf.fit(train_data, train_target)
score_test = roc_auc_score(test_target, clf.predict_proba(test_data)[:, 1])
print("LightGBM GridSearchCV AUC Score: ", score_test)
print("最优参数:")
print(clf.best_params_)
# 第七步:降低学习率,增加迭代次数,验证模型
#已经确认内容:
#n_estimators=200
#{'max_depth': 6, 'num_leaves': 40}
#{'max_bin': 400, 'min_data_in_leaf': 120}
#{'bagging_fraction': 0.9, 'bagging_freq': 4, 'feature_fraction': 0.6}
#{'lambda_l1': 1e-05, 'lambda_l2': 1e-05}
#{'min_split_gain': 0.0}
model = LGBMClassifier(boosting_type='gbdt',
objective='binary',
metrics='auc',
n_estimators=200,
max_depth=6,
num_leaves=40,
max_bin=400,
min_data_in_leaf=120,
learning_rate=0.1,
lambda_l1=1e-05,
lambda_l2=1e-05,
min_split_gain=0.0,
bagging_freq=4,
bagging_fraction=0.9,
feature_fraction=0.6)
model.fit(train_data, train_target)
score_test = roc_auc_score(test_target, model.predict_proba(test_data)[:, 1])
print("LightGBM learning rate 0.1 AUC Score: ", score_test)
model = LGBMClassifier(boosting_type='gbdt',
objective='binary',
metrics='auc',
n_estimators=200,
max_depth=6,
num_leaves=40,
max_bin=400,
min_data_in_leaf=120,
learning_rate=0.05,
lambda_l1=1e-05,
lambda_l2=1e-05,
min_split_gain=0.0,
bagging_freq=4,
bagging_fraction=0.9,
feature_fraction=0.6)
model.fit(train_data, train_target)
score_test = roc_auc_score(test_target, model.predict_proba(test_data)[:, 1])
print("LightGBM learning rate 0.05 AUC Score: ", score_test)
model = LGBMClassifier(boosting_type='gbdt',
objective='binary',
metrics='auc',
n_estimators=200,
max_depth=6,
num_leaves=40,
max_bin=400,
min_data_in_leaf=120,
learning_rate=0.01,
lambda_l1=1e-05,
lambda_l2=1e-05,
min_split_gain=0.0,
bagging_freq=4,
bagging_fraction=0.9,
feature_fraction=0.6)
model.fit(train_data, train_target)
score_test = roc_auc_score(test_target, model.predict_proba(test_data)[:, 1])
print("LightGBM learning rate 0.01 AUC Score: ", score_test)
model = LGBMClassifier(boosting_type='gbdt',
objective='binary',
metrics='auc',
n_estimators=200,
max_depth=6,
num_leaves=40,
max_bin=400,
min_data_in_leaf=120,
learning_rate=0.005,
lambda_l1=1e-05,
lambda_l2=1e-05,
min_split_gain=0.0,
bagging_freq=4,
bagging_fraction=0.9,
feature_fraction=0.6)
model.fit(train_data, train_target)
score_test = roc_auc_score(test_target, model.predict_proba(test_data)[:, 1])
print("LightGBM learning rate 0.005 AUC Score: ", score_test)
model = LGBMClassifier()
model.fit(train_data, train_target)
score_test = roc_auc_score(test_target, model.predict_proba(test_data)[:, 1])
print("默认参数 AUC Score: ", score_test)
"""最优参数 model = LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',n_estimators=200,max_depth=6,num_leaves=40,
max_bin=400,min_data_in_leaf=120,
learning_rate=0.05,
lambda_l1=1e-05,lambda_l2=1e-05,min_split_gain=0.0,
bagging_freq=4, bagging_fraction = 0.9,feature_fraction = 0.6)
"""
# f2特征
train = train_f2.copy()
train.head()
print('默认参数')
classifier_df_score(train, 'LGB', 5)
params = {
'boosting_type': 'gbdt',
'objective': 'binary',
'eval_metric': 'auc',
'n_estimators': 200,
'max_depth': 5,
'num_leaves': 40,
'max_bin': 400,
'min_data_in_leaf': 120,
'learning_rate': 0.1,
'lambda_l1': 1e-05,
'lambda_l2': 1e-05,
'min_split_gain': 0.0,
'bagging_freq': 4,
'bagging_fraction': 0.9,
'feature_fraction': 0.6,
'seed': 1024,
'n_thread': 12
}
print('调参后')
classifier_df_score(train, 'LGB', 5, params)
"""默认参数
LGB总体AUC: [0.9017063876540394, 0.9028474497847858, 0.8860489799298887, 0.8810605942191141, 0.8825448947499648]
LGBCoupon AUC: [0.7440798692234487, 0.741981950122888, 0.7474652592149504, 0.7642682296497904, 0.7628559092978145]
调参后
LGB总体AUC: [0.9025855554052864, 0.9043496920648407, 0.8861875013967577, 0.8820594188479879, 0.883635213451176]
LGBCoupon AUC: [0.7441512385391114, 0.7432384282827527, 0.7463287384907189, 0.7663217891405024, 0.7620611110529001]
"""
# 对比发现调参后的结果比默认参数有所提高,不过不是高很多,比不上特征对结果的影响。而且因为调参只能再测试集上作,
# 有的时候调参造成过拟,调参后线上成绩可能反而会下降。所以调参一般都是在比赛后期再做,前期主要是特征和模型的选择。
# 绘制验证曲线
# 可以通过绘制验证曲线,可视化的了解调参的过程
# 对进行网格调参
def grid_plot(train_feat,
classifier,
cvnum,
param_range,
param_name,
param=None):
from sklearn.model_selection import validation_curve
train_scores, test_scores = validation_curve(get_sklearn_model(
classifier, param),
get_predictors_df(train_feat),
get_target_df(train_feat),
param_name=param_name,
param_range=param_range,
cv=cvnum,
scoring='roc_auc',
n_jobs=1)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.title("Validation Curve with " + param_name)
plt.xlabel(param_name)
plt.ylabel("Score")
plt.ylim(0.0, 1.1)
plt.semilogx(param_range,
train_scores_mean,
label="Training score",
color="r")
plt.fill_between(param_range,
train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std,
alpha=0.2,
color="r")
plt.semilogx(param_range,
test_scores_mean,
label="Cross-validation score",
color="g")
plt.fill_between(param_range,
test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std,
alpha=0.2,
color="g")
plt.legend(loc="best")
plt.show()
# 对逻辑回归的max_iter情况进行查看
train_feat = train_f2.copy()
# grid_plot(train_feat,classifier,3,[10,20,40,80,200,400,800],'n_estimators',param=params)
grid_plot(train_feat,
'LR',
3, [1, 2, 5, 10, 20, 40, 50],
'max_iter',
param=None)
params = {
'learning_rate': 0.1,
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'sub_feature': 0.6,
'num_leaves': 50,
'bagging_fraction': 0.8,
'feature_fraction': 0.8
}
train_feat = train_f2.copy()
# grid_plot(train_feat,classifier,3,[10,20,40,80,200,400,800],'n_estimators',param=params)
grid_plot(train_feat, 'LGB', 3, [10, 20, 40], 'n_estimators', param=params)
params = {
'learning_rate': 0.1,
'boosting_type': 'gbdt',
'objective': 'binary',
'metric': 'auc',
'num_leaves': 10
}
train_feat = train_f2.copy()
# grid_plot(train_feat,classifier,3,[10,20,40,80,200,400,800],'n_estimators',param=params)
grid_plot(train_feat,
'LGB',
3, [0.1, 0.2, 0.5, 0.7, 0.8],
'colsample_bytree',
param=params)