天池O2O优惠券预测之模型验证代码解析

模型验证部分步骤:

按照: 评价指标 验证方式 学习曲线 结果分析,模型选择 模型调参 的步骤对模型进行选择,调优。

评价指标及预测方式等 性能评价函数 。赛题目标是预测投放的优惠券是否核销。 针对此任务及一些相关背景知识,使用优惠券核销预测的平均AUC(ROC曲线下面积)作为评价标准。 即对每个优惠券coupon_id单独计算核销预测的AUC值,再对所有优惠券的AUC值求平均作为最终的评价标准。

根据以下经验获得最好的模型:

:1:绘制学习曲线对模型拟合程度进行可视化分析

2:采用不同调参方式:网格搜索和随机搜索

3:绘制验证曲线可视化调参过程

4:对交叉验证方式进行选择

5: coupon平均auc计算为最终评价指标

6:选择不同模型进行比较,选择最好的模型

Python代码以及详细注释如下:

# 修改pandas默认的现实设置
import pandas as pd
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 20)

"""按照: 评价指标 验证方式 学习曲线 结果分析,模型选择 模型调参 的步骤对模型进行选择,调优"""

from sklearn import metrics
import numpy as np
import pandas as pd
import datetime
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV
#########部分SKLearn 集成的算法###############
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import MultinomialNB
#########SKLearn 集成的算法###############
import warnings
warnings.filterwarnings("ignore")

############全局参数#################################
id_col_names = ['user_id', 'coupon_id', 'date_received']
target_col_name = 'label'
id_target_cols = ['user_id', 'coupon_id', 'date_received', 'label']
myeval = 'roc_auc'
# cvscore=0
############目录定义#################################
datapath = '../data/'
featurepath = '../feature/'
resultpath = '../result/'
tmppath = '../tmp/'
scorepath = '../score/'

###########工具函数#############################################

# 返回ID列
def get_id_df(df):
    return df[id_col_names]


# 返回Target列
def get_target_df(df):
    return df[target_col_name]


# 返回特征列
def get_predictors_df(df):
    predictors = [f for f in df.columns if f not in id_target_cols]
    return df[predictors]


# 按特征名读取训练集
def read_featurefile_train():
    df = pd.read_csv('C:\\Users\\Administrator\\Desktop\\数据挖掘项目\\O2O_data\\train_sf2.csv',
                     sep=',',
                     encoding="utf-8")
    df.fillna(0, inplace=True)
    return df


# 按特征名读取测试集
def read_featurefile_test():
    df = pd.read_csv('C:\\Users\\Administrator\\Desktop\\数据挖掘项目\\O2O_data\\test_sf2.csv',
                     sep=',',
                     encoding="utf-8")
    df.fillna(0, inplace=True)
    return df


# 将特征归一化
def standize_df(train_data, test_data):
    from sklearn import preprocessing

    features_columns = [
        f for f in test_data.columns if f not in id_target_cols
    ]
    min_max_scaler = preprocessing.MinMaxScaler()
    min_max_scaler = min_max_scaler.fit(train_data[features_columns])

    train_data_scaler = min_max_scaler.transform(train_data[features_columns])
    test_data_scaler = min_max_scaler.transform(test_data[features_columns])

    train_data_scaler = pd.DataFrame(train_data_scaler)
    train_data_scaler.columns = features_columns

    test_data_scaler = pd.DataFrame(test_data_scaler)
    test_data_scaler.columns = features_columns

    train_data_scaler['label'] = train_data['label']
    train_data_scaler[id_col_names] = train_data[id_col_names]
    test_data_scaler[id_col_names] = test_data[id_col_names]
    return train_data_scaler, test_data_scaler


# 按特征名读取数据
def read_data():
    traindf = read_featurefile_train()
    testdf = read_featurefile_test()
    #return traindf,testdf
    return standize_df(traindf, testdf)


# 数据读取

# 所有的特征都是上一节生成的
train_f2, test_f2 = read_data()  # 返回的是归一化后的原始训练和测试数据


# 评价指标及预测方式等

# 性能评价函数
# 本赛题目标是预测投放的优惠券是否核销。
# 针对此任务及一些相关背景知识,使用优惠券核销预测的平均AUC(ROC曲线下面积)作为评价标准。
# 即对每个优惠券coupon_id单独计算核销预测的AUC值,再对所有优惠券的AUC值求平均作为最终的评价标准。
# coupon平均auc计算
from sklearn.metrics import roc_auc_score


def myauc(test):
    testgroup = test.groupby(['coupon_id'])
    aucs = []
    for i in testgroup:
        coupon_df = i[1]
        # 测算AUC必须大于1个类别
        if len(coupon_df['label'].unique()) < 2:
            continue
        auc = metrics.roc_auc_score(coupon_df['label'], coupon_df['pred'])
        aucs.append(auc)
    return np.average(aucs)


"""
虽然赛题是按照coupon的AUC来计算。不过因为整体AUC(也就是用roc_auc_score 求出的结果)与Coupon AUC同增同减,
所以在进行评估的时候可以直接使用整体AUC。
预测方式,因为要的结果是购买的几率,所以不能直接用Predict因为这样会直接返回0,1,
而要用predict_proba,它会返回每个类别的可能行,取其中为1的列即可
"""

# 验证方式
# 简单交叉验证

# 简单交叉验证
from sklearn.model_selection import train_test_split  # 切分数据

target = get_target_df(train_f2).copy()
traindf = train_f2.copy()

# 切分数据 训练数据80% 验证数据20%
train_all, test_all, train_target, test_target = train_test_split(
    traindf, target, test_size=0.2, random_state=0)

train_data = get_predictors_df(train_all).copy()
test_data = get_predictors_df(test_all).copy()

clf = LogisticRegression()
clf.fit(train_data, train_target)
train_pred = clf.predict_proba(train_data)[:, 1]
test_pred = clf.predict_proba(test_data)[:, 1]

score_train = roc_auc_score(train_target, train_pred)
score_test = roc_auc_score(test_target, test_pred)
print("LogisticRegression train 总体AUC:   ", score_train)
print("LogisticRegression test 总体AUC:   ", score_test)

train_all['pred'] = train_pred
test_all['pred'] = test_pred
print("LogisticRegression train Coupon AUC:   ", myauc(train_all))
print("LogisticRegression test Coupon AUC:   ", myauc(test_all))

# K折交叉验证 K-fold CV

# 5折交叉验证
train = train_f2.copy()
target = get_target_df(train_f2).copy()

from sklearn.model_selection import KFold
kf = KFold(n_splits=5)
for k, (train_index, test_index) in enumerate(kf.split(train)):  # train_index, test_index  两个分别为列表
    train_data, test_data, train_target, test_target = train.iloc[
        train_index], train.iloc[test_index], target[train_index], target[
            test_index]
    clf = LogisticRegression()
    clf.fit(get_predictors_df(train_data), train_target)

    train_pred = clf.predict_proba(get_predictors_df(train_data))[:, 1]
    test_pred = clf.predict_proba(get_predictors_df(test_data))[:, 1]
    score_train = roc_auc_score(train_target, train_pred)
    score_test = roc_auc_score(test_target, test_pred)
    train_data['pred'] = train_pred
    test_data['pred'] = test_pred
    print(k + 1, " 折", "LogisticRegression train 总体AUC:   ", score_train)
    print(k + 1, " 折", "LogisticRegression test 总体AUC:   ", score_test)
    print(k + 1, " 折", "LogisticRegression train Coupon AUC:   ",
          myauc(train_data))
    print(k + 1, " 折", "LogisticRegression test Coupon AUC:   ",
          myauc(test_data), '\n')


# 留P法 LPO CV
train = train_f2.copy()
target = get_target_df(train_f2).copy()

from sklearn.model_selection import LeavePOut
lpo = LeavePOut(p=200)
num = 100
for k, (train_index, test_index) in enumerate(lpo.split(train)):
    train_data, test_data, train_target, test_target = train.iloc[
        train_index], train.iloc[test_index], target[train_index], target[
            test_index]
    clf = LogisticRegression()
    clf.fit(get_predictors_df(train_data), train_target)

    train_pred = clf.predict_proba(get_predictors_df(train_data))[:, 1]
    test_pred = clf.predict_proba(get_predictors_df(test_data))[:, 1]
    score_train = roc_auc_score(train_target, train_pred)
    score_test = roc_auc_score(test_target, test_pred)
    train_data['pred'] = train_pred
    test_data['pred'] = test_pred
    print(k + 1, " 折", "LogisticRegression train 总体AUC:   ", score_train)
    print(k + 1, " 折", "LogisticRegression test 总体AUC:   ", score_test)
    print(k + 1, " 折", "LogisticRegression train Coupon AUC:   ",
          myauc(train_data))
    print(k + 1, " 折", "LogisticRegression test Coupon AUC:   ",
          myauc(test_data), '\n')
    if k >= 5:
        break

# StratifiedKFold
# 通过比较发现还是StratifiedKFold比较适合本赛题。因为本赛题正负样本分布不均匀。
# 而StratifiedKFold 分层采样交叉切分,确保训练集,测试集中各类别样本的比例与原始数据集中相同(每折按类分层抽样)。
# StratifiedKFold 5折交叉验证
train = train_f2.copy()
target = get_target_df(train_f2).copy()

from sklearn.model_selection import StratifiedKFold

kf = StratifiedKFold(n_splits=5)
for k, (train_index, test_index) in enumerate(kf.split(train, target)):
    train_data, test_data, train_target, test_target = train.iloc[
        train_index], train.iloc[test_index], target[train_index], target[
            test_index]
    clf = LogisticRegression()
    clf.fit(get_predictors_df(train_data), train_target)

    train_pred = clf.predict_proba(get_predictors_df(train_data))[:, 1]
    test_pred = clf.predict_proba(get_predictors_df(test_data))[:, 1]
    score_train = roc_auc_score(train_target, train_pred)
    score_test = roc_auc_score(test_target, test_pred)
    train_data['pred'] = train_pred
    test_data['pred'] = test_pred
    print(k + 1, " 折", "LogisticRegression train 总体AUC:   ", score_train)
    print(k + 1, " 折", "LogisticRegression test 总体AUC:   ", score_test)
    print(k + 1, " 折", "LogisticRegression train Coupon AUC:   ",
          myauc(train_data))
    print(k + 1, " 折", "LogisticRegression test Coupon AUC:   ",
          myauc(test_data), '\n')



#  10.4开始
####################使用sklearn的统一代码框架##########################
# 提供的函数包括:
# classifier_single(featurename,classifier,cvnum)
# 按满减情况分别预测
# classifier_single_sep_fd(featurename,classifier,cvnum):
####################整合在sklearn的分类算法###############
def get_sklearn_model(model_name, param=None):
    #朴素贝叶斯
    if model_name == 'NB':
        model = MultinomialNB(alpha=0.01)
    #逻辑回归
    elif model_name == 'LR':
        model = LogisticRegression(penalty='l2')
    # KNN
    elif model_name == 'KNN':
        model = KNeighborsClassifier()
    #随机森林
    elif model_name == 'RF':
        model = RandomForestClassifier()
    #决策树
    elif model_name == 'DT':
        model = tree.DecisionTreeClassifier()
    #向量机
    elif model_name == 'SVC':
        model = SVC(kernel='rbf')
    #GBDT
    elif model_name == 'GBDT':
        model = GradientBoostingClassifier()
    #XGBoost
    elif model_name == 'XGB':
        model = XGBClassifier()
    #lightGBM
    elif model_name == 'LGB':
        model = LGBMClassifier()
    else:
        print("wrong model name!")
        return
    if param is not None:
        model.set_params(**param)
    return model


# 对算法进行分析
def classifier_df_score(train_feat, classifier, cvnum, param=None):
    clf = get_sklearn_model(classifier, param)
    train = train_feat.copy()
    target = get_target_df(train_feat).copy()
    kf = StratifiedKFold(n_splits=cvnum)

    scores = []
    score_coupons = []
    for k, (train_index, test_index) in enumerate(kf.split(train, target)):
        train_data, test_data, train_target, test_target = train.iloc[
            train_index], train.iloc[test_index], target[train_index], target[
                test_index]
        clf.fit(get_predictors_df(train_data), train_target)
        train_pred = clf.predict_proba(get_predictors_df(train_data))[:, 1]
        test_pred = clf.predict_proba(get_predictors_df(test_data))[:, 1]

        score_test = roc_auc_score(test_target, test_pred)
        test_data['pred'] = test_pred
        score_coupon_test = myauc(test_data)

        scores.append(score_test)
        score_coupons.append(score_coupon_test)

    print(classifier + "总体AUC:", scores)
    print(classifier + "Coupon AUC:", score_coupons)



# f2特征
train = train_f2.copy()
print(train.head())

print('特征f2, 不同模型5折训练Score:')
classifier_df_score(train, 'NB', 5)
classifier_df_score(train, 'LR', 5)
classifier_df_score(train, 'RF', 5)
classifier_df_score(train, 'LGB', 5)


"""通过对比训练集上不同算法的运算结果可以发现,F1特征集因为特征比较少,有严重的欠拟合,
所以所有算法的分数都比较低。 F2特征集通过滑窗增加统计特征,它的分数比f1有了飞跃性的提高,
其实在现实的业务场景F2+LR已经是一个很常用的解决方案了。之所以在实际作业中更倾向逻辑回归而不是类似LightGBM的算法,
是为了减少计算量。当然如果计算资源不是问题的话,LightGBM也是一个好选择
"""

# 绘制学习曲线进行可视化分析
# 我们还可以通过绘制学习曲线,对训练的过程进行比较深入的了解。


# 画学习曲线
def plot_learning_curve(estimator,
                        title,
                        X,
                        y,
                        ylim=None,
                        cv=None,
                        n_jobs=1,
                        train_sizes=[0.01, 0.02, 0.05, 0.1, 0.2, 0.3]):
    plt.figure()
    plt.title(title)
    if ylim is not None:
        plt.ylim(*ylim)
    plt.xlabel("Training examples")
    plt.ylabel("Score")
    train_sizes, train_scores, test_scores = learning_curve(
        estimator,
        X,
        y,
        cv=cv,
        scoring=myeval,
        n_jobs=n_jobs,
        train_sizes=train_sizes)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes,
                     train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std,
                     alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes,
                     test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std,
                     alpha=0.1,
                     color="g")
    plt.plot(train_sizes,
             train_scores_mean,
             'o-',
             color="r",
             label="Training score")
    plt.plot(train_sizes,
             test_scores_mean,
             'o-',
             color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    plt.show()
    return plt


# 画算法的学习曲线,为加快画图速度,最多选20%数据
def plot_curve_single(traindf,
                      classifier,
                      cvnum,
                      train_sizes=[0.01, 0.02, 0.05, 0.1, 0.2, 0.3]):
    X = get_predictors_df(traindf)
    y = get_target_df(traindf)
    title = "learning curve of " + classifier + ", cv:" + str(cvnum)
    estimator = get_sklearn_model(classifier)  # 建模
    plot_learning_curve(estimator,
                        title,
                        X,
                        y,
                        ylim=(0, 1.01),
                        cv=cvnum,
                        train_sizes=train_sizes)


# F2特征集
plot_curve_single(train_f2, 'DT', 5, [0.1, 0.2, 0.3, 0.5])

plot_curve_single(train_f2, 'NB', 5, [0.1, 0.2, 0.3, 0.5])

plot_curve_single(train_f2, 'LR', 5, [0.1, 0.2, 0.3, 0.5])

plot_curve_single(train_f2, 'RF', 5, [0.1, 0.2, 0.3, 0.5])

plot_curve_single(train_f2, 'LGB', 5, [0.1, 0.2, 0.3, 0.5])


# 模型超参空间及调参

# f2
train = get_predictors_df(train_f2)
target = get_target_df(train_f2)
print(train.head())


# 以随机森林为例介绍基本调用方法

# 穷举网格搜索

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split  # 切分数据
# 切分数据 训练数据80% 验证数据20%
train_data, test_data, train_target, test_target = train_test_split(
    train, target, test_size=0.2, random_state=0)

model = RandomForestClassifier()
parameters = {'n_estimators': [20, 50, 100], 'max_depth': [1, 2, 3]}

clf = GridSearchCV(model, parameters, cv=3, verbose=2)
clf.fit(train_data, train_target)

score_test = roc_auc_score(test_target, clf.predict(test_data))

print("RandomForestClassifier GridSearchCV test AUC:   ", score_test)
print("最优参数:")
print(clf.best_params_)
sorted(clf.cv_results_.keys())

# 随机参数优化

from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split  # 切分数据
# 切分数据 训练数据80% 验证数据20%
train_data, test_data, train_target, test_target = train_test_split(
    train, target, test_size=0.2, random_state=0)

model = RandomForestClassifier()
parameters = {'n_estimators': [10, 20, 30, 50], 'max_depth': [1, 2, 3]}

clf = RandomizedSearchCV(model, parameters, cv=3, verbose=2)
clf.fit(train_data, train_target)

score_test = roc_auc_score(test_target, clf.predict(test_data))

print("RandomForestClassifier RandomizedSearchCV test AUC:   ", score_test)
print("最优参数:")
print(clf.best_params_)
sorted(clf.cv_results_.keys())

"""F3特征集和LightGBM 网格调参
LightGBM 调参次序:
第一步:学习率和迭代次数
第二步:确定max_depth和num_leaves
第三步:确定min_data_in_leaf和max_bin in
第四步:确定feature_fraction、bagging_fraction、bagging_freq
第五步:确定lambda_l1和lambda_l2
第六步:确定 min_split_gain
第七步:降低学习率,增加迭代次数,验证模型
"""

# f2特征
traindf = train_f2.copy()

# 按日期分割,为了加快速度,只用了一般数据进行网格调参,正式的时候应该全用
train = traindf[traindf.date_received < 20160515]
test = traindf[traindf.date_received >= 20160515]

train_data = get_predictors_df(train).copy()
train_target = get_target_df(train).copy()
test_data = get_predictors_df(test).copy()
test_target = get_target_df(test).copy()

print(traindf.head())

# 第一步:学习率和迭代次数
from sklearn.model_selection import GridSearchCV
# 切分数据 训练数据80% 验证数据20%
# 为了加快速度CV选的3,其实一般用5
# 因为每训练一次耗时很多,所以每个参数的选项不多,间隔比较大,正式的时候应该是比较多,间隔比较细的
# 本次只是演示,所以如果最好参数位于区间的边缘也就直接用了,其实如果最好参数在边缘,需要重新再搜索。
model = LGBMClassifier(boosting_type='gbdt',
                       objective='binary',
                       metrics='auc',
                       learning_rate=0.1,
                       max_depth=5,
                       bagging_fraction=0.8,
                       feature_fraction=0.8)
parameters = {'n_estimators': [100, 150, 175, 200, 225, 250]}

clf = GridSearchCV(model, parameters, cv=3, verbose=2)
clf.fit(train_data, train_target)
score_test = roc_auc_score(test_target, clf.predict_proba(test_data)[:, 1])

print("LightGBM GridSearchCV AUC Score:   ", score_test)
print("最优参数:")
print(clf.best_params_)

# 第二步:确定max_depth和num_leaves
# n_estimators=200
model = LGBMClassifier(boosting_type='gbdt',
                       objective='binary',
                       metrics='auc',
                       n_estimators=200,
                       learning_rate=0.1,
                       bagging_fraction=0.8,
                       feature_fraction=0.8)
parameters = {'max_depth': range(4, 8, 1), 'num_leaves': range(10, 150, 10)}
# parameters={'max_depth': range(4,8,2), 'num_leaves':range(10, 100, 20)}
clf = GridSearchCV(model, parameters, cv=3, verbose=2)
clf.fit(train_data, train_target)

score_test = roc_auc_score(test_target, clf.predict_proba(test_data)[:, 1])

print("LightGBM GridSearchCV AUC Score:   ", score_test)
print("最优参数:")
print(clf.best_params_)

# 根据结果取 max_depth=6, num_leaves=40 也都这么做。

#第三步:确定min_data_in_leaf和max_bin in
#已经确认内容:
#n_estimators=200
#{'max_depth': 6, 'num_leaves': 40}
model = LGBMClassifier(boosting_type='gbdt',
                       objective='binary',
                       metrics='auc',
                       n_estimators=200,
                       max_depth=6,
                       num_leaves=40,
                       learning_rate=0.1,
                       bagging_fraction=0.8,
                       feature_fraction=0.8)

#parameters={'max_bin': range(100,500,50),'min_data_in_leaf':range(100,150,10)}
parameters = {
    'max_bin': range(100, 500, 100),
    'min_data_in_leaf': range(100, 150, 50)
}
#调高 verbose可以看到更多信息
clf = GridSearchCV(model, parameters, cv=3, verbose=3)
clf.fit(train_data, train_target)

score_test = roc_auc_score(test_target, clf.predict_proba(test_data)[:, 1])

print("LightGBM GridSearchCV AUC Score:   ", score_test)
print("最优参数:")
print(clf.best_params_)

# LightGBM GridSearchCV AUC Score:    0.8158486756976501
# 最优参数:
# {'max_bin': 400, 'min_data_in_leaf': 100}

#第四步:确定feature_fraction、bagging_fraction、bagging_freq
#已经确认内容:
#n_estimators=200
#{'max_depth': 6, 'num_leaves': 40}
#{'max_bin': 400, 'min_data_in_leaf': 120}
model = LGBMClassifier(boosting_type='gbdt',
                       objective='binary',
                       metrics='auc',
                       n_estimators=200,
                       max_depth=6,
                       num_leaves=40,
                       max_bin=400,
                       min_data_in_leaf=120,
                       learning_rate=0.1,
                       bagging_fraction=0.8,
                       feature_fraction=0.8)

parameters = {
    'feature_fraction': [0.6, 0.7, 0.8, 0.9, 1.0],
    'bagging_fraction': [0.6, 0.7, 0.8, 0.9, 1.0],
    'bagging_freq': range(0, 10, 2)
}

#调高 verbose可以看到更多信息
clf = GridSearchCV(model, parameters, cv=3, verbose=3)
clf.fit(train_data, train_target)

score_test = roc_auc_score(test_target, clf.predict_proba(test_data)[:, 1])

print("LightGBM GridSearchCV AUC Score:   ", score_test)
print("最优参数:")
print(clf.best_params_)

#第五步:确定lambda_l1和lambda_l2
#已经确认内容:
#n_estimators=200
#{'max_depth': 6, 'num_leaves': 40}
#{'max_bin': 400, 'min_data_in_leaf': 120}
#{'bagging_fraction': 0.9, 'bagging_freq': 4, 'feature_fraction': 0.6}
model = LGBMClassifier(boosting_type='gbdt',
                       objective='binary',
                       metrics='auc',
                       n_estimators=200,
                       max_depth=6,
                       num_leaves=40,
                       max_bin=400,
                       min_data_in_leaf=120,
                       learning_rate=0.1,
                       bagging_freq=4,
                       bagging_fraction=0.9,
                       feature_fraction=0.6)

parameters = {
    'lambda_l1': [1e-5, 1e-3, 1e-1, 0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0],
    'lambda_l2': [1e-5, 1e-3, 1e-1, 0.0, 0.1, 0.3, 0.5, 0.7, 0.9, 1.0]
}

#调高 verbose可以看到更多信息
clf = GridSearchCV(model, parameters, cv=3, verbose=3)
clf.fit(train_data, train_target)

score_test = roc_auc_score(test_target, clf.predict_proba(test_data)[:, 1])

print("LightGBM GridSearchCV AUC Score:   ", score_test)
print("最优参数:")
print(clf.best_params_)

#第六步:确定 min_split_gain
#已经确认内容:
#n_estimators=200
#{'max_depth': 6, 'num_leaves': 40}
#{'max_bin': 400, 'min_data_in_leaf': 120}
#{'bagging_fraction': 0.9, 'bagging_freq': 4, 'feature_fraction': 0.6}
#{'lambda_l1': 1e-05, 'lambda_l2': 1e-05}
model = LGBMClassifier(boosting_type='gbdt',
                       objective='binary',
                       metrics='auc',
                       n_estimators=200,
                       max_depth=6,
                       num_leaves=40,
                       max_bin=400,
                       min_data_in_leaf=120,
                       learning_rate=0.1,
                       lambda_l1=1e-05,
                       lambda_l2=1e-05,
                       bagging_freq=4,
                       bagging_fraction=0.9,
                       feature_fraction=0.6)

parameters = {
    'min_split_gain': [0.0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
}

#调高 verbose可以看到更多信息
clf = GridSearchCV(model, parameters, cv=3, verbose=3)
clf.fit(train_data, train_target)

score_test = roc_auc_score(test_target, clf.predict_proba(test_data)[:, 1])

print("LightGBM GridSearchCV AUC Score:   ", score_test)
print("最优参数:")
print(clf.best_params_)

# 第七步:降低学习率,增加迭代次数,验证模型
#已经确认内容:
#n_estimators=200
#{'max_depth': 6, 'num_leaves': 40}
#{'max_bin': 400, 'min_data_in_leaf': 120}
#{'bagging_fraction': 0.9, 'bagging_freq': 4, 'feature_fraction': 0.6}
#{'lambda_l1': 1e-05, 'lambda_l2': 1e-05}
#{'min_split_gain': 0.0}
model = LGBMClassifier(boosting_type='gbdt',
                       objective='binary',
                       metrics='auc',
                       n_estimators=200,
                       max_depth=6,
                       num_leaves=40,
                       max_bin=400,
                       min_data_in_leaf=120,
                       learning_rate=0.1,
                       lambda_l1=1e-05,
                       lambda_l2=1e-05,
                       min_split_gain=0.0,
                       bagging_freq=4,
                       bagging_fraction=0.9,
                       feature_fraction=0.6)

model.fit(train_data, train_target)
score_test = roc_auc_score(test_target, model.predict_proba(test_data)[:, 1])

print("LightGBM learning rate 0.1 AUC Score:   ", score_test)

model = LGBMClassifier(boosting_type='gbdt',
                       objective='binary',
                       metrics='auc',
                       n_estimators=200,
                       max_depth=6,
                       num_leaves=40,
                       max_bin=400,
                       min_data_in_leaf=120,
                       learning_rate=0.05,
                       lambda_l1=1e-05,
                       lambda_l2=1e-05,
                       min_split_gain=0.0,
                       bagging_freq=4,
                       bagging_fraction=0.9,
                       feature_fraction=0.6)

model.fit(train_data, train_target)
score_test = roc_auc_score(test_target, model.predict_proba(test_data)[:, 1])

print("LightGBM learning rate 0.05 AUC Score:   ", score_test)

model = LGBMClassifier(boosting_type='gbdt',
                       objective='binary',
                       metrics='auc',
                       n_estimators=200,
                       max_depth=6,
                       num_leaves=40,
                       max_bin=400,
                       min_data_in_leaf=120,
                       learning_rate=0.01,
                       lambda_l1=1e-05,
                       lambda_l2=1e-05,
                       min_split_gain=0.0,
                       bagging_freq=4,
                       bagging_fraction=0.9,
                       feature_fraction=0.6)

model.fit(train_data, train_target)
score_test = roc_auc_score(test_target, model.predict_proba(test_data)[:, 1])

print("LightGBM learning rate 0.01 AUC Score:   ", score_test)

model = LGBMClassifier(boosting_type='gbdt',
                       objective='binary',
                       metrics='auc',
                       n_estimators=200,
                       max_depth=6,
                       num_leaves=40,
                       max_bin=400,
                       min_data_in_leaf=120,
                       learning_rate=0.005,
                       lambda_l1=1e-05,
                       lambda_l2=1e-05,
                       min_split_gain=0.0,
                       bagging_freq=4,
                       bagging_fraction=0.9,
                       feature_fraction=0.6)

model.fit(train_data, train_target)
score_test = roc_auc_score(test_target, model.predict_proba(test_data)[:, 1])

print("LightGBM learning rate 0.005 AUC Score:   ", score_test)

model = LGBMClassifier()

model.fit(train_data, train_target)
score_test = roc_auc_score(test_target, model.predict_proba(test_data)[:, 1])

print("默认参数 AUC Score:   ", score_test)

"""最优参数 model = LGBMClassifier(boosting_type='gbdt',objective='binary',metrics='auc',n_estimators=200,max_depth=6,num_leaves=40,
max_bin=400,min_data_in_leaf=120,
learning_rate=0.05,
lambda_l1=1e-05,lambda_l2=1e-05,min_split_gain=0.0,
bagging_freq=4, bagging_fraction = 0.9,feature_fraction = 0.6)
"""

# f2特征
train = train_f2.copy()
train.head()
print('默认参数')
classifier_df_score(train, 'LGB', 5)
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'eval_metric': 'auc',
    'n_estimators': 200,
    'max_depth': 5,
    'num_leaves': 40,
    'max_bin': 400,
    'min_data_in_leaf': 120,
    'learning_rate': 0.1,
    'lambda_l1': 1e-05,
    'lambda_l2': 1e-05,
    'min_split_gain': 0.0,
    'bagging_freq': 4,
    'bagging_fraction': 0.9,
    'feature_fraction': 0.6,
    'seed': 1024,
    'n_thread': 12
}
print('调参后')
classifier_df_score(train, 'LGB', 5, params)

"""默认参数
LGB总体AUC: [0.9017063876540394, 0.9028474497847858, 0.8860489799298887, 0.8810605942191141, 0.8825448947499648]
LGBCoupon AUC: [0.7440798692234487, 0.741981950122888, 0.7474652592149504, 0.7642682296497904, 0.7628559092978145]
调参后
LGB总体AUC: [0.9025855554052864, 0.9043496920648407, 0.8861875013967577, 0.8820594188479879, 0.883635213451176]
LGBCoupon AUC: [0.7441512385391114, 0.7432384282827527, 0.7463287384907189, 0.7663217891405024, 0.7620611110529001]
"""

# 对比发现调参后的结果比默认参数有所提高,不过不是高很多,比不上特征对结果的影响。而且因为调参只能再测试集上作,
# 有的时候调参造成过拟,调参后线上成绩可能反而会下降。所以调参一般都是在比赛后期再做,前期主要是特征和模型的选择。

# 绘制验证曲线
# 可以通过绘制验证曲线,可视化的了解调参的过程


# 对进行网格调参
def grid_plot(train_feat,
              classifier,
              cvnum,
              param_range,
              param_name,
              param=None):
    from sklearn.model_selection import validation_curve
    train_scores, test_scores = validation_curve(get_sklearn_model(
        classifier, param),
                                                 get_predictors_df(train_feat),
                                                 get_target_df(train_feat),
                                                 param_name=param_name,
                                                 param_range=param_range,
                                                 cv=cvnum,
                                                 scoring='roc_auc',
                                                 n_jobs=1)
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)

    plt.title("Validation Curve with " + param_name)
    plt.xlabel(param_name)
    plt.ylabel("Score")
    plt.ylim(0.0, 1.1)
    plt.semilogx(param_range,
                 train_scores_mean,
                 label="Training score",
                 color="r")
    plt.fill_between(param_range,
                     train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std,
                     alpha=0.2,
                     color="r")
    plt.semilogx(param_range,
                 test_scores_mean,
                 label="Cross-validation score",
                 color="g")
    plt.fill_between(param_range,
                     test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std,
                     alpha=0.2,
                     color="g")
    plt.legend(loc="best")
    plt.show()


# 对逻辑回归的max_iter情况进行查看
train_feat = train_f2.copy()
# grid_plot(train_feat,classifier,3,[10,20,40,80,200,400,800],'n_estimators',param=params)
grid_plot(train_feat,
          'LR',
          3, [1, 2, 5, 10, 20, 40, 50],
          'max_iter',
          param=None)


params = {
    'learning_rate': 0.1,
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'sub_feature': 0.6,
    'num_leaves': 50,
    'bagging_fraction': 0.8,
    'feature_fraction': 0.8
}
train_feat = train_f2.copy()
# grid_plot(train_feat,classifier,3,[10,20,40,80,200,400,800],'n_estimators',param=params)
grid_plot(train_feat, 'LGB', 3, [10, 20, 40], 'n_estimators', param=params)

params = {
    'learning_rate': 0.1,
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'auc',
    'num_leaves': 10
}
train_feat = train_f2.copy()
# grid_plot(train_feat,classifier,3,[10,20,40,80,200,400,800],'n_estimators',param=params)
grid_plot(train_feat,
          'LGB',
          3, [0.1, 0.2, 0.5, 0.7, 0.8],
          'colsample_bytree',
          param=params)

你可能感兴趣的:(天池,人工智能,机器学习,深度学习)