阿里云天池O2O优惠券预测之模型交叉验证部分Python代码

 代码比较了普通交叉验证,k折交叉验证,留p法验证,以及分层k折交叉验证法。由于二分类正负样本不均衡,最后决定采用分层k折交叉验证法来进行下一步的算法模型比较

# 修改pandas默认的现实设置
import pandas as pd
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 20)

"""按照: 评价指标 验证方式 学习曲线 结果分析,模型选择 模型调参 的步骤对模型进行选择,调优"""

from sklearn import metrics
import numpy as np
import pandas as pd
import datetime
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV
#########部分SKLearn 集成的算法###############
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import MultinomialNB
#########SKLearn 集成的算法###############
import warnings
warnings.filterwarnings("ignore")

############全局参数#################################
id_col_names = ['user_id', 'coupon_id', 'date_received']
target_col_name = 'label'
id_target_cols = ['user_id', 'coupon_id', 'date_received', 'label']
myeval = 'roc_auc'
# cvscore=0
############目录定义#################################
datapath = '../data/'
featurepath = '../feature/'
resultpath = '../result/'
tmppath = '../tmp/'
scorepath = '../score/'

###########工具函数#############################################

# 返回ID列
def get_id_df(df):
    return df[id_col_names]


# 返回Target列
def get_target_df(df):
    return df[target_col_name]


# 返回特征列
def get_predictors_df(df):
    predictors = [f for f in df.columns if f not in id_target_cols]
    return df[predictors]


# 按特征名读取训练集
def read_featurefile_train():
    df = pd.read_csv('C:\\Users\\Administrator\\Desktop\\数据挖掘项目\\O2O_data\\train_sf2.csv',
                     sep=',',
                     encoding="utf-8")
    df.fillna(0, inplace=True)
    return df


# 按特征名读取测试集
def read_featurefile_test():
    df = pd.read_csv('C:\\Users\\Administrator\\Desktop\\数据挖掘项目\\O2O_data\\test_sf2.csv',
                     sep=',',
                     encoding="utf-8")
    df.fillna(0, inplace=True)
    return df


# 将特征归一化
def standize_df(train_data, test_data):
    from sklearn import preprocessing

    features_columns = [
        f for f in test_data.columns if f not in id_target_cols
    ]
    min_max_scaler = preprocessing.MinMaxScaler()
    min_max_scaler = min_max_scaler.fit(train_data[features_columns])

    train_data_scaler = min_max_scaler.transform(train_data[features_columns])
    test_data_scaler = min_max_scaler.transform(test_data[features_columns])

    train_data_scaler = pd.DataFrame(train_data_scaler)
    train_data_scaler.columns = features_columns

    test_data_scaler = pd.DataFrame(test_data_scaler)
    test_data_scaler.columns = features_columns

    train_data_scaler['label'] = train_data['label']
    train_data_scaler[id_col_names] = train_data[id_col_names]
    test_data_scaler[id_col_names] = test_data[id_col_names]
    return train_data_scaler, test_data_scaler


# 按特征名读取数据
def read_data():
    traindf = read_featurefile_train()
    testdf = read_featurefile_test()
    #return traindf,testdf
    return standize_df(traindf, testdf)


# 数据读取

# 所有的特征都是上一节生成的
train_f2, test_f2 = read_data()


# 评价指标及预测方式等

# 性能评价函数
# 本赛题目标是预测投放的优惠券是否核销。
# 针对此任务及一些相关背景知识,使用优惠券核销预测的平均AUC(ROC曲线下面积)作为评价标准。
# 即对每个优惠券coupon_id单独计算核销预测的AUC值,再对所有优惠券的AUC值求平均作为最终的评价标准。
# coupon平均auc计算
from sklearn.metrics import roc_auc_score


def myauc(test):
    testgroup = test.groupby(['coupon_id'])
    aucs = []
    for i in testgroup:
        coupon_df = i[1]
        # 测算AUC必须大于1个类别
        if len(coupon_df['label'].unique()) < 2:
            continue
        auc = metrics.roc_auc_score(coupon_df['label'], coupon_df['pred'])
        aucs.append(auc)
    return np.average(aucs)


"""
虽然赛题是按照coupon的AUC来计算。不过因为整体AUC(也就是用roc_auc_score 求出的结果)与Coupon AUC同增同减,
所以在进行评估的时候可以直接使用整体AUC。
预测方式,因为要的结果是购买的几率,所以不能直接用Predict因为这样会直接返回0,1,
而要用predict_proba,它会返回每个类别的可能行,取其中为1的列即可
"""

# 验证方式
# 简单交叉验证

# 简单交叉验证
from sklearn.model_selection import train_test_split  # 切分数据

target = get_target_df(train_f2).copy()
traindf = train_f2.copy()

# 切分数据 训练数据80% 验证数据20%
train_all, test_all, train_target, test_target = train_test_split(
    traindf, target, test_size=0.2, random_state=0)

train_data = get_predictors_df(train_all).copy()
test_data = get_predictors_df(test_all).copy()

clf = LogisticRegression()
clf.fit(train_data, train_target)
train_pred = clf.predict_proba(train_data)[:, 1]
test_pred = clf.predict_proba(test_data)[:, 1]

score_train = roc_auc_score(train_target, train_pred)
score_test = roc_auc_score(test_target, test_pred)
print("LogisticRegression train 总体AUC:   ", score_train)
print("LogisticRegression test 总体AUC:   ", score_test)

train_all['pred'] = train_pred
test_all['pred'] = test_pred
print("LogisticRegression train Coupon AUC:   ", myauc(train_all))
print("LogisticRegression test Coupon AUC:   ", myauc(test_all))

# K折交叉验证 K-fold CV

# 5折交叉验证
train = train_f2.copy()
target = get_target_df(train_f2).copy()

from sklearn.model_selection import KFold
kf = KFold(n_splits=5)
for k, (train_index, test_index) in enumerate(kf.split(train)):  # train_index, test_index  两个分别为列表
    train_data, test_data, train_target, test_target = train.iloc[
        train_index], train.iloc[test_index], target[train_index], target[
            test_index]
    clf = LogisticRegression()
    clf.fit(get_predictors_df(train_data), train_target)

    train_pred = clf.predict_proba(get_predictors_df(train_data))[:, 1]
    test_pred = clf.predict_proba(get_predictors_df(test_data))[:, 1]
    score_train = roc_auc_score(train_target, train_pred)
    score_test = roc_auc_score(test_target, test_pred)
    train_data['pred'] = train_pred
    test_data['pred'] = test_pred
    print(k + 1, " 折", "LogisticRegression train 总体AUC:   ", score_train)
    print(k + 1, " 折", "LogisticRegression test 总体AUC:   ", score_test)
    print(k + 1, " 折", "LogisticRegression train Coupon AUC:   ",
          myauc(train_data))
    print(k + 1, " 折", "LogisticRegression test Coupon AUC:   ",
          myauc(test_data), '\n')


# 留P法 LPO CV
train = train_f2.copy()
target = get_target_df(train_f2).copy()

from sklearn.model_selection import LeavePOut
lpo = LeavePOut(p=200)
num = 100
for k, (train_index, test_index) in enumerate(lpo.split(train)):
    train_data, test_data, train_target, test_target = train.iloc[
        train_index], train.iloc[test_index], target[train_index], target[
            test_index]
    clf = LogisticRegression()
    clf.fit(get_predictors_df(train_data), train_target)

    train_pred = clf.predict_proba(get_predictors_df(train_data))[:, 1]
    test_pred = clf.predict_proba(get_predictors_df(test_data))[:, 1]
    score_train = roc_auc_score(train_target, train_pred)
    score_test = roc_auc_score(test_target, test_pred)
    train_data['pred'] = train_pred
    test_data['pred'] = test_pred
    print(k + 1, " 折", "LogisticRegression train 总体AUC:   ", score_train)
    print(k + 1, " 折", "LogisticRegression test 总体AUC:   ", score_test)
    print(k + 1, " 折", "LogisticRegression train Coupon AUC:   ",
          myauc(train_data))
    print(k + 1, " 折", "LogisticRegression test Coupon AUC:   ",
          myauc(test_data), '\n')
    if k >= 5:
        break

# StratifiedKFold
# 通过比较发现还是StratifiedKFold比较适合本赛题。因为本赛题正负样本分布不均匀。
# 而StratifiedKFold 分层采样交叉切分,确保训练集,测试集中各类别样本的比例与原始数据集中相同(每折按类分层抽样)。
# StratifiedKFold 5折交叉验证
train = train_f2.copy()
target = get_target_df(train_f2).copy()

from sklearn.model_selection import StratifiedKFold

kf = StratifiedKFold(n_splits=5)
for k, (train_index, test_index) in enumerate(kf.split(train, target)):
    train_data, test_data, train_target, test_target = train.iloc[
        train_index], train.iloc[test_index], target[train_index], target[
            test_index]
    clf = LogisticRegression()
    clf.fit(get_predictors_df(train_data), train_target)

    train_pred = clf.predict_proba(get_predictors_df(train_data))[:, 1]
    test_pred = clf.predict_proba(get_predictors_df(test_data))[:, 1]
    score_train = roc_auc_score(train_target, train_pred)
    score_test = roc_auc_score(test_target, test_pred)
    train_data['pred'] = train_pred
    test_data['pred'] = test_pred
    print(k + 1, " 折", "LogisticRegression train 总体AUC:   ", score_train)
    print(k + 1, " 折", "LogisticRegression test 总体AUC:   ", score_test)
    print(k + 1, " 折", "LogisticRegression train Coupon AUC:   ",
          myauc(train_data))
    print(k + 1, " 折", "LogisticRegression test Coupon AUC:   ",
          myauc(test_data), '\n')

 运行结果:

LogisticRegression train 总体AUC:    0.8033601207805912
LogisticRegression test 总体AUC:    0.8040890454196647
LogisticRegression train Coupon AUC:    0.6054489446463452
LogisticRegression test Coupon AUC:    0.6243258741237175
1  折 LogisticRegression train 总体AUC:    0.7940413176992859
1  折 LogisticRegression test 总体AUC:    0.8389913232130671
1  折 LogisticRegression train Coupon AUC:    0.6036368871858807
1  折 LogisticRegression test Coupon AUC:    0.6314837405607805 

2  折 LogisticRegression train 总体AUC:    0.7944813183500465
2  折 LogisticRegression test 总体AUC:    0.8375490223211755
2  折 LogisticRegression train Coupon AUC:    0.6086089959636457
2  折 LogisticRegression test Coupon AUC:    0.6236689511644279 

3  折 LogisticRegression train 总体AUC:    0.8091990146439405
3  折 LogisticRegression test 总体AUC:    0.7777939473876077
3  折 LogisticRegression train Coupon AUC:    0.6108069451603675
3  折 LogisticRegression test Coupon AUC:    0.6024292968721827 

4  折 LogisticRegression train 总体AUC:    0.810610046684465
4  折 LogisticRegression test 总体AUC:    0.7760278960446174
4  折 LogisticRegression train Coupon AUC:    0.6073179717574466
4  折 LogisticRegression test Coupon AUC:    0.6098148295374517 

5  折 LogisticRegression train 总体AUC:    0.8106999656751949
5  折 LogisticRegression test 总体AUC:    0.77425015657914
5  折 LogisticRegression train Coupon AUC:    0.6098650487880474
5  折 LogisticRegression test Coupon AUC:    0.6010086298836634 

1  折 LogisticRegression train 总体AUC:    0.8035760861033734
1  折 LogisticRegression test 总体AUC:    0.8100620567375886
1  折 LogisticRegression train Coupon AUC:    0.6076882934738217
1  折 LogisticRegression test Coupon AUC:    0.35526315789473684 

2  折 LogisticRegression train 总体AUC:    0.8036019920049553
2  折 LogisticRegression test 总体AUC:    0.8105053191489362
2  折 LogisticRegression train Coupon AUC:    0.6077159798772628
2  折 LogisticRegression test Coupon AUC:    0.375 

3  折 LogisticRegression train 总体AUC:    0.8036263208518077
3  折 LogisticRegression test 总体AUC:    0.8091755319148936
3  折 LogisticRegression train Coupon AUC:    0.6076056423299218
3  折 LogisticRegression test Coupon AUC:    0.35526315789473684 

4  折 LogisticRegression train 总体AUC:    0.8036142677005703
4  折 LogisticRegression test 总体AUC:    0.8096187943262411
4  折 LogisticRegression train Coupon AUC:    0.6076862125036082
4  折 LogisticRegression test Coupon AUC:    0.35526315789473684 

5  折 LogisticRegression train 总体AUC:    0.8034619718104917
5  折 LogisticRegression test 总体AUC:    0.8038563829787234
5  折 LogisticRegression train Coupon AUC:    0.6076497424745886
5  折 LogisticRegression test Coupon AUC:    0.35526315789473684 

6  折 LogisticRegression train 总体AUC:    0.8036902043854324
6  折 LogisticRegression test 总体AUC:    0.8065159574468085
6  折 LogisticRegression train Coupon AUC:    0.6077067661520358
6  折 LogisticRegression test Coupon AUC:    0.35526315789473684 

1  折 LogisticRegression train 总体AUC:    0.7939091085373198
1  折 LogisticRegression test 总体AUC:    0.8380467153228752
1  折 LogisticRegression train Coupon AUC:    0.6049825424583766
1  折 LogisticRegression test Coupon AUC:    0.6297486562799388 

2  折 LogisticRegression train 总体AUC:    0.7959451982525808
2  折 LogisticRegression test 总体AUC:    0.8331144754932879
2  折 LogisticRegression train Coupon AUC:    0.6079759849800991
2  折 LogisticRegression test Coupon AUC:    0.6221991493568617 

3  折 LogisticRegression train 总体AUC:    0.8083839584946638
3  折 LogisticRegression test 总体AUC:    0.7836523160639042
3  折 LogisticRegression train Coupon AUC:    0.609931385224286
3  折 LogisticRegression test Coupon AUC:    0.6094752655327303 

4  折 LogisticRegression train 总体AUC:    0.8106970221066883
4  折 LogisticRegression test 总体AUC:    0.7729860256306605
4  折 LogisticRegression train Coupon AUC:    0.6084553566988748
4  折 LogisticRegression test Coupon AUC:    0.6104010310924847 

5  折 LogisticRegression train 总体AUC:    0.810268034538234
5  折 LogisticRegression test 总体AUC:    0.7756876277239039
5  折 LogisticRegression train Coupon AUC:    0.6078281476888198
5  折 LogisticRegression test Coupon AUC:    0.6069312652012027 


进程已结束,退出代码0

你可能感兴趣的:(天池,python,阿里云,机器学习)