代码比较了普通交叉验证,k折交叉验证,留p法验证,以及分层k折交叉验证法。由于二分类正负样本不均衡,最后决定采用分层k折交叉验证法来进行下一步的算法模型比较
# 修改pandas默认的现实设置
import pandas as pd
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 20)
"""按照: 评价指标 验证方式 学习曲线 结果分析,模型选择 模型调参 的步骤对模型进行选择,调优"""
from sklearn import metrics
import numpy as np
import pandas as pd
import datetime
from sklearn.model_selection import StratifiedKFold
import lightgbm as lgb
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import GridSearchCV
#########部分SKLearn 集成的算法###############
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.naive_bayes import MultinomialNB
#########SKLearn 集成的算法###############
import warnings
warnings.filterwarnings("ignore")
############全局参数#################################
id_col_names = ['user_id', 'coupon_id', 'date_received']
target_col_name = 'label'
id_target_cols = ['user_id', 'coupon_id', 'date_received', 'label']
myeval = 'roc_auc'
# cvscore=0
############目录定义#################################
datapath = '../data/'
featurepath = '../feature/'
resultpath = '../result/'
tmppath = '../tmp/'
scorepath = '../score/'
###########工具函数#############################################
# 返回ID列
def get_id_df(df):
return df[id_col_names]
# 返回Target列
def get_target_df(df):
return df[target_col_name]
# 返回特征列
def get_predictors_df(df):
predictors = [f for f in df.columns if f not in id_target_cols]
return df[predictors]
# 按特征名读取训练集
def read_featurefile_train():
df = pd.read_csv('C:\\Users\\Administrator\\Desktop\\数据挖掘项目\\O2O_data\\train_sf2.csv',
sep=',',
encoding="utf-8")
df.fillna(0, inplace=True)
return df
# 按特征名读取测试集
def read_featurefile_test():
df = pd.read_csv('C:\\Users\\Administrator\\Desktop\\数据挖掘项目\\O2O_data\\test_sf2.csv',
sep=',',
encoding="utf-8")
df.fillna(0, inplace=True)
return df
# 将特征归一化
def standize_df(train_data, test_data):
from sklearn import preprocessing
features_columns = [
f for f in test_data.columns if f not in id_target_cols
]
min_max_scaler = preprocessing.MinMaxScaler()
min_max_scaler = min_max_scaler.fit(train_data[features_columns])
train_data_scaler = min_max_scaler.transform(train_data[features_columns])
test_data_scaler = min_max_scaler.transform(test_data[features_columns])
train_data_scaler = pd.DataFrame(train_data_scaler)
train_data_scaler.columns = features_columns
test_data_scaler = pd.DataFrame(test_data_scaler)
test_data_scaler.columns = features_columns
train_data_scaler['label'] = train_data['label']
train_data_scaler[id_col_names] = train_data[id_col_names]
test_data_scaler[id_col_names] = test_data[id_col_names]
return train_data_scaler, test_data_scaler
# 按特征名读取数据
def read_data():
traindf = read_featurefile_train()
testdf = read_featurefile_test()
#return traindf,testdf
return standize_df(traindf, testdf)
# 数据读取
# 所有的特征都是上一节生成的
train_f2, test_f2 = read_data()
# 评价指标及预测方式等
# 性能评价函数
# 本赛题目标是预测投放的优惠券是否核销。
# 针对此任务及一些相关背景知识,使用优惠券核销预测的平均AUC(ROC曲线下面积)作为评价标准。
# 即对每个优惠券coupon_id单独计算核销预测的AUC值,再对所有优惠券的AUC值求平均作为最终的评价标准。
# coupon平均auc计算
from sklearn.metrics import roc_auc_score
def myauc(test):
testgroup = test.groupby(['coupon_id'])
aucs = []
for i in testgroup:
coupon_df = i[1]
# 测算AUC必须大于1个类别
if len(coupon_df['label'].unique()) < 2:
continue
auc = metrics.roc_auc_score(coupon_df['label'], coupon_df['pred'])
aucs.append(auc)
return np.average(aucs)
"""
虽然赛题是按照coupon的AUC来计算。不过因为整体AUC(也就是用roc_auc_score 求出的结果)与Coupon AUC同增同减,
所以在进行评估的时候可以直接使用整体AUC。
预测方式,因为要的结果是购买的几率,所以不能直接用Predict因为这样会直接返回0,1,
而要用predict_proba,它会返回每个类别的可能行,取其中为1的列即可
"""
# 验证方式
# 简单交叉验证
# 简单交叉验证
from sklearn.model_selection import train_test_split # 切分数据
target = get_target_df(train_f2).copy()
traindf = train_f2.copy()
# 切分数据 训练数据80% 验证数据20%
train_all, test_all, train_target, test_target = train_test_split(
traindf, target, test_size=0.2, random_state=0)
train_data = get_predictors_df(train_all).copy()
test_data = get_predictors_df(test_all).copy()
clf = LogisticRegression()
clf.fit(train_data, train_target)
train_pred = clf.predict_proba(train_data)[:, 1]
test_pred = clf.predict_proba(test_data)[:, 1]
score_train = roc_auc_score(train_target, train_pred)
score_test = roc_auc_score(test_target, test_pred)
print("LogisticRegression train 总体AUC: ", score_train)
print("LogisticRegression test 总体AUC: ", score_test)
train_all['pred'] = train_pred
test_all['pred'] = test_pred
print("LogisticRegression train Coupon AUC: ", myauc(train_all))
print("LogisticRegression test Coupon AUC: ", myauc(test_all))
# K折交叉验证 K-fold CV
# 5折交叉验证
train = train_f2.copy()
target = get_target_df(train_f2).copy()
from sklearn.model_selection import KFold
kf = KFold(n_splits=5)
for k, (train_index, test_index) in enumerate(kf.split(train)): # train_index, test_index 两个分别为列表
train_data, test_data, train_target, test_target = train.iloc[
train_index], train.iloc[test_index], target[train_index], target[
test_index]
clf = LogisticRegression()
clf.fit(get_predictors_df(train_data), train_target)
train_pred = clf.predict_proba(get_predictors_df(train_data))[:, 1]
test_pred = clf.predict_proba(get_predictors_df(test_data))[:, 1]
score_train = roc_auc_score(train_target, train_pred)
score_test = roc_auc_score(test_target, test_pred)
train_data['pred'] = train_pred
test_data['pred'] = test_pred
print(k + 1, " 折", "LogisticRegression train 总体AUC: ", score_train)
print(k + 1, " 折", "LogisticRegression test 总体AUC: ", score_test)
print(k + 1, " 折", "LogisticRegression train Coupon AUC: ",
myauc(train_data))
print(k + 1, " 折", "LogisticRegression test Coupon AUC: ",
myauc(test_data), '\n')
# 留P法 LPO CV
train = train_f2.copy()
target = get_target_df(train_f2).copy()
from sklearn.model_selection import LeavePOut
lpo = LeavePOut(p=200)
num = 100
for k, (train_index, test_index) in enumerate(lpo.split(train)):
train_data, test_data, train_target, test_target = train.iloc[
train_index], train.iloc[test_index], target[train_index], target[
test_index]
clf = LogisticRegression()
clf.fit(get_predictors_df(train_data), train_target)
train_pred = clf.predict_proba(get_predictors_df(train_data))[:, 1]
test_pred = clf.predict_proba(get_predictors_df(test_data))[:, 1]
score_train = roc_auc_score(train_target, train_pred)
score_test = roc_auc_score(test_target, test_pred)
train_data['pred'] = train_pred
test_data['pred'] = test_pred
print(k + 1, " 折", "LogisticRegression train 总体AUC: ", score_train)
print(k + 1, " 折", "LogisticRegression test 总体AUC: ", score_test)
print(k + 1, " 折", "LogisticRegression train Coupon AUC: ",
myauc(train_data))
print(k + 1, " 折", "LogisticRegression test Coupon AUC: ",
myauc(test_data), '\n')
if k >= 5:
break
# StratifiedKFold
# 通过比较发现还是StratifiedKFold比较适合本赛题。因为本赛题正负样本分布不均匀。
# 而StratifiedKFold 分层采样交叉切分,确保训练集,测试集中各类别样本的比例与原始数据集中相同(每折按类分层抽样)。
# StratifiedKFold 5折交叉验证
train = train_f2.copy()
target = get_target_df(train_f2).copy()
from sklearn.model_selection import StratifiedKFold
kf = StratifiedKFold(n_splits=5)
for k, (train_index, test_index) in enumerate(kf.split(train, target)):
train_data, test_data, train_target, test_target = train.iloc[
train_index], train.iloc[test_index], target[train_index], target[
test_index]
clf = LogisticRegression()
clf.fit(get_predictors_df(train_data), train_target)
train_pred = clf.predict_proba(get_predictors_df(train_data))[:, 1]
test_pred = clf.predict_proba(get_predictors_df(test_data))[:, 1]
score_train = roc_auc_score(train_target, train_pred)
score_test = roc_auc_score(test_target, test_pred)
train_data['pred'] = train_pred
test_data['pred'] = test_pred
print(k + 1, " 折", "LogisticRegression train 总体AUC: ", score_train)
print(k + 1, " 折", "LogisticRegression test 总体AUC: ", score_test)
print(k + 1, " 折", "LogisticRegression train Coupon AUC: ",
myauc(train_data))
print(k + 1, " 折", "LogisticRegression test Coupon AUC: ",
myauc(test_data), '\n')
运行结果:
LogisticRegression train 总体AUC: 0.8033601207805912
LogisticRegression test 总体AUC: 0.8040890454196647
LogisticRegression train Coupon AUC: 0.6054489446463452
LogisticRegression test Coupon AUC: 0.6243258741237175
1 折 LogisticRegression train 总体AUC: 0.7940413176992859
1 折 LogisticRegression test 总体AUC: 0.8389913232130671
1 折 LogisticRegression train Coupon AUC: 0.6036368871858807
1 折 LogisticRegression test Coupon AUC: 0.6314837405607805
2 折 LogisticRegression train 总体AUC: 0.7944813183500465
2 折 LogisticRegression test 总体AUC: 0.8375490223211755
2 折 LogisticRegression train Coupon AUC: 0.6086089959636457
2 折 LogisticRegression test Coupon AUC: 0.6236689511644279
3 折 LogisticRegression train 总体AUC: 0.8091990146439405
3 折 LogisticRegression test 总体AUC: 0.7777939473876077
3 折 LogisticRegression train Coupon AUC: 0.6108069451603675
3 折 LogisticRegression test Coupon AUC: 0.6024292968721827
4 折 LogisticRegression train 总体AUC: 0.810610046684465
4 折 LogisticRegression test 总体AUC: 0.7760278960446174
4 折 LogisticRegression train Coupon AUC: 0.6073179717574466
4 折 LogisticRegression test Coupon AUC: 0.6098148295374517
5 折 LogisticRegression train 总体AUC: 0.8106999656751949
5 折 LogisticRegression test 总体AUC: 0.77425015657914
5 折 LogisticRegression train Coupon AUC: 0.6098650487880474
5 折 LogisticRegression test Coupon AUC: 0.6010086298836634
1 折 LogisticRegression train 总体AUC: 0.8035760861033734
1 折 LogisticRegression test 总体AUC: 0.8100620567375886
1 折 LogisticRegression train Coupon AUC: 0.6076882934738217
1 折 LogisticRegression test Coupon AUC: 0.35526315789473684
2 折 LogisticRegression train 总体AUC: 0.8036019920049553
2 折 LogisticRegression test 总体AUC: 0.8105053191489362
2 折 LogisticRegression train Coupon AUC: 0.6077159798772628
2 折 LogisticRegression test Coupon AUC: 0.375
3 折 LogisticRegression train 总体AUC: 0.8036263208518077
3 折 LogisticRegression test 总体AUC: 0.8091755319148936
3 折 LogisticRegression train Coupon AUC: 0.6076056423299218
3 折 LogisticRegression test Coupon AUC: 0.35526315789473684
4 折 LogisticRegression train 总体AUC: 0.8036142677005703
4 折 LogisticRegression test 总体AUC: 0.8096187943262411
4 折 LogisticRegression train Coupon AUC: 0.6076862125036082
4 折 LogisticRegression test Coupon AUC: 0.35526315789473684
5 折 LogisticRegression train 总体AUC: 0.8034619718104917
5 折 LogisticRegression test 总体AUC: 0.8038563829787234
5 折 LogisticRegression train Coupon AUC: 0.6076497424745886
5 折 LogisticRegression test Coupon AUC: 0.35526315789473684
6 折 LogisticRegression train 总体AUC: 0.8036902043854324
6 折 LogisticRegression test 总体AUC: 0.8065159574468085
6 折 LogisticRegression train Coupon AUC: 0.6077067661520358
6 折 LogisticRegression test Coupon AUC: 0.35526315789473684
1 折 LogisticRegression train 总体AUC: 0.7939091085373198
1 折 LogisticRegression test 总体AUC: 0.8380467153228752
1 折 LogisticRegression train Coupon AUC: 0.6049825424583766
1 折 LogisticRegression test Coupon AUC: 0.6297486562799388
2 折 LogisticRegression train 总体AUC: 0.7959451982525808
2 折 LogisticRegression test 总体AUC: 0.8331144754932879
2 折 LogisticRegression train Coupon AUC: 0.6079759849800991
2 折 LogisticRegression test Coupon AUC: 0.6221991493568617
3 折 LogisticRegression train 总体AUC: 0.8083839584946638
3 折 LogisticRegression test 总体AUC: 0.7836523160639042
3 折 LogisticRegression train Coupon AUC: 0.609931385224286
3 折 LogisticRegression test Coupon AUC: 0.6094752655327303
4 折 LogisticRegression train 总体AUC: 0.8106970221066883
4 折 LogisticRegression test 总体AUC: 0.7729860256306605
4 折 LogisticRegression train Coupon AUC: 0.6084553566988748
4 折 LogisticRegression test Coupon AUC: 0.6104010310924847
5 折 LogisticRegression train 总体AUC: 0.810268034538234
5 折 LogisticRegression test 总体AUC: 0.7756876277239039
5 折 LogisticRegression train Coupon AUC: 0.6078281476888198
5 折 LogisticRegression test Coupon AUC: 0.6069312652012027
进程已结束,退出代码0