天池o2o优惠券使用预测比赛解析(初级)
赛题链接:
天池o2o优惠券使用预测
import os, sys, pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import date
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold,train_test_split,StratifiedKFold,GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import auc, roc_curve
加载数据
df_train = pd.read_csv('data/ccf_offline_stage1_train.csv')
df_test = pd.read_csv('data/ccf_offline_stage1_test_revised.csv')
print(df_train.head())
User_id Merchant_id Coupon_id Discount_rate Distance Date_received \
0 1439408 2632 NaN NaN 0.0 NaN
1 1439408 4663 11002.0 150:20 1.0 20160528.0
2 1439408 2632 8591.0 20:1 0.0 20160217.0
3 1439408 2632 1078.0 20:1 0.0 20160319.0
4 1439408 2632 8591.0 20:1 0.0 20160613.0
Date
0 20160217.0
1 NaN
2 NaN
3 NaN
4 NaN
缺失值处理
df_train=df_train.fillna('null')
df_test=df_test.fillna('null')
统计
df_train.info()
RangeIndex: 1754884 entries, 0 to 1754883
Data columns (total 7 columns):
User_id int64
Merchant_id int64
Coupon_id object
Discount_rate object
Distance object
Date_received object
Date object
dtypes: int64(2), object(5)
memory usage: 93.7+ MB
print('有优惠券,有购买商品: %s 人' %df_train[(df_train['Date_received'] != 'null') & (df_train['Date'] != 'null')].shape[0])
print('有优惠券,没购买商品: %s 人' %df_train[(df_train['Date_received'] != 'null') & (df_train['Date'] == 'null')].shape[0])
print('没优惠券,有购买商品: %s 人' %df_train[(df_train['Date_received'] == 'null') & (df_train['Date'] != 'null')].shape[0])
print('没优惠券,没购买商品: %s 人' %df_train[(df_train['Date_received'] == 'null') & (df_train['Date'] == 'null')].shape[0])
有优惠券,有购买商品: 75382 人
有优惠券,没购买商品: 977900 人
没优惠券,有购买商品: 701602 人
没优惠券,没购买商品: 0 人
- 比赛的意义是把优惠券给那要购买商品却没有优惠券的人(701602 人),真正有需要的人。
特征提取
1. 打折率
#打折率的元素有三种类型,需要拆分开成为新的特征
df_train.Discount_rate.unique()
array(['null', '150:20', '20:1', '200:20', '30:5', '50:10', '10:5',
'100:10', '200:30', '20:5', '30:10', '50:5', '150:10', '100:30',
'200:50', '100:50', '300:30', '50:20', '0.9', '10:1', '30:1',
'0.95', '100:5', '5:1', '100:20', '0.8', '50:1', '200:10',
'300:20', '100:1', '150:30', '300:50', '20:10', '0.85', '0.6',
'150:50', '0.75', '0.5', '200:5', '0.7', '30:20', '300:10', '0.2',
'50:30', '200:100', '150:5'], dtype=object)
#定义函数拆分discount_rate列为新的四个特征
def getDiscountType(row):
if 'null' in row:
return 0
else:
return 1
def convertRate(row):
if 'null' in row:
return 1
elif ':' in row:
money = row.split(':')
rate = 1.0 - float(money[1])/float(money[0])
return rate
else:
return float(row)
def getDiscountMan(row):
if ':' in row:
money = row.split(':')
return int(money[0])
else:
return 0
def getDiscountJian(row):
if ':' in row:
money = row.split(':')
return int(money[1])
else:
return 0
'''
之前犯错在于误以为series.apply(func)输入的是series,所以函数都有个for循环:
def getDiscountType(row):
for i in row:
if 'null' in i:
return 0
else:
return 1
其实是迭代输入series的每一个元素,这一点和直接func(series)区分开来
'''
def processData(df):
df['discount_type'] = df['Discount_rate'].apply(getDiscountType)
df['discount_rate'] = df_train['Discount_rate'].apply(convertRate)
df['discount_man'] = df_train['Discount_rate'].apply(getDiscountMan)
df['discount_jian'] = df_train['Discount_rate'].apply(getDiscountJian)
print('打折率 %s' %df['discount_rate'].unique())
return df
- 之前犯错在于误以为series.apply(func)输入的是series,所以函数都有个for循环:
def getDiscountType(row):
for i in row:
if 'null' in i:
return 0
else:
return 1
其实是迭代输入series的每一个元素,这一点和直接func(series)区分开来
df_train = processData(df_train)
df_test = processData(df_test)
print(df_train.head())
打折率 [1. 0.86666667 0.95 0.9 0.83333333 0.8
0.5 0.85 0.75 0.66666667 0.93333333 0.7
0.6 0.96666667 0.98 0.99 0.975 0.33333333
0.2 0.4 ]
打折率 [1. 0.86666667 0.95 0.9 0.83333333 0.8
0.5 0.85 0.75 0.66666667 0.93333333 0.7
0.6 0.96666667 0.98 0.99 0.975 0.33333333
0.2 ]
User_id Merchant_id Coupon_id Discount_rate Distance Date_received \
0 1439408 2632 null null 0 null
1 1439408 4663 11002 150:20 1 2.01605e+07
2 1439408 2632 8591 20:1 0 2.01602e+07
3 1439408 2632 1078 20:1 0 2.01603e+07
4 1439408 2632 8591 20:1 0 2.01606e+07
Date discount_type discount_rate discount_man discount_jian
0 2.01602e+07 0 1.000000 0 0
1 null 1 0.866667 150 20
2 null 1 0.950000 20 1
3 null 1 0.950000 20 1
4 null 1 0.950000 20 1
2. 距离
df_train['Distance'].unique()
array([0.0, 1.0, 'null', 2.0, 10.0, 4.0, 7.0, 9.0, 3.0, 5.0, 6.0, 8.0],
dtype=object)
- 处理一下'null',转换成int
df_train['distance'] = df_train['Distance'].replace('null',-1).astype(int)
df_test['distance'] = df_test['Distance'].replace('null',-1).astype(int)
df_train['distance'] .unique()
array([ 0, 1, -1, 2, 10, 4, 7, 9, 3, 5, 6, 8])
df_train.info()
RangeIndex: 1754884 entries, 0 to 1754883
Data columns (total 12 columns):
User_id int64
Merchant_id int64
Coupon_id object
Discount_rate object
Distance object
Date_received object
Date object
discount_type int64
discount_rate float64
discount_man int64
discount_jian int64
distance int64
dtypes: float64(1), int64(6), object(5)
memory usage: 160.7+ MB
3. 领券日期
#领券日期
date_receive = df_train['Date_received'].unique()
date_receive = sorted(date_receive[date_receive != 'null'])
print('领券日期:%d - %d'%(date_receive[0],date_receive[-1]))
#消费日期
date_buy = df_train['Date'].unique()
date_buy = sorted(date_buy[date_buy != 'null'])
print('领券日期:%d - %d'%(date_buy[0],date_buy[-1]))
领券日期:20160101 - 20160615
领券日期:20160101 - 20160630
换算成weekday
weekday : {null, 1, 2, 3, 4, 5, 6, 7}
weekday_type : {1, 0}(周六和周日为1,其他为0)
Weekday_1 : {1, 0, 0, 0, 0, 0, 0}
Weekday_2 : {0, 1, 0, 0, 0, 0, 0}
Weekday_3 : {0, 0, 1, 0, 0, 0, 0}
Weekday_4 : {0, 0, 0, 1, 0, 0, 0}
Weekday_5 : {0, 0, 0, 0, 1, 0, 0}
Weekday_6 : {0, 0, 0, 0, 0, 1, 0}
Weekday_7 : {0, 0, 0, 0, 0, 0, 1}
构造weekday特征
def getWeekday(row):
if row == 'null':
return row
else:
weekday = date(int(row[0:4]),int(row[4:6]),int(row[6:8])).weekday() + 1
return weekday
df_train['weekday'] = df_train['Date_received'].astype(str).apply(getWeekday)
df_test['weekday'] = df_test['Date_received'].astype(str).apply(getWeekday)
df_train['weekday'].unique()
array(['null', 6, 3, 1, 5, 4, 7, 2], dtype=object)
构造weekday_tye特征
df_train['weekday_type'] = df_train['weekday'].apply(lambda x: 1 if x in [6,7] else 0)
df_test['weekday_type'] = df_test['weekday'].apply(lambda x: 1 if x in [6,7] else 0)
df_train['weekday_type'] .unique()
array([0, 1])
构造weekday_number特征
#训练数据
#one-hot-encoding
data = df_train['weekday'].replace('null',np.nan)
tmpdf = pd.get_dummies(data,prefix='weekday')
#拼接数据
df_train = pd.concat([df_train,tmpdf],axis=1)
#测试数据
#one-hot-encoding
data = df_test['weekday'].replace('null',np.nan)
tmpdf = pd.get_dummies(data,prefix='weekday')
#拼接数据
df_test = pd.concat([df_test,tmpdf],axis=1)
print(df_train.head())
User_id Merchant_id Coupon_id Discount_rate Distance Date_received \
0 1439408 2632 null null 0 null
1 1439408 4663 11002 150:20 1 20160528.0
2 1439408 2632 8591 20:1 0 20160217.0
3 1439408 2632 1078 20:1 0 20160319.0
4 1439408 2632 8591 20:1 0 20160613.0
Date discount_type discount_rate discount_man ... weekday \
0 2.01602e+07 0 1.000000 0 ... null
1 null 1 0.866667 150 ... 6
2 null 1 0.950000 20 ... 3
3 null 1 0.950000 20 ... 6
4 null 1 0.950000 20 ... 1
weekday_type weekday_1.0 weekday_2.0 weekday_3.0 weekday_4.0 \
0 0 0 0 0 0
1 1 0 0 0 0
2 0 0 0 1 0
3 1 0 0 0 0
4 0 1 0 0 0
weekday_5.0 weekday_6.0 weekday_7.0 label
0 0 0 0 -1
1 0 1 0 0
2 0 0 0 0
3 0 1 0 0
4 0 0 0 0
[5 rows x 22 columns]
print('所有特征:')
for i in df_train.columns:
print('\t',i)
所有特征:
User_id
Merchant_id
Coupon_id
Discount_rate
Distance
Date_received
Date
discount_type
discount_rate
discount_man
discount_jian
distance
weekday
weekday_type
weekday_1.0
weekday_2.0
weekday_3.0
weekday_4.0
weekday_5.0
weekday_6.0
weekday_7.0
标注label
三种情况:
Date_received == 'null':表示没有领到优惠券,无需考虑,y = -1
(Date_received != 'null') & (Date != 'null') & (Date - Date_received <= 15):表示领取优惠券且在15天内使用,即正样本,y = 1
(Date_received != 'null') & ((Date == 'null') | (Date - Date_received > 15)):表示领取优惠券未在在15天内使用,即负样本,y = 0
def label(row):
if row['Date_received'] == 'null':
return -1
if row['Date'] != 'null':
date_buy = pd.to_datetime(row['Date'],format='%Y%m%d')
date_receive = pd.to_datetime(row['Date_received'],format='%Y%m%d')
td = date_buy - date_receive
if td.days <= 15:
return 1
return 0
df_train['label'] = df_train.apply(label,axis=1)
df_train['label'].value_counts()
0 988887
-1 701602
1 64395
Name: label, dtype: int64
建立线性模型 SGDClassifier
- 使用下面面提取的14个特征:
- discount_rate
discount_type
discount_man
discount_jian
distance
weekday
weekday_type
weekday_1
weekday_2
weekday_3
weekday_4
weekday_5
weekday_6
weekday_7
训练集:20160101-20160515;验证集:20160516-20160615。
用线性模型 SGDClassifier
划分训练集/验证集
df_train['Date_received'] = df_train['Date_received'].astype(str)
df_train['Date_received'].unique()
array(['null', '20160528.0', '20160217.0', '20160319.0', '20160613.0',
'20160516.0', '20160429.0', '20160129.0', '20160530.0',
'20160519.0', '20160606.0', '20160207.0', '20160421.0',
'20160130.0', '20160412.0', '20160518.0', '20160327.0',
'20160127.0', '20160215.0', '20160524.0', '20160523.0',
'20160515.0', '20160521.0', '20160114.0', '20160321.0',
'20160426.0', '20160409.0', '20160326.0', '20160322.0',
'20160131.0', '20160125.0', '20160602.0', '20160128.0',
'20160605.0', '20160607.0', '20160324.0', '20160601.0',
'20160126.0', '20160124.0', '20160123.0', '20160201.0',
'20160522.0', '20160203.0', '20160417.0', '20160415.0',
'20160202.0', '20160206.0', '20160218.0', '20160611.0',
'20160329.0', '20160510.0', '20160302.0', '20160526.0',
'20160318.0', '20160205.0', '20160411.0', '20160520.0',
'20160527.0', '20160317.0', '20160213.0', '20160505.0',
'20160402.0', '20160211.0', '20160405.0', '20160408.0',
'20160323.0', '20160204.0', '20160112.0', '20160430.0',
'20160525.0', '20160609.0', '20160403.0', '20160325.0',
'20160413.0', '20160210.0', '20160610.0', '20160414.0',
'20160401.0', '20160109.0', '20160328.0', '20160420.0',
'20160422.0', '20160615.0', '20160120.0', '20160614.0',
'20160107.0', '20160508.0', '20160608.0', '20160603.0',
'20160425.0', '20160424.0', '20160305.0', '20160330.0',
'20160511.0', '20160504.0', '20160223.0', '20160404.0',
'20160416.0', '20160118.0', '20160303.0', '20160212.0',
'20160423.0', '20160308.0', '20160228.0', '20160418.0',
'20160509.0', '20160501.0', '20160428.0', '20160427.0',
'20160229.0', '20160512.0', '20160506.0', '20160117.0',
'20160514.0', '20160407.0', '20160410.0', '20160314.0',
'20160116.0', '20160503.0', '20160502.0', '20160531.0',
'20160316.0', '20160331.0', '20160517.0', '20160222.0',
'20160101.0', '20160306.0', '20160604.0', '20160214.0',
'20160406.0', '20160121.0', '20160313.0', '20160225.0',
'20160220.0', '20160110.0', '20160301.0', '20160105.0',
'20160122.0', '20160104.0', '20160113.0', '20160108.0',
'20160115.0', '20160513.0', '20160208.0', '20160612.0',
'20160419.0', '20160103.0', '20160312.0', '20160209.0',
'20160529.0', '20160119.0', '20160227.0', '20160315.0',
'20160304.0', '20160216.0', '20160507.0', '20160311.0',
'20160320.0', '20160102.0', '20160106.0', '20160224.0',
'20160219.0', '20160111.0', '20160310.0', '20160307.0',
'20160221.0', '20160226.0', '20160309.0'], dtype=object)
df = df_train[df_train['label']!=-1].copy()
train = df_train[df_train['Date_received'] <='20160515.0'].copy()
valid = df_train[(df_train['Date_received'] >='20160516.0') & (df_train['Date_received'] <'20160615.0')]
print('Train Set:\n',train['label'].value_counts())
print('Valid Set:\n',valid['label'].value_counts())
Train Set:
0 759172
1 41524
Name: label, dtype: int64
Valid Set:
0 226595
1 22516
Name: label, dtype: int64
特征
feature = ['discount_type', 'discount_rate',
'discount_man', 'discount_jian', 'distance', 'weekday', 'weekday_type',
'weekday_1.0', 'weekday_2.0', 'weekday_3.0', 'weekday_4.0',
'weekday_5.0', 'weekday_6.0', 'weekday_7.0']
print(feature)
['discount_type', 'discount_rate', 'discount_man', 'discount_jian', 'distance', 'weekday', 'weekday_type', 'weekday_1.0', 'weekday_2.0', 'weekday_3.0', 'weekday_4.0', 'weekday_5.0', 'weekday_6.0', 'weekday_7.0']
建立模型
def check_model(data,feature):
classifier = SGDClassifier(
loss='log',#The ‘log’ loss gives logistic regression
penalty='elasticnet',
fit_intercept=True, # 是否存在截距,默认存在
max_iter=100,
shuffle=True,
n_jobs=1,
class_weight=None)
#管道机制实现了对全部步骤的流式化封装和管理。
model = Pipeline(steps=[
('ss',StandardScaler()),
('clf',classifier)
])
parameters = {
'clf__alpha':[0.001,0.01,0.1],
'clf__l1_ratio':[0.001,0.01,0.1]
}
#分成采样,确保训练集,测试集中各类别样本的比例与原始数据集中相同。
folder = StratifiedKFold(n_splits=3,shuffle=True)
#网格搜索
grid_search = GridSearchCV(
model,
parameters,
cv=folder,
n_jobs=-1,
verbose=1)
grid_search = grid_search.fit(data[feature],data['label'])
return grid_search
训练
model = check_model(train,feature)
Fitting 3 folds for each of 9 candidates, totalling 27 fits
[Parallel(n_jobs=-1)]: Done 27 out of 27 | elapsed: 10.3min finished
验证
对验证集中每个优惠券预测的结果计算 AUC,再对所有优惠券的 AUC 求平均。计算 AUC 的时候,如果 label 只有一类,就直接跳过,因为 AUC 无法计算。
y_valid_pred = model.predict_proba(valid[feature])
valid1 = valid.copy()
valid1['pred_prob'] = y_valid_pred[:,1]
print(valid1.head())
User_id Merchant_id Coupon_id Discount_rate Distance Date_received \
1 1439408 4663 11002 150:20 1 20160528.0
4 1439408 2632 8591 20:1 0 20160613.0
6 1439408 2632 8591 20:1 0 20160516.0
9 2029232 450 1532 30:5 0 20160530.0
10 2029232 6459 12737 20:1 0 20160519.0
Date discount_type discount_rate discount_man ... \
1 null 1 0.866667 150 ...
4 null 1 0.950000 20 ...
6 2.01606e+07 1 0.950000 20 ...
9 null 1 0.833333 30 ...
10 null 1 0.950000 20 ...
weekday_type weekday_1.0 weekday_2.0 weekday_3.0 weekday_4.0 \
1 1 0 0 0 0
4 0 1 0 0 0
6 0 1 0 0 0
9 0 1 0 0 0
10 0 0 0 0 1
weekday_5.0 weekday_6.0 weekday_7.0 label pred_prob
1 0 1 0 0 0.019839
4 0 0 0 0 0.098629
6 0 0 0 0 0.098629
9 0 0 0 0 0.095701
10 0 0 0 0 0.129752
[5 rows x 23 columns]
- groupby之后是元组的形式
valid_groupby = valid1.groupby(['Coupon_id'])
for i in valid_groupby:
print(i)
(1.0, User_id Merchant_id Coupon_id Discount_rate Distance Date_received \
768069 472146 6889 1 20:1 9 20160522.0
962551 2266597 6889 1 20:1 0 20160603.0
964821 3057133 6889 1 20:1 0 20160606.0
1665538 5555255 6889 1 20:1 3 20160530.0
Date discount_type discount_rate discount_man ... \
768069 2.01606e+07 1 0.95 20 ...
962551 null 1 0.95 20 ...
964821 null 1 0.95 20 ...
1665538 null 1 0.95 20 ...
weekday_type weekday_1.0 weekday_2.0 weekday_3.0 weekday_4.0 \
768069 1 0 0 0 0
962551 0 0 0 0 0
964821 0 1 0 0 0
1665538 0 1 0 0 0
weekday_5.0 weekday_6.0 weekday_7.0 label pred_prob
768069 0 0 1 1 0.013089
962551 1 0 0 0 0.103987
964821 0 0 0 0 0.099109
1665538 0 0 0 0 0.052904
[4 rows x 23 columns])
#计算AUC
valid_groupby = valid1.groupby(['Coupon_id'])
aucs = []
mean_tpr = 0.0
for i in valid_groupby:
tmpdf = i[1]
if len(tmpdf['label'].unique())==1:
continue
fpr, tpr, thresholds = roc_curve(tmpdf['label'], tmpdf['pred_prob'], pos_label=1)
aucs.append(auc(fpr,tpr))
print(np.mean(aucs))
0.5334629648536017
valid[feature].shape
(249111, 14)
测试
feature_test = [ 'discount_type', 'discount_rate',
'discount_man', 'discount_jian', 'distance', 'weekday', 'weekday_type','weekday_1',
'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6',
'weekday_7']
y_test_pred = model.predict_proba(df_test[feature_test])
df_test_1 = df_test[['User_id','Coupon_id','Date_received']].copy()
df_test_1['Probability'] = y_test_pred[:,1]
df_test_1.to_csv('submit2.csv',index=False,header=False)
print(df_test_1.head())
User_id Coupon_id Date_received Probability
0 4129537 9983 20160712 0.118748
1 6949378 3429 20160706 0.034147
2 2166529 6928 20160727 0.045592
3 2166529 1808 20160727 0.045592
4 6172162 6500 20160708 0.068717
保存模型 & 导入模型
if not os.path.isfile('model.pkl'):
with open('model.pkl','wb') as f:
pickle.dump(model,f)
else:
with open('model.pkl','rb') as f:
model = pickle.load(f)
比赛第一名代码与解析