1.学习在anaconda中使用jupyter notebook调试python代码
终端开启jupyter notebook,点击链接进入:http://localhost:8888/tree#
新建python 3开始逐行编写和调试代码,记住及时保存。
2.学会在阿里云天池
https://tianchi.aliyun.com/learn/liveDetail.html?spm=5176.11510288.0.0.1b23b7bdwkRz1f&classroomId=263
中学习机器学习相关知识,参与各类论坛和挑战赛,通过调试别人的代码学习做数据挖掘比赛
技巧:论坛找资源、看视频学习、多动手尝试、多看多想多请教、学会百度和很重要,不断尝试放弃敢于迎接挑战。
3.实战-<第1章 O2O优惠券使用新人赛-初试>
跑通了自己的第一个数据挖掘小项目,大致了解了做数据挖掘比赛的流程、方法和思想,了解了一些常用的包。
按照《生活大实惠:O2O优惠券使用预测 ¶》
https://tianchi.aliyun.com/learn/live.html?spm=5176.11510297.4851204.3.7f073c60dp66Zs&classroomId=263#classroomId=263&postsId=23464
调试成功。
代码如下:
# coding: utf-8
# In[1]:
# import libraries necessary for this project
import os, sys, pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
from datetime import date
from sklearn.model_selection import KFold, train_test_split, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import log_loss, roc_auc_score, auc, roc_curve
from sklearn.preprocessing import MinMaxScaler
import xgboost as xgb
import lightgbm as lgb
# display for this notebook
get_ipython().magic('matplotlib inline')
get_ipython().magic("config InlineBackend.figure_format = 'retina'")
# In[2]:
dfoff = pd.read_csv('G:\code\o2o\ccf_offline_stage1_train.csv')
dftest = pd.read_csv('G:\code\o2o\ccf_offline_stage1_test_revised.csv')
dfon = pd.read_csv('G:\code\o2occf_online_stage1_train.csv')
dfoff.head(5)
# In[3]:
dfoff.info()
# In[4]:
print('有优惠券,购买商品条数', dfoff[(dfoff['Date_received'] != 'null') & (dfoff['Date'] != 'null')].shape[0])
print('无优惠券,购买商品条数', dfoff[(dfoff['Date_received'] == 'null') & (dfoff['Date'] != 'null')].shape[0])
print('有优惠券,不购买商品条数', dfoff[(dfoff['Date_received'] != 'null') & (dfoff['Date'] == 'null')].shape[0])
print('无优惠券,不购买商品条数', dfoff[(dfoff['Date_received'] == 'null') & (dfoff['Date'] == 'null')].shape[0])
# In[5]:
# 在测试集中出现的用户但训练集没有出现
print('1. User_id in training set but not in test set', set(dftest['User_id']) - set(dfoff['User_id']))
# 在测试集中出现的商户但训练集没有出现
print('2. Merchant_id in training set but not in test set', set(dftest['Merchant_id']) - set(dfoff['Merchant_id']))# 在测试集中出现的用户但训练集没有出现
print('1. User_id in training set but not in test set', set(dftest['User_id']) - set(dfoff['User_id']))
# 在测试集中出现的商户但训练集没有出现
print('2. Merchant_id in training set but not in test set', set(dftest['Merchant_id']) - set(dfoff['Merchant_id']))
# In[6]:
print('Discount_rate 类型:',dfoff['Discount_rate'].unique())
print('Distance 类型:', dfoff['Distance'].unique())
# In[7]:
# convert Discount_rate and Distance
def getDiscountType(row):
if row == 'null':
return 'null'
elif ':' in row:
return 1
else:
return 0
def convertRate(row):
"""Convert discount to rate"""
if row == 'null':
return 1.0
elif ':' in row:
rows = row.split(':')
return 1.0 - float(rows[1])/float(rows[0])
else:
return float(row)
def getDiscountMan(row):
if ':' in row:
rows = row.split(':')
return int(rows[0])
else:
return 0
def getDiscountJian(row):
if ':' in row:
rows = row.split(':')
return int(rows[1])
else:
return 0
def processData(df):
# convert discunt_rate
df['discount_rate'] = df['Discount_rate'].apply(convertRate)
df['discount_man'] = df['Discount_rate'].apply(getDiscountMan)
df['discount_jian'] = df['Discount_rate'].apply(getDiscountJian)
df['discount_type'] = df['Discount_rate'].apply(getDiscountType)
print(df['discount_rate'].unique())
# convert distance
df['distance'] = df['Distance'].replace('null', -1).astype(int)
print(df['distance'].unique())
return df
dfoff = processData(dfoff)
dftest = processData(dftest)
# In[8]:
dfoff.head(2)
# In[9]:
dftest.head(2)
# In[10]:
date_received = dfoff['Date_received'].unique()
date_received = sorted(date_received[date_received != 'null'])
date_buy = dfoff['Date'].unique()
date_buy = sorted(date_buy[date_buy != 'null'])
date_buy = sorted(dfoff[dfoff['Date'] != 'null']['Date'])
print('优惠券收到日期从',date_received[0],'到', date_received[-1])
print('消费日期从', date_buy[0], '到', date_buy[-1])
# In[11]:
couponbydate = dfoff[dfoff['Date_received'] != 'null'][['Date_received', 'Date']].groupby(['Date_received'], as_index=False).count()
couponbydate.columns = ['Date_received','count']
buybydate = dfoff[(dfoff['Date'] != 'null') & (dfoff['Date_received'] != 'null')][['Date_received', 'Date']].groupby(['Date_received'], as_index=False).count()
buybydate.columns = ['Date_received','count']
# In[13]:
sns.set_style('ticks')
sns.set_context("notebook", font_scale= 1.4)
plt.figure(figsize = (12,8))
date_received_dt = pd.to_datetime(date_received, format='%Y%m%d')
plt.subplot(211)
plt.bar(date_received_dt, couponbydate['count'], label = 'number of coupon received' )
plt.bar(date_received_dt, buybydate['count'], label = 'number of coupon used')
plt.yscale('log')
plt.ylabel('Count')
plt.legend()
plt.subplot(212)
plt.bar(date_received_dt, buybydate['count']/couponbydate['count'])
plt.ylabel('Ratio(coupon used/coupon received)')
plt.tight_layout()
# In[14]:
def getWeekday(row):
if row == 'null':
return row
else:
return date(int(row[0:4]), int(row[4:6]), int(row[6:8])).weekday() + 1
dfoff['weekday'] = dfoff['Date_received'].astype(str).apply(getWeekday)
dftest['weekday'] = dftest['Date_received'].astype(str).apply(getWeekday)
# weekday_type : 周六和周日为1,其他为0
dfoff['weekday_type'] = dfoff['weekday'].apply(lambda x : 1 if x in [6,7] else 0 )
dftest['weekday_type'] = dftest['weekday'].apply(lambda x : 1 if x in [6,7] else 0 )
# In[15]:
# change weekday to one-hot encoding
weekdaycols = ['weekday_' + str(i) for i in range(1,8)]
print(weekdaycols)
tmpdf = pd.get_dummies(dfoff['weekday'].replace('null', np.nan))
tmpdf.columns = weekdaycols
dfoff[weekdaycols] = tmpdf
tmpdf = pd.get_dummies(dftest['weekday'].replace('null', np.nan))
tmpdf.columns = weekdaycols
dftest[weekdaycols] = tmpdf
# In[16]:
def label(row):
if row['Date_received'] == 'null':
return -1
if row['Date'] != 'null':
td = pd.to_datetime(row['Date'], format='%Y%m%d') - pd.to_datetime(row['Date_received'], format='%Y%m%d')
if td <= pd.Timedelta(15, 'D'):
return 1
return 0
dfoff['label'] = dfoff.apply(label, axis = 1)
# In[17]:
print(dfoff['label'].value_counts())
# In[18]:
print('已有columns:',dfoff.columns.tolist())
# In[19]:
dfoff.head(2)
# In[20]:
# data split
df = dfoff[dfoff['label'] != -1].copy()
train = df[(df['Date_received'] < '20160516')].copy()
valid = df[(df['Date_received'] >= '20160516') & (df['Date_received'] <= '20160615')].copy()
print(train['label'].value_counts())
print(valid['label'].value_counts())
# In[21]:
# feature
original_feature = ['discount_rate','discount_type','discount_man', 'discount_jian','distance', 'weekday', 'weekday_type'] + weekdaycols
print(len(original_feature),original_feature)
# In[22]:
# model1
predictors = original_feature
print(predictors)
def check_model(data, predictors):
classifier = lambda: SGDClassifier(
loss='log',
penalty='elasticnet',
fit_intercept=True,
max_iter=100,
shuffle=True,
n_jobs=1,
class_weight=None)
model = Pipeline(steps=[
('ss', StandardScaler()),
('en', classifier())
])
parameters = {
'en__alpha': [ 0.001, 0.01, 0.1],
'en__l1_ratio': [ 0.001, 0.01, 0.1]
}
folder = StratifiedKFold(n_splits=3, shuffle=True)
grid_search = GridSearchCV(
model,
parameters,
cv=folder,
n_jobs=-1,
verbose=1)
grid_search = grid_search.fit(data[predictors],
data['label'])
return grid_search
if not os.path.isfile('1_model.pkl'):
model = check_model(train, predictors)
print(model.best_score_)
print(model.best_params_)
with open('1_model.pkl', 'wb') as f:
pickle.dump(model, f)
else:
with open('1_model.pkl', 'rb') as f:
model = pickle.load(f)
# In[24]:
# valid predict
y_valid_pred = model.predict_proba(valid[predictors])
valid1 = valid.copy()
valid1['pred_prob'] = y_valid_pred[:, 1]
valid1.head(2)
# In[25]:
# avgAUC calculation
vg = valid1.groupby(['Coupon_id'])
aucs = []
for i in vg:
tmpdf = i[1]
if len(tmpdf['label'].unique()) != 2:
continue
fpr, tpr, thresholds = roc_curve(tmpdf['label'], tmpdf['pred_prob'], pos_label=1)
aucs.append(auc(fpr, tpr))
print(np.average(aucs))
# In[26]:
# test prediction for submission
y_test_pred = model.predict_proba(dftest[predictors])
dftest1 = dftest[['User_id','Coupon_id','Date_received']].copy()
dftest1['label'] = y_test_pred[:,1]
dftest1.to_csv('submit1.csv', index=False, header=False)
dftest1.head()
# In[27]:
feature = dfoff[(dfoff['Date'] < '20160516') | ((dfoff['Date'] == 'null') & (dfoff['Date_received'] < '20160516'))].copy()
data = dfoff[(dfoff['Date_received'] >= '20160516') & (dfoff['Date_received'] <= '20160615')].copy()
print(data['label'].value_counts())
# In[28]:
fdf = feature.copy()
# In[29]:
# key of user
u = fdf[['User_id']].copy().drop_duplicates()
# In[30]:
# u_coupon_count : num of coupon received by user
u1 = fdf[fdf['Date_received'] != 'null'][['User_id']].copy()
u1['u_coupon_count'] = 1
u1 = u1.groupby(['User_id'], as_index = False).count()
u1.head(2)
# In[31]:
# u_buy_count : times of user buy offline (with or without coupon)
u2 = fdf[fdf['Date'] != 'null'][['User_id']].copy()
u2['u_buy_count'] = 1
u2 = u2.groupby(['User_id'], as_index = False).count()
u2.head(2)
# In[32]:
# u_buy_with_coupon : times of user buy offline (with coupon)
u3 = fdf[((fdf['Date'] != 'null') & (fdf['Date_received'] != 'null'))][['User_id']].copy()
u3['u_buy_with_coupon'] = 1
u3 = u3.groupby(['User_id'], as_index = False).count()
u3.head(2)
# In[33]:
# u_merchant_count : num of merchant user bought from
u4 = fdf[fdf['Date'] != 'null'][['User_id', 'Merchant_id']].copy()
u4.drop_duplicates(inplace = True)
u4 = u4.groupby(['User_id'], as_index = False).count()
u4.rename(columns = {'Merchant_id':'u_merchant_count'}, inplace = True)
u4.head(2)
# In[34]:
# u_min_distance
utmp = fdf[(fdf['Date'] != 'null') & (fdf['Date_received'] != 'null')][['User_id', 'distance']].copy()
utmp.replace(-1, np.nan, inplace = True)
u5 = utmp.groupby(['User_id'], as_index = False).min()
u5.rename(columns = {'distance':'u_min_distance'}, inplace = True)
u6 = utmp.groupby(['User_id'], as_index = False).max()
u6.rename(columns = {'distance':'u_max_distance'}, inplace = True)
u7 = utmp.groupby(['User_id'], as_index = False).mean()
u7.rename(columns = {'distance':'u_mean_distance'}, inplace = True)
u8 = utmp.groupby(['User_id'], as_index = False).median()
u8.rename(columns = {'distance':'u_median_distance'}, inplace = True)
u8.head(2)
# In[35]:
u.shape, u1.shape, u2.shape, u3.shape, u4.shape, u5.shape, u6.shape, u7.shape, u8.shape
# In[36]:
# merge all the features on key User_id
user_feature = pd.merge(u, u1, on = 'User_id', how = 'left')
user_feature = pd.merge(user_feature, u2, on = 'User_id', how = 'left')
user_feature = pd.merge(user_feature, u3, on = 'User_id', how = 'left')
user_feature = pd.merge(user_feature, u4, on = 'User_id', how = 'left')
user_feature = pd.merge(user_feature, u5, on = 'User_id', how = 'left')
user_feature = pd.merge(user_feature, u6, on = 'User_id', how = 'left')
user_feature = pd.merge(user_feature, u7, on = 'User_id', how = 'left')
user_feature = pd.merge(user_feature, u8, on = 'User_id', how = 'left')
# In[37]:
# calculate rate
user_feature['u_use_coupon_rate'] = user_feature['u_buy_with_coupon'].astype('float')/user_feature['u_coupon_count'].astype('float')
user_feature['u_buy_with_coupon_rate'] = user_feature['u_buy_with_coupon'].astype('float')/user_feature['u_buy_count'].astype('float')
user_feature = user_feature.fillna(0)
user_feature.head(2)
# In[38]:
# add user feature to data on key User_id
data2 = pd.merge(data, user_feature, on = 'User_id', how = 'left').fillna(0)
# split data2 into valid and train
train, valid = train_test_split(data2, test_size = 0.2, stratify = data2['label'], random_state=100)
# In[39]:
# model2
predictors = original_feature + user_feature.columns.tolist()[1:]
print(len(predictors), predictors)
if not os.path.isfile('2_model.pkl'):
model = check_model(train, predictors)
print(model.best_score_)
print(model.best_params_)
with open('2_model.pkl', 'wb') as f:
pickle.dump(model, f)
else:
with open('2_model.pkl', 'rb') as f:
model = pickle.load(f)
# In[40]:
# valid set performance
valid['pred_prob'] = model.predict_proba(valid[predictors])[:,1]
validgroup = valid.groupby(['Coupon_id'])
aucs = []
for i in validgroup:
tmpdf = i[1]
if len(tmpdf['label'].unique()) != 2:
continue
fpr, tpr, thresholds = roc_curve(tmpdf['label'], tmpdf['pred_prob'], pos_label=1)
aucs.append(auc(fpr, tpr))
#aucs.append(roc_auc_score(tmpdf['label'], tmpdf['pred_prob']))
print(np.average(aucs))
# In[41]:
# key of merchant
m = fdf[['Merchant_id']].copy().drop_duplicates()
# In[42]:
# m_coupon_count : num of coupon from merchant
m1 = fdf[fdf['Date_received'] != 'null'][['Merchant_id']].copy()
m1['m_coupon_count'] = 1
m1 = m1.groupby(['Merchant_id'], as_index = False).count()
m1.head(2)
# In[43]:
# m_sale_count : num of sale from merchant (with or without coupon)
m2 = fdf[fdf['Date'] != 'null'][['Merchant_id']].copy()
m2['m_sale_count'] = 1
m2 = m2.groupby(['Merchant_id'], as_index = False).count()
m2.head(2)
# In[44]:
# m_sale_with_coupon : num of sale from merchant with coupon usage
m3 = fdf[(fdf['Date'] != 'null') & (fdf['Date_received'] != 'null')][['Merchant_id']].copy()
m3['m_sale_with_coupon'] = 1
m3 = m3.groupby(['Merchant_id'], as_index = False).count()
m3.head(2)
# In[45]:
# m_min_distance
mtmp = fdf[(fdf['Date'] != 'null') & (fdf['Date_received'] != 'null')][['Merchant_id', 'distance']].copy()
mtmp.replace(-1, np.nan, inplace = True)
m4 = mtmp.groupby(['Merchant_id'], as_index = False).min()
m4.rename(columns = {'distance':'m_min_distance'}, inplace = True)
m5 = mtmp.groupby(['Merchant_id'], as_index = False).max()
m5.rename(columns = {'distance':'m_max_distance'}, inplace = True)
m6 = mtmp.groupby(['Merchant_id'], as_index = False).mean()
m6.rename(columns = {'distance':'m_mean_distance'}, inplace = True)
m7 = mtmp.groupby(['Merchant_id'], as_index = False).median()
m7.rename(columns = {'distance':'m_median_distance'}, inplace = True)
m7.head(2)
# In[46]:
m.shape, m1.shape, m2.shape, m3.shape, m4.shape, m5.shape, m6.shape, m7.shape
# In[47]:
merchant_feature = pd.merge(m, m1, on = 'Merchant_id', how = 'left')
merchant_feature = pd.merge(merchant_feature, m2, on = 'Merchant_id', how = 'left')
merchant_feature = pd.merge(merchant_feature, m3, on = 'Merchant_id', how = 'left')
merchant_feature = pd.merge(merchant_feature, m4, on = 'Merchant_id', how = 'left')
merchant_feature = pd.merge(merchant_feature, m5, on = 'Merchant_id', how = 'left')
merchant_feature = pd.merge(merchant_feature, m6, on = 'Merchant_id', how = 'left')
merchant_feature = pd.merge(merchant_feature, m7, on = 'Merchant_id', how = 'left')
merchant_feature = merchant_feature.fillna(0)
merchant_feature.head(5)
# In[48]:
merchant_feature['m_coupon_use_rate'] = merchant_feature['m_sale_with_coupon'].astype('float')/merchant_feature['m_coupon_count'].astype('float')
merchant_feature['m_sale_with_coupon_rate'] = merchant_feature['m_sale_with_coupon'].astype('float')/merchant_feature['m_sale_count'].astype('float')
merchant_feature = merchant_feature.fillna(0)
merchant_feature.head(2)
# In[49]:
# add merchant feature to data2
data3 = pd.merge(data2, merchant_feature, on = 'Merchant_id', how = 'left').fillna(0)
# split data3 into train/valid
train, valid = train_test_split(data3, test_size = 0.2, stratify = data3['label'], random_state=100)
# In[50]:
# model 3
predictors = original_feature + user_feature.columns.tolist()[1:] + merchant_feature.columns.tolist()[1:]
print(predictors)
if not os.path.isfile('3_model.pkl'):
model = check_model(train, predictors)
print(model.best_score_)
print(model.best_params_)
with open('3_model.pkl', 'wb') as f:
pickle.dump(model, f)
else:
with open('3_model.pkl', 'rb') as f:
model = pickle.load(f)
# In[56]:
valid['pred_prob'] = model.predict_proba(valid[predictors])[:,1]
validgroup = valid.groupby(['Coupon_id'])
aucs = []
for i in validgroup:
tmpdf = i[1]
if len(tmpdf['label'].unique()) != 2:
continue
fpr, tpr, thresholds = roc_curve(tmpdf['label'], tmpdf['pred_prob'], pos_label=1)
aucs.append(auc(fpr, tpr))
print(np.average(aucs))
# In[57]:
# key of user and merchant
um = fdf[['User_id', 'Merchant_id']].copy().drop_duplicates()
# In[58]:
um1 = fdf[['Userum2 = fdf[fdf['Date'] != 'null'][['User_id', 'Merchant_id']].copy()
um2['um_buy_count'] = 1
um2 = um2.groupby(['User_id', 'Merchant_id'], as_index = False).count()
um2.head(2)_id', 'Merchant_id']].copy()
um1['um_count'] = 1
um1 = um1.groupby(['User_id', 'Merchant_id'], as_index = False).count()
um1.head(2)
# In[59]:
um2 = fdf[fdf['Date'] != 'null'][['User_id', 'Merchant_id']].copy()
um2['um_buy_count'] = 1
um2 = um2.groupby(['User_id', 'Merchant_id'], as_index = False).count()
um2.head(2)
# In[60]:
um3 = fdf[fdf['Date_received'] != 'null'][['User_id', 'Merchant_id']].copy()
um3['um_coupon_count'] = 1
um3 = um3.groupby(['User_id', 'Merchant_id'], as_index = False).count()
um3.head(2)
# In[61]:
um4 = fdf[(fdf['Date_received'] != 'null') & (fdf['Date'] != 'null')][['User_id', 'Merchant_id']].copy()
um4['um_buy_with_coupon'] = 1
um4 = um4.groupby(['User_id', 'Merchant_id'], as_index = False).count()
um4.head(2)
# In[62]:
# merge all user merchant
user_merchant_feature = pd.merge(um, um1, on = ['User_id','Merchant_id'], how = 'left')
user_merchant_feature = pd.merge(user_merchant_feature, um2, on = ['User_id','Merchant_id'], how = 'left')
user_merchant_feature = pd.merge(user_merchant_feature, um3, on = ['User_id','Merchant_id'], how = 'left')
user_merchant_feature = pd.merge(user_merchant_feature, um4, on = ['User_id','Merchant_id'], how = 'left')
user_merchant_feature = user_merchant_feature.fillna(0)
user_merchant_feature['um_buy_rate'] = user_merchant_feature['um_buy_count'].astype('float')/user_merchant_feature['um_count'].astype('float')
user_merchant_feature['um_coupon_use_rate'] = user_merchant_feature['um_buy_with_coupon'].astype('float')/user_merchant_feature['um_coupon_count'].astype('float')
user_merchant_feature['um_buy_with_coupon_rate'] = user_merchant_feature['um_buy_with_coupon'].astype('float')/user_merchant_feature['um_buy_count'].astype('float')
user_merchant_feature = user_merchant_feature.fillna(0)
user_merchant_feature.head(2)
# In[63]:
# add user_merchant_feature to data3
data4 = pd.merge(data3, user_merchant_feature, on = ['User_id','Merchant_id'], how = 'left').fillna(0)
# split data4 into train/valid
train, valid = train_test_split(data4, test_size = 0.2, stratify = data4['label'], random_state=100)
# In[64]:
predictors = original_feature + user_feature.columns.tolist()[1:] + merchant_feature.columns.tolist()[1:] + user_merchant_feature.columns.tolist()[2:]
print(len(predictors),predictors)
if not os.path.isfile('4_model.pkl'):
model = check_model(train, predictors)
print(model.best_score_)
print(model.best_params_)
with open('4_model.pkl', 'wb') as f:
pickle.dump(model, f)
else:
with open('4_model.pkl', 'rb') as f:
model = pickle.load(f)
# In[65]:
valid['pred_prob'] = model.predict_proba(valid[predictors])[:,1]
validgroup = valid.groupby(['Coupon_id'])
aucs = []
for i in validgroup:
tmpdf = i[1]
if len(tmpdf['label'].unique()) != 2:
continue
fpr, tpr, thresholds = roc_curve(tmpdf['label'], tmpdf['pred_prob'], pos_label=1)
aucs.append(auc(fpr, tpr))
print(np.average(aucs))
# In[66]:
def userFeature(df):
u = df[['User_id']].copy().drop_duplicates()
# u_coupon_count : num of coupon received by user
u1 = df[df['Date_received'] != 'null'][['User_id']].copy()
u1['u_coupon_count'] = 1
u1 = u1.groupby(['User_id'], as_index = False).count()
# u_buy_count : times of user buy offline (with or without coupon)
u2 = df[df['Date'] != 'null'][['User_id']].copy()
u2['u_buy_count'] = 1
u2 = u2.groupby(['User_id'], as_index = False).count()
# u_buy_with_coupon : times of user buy offline (with coupon)
u3 = df[((df['Date'] != 'null') & (df['Date_received'] != 'null'))][['User_id']].copy()
u3['u_buy_with_coupon'] = 1
u3 = u3.groupby(['User_id'], as_index = False).count()
# u_merchant_count : num of merchant user bought from
u4 = df[df['Date'] != 'null'][['User_id', 'Merchant_id']].copy()
u4.drop_duplicates(inplace = True)
u4 = u4.groupby(['User_id'], as_index = False).count()
u4.rename(columns = {'Merchant_id':'u_merchant_count'}, inplace = True)
# u_min_distance
utmp = df[(df['Date'] != 'null') & (df['Date_received'] != 'null')][['User_id', 'distance']].copy()
utmp.replace(-1, np.nan, inplace = True)
u5 = utmp.groupby(['User_id'], as_index = False).min()
u5.rename(columns = {'distance':'u_min_distance'}, inplace = True)
u6 = utmp.groupby(['User_id'], as_index = False).max()
u6.rename(columns = {'distance':'u_max_distance'}, inplace = True)
u7 = utmp.groupby(['User_id'], as_index = False).mean()
u7.rename(columns = {'distance':'u_mean_distance'}, inplace = True)
u8 = utmp.groupby(['User_id'], as_index = False).median()
u8.rename(columns = {'distance':'u_median_distance'}, inplace = True)
user_feature = pd.merge(u, u1, on = 'User_id', how = 'left')
user_feature = pd.merge(user_feature, u2, on = 'User_id', how = 'left')
user_feature = pd.merge(user_feature, u3, on = 'User_id', how = 'left')
user_feature = pd.merge(user_feature, u4, on = 'User_id', how = 'left')
user_feature = pd.merge(user_feature, u5, on = 'User_id', how = 'left')
user_feature = pd.merge(user_feature, u6, on = 'User_id', how = 'left')
user_feature = pd.merge(user_feature, u7, on = 'User_id', how = 'left')
user_feature = pd.merge(user_feature, u8, on = 'User_id', how = 'left')
user_feature['u_use_coupon_rate'] = user_feature['u_buy_with_coupon'].astype('float')/user_feature['u_coupon_count'].astype('float')
user_feature['u_buy_with_coupon_rate'] = user_feature['u_buy_with_coupon'].astype('float')/user_feature['u_buy_count'].astype('float')
user_feature = user_feature.fillna(0)
print(user_feature.columns.tolist())
return user_feature
# In[67]:
def merchantFeature(df):
m = df[['Merchant_id']].copy().drop_duplicates()
# m_coupon_count : num of coupon from merchant
m1 = df[df['Date_received'] != 'null'][['Merchant_id']].copy()
m1['m_coupon_count'] = 1
m1 = m1.groupby(['Merchant_id'], as_index = False).count()
# m_sale_count : num of sale from merchant (with or without coupon)
m2 = df[df['Date'] != 'null'][['Merchant_id']].copy()
m2['m_sale_count'] = 1
m2 = m2.groupby(['Merchant_id'], as_index = False).count()
# m_sale_with_coupon : num of sale from merchant with coupon usage
m3 = df[(df['Date'] != 'null') & (df['Date_received'] != 'null')][['Merchant_id']].copy()
m3['m_sale_with_coupon'] = 1
m3 = m3.groupby(['Merchant_id'], as_index = False).count()
# m_min_distance
mtmp = df[(df['Date'] != 'null') & (df['Date_received'] != 'null')][['Merchant_id', 'distance']].copy()
mtmp.replace(-1, np.nan, inplace = True)
m4 = mtmp.groupby(['Merchant_id'], as_index = False).min()
m4.rename(columns = {'distance':'m_min_distance'}, inplace = True)
m5 = mtmp.groupby(['Merchant_id'], as_index = False).max()
m5.rename(columns = {'distance':'m_max_distance'}, inplace = True)
m6 = mtmp.groupby(['Merchant_id'], as_index = False).mean()
m6.rename(columns = {'distance':'m_mean_distance'}, inplace = True)
m7 = mtmp.groupby(['Merchant_id'], as_index = False).median()
m7.rename(columns = {'distance':'m_median_distance'}, inplace = True)
merchant_feature = pd.merge(m, m1, on = 'Merchant_id', how = 'left')
merchant_feature = pd.merge(merchant_feature, m2, on = 'Merchant_id', how = 'left')
merchant_feature = pd.merge(merchant_feature, m3, on = 'Merchant_id', how = 'left')
merchant_feature = pd.merge(merchant_feature, m4, on = 'Merchant_id', how = 'left')
merchant_feature = pd.merge(merchant_feature, m5, on = 'Merchant_id', how = 'left')
merchant_feature = pd.merge(merchant_feature, m6, on = 'Merchant_id', how = 'left')
merchant_feature = pd.merge(merchant_feature, m7, on = 'Merchant_id', how = 'left')
merchant_feature['m_coupon_use_rate'] = merchant_feature['m_sale_with_coupon'].astype('float')/merchant_feature['m_coupon_count'].astype('float')
merchant_feature['m_sale_with_coupon_rate'] = merchant_feature['m_sale_with_coupon'].astype('float')/merchant_feature['m_sale_count'].astype('float')
merchant_feature = merchant_feature.fillna(0)
print(merchant_feature.columns.tolist())
return merchant_feature
# In[68]:
def usermerchantFeature(df):
um = df[['User_id', 'Merchant_id']].copy().drop_duplicates()
um1 = df[['User_id', 'Merchant_id']].copy()
um1['um_count'] = 1
um1 = um1.groupby(['User_id', 'Merchant_id'], as_index = False).count()
um2 = df[df['Date'] != 'null'][['User_id', 'Merchant_id']].copy()
um2['um_buy_count'] = 1
um2 = um2.groupby(['User_id', 'Merchant_id'], as_index = False).count()
um3 = df[df['Date_received'] != 'null'][['User_id', 'Merchant_id']].copy()
um3['um_coupon_count'] = 1
um3 = um3.groupby(['User_id', 'Merchant_id'], as_index = False).count()
um4 = df[(df['Date_received'] != 'null') & (df['Date'] != 'null')][['User_id', 'Merchant_id']].copy()
um4['um_buy_with_coupon'] = 1
um4 = um4.groupby(['User_id', 'Merchant_id'], as_index = False).count()
user_merchant_feature = pd.merge(um, um1, on = ['User_id','Merchant_id'], how = 'left')
user_merchant_feature = pd.merge(user_merchant_feature, um2, on = ['User_id','Merchant_id'], how = 'left')
user_merchant_feature = pd.merge(user_merchant_feature, um3, on = ['User_id','Merchant_id'], how = 'left')
user_merchant_feature = pd.merge(user_merchant_feature, um4, on = ['User_id','Merchant_id'], how = 'left')
user_merchant_feature = user_merchant_feature.fillna(0)
user_merchant_feature['um_buy_rate'] = user_merchant_feature['um_buy_count'].astype('float')/user_merchant_feature['um_count'].astype('float')
user_merchant_feature['um_coupon_use_rate'] = user_merchant_feature['um_buy_with_coupon'].astype('float')/user_merchant_feature['um_coupon_count'].astype('float')
user_merchant_feature['um_buy_with_coupon_rate'] = user_merchant_feature['um_buy_with_coupon'].astype('float')/user_merchant_feature['um_buy_count'].astype('float')
user_merchant_feature = user_merchant_feature.fillna(0)
print(user_merchant_feature.columns.tolist())
return user_merchant_feature
# In[69]:
def featureProcess(feature, train, test):
"""
feature engineering from feature data
then assign user, merchant, and user_merchant feature for train and test
"""
user_feature = userFeature(feature)
merchant_feature = merchantFeature(feature)
user_merchant_feature = usermerchantFeature(feature)
train = pd.merge(train, user_feature, on = 'User_id', how = 'left')
train = pd.merge(train, merchant_feature, on = 'Merchant_id', how = 'left')
train = pd.merge(train, user_merchant_feature, on = ['User_id', 'Merchant_id'], how = 'left')
train = train.fillna(0)
test = pd.merge(test, user_feature, on = 'User_id', how = 'left')
test = pd.merge(test, merchant_feature, on = 'Merchant_id', how = 'left')
test = pd.merge(test, user_merchant_feature, on = ['User_id', 'Merchant_id'], how = 'left')
test = test.fillna(0)
return train, test
# In[70]:
# repeat result above and process dftest data
feature = dfoff[(dfoff['Date'] < '20160516') | ((dfoff['Date'] == 'null') & (dfoff['Date_received'] < '20160516'))].copy()
data = dfoff[(dfoff['Date_received'] >= '20160516') & (dfoff['Date_received'] <= '20160615')].copy()
print(data['label'].value_counts())
# In[71]:
# features
predictors = ['discount_rate', 'discount_man', 'discount_jian', 'discount_type', 'distance',
'weekday', 'weekday_1', 'weekday_2', 'weekday_3', 'weekday_4', 'weekday_5', 'weekday_6',
'weekday_7', 'weekday_type',
'u_coupon_count', 'u_buy_count', 'u_buy_with_coupon', 'u_merchant_count', 'u_min_distance',
'u_max_distance', 'u_mean_distance', 'u_median_distance', 'u_use_coupon_rate', 'u_buy_with_coupon_rate',
'm_coupon_count', 'm_sale_count', 'm_sale_with_coupon', 'm_min_distance', 'm_max_distance',
'm_mean_distance', 'm_median_distance', 'm_coupon_use_rate', 'm_sale_with_coupon_rate', 'um_count', 'um_buy_count',
'um_coupon_count', 'um_buy_with_coupon', 'um_buy_rate', 'um_coupon_use_rate', 'um_buy_with_coupon_rate']
print(len(predictors), predictors)
# In[72]:
trainSub, validSub = train_test_split(train, test_size = 0.2, stratify = train['label'], random_state=100)
# In[73]:
# linear model
model = check_model(trainSub, predictors)
# In[74]:
validSub['pred_prob'] = model.predict_proba(validSub[predictors])[:,1]
validgroup = validSub.groupby(['Coupon_id'])
aucs = []
for i in validgroup:
tmpdf = i[1]
if len(tmpdf['label'].unique()) != 2:
continue
fpr, tpr, thresholds = roc_curve(tmpdf['label'], tmpdf['pred_prob'], pos_label=1)
aucs.append(auc(fpr, tpr))
print(np.average(aucs))
# In[75]:
model = lgb.LGBMClassifier(
learning_rate = 0.01,
boosting_type = 'gbdt',
objective = 'binary',
metric = 'logloss',
max_depth = 5,
sub_feature = 0.7,
num_leaves = 3,
colsample_bytree = 0.7,
n_estimators = 5000,
early_stop = 50,
verbose = -1)
model.fit(trainSub[predictors], trainSub['label'])
# In[76]:
validSub['pred_prob'] = model.predict_proba(validSub[predictors])[:,1]
validgroup = validSub.groupby(['Coupon_id'])
aucs = []
for i in validgroup:
tmpdf = i[1]
if len(tmpdf['label'].unique()) != 2:
continue
fpr, tpr, thresholds = roc_curve(tmpdf['label'], tmpdf['pred_prob'], pos_label=1)
aucs.append(auc(fpr, tpr))
print(np.average(aucs))
# In[77]:
# test prediction for submission
y_test_pred = model.predict_proba(test[predictors])
submit = test[['User_id','Coupon_id','Date_received']].copy()
submit['label'] = y_test_pred[:,1]
submit.to_csv('submit2.csv', index=False, header=False)
submit.head()
4.实战-《第1章 O2O优惠券使用新人赛数据发掘工程》
https://tianchi.aliyun.com/learn/live.html?spm=5176.11510297.4851204.5.54443c60L537Wv&classroomId=263#classroomId=263&postsId=23465
学会了通过可视化方法分析中特征,借助图表等直观方法分析提取重要特征以及复合特征,发现不同特征之间的关系,为建模做准备。
注:原来的path ='datalab/59/'应对应改成path ='G:/code/o2o/',!!!注意斜杠方向
# coding: utf-8 # In[1]: import pandas as pd import seaborn as sns import numpy as np from matplotlib import pyplot as plt import matplotlib from datetime import datetime import warnings warnings.filterwarnings("ignore") get_ipython().magic('matplotlib inline') # In[2]: print(plt.style.available) print('pandas version:',pd.__version__) print('seaborn version:',sns.__version__) print('matplotlib version:',matplotlib.__version__) # In[7]: path ='G:/code/o2o/' # In[8]: train = pd.read_csv(path+'ccf_offline_stage1_train.csv') # In[9]: train.head() # In[10]: train.info() # In[11]: train.isnull().sum()/len(train) # In[12]: len(set(train['User_id'])) # In[13]: train['sum'] = 1 user_id_count = train.groupby(['User_id'], as_index = False)['sum'].agg({'count':np.sum}) # In[14]: user_id_count.sort_values(['count'],ascending=0)[:20] # In[15]: def user_count(data): if data > 10: return 3 elif data > 5: return 2 elif data > 1: return 1 else: return 0 # In[16]: user_id_count['user_range'] = user_id_count['count'].map(user_count) # In[17]: sns.set(font_scale=1.2,style="white") f,ax=plt.subplots(1,2,figsize=(17.5,8)) user_id_count['user_range'].value_counts().plot.pie(explode=[0,0.1,0.1,0.1],autopct='%1.1f%%',ax=ax[0]) plt.ylim([0, 300000]) ax[0].set_title('user_range_ratio') ax[0].set_ylabel('') sns.despine() sns.countplot('user_range',data=user_id_count,ax=ax[1]) plt.show() # In[18]: Mer_id_count = train.groupby(['Merchant_id'], as_index = False)['sum'].agg({'count':np.sum}) # In[19]: Mer_id_count.sort_values(['count'],ascending=0)[:5] # In[20]: def Mer_count(data): if data > 1000: return 3 elif data > 100: return 2 elif data > 20: return 1 else: return 0 # In[21]: Mer_id_count['mer_range'] = Mer_id_count['count'].map(Mer_count) # In[22]: sns.set(font_scale=1.2,style="white") f,ax=plt.subplots(1,2,figsize=(17.5,8)) Mer_id_count['mer_range'].value_counts().plot.pie(explode=[0,0.1,0.1,0.1],autopct='%1.1f%%',ax=ax[0]) plt.ylim([0, 6000]) ax[0].set_title('Mer_range_ratio') ax[0].set_ylabel('') sns.despine() sns.countplot('mer_range',data=Mer_id_count,ax=ax[1]) plt.show() # In[23]: print('有优惠券的数据为', train[(train['Coupon_id'] != 'null')].shape[0]) print('无优惠券的数据为', train[(train['Coupon_id'] == 'null')].shape[0]) # In[24]: train1 = train[(train['Coupon_id'] != 'null')] Cou_id_count = train1.groupby(['Coupon_id'], as_index = False)['sum'].agg({'count':np.sum}) # In[25]: Cou_id_count.sort_values(['count'],ascending=0)[:5] # In[26]: def Cou_count(data): if data > 1000: return 3 elif data > 100: return 2 elif data > 10: return 1 else: return 0 # In[27]: Cou_id_count['Cou_range'] = Cou_id_count['count'].map(Cou_count) # In[28]: sns.set(font_scale=1.2,style="white") f,ax=plt.subplots(1,2,figsize=(17.5,8)) Cou_id_count['Cou_range'].value_counts().plot.pie(explode=[0,0.1,0.1,0.1],autopct='%1.1f%%',ax=ax[0]) plt.ylim([0, 7000]) ax[0].set_title('Cou_range_ratio') ax[0].set_ylabel('') sns.despine() sns.countplot('Cou_range',data=Cou_id_count,ax=ax[1]) plt.show() # In[29]: def convertRate(row): """Convert discount to rate""" if row == 'null': return 1.0 elif ':' in row: rows = row.split(':') return np.round(1.0 - float(rows[1])/float(rows[0]),2) else: return float(row) # In[30]: train['discount_rate'] = train['Discount_rate'].apply(convertRate) # In[31]: print('Discount_rate 类型:',train['discount_rate'].unique()) # In[32]: sns.set(style="white") sns.factorplot(x="discount_rate", data=train, kind="count", size=6, aspect=2,color='#ad9ee8') # In[33]: sns.set(style="white") sns.factorplot(x="Distance", data=train, kind="count", size=6, aspect=2) # In[34]: couponbydate = train[train['Date_received'] != 'null'][['Date_received', 'Date']].groupby(['Date_received'], as_index=False).count() couponbydate.columns = ['Date_received','count'] buybydate = train[(train['Date'] != 'null') & (train['Date_received'] != 'null')][['Date_received', 'Date']].groupby(['Date_received'], as_index=False).count() buybydate.columns = ['Date_received','count'] # In[35]: date_buy = train['Date'].unique() date_buy = sorted(date_buy[date_buy != 'null']) date_received = train['Date_received'].unique() date_received = sorted(date_received[date_received != 'null']) sns.set_style('ticks') sns.set_context("notebook", font_scale= 1.4) plt.figure(figsize = (12,8)) date_received_dt = pd.to_datetime(date_received, format='%Y%m%d') plt.subplot(211) plt.bar(date_received_dt, couponbydate['count'], label = 'number of coupon received',color='#a675a1') plt.bar(date_received_dt, buybydate['count'], label = 'number of coupon used',color='#75a1a6') plt.yscale('log') plt.ylabel('Count') plt.legend() plt.subplot(212) plt.bar(date_received_dt, buybydate['count']/couponbydate['count'],color='#62a5de') plt.ylabel('Ratio(coupon used/coupon received)') plt.tight_layout() # In[36]: um_count = train.groupby(['User_id','Merchant_id'], as_index = False)['sum'].agg({'count':np.sum}) # In[37]: um_count.head(5) # In[38]: train1 = train[train['Coupon_id'] != 'null'] # In[39]: uc_count = train1.groupby(['User_id','Coupon_id'], as_index = False)['sum'].agg({'count':np.sum}) # In[40]: uc_count.head(5) # In[41]: train['discount_rate'] = train['Discount_rate'].apply(convertRate) # In[42]: ur_count = train.groupby(['User_id','discount_rate'], as_index = False)['sum'].agg({'count':np.sum}) # In[43]: ur_count.head(5) # In[44]: ud_count = train.groupby(['User_id','Distance'], as_index = False)['sum'].agg({'count':np.sum}) # In[45]: ud_count = train.groupby(['User_id','Distance'], as_index = False)['sum'].agg({'count':np.sum}) # In[46]: from datetime import date def getWeekday(row): if row == 'null': return row else: return date(int(row[0:4]), int(row[4:6]), int(row[6:8])).weekday() + 1 # In[47]: train['received_weekday'] = train['Date_received'].astype(str).apply(getWeekday) train['consume_weekday'] = train['Date'].astype(str).apply(getWeekday) # weekday_type : 周六和周日为1,其他为0 train['received_weekday_type'] = train['received_weekday'].apply(lambda x : 1 if x in [6,7] else 0 ) train['consume_weekday_type'] = train['consume_weekday'].apply(lambda x : 1 if x in [6,7] else 0 ) # In[48]: u_week_count = train.groupby(['User_id','received_weekday'], as_index = False)['sum'].agg({'count':np.sum}) u_week_type_count = train.groupby(['User_id','received_weekday_type'], as_index = False)['sum'].agg({'count':np.sum}) # In[49]: u_week_count.head(5) # In[50]: train1 = train[train['Date'] != 'null'] u_con_week_count = train1.groupby(['User_id','consume_weekday'], as_index = False)['sum'].agg({'count':np.sum}) u_con_week_type_count = train1.groupby(['User_id','consume_weekday_type'], as_index = False)['sum'].agg({'count':np.sum}) # In[51]: u_con_week_count.head(10)
5.学习《天池o2o参考代码(0.81分数)》、《 天池O2O优惠券代码分享》和《第1章 新手代码【视频直播】:O2O优惠券使用预测赛实战演练》代码,学习了利用数据评价标准以及利用滑窗法、pandas的内置函数提取特征等方法优化准确率。
https://tianchi.aliyun.com/learn/live.html?spm=5176.11510297.4851204.9.54443c60L537Wv&classroomId=263#classroomId=263&postsId=23468
https://tianchi.aliyun.com/learn/live.html?spm=5176.11510297.4851204.7.54443c60L537Wv&classroomId=263#classroomId=263&postsId=23467
https://tianchi.aliyun.com/learn/live.html?spm=5176.11510297.4851204.11.54443c60L537Wv&classroomId=263#classroomId=263&postsId=23521
《天池o2o参考代码(0.81分数)》代码参考如下:
def get_offline_features(X, offline):
print(X.shape)
coupon_id_notnull = offline[offline.Coupon_id.notnull()]
temp = coupon_id_notnull
coupon_consume = temp[temp.Date.notnull()]
coupon_no_consume = temp[temp.Date.isnull()]
user_coupon_consume = coupon_consume.groupby('User_id')
X['weekday'] = X.Date_received.dt.weekday
X['day'] = X.Date_received.dt.day
# 距离优惠券消费次数
temp = coupon_consume.groupby('Distance').size().to_frame('distance_0')
X = pd.merge(X, temp, how='left', on='Distance')
# 距离优惠券不消费次数
temp = coupon_no_consume.groupby('Distance').size().to_frame('distance_1')
X = pd.merge(X, temp, how='left', on='Distance')
# 距离优惠券领取次数
X['distance_2'] = X.distance_0 + X.distance_1
# 距离优惠券消费率
X['distance_3'] = X.distance_0 / X.distance_2
'''user features'''
# 用户优惠券消费次数
temp = user_coupon_consume.size().to_frame('u2')
X = pd.merge(X, temp, how='left', on='User_id')
# 用户优惠券不消费次数
temp = coupon_no_consume.groupby('User_id').size().to_frame('u3')
X = pd.merge(X, temp, how='left', on='User_id')
# 用户优惠券消费与不消费次数比值
X['u19'] = X.u2 / X.u3
# 用户领取优惠券次数
X['u1'] = X.u2.fillna(0) + X.u3.fillna(0)
# 用户优惠券使用率
X['u4'] = X.u2 / X.u1
# 用户普通消费次数
date_received_isnull_date_notnull = offline[offline.Date_received.isnull() & offline.Date.notnull()]
temp = date_received_isnull_date_notnull.groupby('User_id').size().to_frame('u5')
X = pd.merge(X, temp, how='left', on='User_id')
# 用户优惠券消费和普通消费次数
X['u25'] = X.u2.fillna(0) + X.u5.fillna(0)
# 用户使用优惠券消费占比
X['u20'] = X.u2 / X.u25
# 用户所有消费时间平均间隔
temp = offline[offline.Date.notnull()].copy()
temp['_max'] = temp.groupby('User_id').Date.transform('max')
temp['_min'] = temp.groupby('User_id').Date.transform('min')
temp['_len'] = temp.groupby('User_id').Date.transform('count')
temp['u26'] = (temp._max - temp._min).dt.days / (temp._len - 1)
temp.drop_duplicates('User_id', inplace=True)
X = pd.merge(X, temp[['User_id', 'u26']], how='left', on='User_id')
# 用户普通消费平均间隔
temp = date_received_isnull_date_notnull.copy()
tmp = temp.groupby('User_id').Date
temp['_max'] = tmp.transform('max')
temp['_min'] = tmp.transform('min')
temp['_count'] = tmp.transform('count')
temp['u6'] = (temp._max - temp._min).dt.days / (temp._count - 1)
temp.drop_duplicates('User_id', inplace=True)
X = pd.merge(X, temp[['User_id', 'u6']], how='left', on='User_id')
# 用户优惠券消费平均间隔
temp = coupon_consume.copy()
temp1 = temp.groupby('User_id').Date
temp['_max'] = temp1.transform('max')
temp['_min'] = temp1.transform('min')
temp['_count'] = temp1.transform('count')
temp['u7'] = (temp._max - temp._min).dt.days / (temp._count - 1)
temp.drop_duplicates('User_id', inplace=True)
X = pd.merge(X, temp[['User_id', 'u7']], how='left', on='User_id')
# 用户15天内平均会普通消费几次
X['u8'] = X.u6 / 15
# 用户15天内平均会优惠券消费几次
X['u9'] = X.u7 / 15
# 用户优惠券领取到使用平均间隔
temp = coupon_consume.copy()
temp['days'] = (temp.Date - temp.Date_received).dt.days
temp = (temp.groupby('User_id').days.sum() / temp.groupby('User_id').size()).to_frame('u10')
X = pd.merge(X, temp, how='left', on='User_id')
# 用户在15天内使用掉优惠券的值大小
X['u11'] = X.u10 / 15
# 领取优惠券到使用优惠券间隔小于15天的次数
temp = coupon_consume.copy()
temp['days'] = (temp.Date - temp.Date_received).dt.days
temp = temp[temp.days <= 15]
temp = temp.groupby('User_id').size().to_frame('u21')
X = pd.merge(X, temp, how='left', on='User_id')
# 用户15天使用掉优惠券的次数除以使用优惠券的次数
X['u22'] = X.u21 / X.u2
# 用户15天使用掉优惠券的次数除以领取优惠券未消费的次数
X['u23'] = X.u21 / X.u3
# 用户15天使用掉优惠券的次数除以领取优惠券的总次数
X['u24'] = X.u21 / X.u1
# 用户核销优惠券的平均折率
temp = user_coupon_consume.discount.mean().to_frame('u45')
X = pd.merge(X, temp, how='left', on='User_id')
# 用户核销优惠券的最低折率
temp = user_coupon_consume.discount.min().to_frame('u27')
X = pd.merge(X, temp, how='left', on='User_id')
# 用户核销优惠券的最高消费折率
temp = user_coupon_consume.discount.max().to_frame('u28')
X = pd.merge(X, temp, how='left', on='User_id')
# 用户核销过的不同优惠券数量
temp = coupon_consume.groupby(['User_id', 'Coupon_id']).size()
temp = temp.groupby('User_id').size().to_frame('u32')
X = pd.merge(X, temp, how='left', on='User_id')
# 用户领取所有不同优惠券数量
date_received_notnull = offline[offline.Date_received.notnull()]
temp = date_received_notnull.groupby(['User_id', 'Coupon_id']).size().to_frame('u47')
X = pd.merge(X, temp, how='left', on=['User_id', 'Coupon_id'])
# 用户核销过的不同优惠券数量占所有不同优惠券的比重
X['u33'] = X.u32 / X.u47
# 用户平均每种优惠券核销多少张
X['u34'] = X.u2 / X.u47
# 核销优惠券用户-商家平均距离
temp = offline[offline.Date_received.notnull() & offline.Date.notnull()].copy()
temp1 = temp.groupby('User_id').Distance
temp['_count'] = temp1.transform('count')
temp['_sum'] = temp1.transform('sum')
temp['u35'] = temp._sum / temp._count
temp = temp[['User_id', 'u35']].copy()
temp.drop_duplicates(inplace=True)
X = pd.merge(X, temp, how='left', on='User_id')
# 用户核销优惠券中的最小用户-商家距离
temp = coupon_consume[coupon_consume.Distance.notnull()]
temp = temp.groupby('User_id').Distance.min().to_frame('u36')
X = pd.merge(X, temp, how='left', on='User_id')
# 用户核销优惠券中的最大用户-商家距离
temp = coupon_consume[coupon_consume.Distance.notnull()]
temp = temp.groupby('User_id').Distance.max().to_frame('u37')
X = pd.merge(X, temp, how='left', on='User_id')
# 优惠券类型
discount_types = [
'0.2', '0.5', '0.6', '0.7', '0.75', '0.8', '0.85', '0.9', '0.95', '30:20', '50:30', '10:5',
'20:10', '100:50', '200:100', '50:20', '30:10', '150:50', '100:30', '20:5', '200:50', '5:1',
'50:10', '100:20', '150:30', '30:5', '300:50', '200:30', '150:20', '10:1', '50:5', '100:10',
'200:20', '300:30', '150:10', '300:20', '500:30', '20:1', '100:5', '200:10', '30:1', '150:5',
'300:10', '200:5', '50:1', '100:1',
]
X['discount_type'] = -1
for k, v in enumerate(discount_types):
X.loc[X.Discount_rate == v, 'discount_type'] = k
# 不同优惠券领取次数
temp = offline.groupby(['User_id', 'Discount_rate']).size().to_frame('u41')
X = pd.merge(X, temp, how='left', on=['User_id', 'Discount_rate'])
# 不同优惠券使用次数
temp = coupon_consume.groupby(['User_id', 'Discount_rate']).size().to_frame('u42')
X = pd.merge(X, temp, how='left', on=['User_id', 'Discount_rate'])
# 不同优惠券不使用次数
temp = coupon_no_consume.groupby(['User_id', 'Discount_rate']).size().to_frame('u43')
X = pd.merge(X, temp, how='left', on=['User_id', 'Discount_rate'])
# 不同打折优惠券使用率
X['u44'] = X.u42 / X.u41
# 满减类型优惠券领取次数
temp = offline[offline.Discount_rate.str.contains(':') == True]
temp = temp.groupby('User_id').size().to_frame('u48')
X = pd.merge(X, temp, how='left', on='User_id')
# 打折类型优惠券领取次数
temp = offline[offline.Discount_rate.str.contains('\.') == True]
temp = temp.groupby('User_id').size().to_frame('u49')
X = pd.merge(X, temp, how='left', on='User_id')
'''offline merchant features'''
# 商家销售次数
temp = offline[offline.Date.notnull()].groupby('Merchant_id').size().to_frame('m0')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家优惠券核销次数
temp = coupon_consume.groupby('Merchant_id').size().to_frame('m1')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家普通销售次数
X['m2'] = X.m0.fillna(0) - X.m1.fillna(0)
# 商家优惠券被领取次数
temp = date_received_notnull.groupby('Merchant_id').size().to_frame('m3')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家优惠券核销率
X['m4'] = X.m1 / X.m3
# 商家优惠券不核销次数
temp = coupon_no_consume.groupby('Merchant_id').size().to_frame('m7')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家在数据集中出现的次数
temp = offline.groupby('Merchant_id').size().to_frame('m16')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商户当天优惠券领取次数
temp = X[X.Date_received.notnull()]
temp = temp.groupby(['Merchant_id', 'Date_received']).size().to_frame('m5')
X = pd.merge(X, temp, how='left', on=['Merchant_id', 'Date_received'])
# 商家当天优惠券领取人数
temp = X[X.Date_received.notnull()]
temp = temp.groupby(['User_id', 'Merchant_id', 'Date_received']).size()
temp = temp.groupby(['Merchant_id', 'Date_received']).size().to_frame('m6')
X = pd.merge(X, temp, how='left', on=['Merchant_id', 'Date_received'])
# 商家核销优惠券的平均折率
temp = coupon_consume.groupby('Merchant_id').discount.mean().to_frame('m8')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家优惠券核销的最小消费折率
temp = coupon_consume.groupby('Merchant_id').discount.max().to_frame('m9')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家优惠券核销的最大消费折率
temp = coupon_consume.groupby('Merchant_id').discount.min().to_frame('m10')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家优惠券核销不同的用户数量
temp = coupon_consume.groupby(['Merchant_id', 'User_id']).size()
temp = temp.groupby('Merchant_id').size().to_frame('m11')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家优惠券领取不同的用户数量
temp = date_received_notnull.groupby(['Merchant_id', 'User_id']).size()
temp = temp.groupby('Merchant_id').size().to_frame('m12')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 核销商家优惠券的不同用户数量其占领取不同的用户比重
X['m13'] = X.m11 / X.m12
# 商家优惠券平均每个用户核销多少张
X['m14'] = X.m1 / X.m12
# 商家被核销过的不同优惠券数量
temp = coupon_consume.groupby(['Merchant_id', 'Coupon_id']).size()
temp = temp.groupby('Merchant_id').size().to_frame('m15')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家领取过的不同优惠券数量的比重
temp = date_received_notnull.groupby(['Merchant_id', 'Coupon_id']).size()
temp = temp.groupby('Merchant_id').count().to_frame('m18')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家被核销过的不同优惠券数量占所有领取过的不同优惠券数量的比重
X['m19'] = X.m15 / X.m18
# 商家被核销优惠券的平均时间
temp = coupon_consume.copy()
temp1 = temp.groupby('Merchant_id').Date
temp['_max'] = temp1.transform('max')
temp['_min'] = temp1.transform('min')
temp['_count'] = temp1.transform('count')
temp['m20'] = (temp._max - temp._min).dt.days / (temp._count - 1)
temp.drop_duplicates('Merchant_id', inplace=True)
X = pd.merge(X, temp[['Merchant_id', 'm20']], how='left', on='Merchant_id')
# 商家被核销优惠券中的用户平均距离
temp = coupon_consume[coupon_consume.Distance.notnull()].copy()
temp1 = temp.groupby('Merchant_id').Distance
temp['_count'] = temp1.transform('count')
temp['_sum'] = temp1.transform('sum')
temp['m21'] = temp._sum / temp._count
temp = temp[['Merchant_id', 'm21']].copy()
temp.drop_duplicates(inplace=True)
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家被核销优惠券中的用户最小距离
temp = coupon_consume[coupon_consume.Distance.notnull()]
temp = temp.groupby('Merchant_id').Distance.min().to_frame('m22')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家被核销优惠券中的用户最大距离
temp = coupon_consume[coupon_consume.Distance.notnull()]
temp = temp.groupby('Merchant_id').Distance.max().to_frame('m23')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家普通消费用户的平均距离
temp = offline[offline.Date_received.isnull() & offline.Date.notnull()].copy()
temp1 = temp.groupby('Merchant_id').Distance
temp['_count'] = temp1.transform('count')
temp['_sum'] = temp1.transform('sum')
temp['m24'] = temp._sum / temp._count
temp = temp[['Merchant_id', 'm24']].copy()
temp.drop_duplicates(inplace=True)
X = pd.merge(X, temp, how='left', on='Merchant_id')
"""offline coupon features"""
# 每种优惠券领取次数
temp = coupon_id_notnull.groupby('Coupon_id').size().to_frame('c1')
X = pd.merge(X, temp, how='left', on='Coupon_id')
# 每种优惠券使用次数
temp = coupon_consume.groupby('Coupon_id').size().to_frame('c2')
X = pd.merge(X, temp, how='left', on='Coupon_id')
# 优惠券使用率
X['c3'] = X.c2 / X.c1
# 优惠券没有使用次数
X['c4'] = X.c1 - X.c2
# 优惠券当天领取次数
temp = X.groupby(['Coupon_id', 'Date_received']).size().to_frame('c5')
X = pd.merge(X, temp, how='left', on=['Coupon_id', 'Date_received'])
# 优惠券类型(直接优惠为0, 满减为1)
X['c6'] = 0
X.loc[X.Discount_rate.str.contains(':') == True, 'c6'] = 1
# 不同打折优惠券领取次数
temp = offline.groupby('Discount_rate').size().to_frame('c8')
X = pd.merge(X, temp, how='left', on='Discount_rate')
# 不同打折优惠券使用次数
temp = coupon_consume.groupby('Discount_rate').size().to_frame('c9')
X = pd.merge(X, temp, how='left', on='Discount_rate')
# 不同打折优惠券不使用次数
temp = coupon_no_consume.groupby('Discount_rate').size().to_frame('c10')
X = pd.merge(X, temp, how='left', on='Discount_rate')
# 不同打折优惠券使用率
X['c11'] = X.c9 / X.c8
# 每种优惠券核销的平均时间
temp = coupon_consume.copy()
temp1 = temp.groupby('Coupon_id').Date
temp['_max'] = temp1.transform('max')
temp['_min'] = temp1.transform('min')
temp['_count'] = temp1.transform('count')
temp['c12'] = (temp._max - temp._min).dt.days / (temp._count - 1)
temp.drop_duplicates('Coupon_id', inplace=True)
X = pd.merge(X, temp[['Coupon_id', 'c12']], how='left', on='Coupon_id')
'''user merchant feature'''
# 用户领取商家的优惠券次数
temp = coupon_id_notnull
temp = temp.groupby(['User_id', 'Merchant_id']).size().to_frame('um1')
X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])
# 用户领取商家的优惠券后不核销次数
temp = coupon_no_consume.groupby(['User_id', 'Merchant_id']).size().to_frame('um2')
X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])
# 用户领取商家的优惠券后核销次数
temp = coupon_consume.groupby(['User_id', 'Merchant_id']).size().to_frame('um3')
X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])
# 用户领取商家的优惠券后核销率
X['um4'] = X.um3 / X.um1
# 用户对每个商家的不核销次数占用户总的不核销次数的比重
temp = coupon_no_consume.groupby('User_id').size().to_frame('temp')
X = pd.merge(X, temp, how='left', on='User_id')
X['um5'] = X.um2 / X.temp
X.drop(columns='temp', inplace=True)
# 用户在商店总共消费过几次
temp = offline[offline.Date.notnull()].groupby(['User_id', 'Merchant_id']).size().to_frame('um6')
X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])
# 用户在商店普通消费次数
temp = date_received_isnull_date_notnull
temp = temp.groupby(['User_id', 'Merchant_id']).size().to_frame('um7')
X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])
# 用户当天在每个商家领取的优惠券次数
temp = date_received_notnull
temp = temp.groupby(['User_id', 'Merchant_id', 'Date_received']).size().to_frame('um8')
X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id', 'Date_received'])
# 用户领取优惠券不同商家数量
temp = coupon_id_notnull
temp = temp.groupby(['User_id', 'Merchant_id']).size().reset_index()
temp = temp.groupby('User_id').size().to_frame('um9')
X = pd.merge(X, temp, how='left', on='User_id')
# 用户核销优惠券不同商家数量
temp = coupon_consume.groupby(['User_id', 'Merchant_id']).size()
temp = temp.groupby('User_id').size().to_frame('um10')
X = pd.merge(X, temp, how='left', on='User_id')
# 用户核销过优惠券的不同商家数量占所有不同商家的比重
X['um11'] = X.um10 / X.um9
# 用户平均核销每个商家多少张优惠券
X['um12'] = X.u2 / X.um9
'''other feature'''
# 用户领取优惠券次数
temp = X.groupby('User_id').size().to_frame('o1')
X = pd.merge(X, temp, how='left', on='User_id')
# 用户不同优惠券领取次数
temp = X.groupby(['User_id', 'Coupon_id']).size().to_frame('o2')
X = pd.merge(X, temp, how='left', on=['User_id', 'Coupon_id'])
# 用户此次之后/前领取的优惠券次数
X['o3'] = 1
X['o3'] = X.sort_values(by=['User_id', 'Date_received']).groupby('User_id').o3.cumsum() - 1
X['o4'] = 1
X['o4'] = X.sort_values(by=['User_id', 'Date_received'], ascending=False).groupby('User_id').o4.cumsum() - 1
# 用户此次之后/前领取的每种优惠券次数
X['o5'] = 1
temp = X.sort_values(by=['User_id', 'Coupon_id', 'Date_received'])
X['o5'] = temp.groupby('User_id').o5.cumsum() - 1
X['o6'] = 1
temp = X.sort_values(by=['User_id', 'Coupon_id', 'Date_received'], ascending=False)
X['o6'] = temp.groupby('User_id').o6.cumsum() - 1
# 用户领取优惠券平均时间间隔
temp = X.copy()
temp1 = temp.groupby('User_id').Date_received
temp['_max'] = temp1.transform('max')
temp['_min'] = temp1.transform('min')
temp['_count'] = temp1.transform('count')
temp['o7'] = (temp._max - temp._min).dt.days / (temp._count - 1)
temp.drop_duplicates('User_id', inplace=True)
X = pd.merge(X, temp[['User_id', 'o7']], how='left', on='User_id')
# 用户领取不同商家的优惠券次数
temp = X.groupby(['User_id', 'Merchant_id']).size().to_frame('o8')
X = pd.merge(X, temp, how='left', on=['User_id', 'Merchant_id'])
# 用户领取的不同商家数
temp = X.groupby(['User_id', 'Merchant_id']).size()
temp = temp.groupby('User_id').size().to_frame('o9')
X = pd.merge(X, temp, how='left', on='User_id')
# 用户当天领取的优惠券次数
temp = X.groupby(['User_id', 'Date_received']).size().to_frame('o10')
X = pd.merge(X, temp, how='left', on=['User_id', 'Date_received'])
# 用户当天不同优惠券领取次数
temp = X.groupby(['User_id', 'Coupon_id', 'Date_received']).size().to_frame('o11')
X = pd.merge(X, temp, how='left', on=['User_id', 'Coupon_id', 'Date_received'])
# 用户领取优惠券类别数
temp = X.groupby(['User_id', 'Coupon_id']).size()
temp = temp.groupby('User_id').size().to_frame('o12')
X = pd.merge(X, temp, how='left', on='User_id')
# 商家被领取的优惠券次数
temp = X.groupby('Merchant_id').size().to_frame('o13')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家优惠券的种类数
temp = X.groupby(['Merchant_id', 'Coupon_id']).size().to_frame('o14')
X = pd.merge(X, temp, how='left', on=['Merchant_id', 'Coupon_id'])
# 商家被领取优惠券不同用户数
temp = X.groupby(['Merchant_id', 'User_id']).size()
temp = temp.groupby('Merchant_id').size().to_frame('o15')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 商家优惠券所有种类数
temp = X.groupby(['Merchant_id', 'Coupon_id']).size()
temp = temp.groupby('Merchant_id').size().to_frame('o16')
X = pd.merge(X, temp, how='left', on='Merchant_id')
# 用户领取优惠券的时间间隔
temp = X.sort_values(by=['User_id', 'Date_received']).groupby('User_id')
X['o17'] = temp.Date_received.diff().dt.days
X['o18'] = temp.Date_received.diff(-1).dt.days.abs()
print(X.shape)
return X