电商推荐系统架构
import numpy as np
import pandas as pd
from tqdm import tqdm
import string
import re
import gensim
from collections import Counter
import pickle
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import KFold
from keras.preprocessing import text, sequence
import warnings
warnings.filterwarnings('ignore')
import lightgbm as lgb
from functools import partial
import os
import gc
from scipy.sparse import vstack
import time
import datetime
import joblib
import multiprocessing as mp
import seaborn as sns
%matplotlib inline
action_data = pd.read_csv('./data.csv')
action_data.head()
|
user_log_acct |
item_sku_id |
action_time |
action_type |
brand_code |
shop_id |
item_third_cate_cd |
vender_id |
shop_score |
age |
sex |
user_level |
province |
city |
county |
0 |
937922 |
357022 |
2020-02-04 08:28:15 |
1 |
1791.0 |
8703.0 |
10.0 |
5227.0 |
-1.000000 |
5.0 |
1.0 |
5 |
11.0 |
348.0 |
1782.0 |
1 |
937922 |
73 |
2020-02-04 08:27:07 |
1 |
1791.0 |
8703.0 |
10.0 |
5227.0 |
-1.000000 |
5.0 |
1.0 |
5 |
11.0 |
348.0 |
1782.0 |
2 |
937922 |
29583 |
2020-02-04 08:26:31 |
1 |
1791.0 |
2738.0 |
10.0 |
3436.0 |
9.206167 |
5.0 |
1.0 |
5 |
11.0 |
348.0 |
1782.0 |
3 |
937922 |
108763 |
2020-02-04 08:26:10 |
1 |
1791.0 |
2738.0 |
10.0 |
3436.0 |
9.206167 |
5.0 |
1.0 |
5 |
11.0 |
348.0 |
1782.0 |
4 |
1369473 |
331139 |
2020-02-03 21:55:49 |
1 |
9985.0 |
6367.0 |
73.0 |
3666.0 |
0.000000 |
5.0 |
1.0 |
5 |
1.0 |
41.0 |
2058.0 |
action_data.shape
(37214269, 15)
数据分析
action_data.info()
RangeIndex: 37214269 entries, 0 to 37214268
Data columns (total 15 columns):
# Column Dtype
--- ------ -----
0 user_log_acct int64
1 item_sku_id int64
2 action_time object
3 action_type int64
4 brand_code float64
5 shop_id float64
6 item_third_cate_cd float64
7 vender_id float64
8 shop_score float64
9 age float64
10 sex float64
11 user_level int64
12 province float64
13 city float64
14 county float64
dtypes: float64(10), int64(4), object(1)
memory usage: 4.2+ GB
action_data['action_time'].apply(lambda x:x[:7]).value_counts()
2020-02 15109221
2020-03 15004111
2020-04 7100937
Name: action_time, dtype: int64
action_data['action_type'].value_counts()
1 33151074
2 2193489
4 826761
5 600979
3 441966
Name: action_type, dtype: int64
action_data['sex'].value_counts()
0.0 24293534
1.0 12824478
-1.0 79207
Name: sex, dtype: int64
action_data['user_level'].value_counts()
7 14440505
5 9062005
1 8763570
6 4910432
4 24432
3 13022
2 303
Name: user_level, dtype: int64
数据预处理
action_data['action_time'].apply(lambda x: len(str(x))).value_counts()
19 34578820
21 2635449
Name: action_time, dtype: int64
action_data['dd_len'] = action_data['action_time'].apply(lambda x: len(str(x)))
action_data['action_time'] = action_data['action_time'].apply(lambda x: x[:19])
del action_data['dd_len']
action_data['action_time'] = pd.to_datetime(action_data['action_time'])
action_data = action_data.sort_values('action_time')
action_data['month'] = action_data['action_time'].dt.month
action_data['day'] = action_data['action_time'].dt.day
action_data['month_day'] = action_data['month'].values * 100 + action_data['day'].values
action_data['month_day'].value_counts()
204 890713
206 861885
205 799023
201 795076
203 783515
...
217 300989
216 252465
215 218483
328 65396
327 52573
Name: month_day, Length: 74, dtype: int64
训练集切分
def _label_trans(x, dic_):
try:
return dic_[x]
except:
return 0
def get_label(df, label_st = (4,11), label_en = (4,15),candidate_st = (4,6), candidate_en = (4,10), fea_en = (4,10)):
lb_st = df.loc[(df['month'] == label_st[0]) & (df['day'] == label_st[1]), 'month_day'].values[0]
lb_en = df.loc[(df['month'] == label_en[0]) & (df['day'] == label_en[1]), 'month_day'].values[0]
cand_st = df.loc[(df['month'] == candidate_st[0]) & (df['day'] == candidate_st[1]), 'month_day'].values[0]
cand_en = df.loc[(df['month'] == candidate_en[0]) & (df['day'] == candidate_en[1]), 'month_day'].values[0]
fea_position = df.loc[(df['month'] == fea_en[0]) & (df['day'] == fea_en[1]), 'month_day'].values[0]
ind_label = (df['month_day']>= lb_st) & (df['month_day']<= lb_en) & (df['action_type'] ==2)
ind_candidate = (df['month_day']>= cand_st) & (df['month_day']<= cand_en)
ind_fea = (df['month_day']<= fea_position)
data_label = df.loc[ind_label].copy()
data_fea = df.loc[ind_fea].copy()
data_candidates = df.loc[ind_candidate].copy()
df_candidates = data_candidates[['user_log_acct','item_sku_id']].copy()
df_candidates = df_candidates.drop_duplicates(subset = ['user_log_acct','item_sku_id'])
df_candidates = df_candidates.loc[(df_candidates.item_sku_id.isnull() == False)]
label = data_label[['user_log_acct','item_sku_id','day']].copy()
print('get label')
df_candidates['label_cnt'] = 0
df_candidates['label_days'] = 0
df_candidates['user_item'] = df_candidates['user_log_acct'].astype(str) + '_' + df_candidates['item_sku_id'].astype(str)
df_candidates['user_cate'] = df_candidates['user_log_acct'].astype(str) + '_' + df_candidates['item_third_cate_cd'].astype(str)
df_candidates['user_cate_shop_id'] = df_candidates['user_log_acct'].astype(str) + '_' + df_candidates['item_third_cate_cd'].astype(str) + '_' + df_candidates['shop_id'].astype(str)
label['user_item'] = label['user_log_acct'].astype(str)+'_' + label['item_sku_id'].astype(str)
dic_cnt = label['user_item'].value_counts().to_dict()
dic_days = label.groupby('user_item')['day'].nunique().to_dict()
df_candidates['label_cnt'] = df_candidates['user_item'].apply(lambda x:_label_trans(x,dic_cnt)).values
df_candidates['label_days'] = df_candidates['user_item'].apply(lambda x:_label_trans(x,dic_days)).values
return df_candidates, data_fea
%%time
df_valid_label,data_valid_fea = get_label(action_data, label_st = (4,11), label_en = (4,15), candidate_st = (4,6), candidate_en = (4,10), fea_en = (4,10))
get label
CPU times: user 5.43 s, sys: 871 ms, total: 6.3 s
Wall time: 6.3 s
%%time
df_train_label1,data_train_fea1 = get_label(action_data, label_st = (4,6), label_en = (4,10), candidate_st = (4,1), candidate_en = (4,5), fea_en = (4,5))
get label
CPU times: user 4.69 s, sys: 640 ms, total: 5.33 s
Wall time: 5.32 s
特征构建
原始特征
my_user = action_data[['user_log_acct','age','sex','user_level','province','city','county']].drop_duplicates(['user_log_acct'], keep='first')
my_item = action_data[['item_sku_id','brand_code','shop_id','item_third_cate_cd','vender_id','shop_score']].drop_duplicates(['item_sku_id'], keep='first')
user特征
def gen_action_freq_feats(df, start_date):
key = ['user_log_acct']
action = df[key+['action_type', 'action_time']].copy()
feats = pd.DataFrame(action[key].drop_duplicates())
for w in tqdm([1, 3, 5, 7, 15, 30]):
bef_start_date = start_date - datetime.timedelta(days=w)
action_cl = action[action['action_time']>=bef_start_date].copy()
df = pd.get_dummies(action_cl['action_type'], prefix='_'.join(key)+'_last{}_days_action'.format(w))
action_cl = pd.concat([action_cl, df], axis=1)
action_cl = action_cl.groupby(key, as_index=False).sum()
action_cl['_'.join(key)+'_last{}_days_action_1_rt'.format(w)] = action_cl['_'.join(key)+'_last{}_days_action_2'.format(w)] / (1 + action_cl['_'.join(key)+'_last{}_days_action_1'.format(w)])
action_cl['_'.join(key)+'_last{}_days_action_3_rt'.format(w)] = action_cl['_'.join(key)+'_last{}_days_action_2'.format(w)] / (1 + action_cl['_'.join(key)+'_last{}_days_action_3'.format(w)])
action_cl['_'.join(key)+'_last{}_days_action_4_rt'.format(w)] = action_cl['_'.join(key)+'_last{}_days_action_2'.format(w)] / (1 + action_cl['_'.join(key)+'_last{}_days_action_4'.format(w)])
del action_cl['action_type']
feats = feats.merge(action_cl, on=key, how='left')
return feats
u_fea_train1 = gen_action_freq_feats(data_train_fea1, datetime.datetime(2020, 4, 5))
u_fea_val1 = gen_action_freq_feats(data_valid_fea, datetime.datetime(2020, 4, 10))
100%|██████████| 6/6 [00:04<00:00, 1.21it/s]
100%|██████████| 6/6 [00:05<00:00, 1.08it/s]
u_fea_train1.head()
|
user_log_acct |
user_log_acct_last1_days_action_1 |
user_log_acct_last1_days_action_2 |
user_log_acct_last1_days_action_3 |
user_log_acct_last1_days_action_4 |
user_log_acct_last1_days_action_1_rt |
user_log_acct_last1_days_action_3_rt |
user_log_acct_last1_days_action_4_rt |
user_log_acct_last3_days_action_1 |
user_log_acct_last3_days_action_2 |
... |
user_log_acct_last15_days_action_1_rt |
user_log_acct_last15_days_action_3_rt |
user_log_acct_last15_days_action_4_rt |
user_log_acct_last30_days_action_1 |
user_log_acct_last30_days_action_2 |
user_log_acct_last30_days_action_3 |
user_log_acct_last30_days_action_4 |
user_log_acct_last30_days_action_1_rt |
user_log_acct_last30_days_action_3_rt |
user_log_acct_last30_days_action_4_rt |
0 |
270769 |
1.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
1.0 |
0.0 |
... |
0.000000 |
0.000000 |
0.000000 |
22.0 |
1.0 |
0.0 |
0.0 |
0.043478 |
1.000000 |
1.000000 |
1 |
492809 |
11.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
17.0 |
0.0 |
... |
0.000000 |
0.000000 |
0.000000 |
30.0 |
0.0 |
1.0 |
0.0 |
0.000000 |
0.000000 |
0.000000 |
2 |
438196 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
... |
0.000000 |
0.000000 |
0.000000 |
4.0 |
0.0 |
0.0 |
0.0 |
0.000000 |
0.000000 |
0.000000 |
3 |
49368 |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
NaN |
7.0 |
0.0 |
... |
0.015873 |
0.166667 |
1.000000 |
275.0 |
1.0 |
23.0 |
0.0 |
0.003623 |
0.041667 |
1.000000 |
4 |
715642 |
36.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
0.0 |
54.0 |
0.0 |
... |
0.003125 |
0.500000 |
0.333333 |
834.0 |
2.0 |
1.0 |
6.0 |
0.002395 |
1.000000 |
0.285714 |
5 rows × 43 columns
item_sku_id 特征
def gen_item_feats(df_item):
df_item_fea = df_item.copy()
for col in ['item_third_cate_cd','vender_id']:
dic_ = df_item[col].value_counts().to_dict()
df_item_fea['{}_cnt'.format(col)] = df_item_fea[col].map(dic_).values
for col in ['shop_score']:
dic_ = df_item.groupby('item_third_cate_cd')[col].mean().to_dict()
df_item_fea['cate_{}_mean'.format(col)] = df_item_fea['item_third_cate_cd'].map(dic_).values
for col in ['item_sku_id','brand_code']:
dic_ = df_item.groupby('shop_id')[col].nunique()
df_item_fea['shop_id_{}_nunique'.format(col)] = df_item_fea['shop_id'].map(dic_).values
for col in ['item_sku_id','brand_code']:
dic_ = df_item.groupby('item_third_cate_cd')[col].nunique()
df_item_fea['item_third_cate_cd_{}_nunique'.format(col)] = df_item_fea['item_third_cate_cd'].map(dic_).values
del df_item_fea['item_third_cate_cd']
return df_item_fea
item_feats = gen_item_feats(my_item)
user + cate 特征
def get_uc_feats(df, start_date):
df['user_cate'] = df['user_log_acct'].astype(str)+'_' + df['item_third_cate_cd'].astype(str)
df_fea = df[['user_cate']].copy()
df_fea = df_fea.drop_duplicates(subset = ['user_cate'])
df['action_time_diff'] = df.groupby('user_cate')['action_time'].shift().values
df['action_time_diff'] = df['action_time'] - df['action_time_diff']
df['action_time_diff'] = df['action_time_diff'].dt.seconds // 60
df['action_time_to_now'] = start_date - df['action_time']
df['action_time_to_now'] = df['action_time_to_now'].dt.seconds // 60
dic_ = df.groupby('user_cate')['action_time_to_now'].min().to_dict()
df_fea['user_cate_action_time_to_now_last'] = df_fea['user_cate'].map(dic_).values
for days in tqdm([1,3,7,14]):
tmp_ind = (df['action_time'] >= start_date + datetime.timedelta(days=-1 * days))
df_tmp = df[tmp_ind].copy()
dic_ = df_tmp.groupby('user_cate')['day'].count().to_dict()
df_fea['user_cate_{}_day_cnt'.format(days)] = df_fea['user_cate'].map(dic_).values
dic_ = df_tmp.groupby('user_cate')['day'].nunique().to_dict()
df_fea['user_cate_{}_day_nunique_pct'.format(days)] = df_fea['user_cate'].map(dic_).values * 1.0 / days
dic_ = df_tmp.groupby('user_cate')['action_time_diff'].mean().to_dict()
df_fea['user_cate_{}_timediff_mean'.format(days)] = df_fea['user_cate'].map(dic_).values
dic_ = df_tmp.groupby('user_cate')['action_time_diff'].std().to_dict()
df_fea['user_cate_{}_timediff_std'.format(days)] = df_fea['user_cate'].map(dic_).values
dic_ = df_tmp.groupby('user_cate')['action_time_diff'].median().to_dict()
df_fea['user_cate_{}_timediff_median'.format(days)] = df_fea['user_cate'].map(dic_).values
for type_ in [1,2,3,4]:
ind_type = df['action_type'] == type_
ind = tmp_ind & ind_type
df_tmp = df[ind].copy()
dic_ = df_tmp.groupby('user_cate')['day'].count().to_dict()
df_fea['type_{}_user_cate_{}_day_cnt'.format(type_,days)] = df_fea['user_cate'].map(dic_).values
if days > 1 and type_ == 2:
df_tmp['action_time_diff'] = df_tmp.groupby('user_cate')['action_time'].shift().values
df_tmp['action_time_diff'] = df_tmp['action_time'] - df_tmp['action_time_diff']
df_tmp['action_time_diff'] = df_tmp['action_time_diff'].dt.seconds // 60
dic_ = df_tmp.groupby('user_cate')['day'].nunique().to_dict()
df_fea['type_{}_user_cate_{}_day_nunique_pct'.format(type_,days)] = df_fea['user_cate'].map(dic_).values * 1.0 / days
dic_ = df_tmp.groupby('user_cate')['action_time_diff'].mean().to_dict()
df_fea['type_{}_user_cate_{}_timediff_mean'.format(type_,days)] = df_fea['user_cate'].map(dic_).values
dic_ = df_tmp.groupby('user_cate')['action_time_diff'].std().to_dict()
df_fea['type_{}_user_cate_{}_timediff_std'.format(type_,days)] = df_fea['user_cate'].map(dic_).values
dic_ = df_tmp.groupby('user_cate')['action_time_diff'].median().to_dict()
df_fea['type_{}_user_cate_{}_timediff_median'.format(type_,days)] = df_fea['user_cate'].map(dic_).values
return df_fea
uc_fea_train = get_uc_feats(data_train_fea1, datetime.datetime(2020, 4, 5))
uc_fea_val = get_uc_feats(data_valid_fea, datetime.datetime(2020, 4, 10))
100%|██████████| 4/4 [01:42<00:00, 25.70s/it]
100%|██████████| 4/4 [01:40<00:00, 25.25s/it]
user + cate + shop 特征
def get_ucs_feats(df, start_date):
df['user_cate_shop_id'] = df['user_log_acct'].astype(str)+'_' + df['item_third_cate_cd'].astype(str)+ '_' + df['shop_id'].astype(str)
df_fea = df[['user_cate_shop_id']].copy()
df_fea = df_fea.drop_duplicates(subset = ['user_cate_shop_id'])
df['action_time_diff'] = df.groupby('user_cate_shop_id')['action_time'].shift().values
df['action_time_diff'] = df['action_time'] - df['action_time_diff']
df['action_time_diff'] = df['action_time_diff'].dt.seconds // 60
df['action_time_to_now'] = start_date - df['action_time']
df['action_time_to_now'] = df['action_time_to_now'].dt.seconds // 60
dic_ = df.groupby('user_cate_shop_id')['action_time_to_now'].min().to_dict()
df_fea['user_cate_shop_id_action_time_to_now_last'] = df_fea['user_cate_shop_id'].map(dic_).values
for days in tqdm([0,3,7]):
tmp_ind = (df['action_time'] >= start_date + datetime.timedelta(days=-1 * days))
df_tmp = df[tmp_ind].copy()
dic_ = df_tmp.groupby('user_cate_shop_id')['day'].count().to_dict()
df_fea['user_cate_shop_id_{}_day_cnt'.format(days)] = df_fea['user_cate_shop_id'].map(dic_).values
dic_ = df_tmp.groupby('user_cate_shop_id')['day'].nunique().to_dict()
df_fea['user_cate_shop_id_{}_day_nunique_pct'.format(days)] = df_fea['user_cate_shop_id'].map(dic_).values * 1.0 / days
dic_ = df_tmp.groupby('user_cate_shop_id')['action_time_diff'].mean().to_dict()
df_fea['user_cate_shop_id_{}_timediff_mean'.format(days)] = df_fea['user_cate_shop_id'].map(dic_).values
dic_ = df_tmp.groupby('user_cate_shop_id')['action_time_diff'].std().to_dict()
df_fea['user_cate_shop_id_{}_timediff_std'.format(days)] = df_fea['user_cate_shop_id'].map(dic_).values
dic_ = df_tmp.groupby('user_cate_shop_id')['action_time_diff'].median().to_dict()
df_fea['user_cate_shop_id_{}_timediff_median'.format(days)] = df_fea['user_cate_shop_id'].map(dic_).values
for type_ in [1,2,3,4]:
ind_type = df['action_type'] == type_
ind = tmp_ind & ind_type
df_tmp = df[ind].copy()
dic_ = df_tmp.groupby('user_cate_shop_id')['day'].count().to_dict()
df_fea['type_{}_user_cate_shop_id_{}_day_cnt'.format(type_,days)] = df_fea['user_cate_shop_id'].map(dic_).values
if days > 1 and type_ == 2:
df_tmp['action_time_diff'] = df_tmp.groupby('user_cate_shop_id')['action_time'].shift().values
df_tmp['action_time_diff'] = df_tmp['action_time'] - df_tmp['action_time_diff']
df_tmp['action_time_diff'] = df_tmp['action_time_diff'].dt.seconds // 60
dic_ = df_tmp.groupby('user_cate_shop_id')['day'].nunique().to_dict()
df_fea['type_{}_user_cate_shop_id_{}_day_nunique_pct'.format(type_,days)] = df_fea['user_cate_shop_id'].map(dic_).values * 1.0 / days
dic_ = df_tmp.groupby('user_cate_shop_id')['action_time_diff'].mean().to_dict()
df_fea['type_{}_user_cate_shop_id_{}_timediff_mean'.format(type_,days)] = df_fea['user_cate_shop_id'].map(dic_).values
dic_ = df_tmp.groupby('user_cate_shop_id')['action_time_diff'].std().to_dict()
df_fea['type_{}_user_cate_shop_id_{}_timediff_std'.format(type_,days)] = df_fea['user_cate_shop_id'].map(dic_).values
dic_ = df_tmp.groupby('user_cate_shop_id')['action_time_diff'].median().to_dict()
df_fea['type_{}_user_cate_shop_id_{}_timediff_median'.format(type_,days)] = df_fea['user_cate_shop_id'].map(dic_).values
return df_fea
ucs_fea_train = get_ucs_feats(data_train_fea1, datetime.datetime(2020, 4, 5))
ucs_fea_val = get_ucs_feats(data_valid_fea, datetime.datetime(2020, 4, 10))
user + item 特征
def get_ui_feats(df, start_date):
df['user_item'] = df['user_log_acct'].astype(str)+'_' + df['item_sku_id'].astype(str)
df_fea = df[['user_item']].copy()
df_fea = df_fea.drop_duplicates(subset = ['user_item'])
df['action_time_diff'] = df.groupby('user_item')['action_time'].shift().values
df['action_time_diff'] = df['action_time'] - df['action_time_diff']
df['action_time_diff'] = df['action_time_diff'].dt.seconds // 60
df['action_time_to_now'] = start_date - df['action_time']
df['action_time_to_now'] = df['action_time_to_now'].dt.seconds // 60
dic_ = df.groupby('user_item')['action_time_to_now'].min().to_dict()
df_fea['user_item_action_time_to_now_last'] = df_fea['user_item'].map(dic_).values
for days in tqdm([1,3,7]):
tmp_ind = (df['action_time'] >= start_date + datetime.timedelta(days=-1 * days))
df_tmp = df[tmp_ind].copy()
dic_ = df_tmp.groupby('user_item')['day'].count().to_dict()
df_fea['user_item_{}_day_cnt'.format(days)] = df_fea['user_item'].map(dic_).values
dic_ = df_tmp.groupby('user_item')['day'].nunique().to_dict()
df_fea['user_item_{}_day_nunique_pct'.format(days)] = df_fea['user_item'].map(dic_).values * 1.0 / days
dic_ = df_tmp.groupby('user_item')['action_time_diff'].mean().to_dict()
df_fea['user_item_{}_timediff_mean'.format(days)] = df_fea['user_item'].map(dic_).values
dic_ = df_tmp.groupby('user_item')['action_time_diff'].std().to_dict()
df_fea['user_item_{}_timediff_std'.format(days)] = df_fea['user_item'].map(dic_).values
dic_ = df_tmp.groupby('user_item')['action_time_diff'].median().to_dict()
df_fea['user_item_{}_timediff_median'.format(days)] = df_fea['user_item'].map(dic_).values
for type_ in [1,2,3,4]:
ind_type = df['action_type'] == type_
ind = tmp_ind & ind_type
df_tmp = df[ind].copy()
dic_ = df_tmp.groupby('user_item')['day'].count().to_dict()
df_fea['type_{}_user_item_{}_day_cnt'.format(type_,days)] = df_fea['user_item'].map(dic_).values
if days > 1 and type_ == 2:
df_tmp['action_time_diff'] = df_tmp.groupby('user_item')['action_time'].shift().values
df_tmp['action_time_diff'] = df_tmp['action_time'] - df_tmp['action_time_diff']
df_tmp['action_time_diff'] = df_tmp['action_time_diff'].dt.seconds // 60
dic_ = df_tmp.groupby('user_item')['day'].nunique().to_dict()
df_fea['type_{}_user_item_{}_day_nunique_pct'.format(type_,days)] = df_fea['user_item'].map(dic_).values * 1.0 / days
dic_ = df_tmp.groupby('user_item')['action_time_diff'].mean().to_dict()
df_fea['type_{}_user_item_{}_timediff_mean'.format(type_,days)] = df_fea['user_item'].map(dic_).values
dic_ = df_tmp.groupby('user_item')['action_time_diff'].std().to_dict()
df_fea['type_{}_user_item_{}_timediff_std'.format(type_,days)] = df_fea['user_item'].map(dic_).values
dic_ = df_tmp.groupby('user_item')['action_time_diff'].median().to_dict()
df_fea['type_{}_user_item_{}_timediff_median'.format(type_,days)] = df_fea['user_item'].map(dic_).values
return df_fea
ui_fea_train = get_ui_feats(data_train_fea1, datetime.datetime(2020, 4, 5))
ui_fea_val = get_ui_feats(data_valid_fea, datetime.datetime(2020, 4, 10))
合并特征集
u_fea_cols1 = [col for col in u_fea_train1.columns if col not in ['user_log_acct']]
u_fea_cols2 = [col for col in my_user.columns if col not in ['user_log_acct']]
i_fea_cols = [col for col in my_item.columns if col not in ['item_sku_id']]
ui_fea_cols = [col for col in ui_fea_train.columns if col not in ['user_item']]
uc_fea_cols = [col for col in uc_fea_train.columns if col not in ['user_cate']]
ucs_fea_cols = [col for col in ucs_fea_train.columns if col not in ['user_cate_shop_id']]
train_cols = ['user_log_acct','item_sku_id'] + u_fea_cols1 + u_fea_cols2 + i_fea_cols + uc_fea_cols + ucs_fea_cols + ui_fea_cols + uc_fea_cols + ucs_fea_cols
训练集&验证集
df_train = df_train_label1.merge(u_fea_train1, on ='user_log_acct', how='left')
df_train = df_train.merge(my_user, on ='user_log_acct', how='left')
df_train = df_train.merge(my_item, on ='item_sku_id', how='left')
df_train = df_train.merge(ui_fea_train, on ='user_item', how='left')
df_train = df_train.merge(uc_fea_train, on ='user_cate', how='left')
df_train = df_train.merge(ucs_fea_train, on ='user_cate_shop_id', how='left')
df_train['label'] = df_train['label_cnt'] > 0
df_train['label'] = df_train['label'].astype(int)
df_val = df_valid_label.merge(u_fea_val1, on ='user_log_acct', how='left')
df_val = df_val.merge(my_user, on ='user_log_acct', how='left')
df_val = df_val.merge(my_item, on ='item_sku_id', how='left')
df_val = df_val.merge(ui_fea_val, on ='user_item', how='left')
df_val = df_train.merge(uc_fea_val, on ='user_cate', how='left')
df_val = df_train.merge(ucs_fea_val, on ='user_cate_shop_id', how='left')
df_val['label'] = df_val['label_cnt'] > 0
df_val['label'] = df_val['label'].astype(int)
模型训练
eval_set = [(df_train[train_cols], df_train['label']), (df_val[train_cols], df_val['label'])]
lgb_model = lgb.LGBMClassifier(boosting_type="gbdt", num_leaves=2**7-1, reg_alpha=0, reg_lambda=0.01,
max_depth=-1, n_estimators=2000, objective='binary', subsample=0.9,
colsample_bytree=0.85, subsample_freq=1, min_child_samples=25,
learning_rate=0.01, random_state=2021, metric="None", n_jobs=20)
lgb_model.fit(df_train[train_cols], df_train['label'] , eval_set = eval_set, eval_metric='auc', verbose=100, early_stopping_rounds=100)
df_submit = pd.DataFrame()
pred_proba = lgb_model.predict_proba(df_val[train_cols])[:,1]
df_submit['user_log_acct'] = df_val['user_log_acct'].values
df_submit['item_sku_id'] = df_val['item_sku_id'].values
df_submit['prob'] = pred_proba
df_submit = df_submit.drop_duplicates(subset=['user_log_acct','item_sku_id'])
df_submit = df_submit.loc[(df_submit.item_sku_id.isnull() == False) & (df_submit.item_sku_id.isnull() == False)]
df_submit_ = df_submit.loc[df_submit.prob >= 0.06].copy()
df_submit_['item_sku_id'] = df_submit_['item_sku_id'].astype(int)
df_submit_.to_csv('df_submit_all.csv',index=False)