import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.metrics import accuracy_score
import time
import datetime
from scipy.sparse import hstack
from sklearn.model_selection import StratifiedKFold
import re
from keras.layers import *
from tensorflow.keras.models import *
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing import text, sequence
from tensorflow.keras.callbacks import *
from keras.layers.advanced_activations import LeakyReLU, PReLU
import tensorflow.keras.backend as K
from keras.optimizers import *
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.backend import cast
import tensorflow as tf
import random as rn
import gc
import logging
import gensim
np.random.seed(1024)
rn.seed(1024)
import warnings
warnings.filterwarnings('ignore')
/home/frank/miniconda3/envs/reco2/lib/python3.7/site-packages/gensim/similarities/__init__.py:15: UserWarning: The gensim.similarities.levenshtein submodule is disabled, because the optional Levenshtein package is unavailable. Install Levenhstein (e.g. `pip install python-Levenshtein`) to suppress this warning.
warnings.warn(msg)
action_data = pd.read_csv('./data/my_data.csv')
action_data.head()
|
user_log_acct |
item_sku_id |
action_time |
action_type |
brand_code |
shop_id |
item_third_cate_cd |
vender_id |
shop_score |
age |
sex |
user_level |
province |
city |
county |
0 |
937922 |
357022 |
2020-02-04 08:28:15 |
1 |
1791.0 |
8703.0 |
10.0 |
5227.0 |
-1.000000 |
5.0 |
1.0 |
5 |
11.0 |
348.0 |
1782.0 |
1 |
937922 |
73 |
2020-02-04 08:27:07 |
1 |
1791.0 |
8703.0 |
10.0 |
5227.0 |
-1.000000 |
5.0 |
1.0 |
5 |
11.0 |
348.0 |
1782.0 |
2 |
937922 |
29583 |
2020-02-04 08:26:31 |
1 |
1791.0 |
2738.0 |
10.0 |
3436.0 |
9.206167 |
5.0 |
1.0 |
5 |
11.0 |
348.0 |
1782.0 |
3 |
937922 |
108763 |
2020-02-04 08:26:10 |
1 |
1791.0 |
2738.0 |
10.0 |
3436.0 |
9.206167 |
5.0 |
1.0 |
5 |
11.0 |
348.0 |
1782.0 |
4 |
1369473 |
331139 |
2020-02-03 21:55:49 |
1 |
9985.0 |
6367.0 |
73.0 |
3666.0 |
0.000000 |
5.0 |
1.0 |
5 |
1.0 |
41.0 |
2058.0 |
action_data.shape
(37214269, 15)
数据预处理
action_data['dd_len'] = action_data['action_time'].apply(lambda x: len(str(x)))
action_data['action_time'] = action_data['action_time'].apply(lambda x: x[:19])
del action_data['dd_len']
action_data['action_time'] = pd.to_datetime(action_data['action_time'])
action_data = action_data.sort_values('action_time')
action_data['month'] = action_data['action_time'].dt.month
action_data['day'] = action_data['action_time'].dt.day
action_data['month_day'] = action_data['month'].values * 100 + action_data['day'].values
训练集切分
def _label_trans(x, dic_):
try:
return dic_[x]
except:
return 0
def get_label(df, label_st = (4,11), label_en = (4,15),candidate_st = (4,6), candidate_en = (4,10), fea_en = (4,10)):
lb_st = df.loc[(df['month'] == label_st[0]) & (df['day'] == label_st[1]), 'month_day'].values[0]
lb_en = df.loc[(df['month'] == label_en[0]) & (df['day'] == label_en[1]), 'month_day'].values[0]
cand_st = df.loc[(df['month'] == candidate_st[0]) & (df['day'] == candidate_st[1]), 'month_day'].values[0]
cand_en = df.loc[(df['month'] == candidate_en[0]) & (df['day'] == candidate_en[1]), 'month_day'].values[0]
fea_position = df.loc[(df['month'] == fea_en[0]) & (df['day'] == fea_en[1]), 'month_day'].values[0]
ind_label = (df['month_day']>= lb_st) & (df['month_day']<= lb_en) & (df['action_type'] ==2)
ind_candidate = (df['month_day']>= cand_st) & (df['month_day']<= cand_en)
ind_fea = (df['month_day']<= fea_position)
data_label = df.loc[ind_label].copy()
data_fea = df.loc[ind_fea].copy()
data_candidates = df.loc[ind_candidate].copy()
df_candidates = data_candidates[['user_log_acct','item_sku_id']].copy()
df_candidates = df_candidates.drop_duplicates(subset = ['user_log_acct','item_sku_id'])
df_candidates = df_candidates.loc[(df_candidates.item_sku_id.isnull() == False)]
label = data_label[['user_log_acct','item_sku_id','day']].copy()
print('get label')
df_candidates['label_cnt'] = 0
df_candidates['label_days'] = 0
df_candidates['user_item'] = df_candidates['user_log_acct'].astype(str)+'_' + df_candidates['item_sku_id'].astype(str)
label['user_item'] = label['user_log_acct'].astype(str)+'_' + label['item_sku_id'].astype(str)
dic_cnt = label['user_item'].value_counts().to_dict()
dic_days = label.groupby('user_item')['day'].nunique().to_dict()
df_candidates['label_cnt'] = df_candidates['user_item'].apply(lambda x:_label_trans(x,dic_cnt)).values
df_candidates['label_days'] = df_candidates['user_item'].apply(lambda x:_label_trans(x,dic_days)).values
return df_candidates, data_fea
%%time
df_valid_label,data_valid_fea = get_label(action_data, label_st = (4,11), label_en = (4,15), candidate_st = (4,6), candidate_en = (4,10), fea_en = (4,10))
get label
CPU times: user 5.49 s, sys: 933 ms, total: 6.43 s
Wall time: 6.43 s
%%time
df_train_label1,data_train_fea1 = get_label(action_data, label_st = (4,6), label_en = (4,10), candidate_st = (4,1), candidate_en = (4,5), fea_en = (4,5))
get label
CPU times: user 4.73 s, sys: 784 ms, total: 5.51 s
Wall time: 5.51 s
df_train_label1.head()
|
user_log_acct |
item_sku_id |
label_cnt |
label_days |
user_item |
34296301 |
1144603 |
153700 |
0 |
0 |
1144603_153700 |
1415203 |
1129253 |
327893 |
0 |
0 |
1129253_327893 |
3960663 |
736788 |
201003 |
0 |
0 |
736788_201003 |
5158969 |
109461 |
256490 |
0 |
0 |
109461_256490 |
7377193 |
470525 |
142823 |
0 |
0 |
470525_142823 |
特征构建
原始特征
my_user = action_data[['user_log_acct','age','sex','user_level','province','city','county']].drop_duplicates(['user_log_acct'], keep='first')
my_item = action_data[['item_sku_id','brand_code','shop_id','item_third_cate_cd','vender_id','shop_score']].drop_duplicates(['item_sku_id'], keep='first')
user特征
def gen_action_freq_feats(df, start_date):
key = ['user_log_acct']
action = df[key+['action_type', 'action_time']].copy()
feats = pd.DataFrame(action[key].drop_duplicates())
for w in tqdm([1, 3, 5, 7, 15, 30]):
bef_start_date = start_date - datetime.timedelta(days=w)
action_cl = action[action['action_time']>=bef_start_date].copy()
df = pd.get_dummies(action_cl['action_type'], prefix='_'.join(key)+'_last{}_days_action'.format(w))
action_cl = pd.concat([action_cl, df], axis=1)
action_cl = action_cl.groupby(key, as_index=False).sum()
action_cl['_'.join(key)+'_last{}_days_action_1_rt'.format(w)] = action_cl['_'.join(key)+'_last{}_days_action_2'.format(w)] / (1 + action_cl['_'.join(key)+'_last{}_days_action_1'.format(w)])
action_cl['_'.join(key)+'_last{}_days_action_3_rt'.format(w)] = action_cl['_'.join(key)+'_last{}_days_action_2'.format(w)] / (1 + action_cl['_'.join(key)+'_last{}_days_action_3'.format(w)])
action_cl['_'.join(key)+'_last{}_days_action_4_rt'.format(w)] = action_cl['_'.join(key)+'_last{}_days_action_2'.format(w)] / (1 + action_cl['_'.join(key)+'_last{}_days_action_4'.format(w)])
del action_cl['action_type']
feats = feats.merge(action_cl, on=key, how='left')
return feats
u_fea_train1 = gen_action_freq_feats(data_train_fea1, datetime.datetime(2020, 4, 5))
u_fea_val1 = gen_action_freq_feats(data_valid_fea, datetime.datetime(2020, 4, 10))
100%|██████████| 6/6 [00:05<00:00, 1.16it/s]
100%|██████████| 6/6 [00:05<00:00, 1.08it/s]
合并特征集
u_fea_cols1 = [col for col in u_fea_train1.columns if col not in ['user_log_acct']]
u_fea_cols2 = [col for col in my_user.columns if col not in ['user_log_acct']]
i_fea_cols = [col for col in my_item.columns if col not in ['item_sku_id']]
train_cols = ['user_log_acct','item_sku_id'] + u_fea_cols1 + u_fea_cols2 + i_fea_cols
训练集&验证集
df_train = df_train_label1.merge(u_fea_train1, on ='user_log_acct', how='left')
df_train = df_train.merge(my_user, on ='user_log_acct', how='left')
df_train = df_train.merge(my_item, on ='item_sku_id', how='left')
df_train['label'] = df_train['label_cnt'] > 0
df_train['label'] = df_train['label'].astype(int)
df_val = df_valid_label.merge(u_fea_val1, on ='user_log_acct', how='left')
df_val = df_val.merge(my_user, on ='user_log_acct', how='left')
df_val = df_val.merge(my_item, on ='item_sku_id', how='left')
df_val['label'] = df_val['label_cnt'] > 0
df_val['label'] = df_val['label'].astype(int)
序列化
def set_tokenizer(docs, split_char=' ', max_len=100):
'''
输入
docs:文本列表
split_char:按什么字符切割
max_len:截取的最大长度
输出
X:序列化后的数据
word_index:文本和数字对应的索引
'''
tokenizer = Tokenizer(lower=False, char_level=False, split=split_char)
tokenizer.fit_on_texts(docs)
X = tokenizer.texts_to_sequences(docs)
maxlen = max_len
X = pad_sequences(X, maxlen=maxlen, value=0)
word_index=tokenizer.word_index
return X, word_index
valid_item_seq = data_valid_fea.groupby(['user_log_acct'])['item_sku_id'].agg(list).reset_index()
valid_item_seq.columns = ['user_log_acct', 'item_seq']
df_val = df_val.merge(valid_item_seq, on='user_log_acct', how='left')
train_item_seq = data_train_fea1.groupby(['user_log_acct'])['item_sku_id'].agg(list).reset_index()
train_item_seq.columns = ['user_log_acct', 'item_seq']
df_train = df_train.merge(train_item_seq, on='user_log_acct', how='left')
df_data = pd.concat([df_train[['item_seq']], df_val[['item_seq']]], axis=0, ignore_index=True)
df_data['item_seq'] = df_data['item_seq'].apply(lambda x:str(x)[1:-1])
text_1_list = list(df_data['item_seq'])
print('开始序列化')
x1, index_1 = set_tokenizer(text_1_list, split_char=',', max_len=20)
print('序列化完成')
gc.collect()
开始序列化
序列化完成
0
sparse_col = ['item_sku_id','age','sex','user_level','province','city','county','brand_code','shop_id','item_third_cate_cd','vender_id']
rest_col = ['user_log_acct','label_cnt','label_days','user_item','item_seq','label']
dense_cols = []
for i in df_train.columns:
if df_train[i].dtype in ['float64','int64'] and i not in sparse_col and i not in rest_col:
dense_cols.append(i)
from sklearn.preprocessing import StandardScaler
ss=StandardScaler()
df_data = pd.concat([df_train[dense_cols], df_val[dense_cols]], axis=0, ignore_index=True)
df_data = df_data.fillna(0)
ss.fit(df_data)
dense_feature = ss.transform(df_data)
dense_feature_input = dense_feature.shape[1]
train_input_1 = x1[:df_train.shape[0]]
test_input_1 = x1[df_train.shape[0]:]
train_input_2 = dense_feature[:df_train.shape[0]]
test_input_2 = dense_feature[df_train.shape[0]:]
train_label = df_train['label']
test_label = df_val['label']
LSTM
from keras.initializers import *
def model_1(emb1, dense_feature_input, df_):
K.clear_session()
emb_layer_1 = Embedding(
input_dim=emb1.shape[0],
output_dim=32,
input_length=20,
trainable=True
)
seq1 = Input(shape=(20,))
x1 = emb_layer_1(seq1)
sdrop=SpatialDropout1D(rate=0.2)
x1 = sdrop(x1)
x = Dropout(0.2)(LSTM(200, return_sequences=True)(x1))
semantic = TimeDistributed(Dense(100, activation="tanh"))(x)
merged_1 = Lambda(lambda x: K.max(x, axis=1), output_shape=(100,))(semantic)
merged_1_avg = Lambda(lambda x: K.mean(x, axis=1), output_shape=(100,))(semantic)
hin = Input(shape=(dense_feature_input, ))
htime = Dense(16, activation='relu')(hin)
x = concatenate([merged_1, merged_1_avg, htime])
x = Dropout(0.2)(Activation(activation="relu")(BatchNormalization()(Dense(128)(x))))
x = Activation(activation="relu")(BatchNormalization()(Dense(64)(x)))
pred = Dense(1, activation='sigmoid')(x)
model = Model(inputs=[seq1, hin], outputs=pred)
model.compile(optimizer="adam",
loss="binary_crossentropy",
metrics=["binary_crossentropy", tf.keras.metrics.AUC(name='auc')])
return model
file_path = "nn_lstm.h5"
earlystopping = EarlyStopping(monitor="val_auc", patience=5,mode='max')
plateau = ReduceLROnPlateau(monitor="val_auc", verbose=1,mode='max', factor=0.5, patience=3)
checkpoint = ModelCheckpoint(
file_path,monitor='val_auc', save_weights_only=True, verbose=1, save_best_only=True,mode='max')
callbacks_list = [earlystopping, checkpoint, plateau]
model_lstm = model_1(x1, dense_feature_input, df_data)
x1_tr, x1_va = np.array(train_input_1), np.array(test_input_1)
x2_tr, x2_va = np.array(train_input_2), np.array(test_input_2)
y_tr, y_va = train_label, test_label
hist = model_lstm.fit([x1_tr, x2_tr],
y_tr, batch_size=4096, epochs=10,
validation_data=([x1_va, x2_va], y_va),
callbacks=callbacks_list, verbose=1, shuffle=True)
test_pred = model_lstm.predict([x1_va, x2_va], batch_size=2048, verbose=1)
Epoch 1/10
401/401 [==============================] - 501s 1s/step - loss: 0.1395 - binary_crossentropy: 0.1395 - auc: 0.5903 - val_loss: 0.0302 - val_binary_crossentropy: 0.0302 - val_auc: 0.8218
Epoch 00001: val_auc improved from -inf to 0.82176, saving model to nn_lstm.h5
Epoch 2/10
401/401 [==============================] - 505s 1s/step - loss: 0.0307 - binary_crossentropy: 0.0307 - auc: 0.8247 - val_loss: 0.0284 - val_binary_crossentropy: 0.0284 - val_auc: 0.8047
Epoch 00002: val_auc did not improve from 0.82176
Epoch 3/10
401/401 [==============================] - 498s 1s/step - loss: 0.0273 - binary_crossentropy: 0.0273 - auc: 0.8839 - val_loss: 0.0294 - val_binary_crossentropy: 0.0294 - val_auc: 0.7620
Epoch 00003: val_auc did not improve from 0.82176
Epoch 4/10
401/401 [==============================] - 503s 1s/step - loss: 0.0233 - binary_crossentropy: 0.0233 - auc: 0.9356 - val_loss: 0.0313 - val_binary_crossentropy: 0.0313 - val_auc: 0.7226
Epoch 00004: val_auc did not improve from 0.82176
Epoch 00004: ReduceLROnPlateau reducing learning rate to 0.0005000000237487257.
Epoch 5/10
401/401 [==============================] - 497s 1s/step - loss: 0.0202 - binary_crossentropy: 0.0202 - auc: 0.9637 - val_loss: 0.0334 - val_binary_crossentropy: 0.0334 - val_auc: 0.7039
Epoch 00005: val_auc did not improve from 0.82176
Epoch 6/10
401/401 [==============================] - 497s 1s/step - loss: 0.0184 - binary_crossentropy: 0.0184 - auc: 0.9711 - val_loss: 0.0347 - val_binary_crossentropy: 0.0347 - val_auc: 0.6929
Epoch 00006: val_auc did not improve from 0.82176
864/864 [==============================] - 66s 76ms/step
np.max(hist.history['val_auc'])
0.821759819984436
np.max(hist.history['val_loss'])
0.034674450755119324