背景:需要从大量道具中检索出用户喜欢的道具,出于隐私保护代码中隐去了使用到的具体特征,整个流程是可以跑通的,实际使用时可以根据需要增加相应的early stopping,BN,数据采样等操作。
环境设置:deepmatch==0.1.3,deepctr[GPU]==0.7.5,pyspark==2.4.0,keras
模型文件:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from keras.utils import plot_model
from keras import backend
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.preprocessing.sequence import pad_sequences
from deepctr.inputs import SparseFeat, VarLenSparseFeat, build_input_features, DenseFeat
from deepmatch.models import *
from deepctr.inputs import combined_dnn_input, create_embedding_matrix, get_feature_names
from deepctr.layers.core import PredictionLayer, DNN
from tensorflow.python.keras.models import Model
from deepmatch.inputs import input_from_feature_columns
from deepmatch.layers.core import Similarity
from sklearn.metrics import accuracy_score
# numpy
np.set_printoptions(threshold=np.inf)
# pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 5000)
def DSSM(user_feature_columns, item_feature_columns, user_dnn_hidden_units=(64, 32),
item_dnn_hidden_units=(64, 32),
dnn_activation='tanh', dnn_use_bn=False,
l2_reg_dnn=0, l2_reg_embedding=1e-8, dnn_dropout=0.8, init_std=0.0001, seed=1024, metric='cos'):
embedding_matrix_dict = create_embedding_matrix(user_feature_columns + item_feature_columns, l2_reg_embedding,
init_std, seed,
seq_mask_zero=True)
user_features = build_input_features(user_feature_columns)
user_inputs_list = list(user_features.values())
user_sparse_embedding_list, user_dense_value_list = input_from_feature_columns(user_features,
user_feature_columns,
l2_reg_embedding, init_std, seed,
embedding_matrix_dict=embedding_matrix_dict)
user_dnn_input = combined_dnn_input(user_sparse_embedding_list, user_dense_value_list)
item_features = build_input_features(item_feature_columns)
item_inputs_list = list(item_features.values())
item_sparse_embedding_list, item_dense_value_list = input_from_feature_columns(item_features,
item_feature_columns,
l2_reg_embedding, init_std, seed,
embedding_matrix_dict=embedding_matrix_dict)
item_dnn_input = combined_dnn_input(item_sparse_embedding_list, item_dense_value_list)
user_dnn_out = DNN(user_dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout,
dnn_use_bn, seed, )(user_dnn_input)
item_dnn_out = DNN(item_dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout,
dnn_use_bn, seed)(item_dnn_input)
score = Similarity(type=metric)([user_dnn_out, item_dnn_out])
output = PredictionLayer("binary", False)(score)
model = Model(inputs=user_inputs_list + item_inputs_list, outputs=output)
model.__setattr__("user_input", user_inputs_list)
model.__setattr__("item_input", item_inputs_list)
model.__setattr__("user_embedding", user_dnn_out)
model.__setattr__("item_embedding", item_dnn_out)
return model
def split_and_encoder(x):
key_ans = x.split('|')
for key in key_ans:
if key not in key2index:
# Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input
key2index[key] = len(key2index) + 1
return list(map(lambda x: key2index[x], key_ans))
def split_and_encoder_pred(x):
key_ans = x.split('|')
for key in key_ans:
if key not in key2index:
# Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input
key2index[key] = 0
return list(map(lambda x: key2index[x], key_ans))
def label_encoder(old_str):
encoder = {}
for key in old_str:
if key not in encoder:
encoder[key] = len(encoder) + 1
return encoder
PySpark训练和保存:
"""
获取SparkSession
"""
def get_spark_session(app_name="xxxxxxxxx"):
spark_session = pyspark.sql.SparkSession.builder \
.config('spark.driver.extraClassPath', 'xxxxxxxxxxxxxxx') \
.config('spark.sql.parquet.compression.codec', 'none') \
.config('spark.sql.legacy.allowCreatingManagedTableUsingNonemptyLocation', 'true') \
.config("spark.driver.memory", '8g') \
.config("spark.executor.memory", '8g') \
.config("spark.executor.cores", '4') \
.config("spark.executor.instances", '40') \
.config("spark.speculation", 'true') \
.config("spark.kryoserializer.buffer.max", "2000m") \
.config('spark.ui.showConsoleProgress', 'false') \
.master("local[*]") \
.appName(app_name) \
.enableHiveSupport() \
.getOrCreate()
return spark_session
if __name__ == '__main__':
#print("1、加载数据")
spark_session = get_spark_session()
sql_train = 'SELECT * from ${tbname}'
df_train = spark_session.sql(sql_train)
dataset_train = df_train.toPandas()
dataset_train.columns = ['a','b',...]
dataset_train = shuffle(dataset_train)
# Get user and item features
user_fixlen_category_features = ['a','b',...]
user_fixlen_number_features = ['a','b',...]
user_varlen_category_features = ['a','b',...]
item_fixlen_category_features = ['a','b',...]
item_fixlen_number_features = ['a','b',...]
target = ['label']
# change the type of dense_features and target features
dataset_train[user_fixlen_number_features] = dataset_train[user_fixlen_number_features].astype('float')
dataset_train[item_fixlen_number_features] = dataset_train[item_fixlen_number_features].astype('float')
dataset_train[user_fixlen_category_features] = dataset_train[user_fixlen_category_features].fillna('-1', )
dataset_train[item_fixlen_category_features] = dataset_train[item_fixlen_category_features].fillna('-1', )
dataset_train[user_varlen_category_features] = dataset_train[user_varlen_category_features].fillna('-1', )
dataset_train[user_fixlen_number_features] = dataset_train[user_fixlen_number_features].fillna(0, )
dataset_train[item_fixlen_number_features] = dataset_train[item_fixlen_number_features].fillna(0, )
# 1.Label Encoding for category features and minmax normalizing for number features
lbe_dic = dict()
for feat in user_fixlen_category_features:
lbe = label_encoder(dataset_train[feat].values)
dataset_train[feat] = [lbe[x] if x in lbe else 0 for x in dataset_train[feat].values]
lbe_dic[feat] = lbe
for feat in item_fixlen_category_features:
lbe = label_encoder(dataset_train[feat].values)
dataset_train[feat] = [lbe[x] if x in lbe else 0 for x in dataset_train[feat].values]
lbe_dic[feat] = lbe
# minmax normalizing for number features
mms_user = MinMaxScaler(feature_range=(0,1))
dataset_train[user_fixlen_number_features] = mms_user.fit_transform(dataset_train[user_fixlen_number_features])
mms_item = MinMaxScaler(feature_range=(0,1))
dataset_train[item_fixlen_number_features] = mms_item.fit_transform(dataset_train[item_fixlen_number_features])
pro_train_data, pro_valid_data = train_test_split(dataset_train, test_size = 0.2, random_state = 2020)
# preprocess the sequence feature
key2index = {}
train_item_id_list = list(map(split_and_encoder, pro_train_data['xxx'].values))
train_item_id_length = np.array(list(map(len, train_item_id_list)))
train_max_len = max(train_item_id_length)
valid_item_id_list = list(map(split_and_encoder, pro_valid_data['xxx'].values))
valid_item_id_length = np.array(list(map(len, valid_item_id_list)))
valid_max_len = max(valid_item_id_length)
# Notice : padding=`post`
max_len = max([train_max_len, valid_max_len])
train_item_id_list = pad_sequences(train_item_id_list, maxlen=max_len, padding='post', )
valid_item_id_list = pad_sequences(valid_item_id_list, maxlen=max_len, padding='post', )
# 2.Count the itme frequency of each discrete feature and construct feature parameters
# for user part
user_fixlen_category_columns = [SparseFeat(feat, vocabulary_size=len(lbe_dic[feat]) + 2)
for feat in user_fixlen_category_features]
user_varlen_category_columns = [VarLenSparseFeat(SparseFeat('xxx', len(key2index) + 1,
), maxlen=max_len, combiner='mean')]
user_fixlen_number_columns = [DenseFeat(feat, 1,)for feat in user_fixlen_number_features]
user_fixlen_columns = user_fixlen_category_columns + user_fixlen_number_columns
user_feature_columns = user_fixlen_columns + user_varlen_category_columns
# for item part
item_fixlen_category_columns = [SparseFeat(feat, vocabulary_size=len(lbe_dic[feat]) + 2)
for feat in item_fixlen_category_features]
item_fixlen_number_columns = [DenseFeat(feat, 1,)for feat in item_fixlen_number_features]
item_feature_columns = item_fixlen_category_columns + item_fixlen_number_columns
# 3.generate input data for model
#data = np.array(data)
feature_names = get_feature_names(user_fixlen_columns + item_feature_columns)
train_model_input = {name:pro_train_data[name] for name in feature_names}#
train_model_input["xxxxx"] = train_item_id_list
valid_model_input = {name:pro_valid_data[name] for name in feature_names}
valid_model_input["xxxxx"] = valid_item_id_list
# Model compilation and training
'''
BATCH_SIZE = 512
train_input_ts = tf.data.Dataset.from_tensor_slices((train_model_input,pro_train_data[target])) \
.shuffle(buffer_size = 1024).batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
valid_input_ts = tf.data.Dataset.from_tensor_slices((valid_model_input, pro_valid_data[target])) \
.batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
'''
model = DSSM(user_feature_columns, item_feature_columns, dnn_dropout = 0.5)
model.compile(optimizer= tf.keras.optimizers.Adam(0.0001), loss="binary_crossentropy", metrics=['accuracy', 'AUC'])
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir='${LOG_DIR}', histogram_freq=1)
history = model.fit(train_model_input, pro_train_data[target], shuffle=True,
batch_size=64, epochs=10, verbose=2, validation_split=0.1,
use_multiprocessing=True, callbacks=[tensorboard_callback])
# save model
model.save_weights('xxxxxxxxx.ckpt')
加载模型:
spark_session = get_spark_session()
sql_train = 'SELECT * from ${tbname}' #原训练样本,仅用来读取字典等信息,不用来训练DSSM模型
print(sql_train)
df_train = spark_session.sql(sql_train)
dataset_train = df_train.toPandas()
dataset_train.columns = ['a','b',...]
dataset_train = shuffle(dataset_train)
# Get user and item features
#user_fixlen_category_features = ['a','b',...]
user_fixlen_category_features = ['a','b',...]
user_fixlen_number_features = ['a','b',...]
user_varlen_category_features = ['a','b',...]
item_fixlen_category_features = ['a','b',...]
item_fixlen_number_features = ['a','b',...]
# change the type of dense_features and target features
dataset_train[user_fixlen_number_features] = dataset_train[user_fixlen_number_features].astype('float')
dataset_train[item_fixlen_number_features] = dataset_train[item_fixlen_number_features].astype('float')
dataset_train[user_fixlen_category_features] = dataset_train[user_fixlen_category_features].fillna('-1', )
dataset_train[item_fixlen_category_features] = dataset_train[item_fixlen_category_features].fillna('-1', )
dataset_train[user_varlen_category_features] = dataset_train[user_varlen_category_features].fillna('-1', )
dataset_train[user_fixlen_number_features] = dataset_train[user_fixlen_number_features].fillna(0, )
dataset_train[item_fixlen_number_features] = dataset_train[item_fixlen_number_features].fillna(0, )
# 1.Label Encoding for category features and minmax normalizing for number features
lbe_dic = dict()
for feat in user_fixlen_category_features:
lbe = label_encoder(dataset_train[feat].values)
dataset_train[feat] = [lbe[x] if x in lbe else 0 for x in dataset_train[feat].values]
lbe_dic[feat] = lbe
for feat in item_fixlen_category_features:
lbe = label_encoder(dataset_train[feat].values)
dataset_train[feat] = [lbe[x] if x in lbe else 0 for x in dataset_train[feat].values]
lbe_dic[feat] = lbe
# minmax normalizing for number features
mms_user = MinMaxScaler(feature_range=(0,1))
dataset_train[user_fixlen_number_features] = mms_user.fit_transform(dataset_train[user_fixlen_number_features])
mms_item = MinMaxScaler(feature_range=(0,1))
dataset_train[item_fixlen_number_features] = mms_item.fit_transform(dataset_train[item_fixlen_number_features])
pro_train_data, pro_valid_data = train_test_split(dataset_train, test_size = 0.2, random_state = 2020)
# preprocess the sequence feature
key2index = {}
train_item_id_list = list(map(split_and_encoder, pro_train_data['item_id_list'].values))
train_item_id_length = np.array(list(map(len, train_item_id_list)))
train_max_len = max(train_item_id_length)
valid_item_id_list = list(map(split_and_encoder, pro_valid_data['item_id_list'].values))
valid_item_id_length = np.array(list(map(len, valid_item_id_list)))
valid_max_len = max(valid_item_id_length)
# Notice : padding=`post`
max_len = max([train_max_len, valid_max_len])
train_item_id_list = pad_sequences(train_item_id_list, maxlen=max_len, padding='post', )
valid_item_id_list = pad_sequences(valid_item_id_list, maxlen=max_len, padding='post', )
# 2.Count the word frequency of each discrete feature and construct feature parameters
# for user part
user_fixlen_category_columns = [SparseFeat(feat, vocabulary_size=len(lbe_dic[feat]) + 2)
for feat in user_fixlen_category_features]
user_varlen_category_columns = [VarLenSparseFeat(SparseFeat('item_id_list', len(key2index) + 1,
), maxlen=max_len, combiner='mean')]
user_fixlen_number_columns = [DenseFeat(feat, 1,)for feat in user_fixlen_number_features]
user_fixlen_columns = user_fixlen_category_columns + user_fixlen_number_columns
user_feature_columns = user_fixlen_columns + user_varlen_category_columns
# for item part
item_fixlen_category_columns = [SparseFeat(feat, vocabulary_size=len(lbe_dic[feat]) + 2)
for feat in item_fixlen_category_features]
item_fixlen_number_columns = [DenseFeat(feat, 1,)for feat in item_fixlen_number_features]
item_feature_columns = item_fixlen_category_columns + item_fixlen_number_columns
# 3.generate input data for model
#data = np.array(data)
feature_names = get_feature_names(user_fixlen_columns + item_feature_columns)
del dataset_train
del pro_train_data
del pro_valid_data
# Load model
model = DSSM(user_feature_columns, item_feature_columns, dnn_dropout = 0.5)
model.load_weights('xxxxxxxxxx.ckpt')
#Predict: get user-embedding
sql_pred = 'select * from ${tbname}'
print(sql_pred)
df_pred = spark_session.sql(sql_pred)
dataset_pred = df_pred.toPandas()
dataset_pred.columns = ['a','b',...]
user_id = list(dataset_pred['user_id'])
dataset_pred[user_fixlen_number_features] = dataset_pred[user_fixlen_number_features].astype('float')
dataset_pred[user_fixlen_category_features] = dataset_pred[user_fixlen_category_features].fillna('-1', )
dataset_pred[user_varlen_category_features] = dataset_pred[user_varlen_category_features].fillna('-1', )
dataset_pred[user_fixlen_number_features] = dataset_pred[user_fixlen_number_features].fillna(0, )
for feat in user_fixlen_category_features:
lbe = lbe_dic[feat]
dataset_pred[feat] = [lbe[x] if x in lbe else 0 for x in dataset_pred[feat].values]
dataset_pred[user_fixlen_number_features] = mms_user.transform(dataset_pred[user_fixlen_number_features])
test_item_id_list = list(map(split_and_encoder_pred, dataset_pred['item_id_list'].values))
test_item_id_list = pad_sequences(test_item_id_list, maxlen=max_len, padding='post', )
user_feature_names = get_feature_names(user_fixlen_columns)
test_user_model_input = {name:dataset_pred[name] for name in user_feature_names}
test_user_model_input["item_id_list"] = test_item_id_list
user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding)
user_embs = pd.DataFrame(user_embedding_model.predict(test_user_model_input, batch_size=64))
user_embs['user_id'] = user_id
获取用户侧特征:
sql_pred = 'select * from ${tbname}'
print(sql_pred)
df_pred = spark_session.sql(sql_pred)
dataset_pred = df_pred.toPandas()
dataset_pred.columns = ['a','b',...]
user_id = list(dataset_pred['user_id'])
dataset_pred[user_fixlen_number_features] = dataset_pred[user_fixlen_number_features].astype('float')
dataset_pred[user_fixlen_category_features] = dataset_pred[user_fixlen_category_features].fillna('-1', )
dataset_pred[user_varlen_category_features] = dataset_pred[user_varlen_category_features].fillna('-1', )
dataset_pred[user_fixlen_number_features] = dataset_pred[user_fixlen_number_features].fillna(0, )
for feat in user_fixlen_category_features:
lbe = lbe_dic[feat]
dataset_pred[feat] = [lbe[x] if x in lbe else 0 for x in dataset_pred[feat].values]
dataset_pred[user_fixlen_number_features] = mms_user.transform(dataset_pred[user_fixlen_number_features])
test_item_id_list = list(map(split_and_encoder_pred, dataset_pred['item_id_list'].values))
test_item_id_list = pad_sequences(test_item_id_list, maxlen=max_len, padding='post', )
user_feature_names = get_feature_names(user_fixlen_columns)
test_user_model_input = {name:dataset_pred[name] for name in user_feature_names}
test_user_model_input["item_id_list"] = test_item_id_list
user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding)
user_embs = pd.DataFrame(user_embedding_model.predict(test_user_model_input, batch_size=64))
user_embs['user_id'] = user_id
获取物品侧特征:
sql_pred = 'select * from ${tbname}'
print(sql_pred)
df_pred = spark_session.sql(sql_pred)
dataset_pred = df_pred.toPandas()
dataset_pred.columns = ['a','b',...]
item_id = list(dataset_pred['item_id'])
dataset_pred[item_fixlen_number_features] = dataset_pred[item_fixlen_number_features].astype('float')
dataset_pred[item_fixlen_category_features] = dataset_pred[item_fixlen_category_features].fillna('-1', )
dataset_pred[item_fixlen_number_features] = dataset_pred[item_fixlen_number_features].fillna(0, )
for feat in item_fixlen_category_features:
lbe = lbe_dic[feat]
dataset_pred[feat] = [lbe[x] if x in lbe else 0 for x in dataset_pred[feat].values]
dataset_pred[item_fixlen_number_features] = mms_item.transform(dataset_pred[item_fixlen_number_features])
item_feature_names = get_feature_names(item_feature_columns)
test_item_model_input = {name:dataset_pred[name] for name in item_feature_names}
item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding)
item_embs = pd.DataFrame(item_embedding_model.predict(test_item_model_input, batch_size=64))
item_embs['item_id'] = item_id
得到用户embedding和物品embedding后可以做相似度检索或者是作为特征训练分类模型
参考:
[1] https://github.com/shenweichen/DeepCTR
[2] Faiss(Facebook开源的高效相似搜索库)学习小记: https://blog.csdn.net/wuzhongqiang/article/details/109718827