PySpark+深度语义模型DSSM+获取embedding

背景:需要从大量道具中检索出用户喜欢的道具,出于隐私保护代码中隐去了使用到的具体特征,整个流程是可以跑通的,实际使用时可以根据需要增加相应的early stopping,BN,数据采样等操作。

环境设置:deepmatch==0.1.3,deepctr[GPU]==0.7.5,pyspark==2.4.0,keras

模型文件:

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers


from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from keras.utils import plot_model
from keras import backend
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.preprocessing.sequence import pad_sequences

from deepctr.inputs import SparseFeat, VarLenSparseFeat, build_input_features, DenseFeat
from deepmatch.models import *

from deepctr.inputs import combined_dnn_input, create_embedding_matrix, get_feature_names
from deepctr.layers.core import PredictionLayer, DNN
from tensorflow.python.keras.models import Model
from deepmatch.inputs import input_from_feature_columns
from deepmatch.layers.core import Similarity

from sklearn.metrics import accuracy_score

# numpy
np.set_printoptions(threshold=np.inf)

# pandas
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 5000)

def DSSM(user_feature_columns, item_feature_columns, user_dnn_hidden_units=(64, 32),
		 item_dnn_hidden_units=(64, 32),
		 dnn_activation='tanh', dnn_use_bn=False,
		 l2_reg_dnn=0, l2_reg_embedding=1e-8, dnn_dropout=0.8, init_std=0.0001, seed=1024, metric='cos'):

	embedding_matrix_dict = create_embedding_matrix(user_feature_columns + item_feature_columns, l2_reg_embedding,
													init_std, seed,
													seq_mask_zero=True)

	user_features = build_input_features(user_feature_columns)
	user_inputs_list = list(user_features.values())
	user_sparse_embedding_list, user_dense_value_list = input_from_feature_columns(user_features,
																				   user_feature_columns,
																				   l2_reg_embedding, init_std, seed,
																				   embedding_matrix_dict=embedding_matrix_dict)
	user_dnn_input = combined_dnn_input(user_sparse_embedding_list, user_dense_value_list)
	item_features = build_input_features(item_feature_columns)
	item_inputs_list = list(item_features.values())
	item_sparse_embedding_list, item_dense_value_list = input_from_feature_columns(item_features,
																				   item_feature_columns,
																				   l2_reg_embedding, init_std, seed,
																				   embedding_matrix_dict=embedding_matrix_dict)
	item_dnn_input = combined_dnn_input(item_sparse_embedding_list, item_dense_value_list)

	user_dnn_out = DNN(user_dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout,
					   dnn_use_bn, seed, )(user_dnn_input)

	item_dnn_out = DNN(item_dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout,
					   dnn_use_bn, seed)(item_dnn_input)

	score = Similarity(type=metric)([user_dnn_out, item_dnn_out])

	output = PredictionLayer("binary", False)(score)

	model = Model(inputs=user_inputs_list + item_inputs_list, outputs=output)

	model.__setattr__("user_input", user_inputs_list)
	model.__setattr__("item_input", item_inputs_list)
	model.__setattr__("user_embedding", user_dnn_out)
	model.__setattr__("item_embedding", item_dnn_out)

	return model


def split_and_encoder(x):
	key_ans = x.split('|')
	for key in key_ans:
		if key not in key2index:
			# Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input
			key2index[key] = len(key2index) + 1
	return list(map(lambda x: key2index[x], key_ans))

def split_and_encoder_pred(x):
	key_ans = x.split('|')
	for key in key_ans:
		if key not in key2index:
			# Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input
			key2index[key] = 0
	return list(map(lambda x: key2index[x], key_ans))

def label_encoder(old_str):
	encoder = {}
	for key in old_str:
		if key not in encoder:
			encoder[key] = len(encoder) + 1
	return encoder 

PySpark训练和保存:

"""
	获取SparkSession
"""
def get_spark_session(app_name="xxxxxxxxx"):
	spark_session = pyspark.sql.SparkSession.builder \
		.config('spark.driver.extraClassPath', 'xxxxxxxxxxxxxxx') \
		.config('spark.sql.parquet.compression.codec', 'none') \
		.config('spark.sql.legacy.allowCreatingManagedTableUsingNonemptyLocation', 'true') \
		.config("spark.driver.memory", '8g') \
		.config("spark.executor.memory", '8g') \
		.config("spark.executor.cores", '4') \
		.config("spark.executor.instances", '40') \
		.config("spark.speculation", 'true') \
		.config("spark.kryoserializer.buffer.max", "2000m") \
		.config('spark.ui.showConsoleProgress', 'false') \
		.master("local[*]") \
		.appName(app_name) \
		.enableHiveSupport() \
		.getOrCreate()
	return spark_session

if __name__ == '__main__':
	#print("1、加载数据")
	spark_session = get_spark_session()
	sql_train = 'SELECT * from ${tbname}'
	df_train = spark_session.sql(sql_train)
	dataset_train = df_train.toPandas()
	dataset_train.columns = ['a','b',...]
	dataset_train = shuffle(dataset_train)

	# Get user and item features
	user_fixlen_category_features = ['a','b',...]
	user_fixlen_number_features = ['a','b',...]
	user_varlen_category_features = ['a','b',...]

	item_fixlen_category_features = ['a','b',...]
	item_fixlen_number_features = ['a','b',...]

	target = ['label']

	# change the type of dense_features and target features
	dataset_train[user_fixlen_number_features] = dataset_train[user_fixlen_number_features].astype('float')
	dataset_train[item_fixlen_number_features] = dataset_train[item_fixlen_number_features].astype('float')

	dataset_train[user_fixlen_category_features] = dataset_train[user_fixlen_category_features].fillna('-1', )
	dataset_train[item_fixlen_category_features] = dataset_train[item_fixlen_category_features].fillna('-1', )
	dataset_train[user_varlen_category_features] = dataset_train[user_varlen_category_features].fillna('-1', )

	dataset_train[user_fixlen_number_features] = dataset_train[user_fixlen_number_features].fillna(0, )
	dataset_train[item_fixlen_number_features] = dataset_train[item_fixlen_number_features].fillna(0, )


	# 1.Label Encoding for category features and minmax normalizing for number features
	lbe_dic = dict()
	for feat in user_fixlen_category_features:
		lbe = label_encoder(dataset_train[feat].values)
		dataset_train[feat] = [lbe[x] if x in lbe else 0 for x in dataset_train[feat].values]
		lbe_dic[feat] = lbe	

	for feat in item_fixlen_category_features:
		lbe = label_encoder(dataset_train[feat].values)
		dataset_train[feat] = [lbe[x] if x in lbe else 0 for x in dataset_train[feat].values]
		lbe_dic[feat] = lbe


	# minmax normalizing for number features
	mms_user = MinMaxScaler(feature_range=(0,1))
	dataset_train[user_fixlen_number_features] = mms_user.fit_transform(dataset_train[user_fixlen_number_features])
	mms_item = MinMaxScaler(feature_range=(0,1))
	dataset_train[item_fixlen_number_features] = mms_item.fit_transform(dataset_train[item_fixlen_number_features])

	
	pro_train_data, pro_valid_data = train_test_split(dataset_train, test_size = 0.2, random_state = 2020)

	# preprocess the sequence feature

	key2index = {}
	train_item_id_list = list(map(split_and_encoder, pro_train_data['xxx'].values))
	train_item_id_length = np.array(list(map(len, train_item_id_list)))
	train_max_len = max(train_item_id_length)
	
	valid_item_id_list = list(map(split_and_encoder, pro_valid_data['xxx'].values))
	valid_item_id_length = np.array(list(map(len, valid_item_id_list)))
	valid_max_len = max(valid_item_id_length)


	# Notice : padding=`post`
	max_len = max([train_max_len, valid_max_len])
	train_item_id_list = pad_sequences(train_item_id_list, maxlen=max_len, padding='post', )
	valid_item_id_list = pad_sequences(valid_item_id_list, maxlen=max_len, padding='post', )

	# 2.Count the itme frequency of each discrete feature and construct feature parameters

	# for user part 
	user_fixlen_category_columns = [SparseFeat(feat, vocabulary_size=len(lbe_dic[feat]) + 2)
						   for feat in user_fixlen_category_features] 
	user_varlen_category_columns = [VarLenSparseFeat(SparseFeat('xxx', len(key2index) + 1, 
										), maxlen=max_len, combiner='mean')]
	user_fixlen_number_columns = [DenseFeat(feat, 1,)for feat in user_fixlen_number_features]

	user_fixlen_columns = user_fixlen_category_columns +  user_fixlen_number_columns
	user_feature_columns = user_fixlen_columns + user_varlen_category_columns

	# for item part
	item_fixlen_category_columns = [SparseFeat(feat, vocabulary_size=len(lbe_dic[feat]) + 2)
						   for feat in item_fixlen_category_features]
	item_fixlen_number_columns = [DenseFeat(feat, 1,)for feat in item_fixlen_number_features]
	item_feature_columns = item_fixlen_category_columns + item_fixlen_number_columns

	# 3.generate input data for model
	#data = np.array(data)
	feature_names = get_feature_names(user_fixlen_columns + item_feature_columns)
	train_model_input = {name:pro_train_data[name] for name in feature_names}#
	train_model_input["xxxxx"] = train_item_id_list

	valid_model_input = {name:pro_valid_data[name] for name in feature_names}
	valid_model_input["xxxxx"] = valid_item_id_list

	# Model compilation and training
	'''
	BATCH_SIZE = 512
	train_input_ts = tf.data.Dataset.from_tensor_slices((train_model_input,pro_train_data[target])) \
		.shuffle(buffer_size = 1024).batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
	valid_input_ts = tf.data.Dataset.from_tensor_slices((valid_model_input, pro_valid_data[target])) \
		.batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
	'''

	model = DSSM(user_feature_columns, item_feature_columns, dnn_dropout = 0.5)
	model.compile(optimizer= tf.keras.optimizers.Adam(0.0001), loss="binary_crossentropy", metrics=['accuracy', 'AUC'])
	tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir='${LOG_DIR}', histogram_freq=1)
	history = model.fit(train_model_input, pro_train_data[target], shuffle=True, 
	                    batch_size=64, epochs=10, verbose=2, validation_split=0.1,
						use_multiprocessing=True, callbacks=[tensorboard_callback])

	# save model
	model.save_weights('xxxxxxxxx.ckpt')

加载模型:

	spark_session = get_spark_session()
	sql_train = 'SELECT * from ${tbname}' #原训练样本,仅用来读取字典等信息,不用来训练DSSM模型
	print(sql_train)
	df_train = spark_session.sql(sql_train)
	dataset_train = df_train.toPandas()
	dataset_train.columns = ['a','b',...]
	dataset_train = shuffle(dataset_train)

	# Get user and item features
	#user_fixlen_category_features = ['a','b',...]
	user_fixlen_category_features = ['a','b',...]
	user_fixlen_number_features = ['a','b',...]
	user_varlen_category_features = ['a','b',...]

	item_fixlen_category_features = ['a','b',...]
	item_fixlen_number_features = ['a','b',...]

	# change the type of dense_features and target features
	dataset_train[user_fixlen_number_features] = dataset_train[user_fixlen_number_features].astype('float')
	dataset_train[item_fixlen_number_features] = dataset_train[item_fixlen_number_features].astype('float')

	dataset_train[user_fixlen_category_features] = dataset_train[user_fixlen_category_features].fillna('-1', )
	dataset_train[item_fixlen_category_features] = dataset_train[item_fixlen_category_features].fillna('-1', )
	dataset_train[user_varlen_category_features] = dataset_train[user_varlen_category_features].fillna('-1', )

	dataset_train[user_fixlen_number_features] = dataset_train[user_fixlen_number_features].fillna(0, )
	dataset_train[item_fixlen_number_features] = dataset_train[item_fixlen_number_features].fillna(0, )

	# 1.Label Encoding for category features and minmax normalizing for number features
	lbe_dic = dict()
	for feat in user_fixlen_category_features:
		lbe = label_encoder(dataset_train[feat].values)
		dataset_train[feat] = [lbe[x] if x in lbe else 0 for x in dataset_train[feat].values]
		lbe_dic[feat] = lbe	

	for feat in item_fixlen_category_features:
		lbe = label_encoder(dataset_train[feat].values)
		dataset_train[feat] = [lbe[x] if x in lbe else 0 for x in dataset_train[feat].values]
		lbe_dic[feat] = lbe

	# minmax normalizing for number features
	mms_user = MinMaxScaler(feature_range=(0,1))
	dataset_train[user_fixlen_number_features] = mms_user.fit_transform(dataset_train[user_fixlen_number_features])
	mms_item = MinMaxScaler(feature_range=(0,1))
	dataset_train[item_fixlen_number_features] = mms_item.fit_transform(dataset_train[item_fixlen_number_features])

	pro_train_data, pro_valid_data = train_test_split(dataset_train, test_size = 0.2, random_state = 2020)

	# preprocess the sequence feature

	key2index = {}
	train_item_id_list = list(map(split_and_encoder, pro_train_data['item_id_list'].values))
	train_item_id_length = np.array(list(map(len, train_item_id_list)))
	train_max_len = max(train_item_id_length)
	
	valid_item_id_list = list(map(split_and_encoder, pro_valid_data['item_id_list'].values))
	valid_item_id_length = np.array(list(map(len, valid_item_id_list)))
	valid_max_len = max(valid_item_id_length)


	# Notice : padding=`post`
	max_len = max([train_max_len, valid_max_len])
	train_item_id_list = pad_sequences(train_item_id_list, maxlen=max_len, padding='post', )
	valid_item_id_list = pad_sequences(valid_item_id_list, maxlen=max_len, padding='post', )

	# 2.Count the word frequency of each discrete feature and construct feature parameters

	# for user part 
	user_fixlen_category_columns = [SparseFeat(feat, vocabulary_size=len(lbe_dic[feat]) + 2)
						   for feat in user_fixlen_category_features] 
	user_varlen_category_columns = [VarLenSparseFeat(SparseFeat('item_id_list', len(key2index) + 1, 
										), maxlen=max_len, combiner='mean')]
	user_fixlen_number_columns = [DenseFeat(feat, 1,)for feat in user_fixlen_number_features]

	user_fixlen_columns = user_fixlen_category_columns +  user_fixlen_number_columns
	user_feature_columns = user_fixlen_columns + user_varlen_category_columns

	# for item part
	item_fixlen_category_columns = [SparseFeat(feat, vocabulary_size=len(lbe_dic[feat]) + 2)
						   for feat in item_fixlen_category_features]
	item_fixlen_number_columns = [DenseFeat(feat, 1,)for feat in item_fixlen_number_features]
	item_feature_columns = item_fixlen_category_columns + item_fixlen_number_columns

	# 3.generate input data for model
	#data = np.array(data)
	feature_names = get_feature_names(user_fixlen_columns + item_feature_columns)

	del dataset_train
	del pro_train_data
	del pro_valid_data


	# Load model
	model = DSSM(user_feature_columns, item_feature_columns, dnn_dropout = 0.5)
	model.load_weights('xxxxxxxxxx.ckpt')
	
    #Predict: get user-embedding
	sql_pred = 'select * from ${tbname}'
	print(sql_pred)
	df_pred = spark_session.sql(sql_pred)
	dataset_pred = df_pred.toPandas()
	dataset_pred.columns = ['a','b',...]
	user_id = list(dataset_pred['user_id'])

	dataset_pred[user_fixlen_number_features] = dataset_pred[user_fixlen_number_features].astype('float')
	dataset_pred[user_fixlen_category_features] = dataset_pred[user_fixlen_category_features].fillna('-1', )
	dataset_pred[user_varlen_category_features] = dataset_pred[user_varlen_category_features].fillna('-1', )
	dataset_pred[user_fixlen_number_features] = dataset_pred[user_fixlen_number_features].fillna(0, )

	for feat in user_fixlen_category_features:
		lbe = lbe_dic[feat]
		dataset_pred[feat] = [lbe[x] if x in lbe else 0 for x in dataset_pred[feat].values]

	dataset_pred[user_fixlen_number_features] = mms_user.transform(dataset_pred[user_fixlen_number_features])

	test_item_id_list =  list(map(split_and_encoder_pred, dataset_pred['item_id_list'].values))
	test_item_id_list = pad_sequences(test_item_id_list, maxlen=max_len, padding='post', )

	user_feature_names = get_feature_names(user_fixlen_columns)
	test_user_model_input = {name:dataset_pred[name] for name in user_feature_names}
	test_user_model_input["item_id_list"] = test_item_id_list

	user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding)

	user_embs = pd.DataFrame(user_embedding_model.predict(test_user_model_input, batch_size=64))
	user_embs['user_id'] = user_id

获取用户侧特征:

	sql_pred = 'select * from ${tbname}'
	print(sql_pred)
	df_pred = spark_session.sql(sql_pred)
	dataset_pred = df_pred.toPandas()
	dataset_pred.columns = ['a','b',...]
	user_id = list(dataset_pred['user_id'])

	dataset_pred[user_fixlen_number_features] = dataset_pred[user_fixlen_number_features].astype('float')
	dataset_pred[user_fixlen_category_features] = dataset_pred[user_fixlen_category_features].fillna('-1', )
	dataset_pred[user_varlen_category_features] = dataset_pred[user_varlen_category_features].fillna('-1', )
	dataset_pred[user_fixlen_number_features] = dataset_pred[user_fixlen_number_features].fillna(0, )

	for feat in user_fixlen_category_features:
		lbe = lbe_dic[feat]
		dataset_pred[feat] = [lbe[x] if x in lbe else 0 for x in dataset_pred[feat].values]

	dataset_pred[user_fixlen_number_features] = mms_user.transform(dataset_pred[user_fixlen_number_features])

	test_item_id_list =  list(map(split_and_encoder_pred, dataset_pred['item_id_list'].values))
	test_item_id_list = pad_sequences(test_item_id_list, maxlen=max_len, padding='post', )

	user_feature_names = get_feature_names(user_fixlen_columns)
	test_user_model_input = {name:dataset_pred[name] for name in user_feature_names}
	test_user_model_input["item_id_list"] = test_item_id_list

	user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding)

	user_embs = pd.DataFrame(user_embedding_model.predict(test_user_model_input, batch_size=64))
	user_embs['user_id'] = user_id

获取物品侧特征:

	sql_pred = 'select * from ${tbname}'
	print(sql_pred)
	df_pred = spark_session.sql(sql_pred)
	dataset_pred = df_pred.toPandas()
	dataset_pred.columns = ['a','b',...]
	item_id = list(dataset_pred['item_id'])

	dataset_pred[item_fixlen_number_features] = dataset_pred[item_fixlen_number_features].astype('float')
	dataset_pred[item_fixlen_category_features] = dataset_pred[item_fixlen_category_features].fillna('-1', )
	dataset_pred[item_fixlen_number_features] = dataset_pred[item_fixlen_number_features].fillna(0, )

	for feat in item_fixlen_category_features:
		lbe = lbe_dic[feat]
		dataset_pred[feat] = [lbe[x] if x in lbe else 0 for x in dataset_pred[feat].values]

	dataset_pred[item_fixlen_number_features] = mms_item.transform(dataset_pred[item_fixlen_number_features])

	item_feature_names = get_feature_names(item_feature_columns)
	test_item_model_input = {name:dataset_pred[name] for name in item_feature_names}

	item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding)

	item_embs = pd.DataFrame(item_embedding_model.predict(test_item_model_input, batch_size=64))
	item_embs['item_id'] = item_id

得到用户embedding和物品embedding后可以做相似度检索或者是作为特征训练分类模型

参考:

[1] https://github.com/shenweichen/DeepCTR

[2] Faiss(Facebook开源的高效相似搜索库)学习小记: https://blog.csdn.net/wuzhongqiang/article/details/109718827 

你可能感兴趣的:(算法,深度学习,tensorflow,spark)