pyspark+DSSM做大规模道具商品个性化推荐模型训练

背景:需要从大量道具中检索出用户喜欢的道具,出于隐私保护代码中隐去了使用到的具体特征,整个流程是可以跑通的,实际使用时可以根据需要增加相应的early stopping,BN,数据采样等操作。

环境设置:deepmatch==0.1.3,deepctr[GPU]==0.7.5,pyspark==2.4.0,keras

模型文件:

    import pandas as pd
    import numpy as np
    import tensorflow as tf
    from tensorflow import keras
    from tensorflow.keras import layers
     
     
    from sklearn.utils import shuffle
    from sklearn.model_selection import train_test_split
    from keras.utils import plot_model
    from keras import backend
    from sklearn.preprocessing import LabelEncoder, MinMaxScaler
    from tensorflow.python.keras.models import Model
    from tensorflow.python.keras.preprocessing.sequence import pad_sequences
     
    from deepctr.inputs import SparseFeat, VarLenSparseFeat, build_input_features, DenseFeat
    from deepmatch.models import *
     
    from deepctr.inputs import combined_dnn_input, create_embedding_matrix, get_feature_names
    from deepctr.layers.core import PredictionLayer, DNN
    from tensorflow.python.keras.models import Model
    from deepmatch.inputs import input_from_feature_columns
    from deepmatch.layers.core import Similarity
     
    from sklearn.metrics import accuracy_score
     
    # numpy
    np.set_printoptions(threshold=np.inf)
     
    # pandas
    pd.set_option('display.max_columns', None)
    pd.set_option('display.max_rows', None)
    pd.set_option('display.width', 5000)
     
    def DSSM(user_feature_columns, item_feature_columns, user_dnn_hidden_units=(64, 32),
             item_dnn_hidden_units=(64, 32),
             dnn_activation='tanh', dnn_use_bn=False,
             l2_reg_dnn=0, l2_reg_embedding=1e-8, dnn_dropout=0.8, init_std=0.0001, seed=1024, metric='cos'):
     
        embedding_matrix_dict = create_embedding_matrix(user_feature_columns + item_feature_columns, l2_reg_embedding,
                                                        init_std, seed,
                                                        seq_mask_zero=True)
     
        user_features = build_input_features(user_feature_columns)
        user_inputs_list = list(user_features.values())
        user_sparse_embedding_list, user_dense_value_list = input_from_feature_columns(user_features,
                                                                                       user_feature_columns,
                                                                                       l2_reg_embedding, init_std, seed,
                                                                                       embedding_matrix_dict=embedding_matrix_dict)
        user_dnn_input = combined_dnn_input(user_sparse_embedding_list, user_dense_value_list)
        item_features = build_input_features(item_feature_columns)
        item_inputs_list = list(item_features.values())
        item_sparse_embedding_list, item_dense_value_list = input_from_feature_columns(item_features,
                                                                                       item_feature_columns,
                                                                                       l2_reg_embedding, init_std, seed,
                                                                                       embedding_matrix_dict=embedding_matrix_dict)
        item_dnn_input = combined_dnn_input(item_sparse_embedding_list, item_dense_value_list)
     
        user_dnn_out = DNN(user_dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout,
                           dnn_use_bn, seed, )(user_dnn_input)
     
        item_dnn_out = DNN(item_dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout,
                           dnn_use_bn, seed)(item_dnn_input)
     
        score = Similarity(type=metric)([user_dnn_out, item_dnn_out])
     
        output = PredictionLayer("binary", False)(score)
     
        model = Model(inputs=user_inputs_list + item_inputs_list, outputs=output)
     
        model.__setattr__("user_input", user_inputs_list)
        model.__setattr__("item_input", item_inputs_list)
        model.__setattr__("user_embedding", user_dnn_out)
        model.__setattr__("item_embedding", item_dnn_out)
     
        return model
     
     
    def split_and_encoder(x):
        key_ans = x.split('|')
        for key in key_ans:
            if key not in key2index:
                # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input
                key2index[key] = len(key2index) + 1
        return list(map(lambda x: key2index[x], key_ans))
     
    def split_and_encoder_pred(x):
        key_ans = x.split('|')
        for key in key_ans:
            if key not in key2index:
                # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input
                key2index[key] = 0
        return list(map(lambda x: key2index[x], key_ans))
     
    def label_encoder(old_str):
        encoder = {}
        for key in old_str:
            if key not in encoder:
                encoder[key] = len(encoder) + 1
        return encoder

PySpark训练和保存:

    """
        获取SparkSession
    """
    def get_spark_session(app_name="xxxxxxxxx"):
        spark_session = pyspark.sql.SparkSession.builder \
            .config('spark.driver.extraClassPath', 'xxxxxxxxxxxxxxx') \
            .config('spark.sql.parquet.compression.codec', 'none') \
            .config('spark.sql.legacy.allowCreatingManagedTableUsingNonemptyLocation', 'true') \
            .config("spark.driver.memory", '8g') \
            .config("spark.executor.memory", '8g') \
            .config("spark.executor.cores", '4') \
            .config("spark.executor.instances", '40') \
            .config("spark.speculation", 'true') \
            .config("spark.kryoserializer.buffer.max", "2000m") \
            .config('spark.ui.showConsoleProgress', 'false') \
            .master("local[*]") \
            .appName(app_name) \
            .enableHiveSupport() \
            .getOrCreate()
        return spark_session
     
    if __name__ == '__main__':
        #print("1、加载数据")
        spark_session = get_spark_session()
        sql_train = 'SELECT * from ${tbname}'
        df_train = spark_session.sql(sql_train)
        dataset_train = df_train.toPandas()
        dataset_train.columns = ['a','b',...]
        dataset_train = shuffle(dataset_train)
     
        # Get user and item features
        user_fixlen_category_features = ['a','b',...]
        user_fixlen_number_features = ['a','b',...]
        user_varlen_category_features = ['a','b',...]
     
        item_fixlen_category_features = ['a','b',...]
        item_fixlen_number_features = ['a','b',...]
     
        target = ['label']
     
        # change the type of dense_features and target features
        dataset_train[user_fixlen_number_features] = dataset_train[user_fixlen_number_features].astype('float')
        dataset_train[item_fixlen_number_features] = dataset_train[item_fixlen_number_features].astype('float')
     
        dataset_train[user_fixlen_category_features] = dataset_train[user_fixlen_category_features].fillna('-1', )
        dataset_train[item_fixlen_category_features] = dataset_train[item_fixlen_category_features].fillna('-1', )
        dataset_train[user_varlen_category_features] = dataset_train[user_varlen_category_features].fillna('-1', )
     
        dataset_train[user_fixlen_number_features] = dataset_train[user_fixlen_number_features].fillna(0, )
        dataset_train[item_fixlen_number_features] = dataset_train[item_fixlen_number_features].fillna(0, )
     
     
        # 1.Label Encoding for category features and minmax normalizing for number features
        lbe_dic = dict()
        for feat in user_fixlen_category_features:
            lbe = label_encoder(dataset_train[feat].values)
            dataset_train[feat] = [lbe[x] if x in lbe else 0 for x in dataset_train[feat].values]
            lbe_dic[feat] = lbe    
     
        for feat in item_fixlen_category_features:
            lbe = label_encoder(dataset_train[feat].values)
            dataset_train[feat] = [lbe[x] if x in lbe else 0 for x in dataset_train[feat].values]
            lbe_dic[feat] = lbe
     
     
        # minmax normalizing for number features
        mms_user = MinMaxScaler(feature_range=(0,1))
        dataset_train[user_fixlen_number_features] = mms_user.fit_transform(dataset_train[user_fixlen_number_features])
        mms_item = MinMaxScaler(feature_range=(0,1))
        dataset_train[item_fixlen_number_features] = mms_item.fit_transform(dataset_train[item_fixlen_number_features])
     
        
        pro_train_data, pro_valid_data = train_test_split(dataset_train, test_size = 0.2, random_state = 2020)
     
        # preprocess the sequence feature
     
        key2index = {}
        train_item_id_list = list(map(split_and_encoder, pro_train_data['xxx'].values))
        train_item_id_length = np.array(list(map(len, train_item_id_list)))
        train_max_len = max(train_item_id_length)
        
        valid_item_id_list = list(map(split_and_encoder, pro_valid_data['xxx'].values))
        valid_item_id_length = np.array(list(map(len, valid_item_id_list)))
        valid_max_len = max(valid_item_id_length)
     
     
        # Notice : padding=`post`
        max_len = max([train_max_len, valid_max_len])
        train_item_id_list = pad_sequences(train_item_id_list, maxlen=max_len, padding='post', )
        valid_item_id_list = pad_sequences(valid_item_id_list, maxlen=max_len, padding='post', )
     
        # 2.Count the itme frequency of each discrete feature and construct feature parameters
     
        # for user part
        user_fixlen_category_columns = [SparseFeat(feat, vocabulary_size=len(lbe_dic[feat]) + 2)
                               for feat in user_fixlen_category_features]
        user_varlen_category_columns = [VarLenSparseFeat(SparseFeat('xxx', len(key2index) + 1,
                                            ), maxlen=max_len, combiner='mean')]
        user_fixlen_number_columns = [DenseFeat(feat, 1,)for feat in user_fixlen_number_features]
     
        user_fixlen_columns = user_fixlen_category_columns +  user_fixlen_number_columns
        user_feature_columns = user_fixlen_columns + user_varlen_category_columns
     
        # for item part
        item_fixlen_category_columns = [SparseFeat(feat, vocabulary_size=len(lbe_dic[feat]) + 2)
                               for feat in item_fixlen_category_features]
        item_fixlen_number_columns = [DenseFeat(feat, 1,)for feat in item_fixlen_number_features]
        item_feature_columns = item_fixlen_category_columns + item_fixlen_number_columns
     
        # 3.generate input data for model
        #data = np.array(data)
        feature_names = get_feature_names(user_fixlen_columns + item_feature_columns)
        train_model_input = {name:pro_train_data[name] for name in feature_names}#
        train_model_input["xxxxx"] = train_item_id_list
     
        valid_model_input = {name:pro_valid_data[name] for name in feature_names}
        valid_model_input["xxxxx"] = valid_item_id_list
     
        # Model compilation and training
        '''
        BATCH_SIZE = 512
        train_input_ts = tf.data.Dataset.from_tensor_slices((train_model_input,pro_train_data[target])) \
            .shuffle(buffer_size = 1024).batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
        valid_input_ts = tf.data.Dataset.from_tensor_slices((valid_model_input, pro_valid_data[target])) \
            .batch(BATCH_SIZE).prefetch(tf.data.experimental.AUTOTUNE)
        '''
     
        model = DSSM(user_feature_columns, item_feature_columns, dnn_dropout = 0.5)
        model.compile(optimizer= tf.keras.optimizers.Adam(0.0001), loss="binary_crossentropy", metrics=['accuracy', 'AUC'])
        tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir='${LOG_DIR}', histogram_freq=1)
        history = model.fit(train_model_input, pro_train_data[target], shuffle=True,
                            batch_size=64, epochs=10, verbose=2, validation_split=0.1,
                            use_multiprocessing=True, callbacks=[tensorboard_callback])
     
        # save model
        model.save_weights('xxxxxxxxx.ckpt')

加载模型:

        spark_session = get_spark_session()
        sql_train = 'SELECT * from ${tbname}' #原训练样本,仅用来读取字典等信息,不用来训练DSSM模型
        print(sql_train)
        df_train = spark_session.sql(sql_train)
        dataset_train = df_train.toPandas()
        dataset_train.columns = ['a','b',...]
        dataset_train = shuffle(dataset_train)
        # Get user and item features
        #user_fixlen_category_features = ['a','b',...]
        user_fixlen_category_features = ['a','b',...]
        user_fixlen_number_features = ['a','b',...]
        user_varlen_category_features = ['a','b',...]
     
        item_fixlen_category_features = ['a','b',...]
        item_fixlen_number_features = ['a','b',...]
        # change the type of dense_features and target features
        dataset_train[user_fixlen_number_features] = dataset_train[user_fixlen_number_features].astype('float')
        dataset_train[item_fixlen_number_features] = dataset_train[item_fixlen_number_features].astype('float')
     
        dataset_train[user_fixlen_category_features] = dataset_train[user_fixlen_category_features].fillna('-1', )
        dataset_train[item_fixlen_category_features] = dataset_train[item_fixlen_category_features].fillna('-1', )
        dataset_train[user_varlen_category_features] = dataset_train[user_varlen_category_features].fillna('-1', )
     
        dataset_train[user_fixlen_number_features] = dataset_train[user_fixlen_number_features].fillna(0, )
        dataset_train[item_fixlen_number_features] = dataset_train[item_fixlen_number_features].fillna(0, )
        # 1.Label Encoding for category features and minmax normalizing for number features
        lbe_dic = dict()
        for feat in user_fixlen_category_features:
            lbe = label_encoder(dataset_train[feat].values)
            dataset_train[feat] = [lbe[x] if x in lbe else 0 for x in dataset_train[feat].values]
            lbe_dic[feat] = lbe    
     
        for feat in item_fixlen_category_features:
            lbe = label_encoder(dataset_train[feat].values)
            dataset_train[feat] = [lbe[x] if x in lbe else 0 for x in dataset_train[feat].values]
            lbe_dic[feat] = lbe
        # minmax normalizing for number features
        mms_user = MinMaxScaler(feature_range=(0,1))
        dataset_train[user_fixlen_number_features] = mms_user.fit_transform(dataset_train[user_fixlen_number_features])
        mms_item = MinMaxScaler(feature_range=(0,1))
        dataset_train[item_fixlen_number_features] = mms_item.fit_transform(dataset_train[item_fixlen_number_features])
     
        pro_train_data, pro_valid_data = train_test_split(dataset_train, test_size = 0.2, random_state = 2020)
        # preprocess the sequence feature
     
        key2index = {}
        train_item_id_list = list(map(split_and_encoder, pro_train_data['item_id_list'].values))
        train_item_id_length = np.array(list(map(len, train_item_id_list)))
        train_max_len = max(train_item_id_length)
        
        valid_item_id_list = list(map(split_and_encoder, pro_valid_data['item_id_list'].values))
        valid_item_id_length = np.array(list(map(len, valid_item_id_list)))
        valid_max_len = max(valid_item_id_length)
     
        # Notice : padding=`post`
        max_len = max([train_max_len, valid_max_len])
        train_item_id_list = pad_sequences(train_item_id_list, maxlen=max_len, padding='post', )
        valid_item_id_list = pad_sequences(valid_item_id_list, maxlen=max_len, padding='post', )
        # 2.Count the word frequency of each discrete feature and construct feature parameters
        # for user part
        user_fixlen_category_columns = [SparseFeat(feat, vocabulary_size=len(lbe_dic[feat]) + 2)
                               for feat in user_fixlen_category_features]
        user_varlen_category_columns = [VarLenSparseFeat(SparseFeat('item_id_list', len(key2index) + 1,
                                            ), maxlen=max_len, combiner='mean')]
        user_fixlen_number_columns = [DenseFeat(feat, 1,)for feat in user_fixlen_number_features]
     
        user_fixlen_columns = user_fixlen_category_columns +  user_fixlen_number_columns
        user_feature_columns = user_fixlen_columns + user_varlen_category_columns
        # for item part
        item_fixlen_category_columns = [SparseFeat(feat, vocabulary_size=len(lbe_dic[feat]) + 2)
                               for feat in item_fixlen_category_features]
        item_fixlen_number_columns = [DenseFeat(feat, 1,)for feat in item_fixlen_number_features]
        item_feature_columns = item_fixlen_category_columns + item_fixlen_number_columns
        # 3.generate input data for model
        #data = np.array(data)
        feature_names = get_feature_names(user_fixlen_columns + item_feature_columns)
     
        del dataset_train
        del pro_train_data
        del pro_valid_data
     
        # Load model
        model = DSSM(user_feature_columns, item_feature_columns, dnn_dropout = 0.5)
        model.load_weights('xxxxxxxxxx.ckpt')
        
        #Predict: get user-embedding
        sql_pred = 'select * from ${tbname}'
        print(sql_pred)
        df_pred = spark_session.sql(sql_pred)
        dataset_pred = df_pred.toPandas()
        dataset_pred.columns = ['a','b',...]
        user_id = list(dataset_pred['user_id'])
     
        dataset_pred[user_fixlen_number_features] = dataset_pred[user_fixlen_number_features].astype('float')
        dataset_pred[user_fixlen_category_features] = dataset_pred[user_fixlen_category_features].fillna('-1', )
        dataset_pred[user_varlen_category_features] = dataset_pred[user_varlen_category_features].fillna('-1', )
        dataset_pred[user_fixlen_number_features] = dataset_pred[user_fixlen_number_features].fillna(0, )
     
        for feat in user_fixlen_category_features:
            lbe = lbe_dic[feat]
            dataset_pred[feat] = [lbe[x] if x in lbe else 0 for x in dataset_pred[feat].values]
     
        dataset_pred[user_fixlen_number_features] = mms_user.transform(dataset_pred[user_fixlen_number_features])
     
        test_item_id_list =  list(map(split_and_encoder_pred, dataset_pred['item_id_list'].values))
        test_item_id_list = pad_sequences(test_item_id_list, maxlen=max_len, padding='post', )
     
        user_feature_names = get_feature_names(user_fixlen_columns)
        test_user_model_input = {name:dataset_pred[name] for name in user_feature_names}
        test_user_model_input["item_id_list"] = test_item_id_list
     
        user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding)
     
        user_embs = pd.DataFrame(user_embedding_model.predict(test_user_model_input, batch_size=64))
        user_embs['user_id'] = user_id

获取用户侧特征:

        sql_pred = 'select * from ${tbname}'
        print(sql_pred)
        df_pred = spark_session.sql(sql_pred)
        dataset_pred = df_pred.toPandas()
        dataset_pred.columns = ['a','b',...]
        user_id = list(dataset_pred['user_id'])
     
        dataset_pred[user_fixlen_number_features] = dataset_pred[user_fixlen_number_features].astype('float')
        dataset_pred[user_fixlen_category_features] = dataset_pred[user_fixlen_category_features].fillna('-1', )
        dataset_pred[user_varlen_category_features] = dataset_pred[user_varlen_category_features].fillna('-1', )
        dataset_pred[user_fixlen_number_features] = dataset_pred[user_fixlen_number_features].fillna(0, )
     
        for feat in user_fixlen_category_features:
            lbe = lbe_dic[feat]
            dataset_pred[feat] = [lbe[x] if x in lbe else 0 for x in dataset_pred[feat].values]
     
        dataset_pred[user_fixlen_number_features] = mms_user.transform(dataset_pred[user_fixlen_number_features])
     
        test_item_id_list =  list(map(split_and_encoder_pred, dataset_pred['item_id_list'].values))
        test_item_id_list = pad_sequences(test_item_id_list, maxlen=max_len, padding='post', )
     
        user_feature_names = get_feature_names(user_fixlen_columns)
        test_user_model_input = {name:dataset_pred[name] for name in user_feature_names}
        test_user_model_input["item_id_list"] = test_item_id_list
     
        user_embedding_model = Model(inputs=model.user_input, outputs=model.user_embedding)
     
        user_embs = pd.DataFrame(user_embedding_model.predict(test_user_model_input, batch_size=64))
        user_embs['user_id'] = user_id

获取物品侧特征:

        sql_pred = 'select * from ${tbname}'
        print(sql_pred)
        df_pred = spark_session.sql(sql_pred)
        dataset_pred = df_pred.toPandas()
        dataset_pred.columns = ['a','b',...]
        item_id = list(dataset_pred['item_id'])
     
        dataset_pred[item_fixlen_number_features] = dataset_pred[item_fixlen_number_features].astype('float')
        dataset_pred[item_fixlen_category_features] = dataset_pred[item_fixlen_category_features].fillna('-1', )
        dataset_pred[item_fixlen_number_features] = dataset_pred[item_fixlen_number_features].fillna(0, )
     
        for feat in item_fixlen_category_features:
            lbe = lbe_dic[feat]
            dataset_pred[feat] = [lbe[x] if x in lbe else 0 for x in dataset_pred[feat].values]
     
        dataset_pred[item_fixlen_number_features] = mms_item.transform(dataset_pred[item_fixlen_number_features])
     
        item_feature_names = get_feature_names(item_feature_columns)
        test_item_model_input = {name:dataset_pred[name] for name in item_feature_names}
     
        item_embedding_model = Model(inputs=model.item_input, outputs=model.item_embedding)
     
        item_embs = pd.DataFrame(item_embedding_model.predict(test_item_model_input, batch_size=64))
        item_embs['item_id'] = item_id

得到用户embedding和物品embedding后可以做相似度检索或者是作为特征训练分类模型

参考:

[1] https://github.com/shenweichen/DeepCTR

[2] Faiss(Facebook开源的高效相似搜索库)学习小记: https://blog.csdn.net/wuzhongqiang/article/details/109718827
————————————————

你可能感兴趣的:(推荐算法,用户画像和个性化推荐)