利用tensorflow estimator API实现双塔推荐算法

本文完整代码见: https://github.com/cdj0311/two_tower_recommendation_system

Tensorflow estimator实现分布式训练很简单,只需要将数据进行相应的切分丢给模型就可以很方便的完成分布式训练了。以下代码是一个完整的推荐算法模板,可根据自己的需要修改数据读取和模型结构部分,tensorflow==1.13.1。

1. 特征处理部分,feature_processing.py

#coding:utf-8
import tensorflow as tf
from tensorflow import feature_column as fc
import config

FLAGS = config.FLAGS

class FeatureConfig(object):
    def __init__(self):
        self.user_columns = dict()
        self.item_columns = dict()
        self.feature_spec = dict()

    def create_features_columns(self):
        # 向量类特征
        user_vector = fc.numeric_column(key="user_vector", shape=(128,), default_value=[0.0] * 128, dtype=tf.float32)
        item_vector = fc.numeric_column(key="item_vector", shape=(128,), default_value=[0.0] * 128, dtype=tf.float32)
        
        # 分桶类特征
        age = fc.numeric_column(key="age", shape=(1,), default_value=[0], dtype=tf.int64)
        age = fc.bucketized_column(input_fc, boundaries=[0,10,20,30,40,50,60,70,80])
        age = fc.embedding_column(age, dimension=32, combiner='mean')
        
        # 分类特征
        city = fc.categorical_column_with_identity(key="city", num_buckets=1000, default_value=0)
        city = fc.embedding_column(city, dimension=32, combiner='mean')
        
        # hash特征
        device_id = fc.categorical_column_with_hash_bucket(key="device_id", 
                    hash_bucket_size=1000000, dtype=tf.int64)
        device_id = fc.embedding_column(device_id, dimension=32, combiner='mean')

        item_id = fc.categorical_column_with_hash_bucket(key="item_id", 
                    hash_bucket_size=1000000, dtype=tf.int64)
        item_id = fc.embedding_column(item_id, dimension=32, combiner='mean')
        
        self.user_columns["user_vector"] = user_vector
        self.user_columns["age"] = age
        self.user_columns["city"] = city
        self.user_columns["device_id"] = device_id
        self.item_columns["item_vector"] = item_vector
        self.item_columns["item_id"] = item_id

        self.feature_spec = tf.feature_column.make_parse_example_spec(self.user_columns.values()+self.item_columns.values())

        return self

2. 模型部分,base_model.py

# coding:utf-8
import tensorflow as tf
from tensorflow import feature_column as fc
import config
import feature_processing as fe

FLAGS = config.FLAGS


def build_user_model(features, mode, params):
    # 特征输入
    feature_inputs = []
    for key, value in params["feature_configs"].user_columns.items():
        feature_inputs.append(tf.input_layer(features, value))
    # 特征拼接
    net = tf.concat(feature_inputs, axis=1)
    # 全连接
    for idx, units in enumerate(params["hidden_units"]):
        net = tf.layers.dense(net, units=units, activation=tf.nn.leaky_relu, name="fc_layer_%s"%idx)
        net = tf.layers.dropout(net, 0.4, training=(mode == tf.estimator.ModeKeys.TRAIN))
    # 最后输出
    net = tf.layers.dense(net, units=128, name="user_output_layer")
    return net

def build_item_model(features, mode, params):
    # 特征输入
    feature_inputs = []
    for key, value in params["feature_configs"].item_columns.items():
        feature_inputs.append(tf.input_layer(features, value))
    # 特征拼接
    net = tf.concat(feature_inputs, axis=1)
    # 全连接
    for idx, units in enumerate(params["hidden_units"]):
        net = tf.layers.dense(net, units=units, activation=tf.nn.leaky_relu, name="fc_layer_%s"%idx)
        net = tf.layers.dropout(net, 0.4, training=(mode == tf.estimator.ModeKeys.TRAIN))
    # 最后输出
    net = tf.layers.dense(net, units=128, name="item_output_layer")
    return net

def model_fn(features, labels, mode, params):
    user_net = build_user_model(features, mode, params)
    item_net = build_item_model(features, mode, params)
    dot = tf.reduce_sum(tf.multiply(user_net, item_net), axis=1, keepdims=True)
    pred = tf.sigmoid(dot)
    if mode == tf.estimator.ModeKeys.PREDICT:
        return tf.estimator.EstimatorSpec(mode, predictions={"output": pred})
    if mode == tf.estimator.ModeKeys.EVAL:
        loss = tf.losses.log_loss(labels, pred)
        metrics = {"auc": tf.metrics.auc(labels=labels, predictions=pred)}
        return tf.estimator.EstimatorSpec(mode, loss=loss, eval_metric_ops=metrics)
    if mode == tf.estimator.ModeKeys.TRAIN:
        loss = tf.losses.log_loss(labels, pred)
        global_step = tf.train.get_global_step()
        learning_rate = tf.train.exponential_decay(params["learning_rate"], global_step, 100000, 0.9, staircase=True)
        train_op = (tf.train.AdagradOptimizer(learning_rate).minimize(loss, global_step=global_step))
        return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)

3. 数据输入部分,data_input.py

# coding:utf-8
import tensorflow as tf
import config

FLAGS = config.FLAGS

def parse_exp(example):
    features_def = {}
    features_def["label"] = tf.io.FixedLenFeature([1], tf.int64)
    """
    数据解析逻辑
    """
    features = tf.io.parse_single_example(example, features_def)
    label = features["label"]
    return features, label


def train_input_fn(filenames=None, batch_size=128):
    with tf.gfile.Open(filenames) as f:
        filenames = f.read().split()
    files_all = []
    for f in filenames:
        files_all += tf.gfile.Glob(f)
    train_worker_num = len(FLAGS.worker_hosts.split(","))
    hash_id = FLAGS.task_index if FLAGS.job_name == "worker" else train_worker_num - 1
    files_shard = [files for i, files in enumerate(files_all) if i % train_worker_num == hash_id]
    dataset = tf.data.TFRecordDataset(files_shard)
    dataset = dataset.shuffle(batch_size*100)
    dataset = dataset.map(parse_exp, num_parallel_calls=8)
    dataset = dataset.repeat().batch(batch_size).prefetch(1)
    return dataset

def eval_input_fn(filenames=None, batch_size=128):
    with tf.gfile.Open(filenames) as f:
        filenames = f.read().split()
    files = tf.data.Dataset.list_files(filenames)
    dataset = files.apply(tf.contrib.data.parallel_interleave(lambda filename: tf.data.TFRecordDataset(files), buffer_output_elements=batch_size*20, cycle_length=10))
    dataset = dataset.map(parse_exp, num_parallel_calls=4)
    dataset = dataset.batch(batch_size)
    return dataset

4. 主函数部分,main.py

# encoding:utf-8
import os
import tensorflow as tf
from tensorflow import feature_column as fc
import feature_engineering as fe
from feature_processing import FeatureConfig
import base_model
import data_input
import config

FLAGS = config.FLAGS

if FLAGS.run_on_cluster:
    cluster = json.loads(os.environ["TF_CONFIG"])
    task_index = int(os.environ["TF_INDEX"])
    task_type = os.environ["TF_ROLE"]


def main(unused_argv):
    feature_configs = FeatureConfig().create_features_columns()
    classifier = tf.estimator.Estimator(model_fn=model.model_fn,
                                        config=tf.estimator.RunConfig(model_dir=FLAGS.model_dir,
                                                                      save_checkpoints_steps=FLAGS.save_checkpoints_steps,
                                                                      keep_checkpoint_max=3),
                                        params={"feature_configs": feature_configs,
                                                "hidden_units": list(map(int, FLAGS.hidden_units.split(","))),
                                                "learning_rate": FLAGS.learning_rate}
                                        )
    def train_eval_model():
        train_spec = tf.estimator.TrainSpec(input_fn=lambda: data_input.train_input_fn(FLAGS.train_data, FLAGS.batch_size),
                                            max_steps=FLAGS.train_steps)
        eval_spec = tf.estimator.EvalSpec(input_fn=lambda: data_input.eval_input_fn(FLAGS.eval_data, FLAGS.batch_size),
                                          start_delay_secs=60,
                                          throttle_secs = 30,
                                          steps=1000)
        tf.estimator.train_and_evaluate(classifier, train_spec, eval_spec)

    def export_model():
        feature_spec = feature_configs.feature_spec
        feature_map = {}
        for key, feature in feature_spec.items():
            if key not in fe.feature_configs:
                continue
            if isinstance(feature, tf.io.VarLenFeature):  # 可变长度
                feature_map[key] = tf.placeholder(dtype=feature.dtype, shape=[1], name=key)
            elif isinstance(feature, tf.io.FixedLenFeature):  # 固定长度
                feature_map[key] = tf.placeholder(dtype=feature.dtype, shape=[None, feature.shape[0]], name=key)
        serving_input_recevier_fn = tf.estimator.export.build_raw_serving_input_receiver_fn(feature_map)
        export_dir = classifier.export_saved_model(FLAGS.output_model, serving_input_recevier_fn)

    # 模型训练
    train_eval_model()
    
    # 导出模型,只在chief中导出一次即可
    if FLAGS.run_on_cluster: 
        if task_type == "chief":
            export_model()
    else:
        export_model()


if __name__ == "__main__":
    os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
    tf.logging.set_verbosity(tf.logging.INFO)
    tf.app.run(main=main)

5. config.py

import json, os, re, codecs
import tensorflow as tf

flags = tf.app.flags

flags.DEFINE_boolean("run_on_cluster", True, "Whether the cluster info need to be passed in as input")

flags.DEFINE_string("train_dir", "", "")
flags.DEFINE_string("data_dir", "", "")
flags.DEFINE_string("log_dir", "", "")
flags.DEFINE_string("ps_hosts", "","")
flags.DEFINE_string("worker_hosts", "","")
flags.DEFINE_string("job_name", "", "One of 'ps', 'worker'")
flags.DEFINE_integer("task_index", 0, "Index of task within the job")
flags.DEFINE_string("model_dir", "hdfs:/user/ranker/ckpt/", "Base directory for the model.")
flags.DEFINE_string("output_model", "hdfs:/user/ranker/model/", "Saved model.")
flags.DEFINE_string("train_data", "./train_data.txt", "Directory for storing mnist data")
flags.DEFINE_string("eval_data", "./eval_data.txt", "Path to the evaluation data.")
flags.DEFINE_string("hidden_units", "512,256,128", "hidden units.")
flags.DEFINE_integer("train_steps",1000000, "Number of (global) training steps to perform")
flags.DEFINE_integer("batch_size", 256, "Training batch size")
flags.DEFINE_float("learning_rate", 0.0001, "learning rate")
flags.DEFINE_integer("save_checkpoints_steps", 10000, "Save checkpoints every this many steps")

FLAGS = flags.FLAGS

6. 最后是任务提交命令,各公司应该都有自己的提交平台,这里就不贴出来了。

当然该代码直接单机运行也是没有任何问题的,只需要将run_on_cluster设置为False即可。

你可能感兴趣的:(tensorflow,深度学习)