Deep Interest Network (DIN)专题4-网络结构部分代码解析

整体代码

上节重点分析了数据加载相关代码,本节将重点分析下模型训练相关的代码,整个模型相关部分的代码如下:

import tensorflow as tf

from Dice import dice

class Model(object):

  def __init__(self, user_count, item_count, cate_count, cate_list, predict_batch_size, predict_ads_num):

    self.u = tf.placeholder(tf.int32, [None,]) # [B] 用户id
    self.i = tf.placeholder(tf.int32, [None,]) # [B] 推荐商品id
    self.j = tf.placeholder(tf.int32, [None,]) # [B] 
    self.y = tf.placeholder(tf.float32, [None,]) # [B] 是否点击
    self.hist_i = tf.placeholder(tf.int32, [None, None]) # [B, T] 之前点击商品id列表
    self.sl = tf.placeholder(tf.int32, [None,]) # [B] 之前点击商品个数
    self.lr = tf.placeholder(tf.float32, []) # 学习率

    hidden_units = 128

    user_emb_w = tf.get_variable("user_emb_w", [user_count, hidden_units]) # 用户embedding
    item_emb_w = tf.get_variable("item_emb_w", [item_count, hidden_units // 2]) # 商品embedding
    item_b = tf.get_variable("item_b", [item_count],
                             initializer=tf.constant_initializer(0.0))
    cate_emb_w = tf.get_variable("cate_emb_w", [cate_count, hidden_units // 2])
    cate_list = tf.convert_to_tensor(cate_list, dtype=tf.int64) # 所有商品的分类List

    ic = tf.gather(cate_list, self.i)
    i_emb = tf.concat(values = [
        tf.nn.embedding_lookup(item_emb_w, self.i),
        tf.nn.embedding_lookup(cate_emb_w, ic),
        ], axis=1)
        # 推荐商品i的embedding + 分类embedding B*T,BATCH_SIZE个一维向量。
        # 两个embedding的向量维度均为hidden_units // 2,故拼接后的embedding向量的维度为hidden_units
    i_b = tf.gather(item_b, self.i)

    jc = tf.gather(cate_list, self.j)
    j_emb = tf.concat([
        tf.nn.embedding_lookup(item_emb_w, self.j),
        tf.nn.embedding_lookup(cate_emb_w, jc),
        ], axis=1)
    j_b = tf.gather(item_b, self.j)

    hc = tf.gather(cate_list, self.hist_i)
    h_emb = tf.concat([
        tf.nn.embedding_lookup(item_emb_w, self.hist_i),
        tf.nn.embedding_lookup(cate_emb_w, hc),
        ], axis=2) # 之前点过商品的embedding + 分类embedding B*N*T,BATCH_SIZE个样本 * N个访问记录 * 一维向量

    hist_i =attention(i_emb, h_emb, self.sl)
    # 放回 [B,1,H],一个Batch每一个样本都有一个 sum pooling出的embedding向量。
    # embedding向量维度为hidden_units。
    #-- attention end ---
    
    hist_i = tf.layers.batch_normalization(inputs = hist_i)
    hist_i = tf.reshape(hist_i, [-1, hidden_units], name='hist_bn')
    # [B, hidden_units],每一个embedding向量的维度是hidden_units。
    hist_i = tf.layers.dense(hist_i, hidden_units, name='hist_fcn')
    u_emb_i = hist_i
    
    hist_j =attention(j_emb, h_emb, self.sl)
    # 
    #-- attention end ---
    
    # hist_j = tf.layers.batch_normalization(inputs = hist_j)
    hist_j = tf.layers.batch_normalization(inputs = hist_j, reuse=True)
    hist_j = tf.reshape(hist_j, [-1, hidden_units], name='hist_bn')
    hist_j = tf.layers.dense(hist_j, hidden_units, name='hist_fcn', reuse=True)

    u_emb_j = hist_j
    print(u_emb_i.get_shape().as_list())
    print(u_emb_j.get_shape().as_list())
    print(i_emb.get_shape().as_list())
    print(j_emb.get_shape().as_list())
    #-- fcn begin -------
    din_i = tf.concat([u_emb_i, i_emb, u_emb_i * i_emb], axis=-1)
    din_i = tf.layers.batch_normalization(inputs=din_i, name='b1')
    d_layer_1_i = tf.layers.dense(din_i, 80, activation=tf.nn.sigmoid, name='f1')
    #if u want try dice change sigmoid to None and add dice layer like following two lines. You can also find model_dice.py in this folder.
    # d_layer_1_i = tf.layers.dense(din_i, 80, activation=None, name='f1')
    # d_layer_1_i = dice(d_layer_1_i, name='dice_1_i')
    d_layer_2_i = tf.layers.dense(d_layer_1_i, 40, activation=tf.nn.sigmoid, name='f2')
    # d_layer_2_i = tf.layers.dense(d_layer_1_i, 40, activation=None, name='f2')
    # d_layer_2_i = dice(d_layer_2_i, name='dice_2_i')
    d_layer_3_i = tf.layers.dense(d_layer_2_i, 1, activation=None, name='f3')
    din_j = tf.concat([u_emb_j, j_emb, u_emb_j * j_emb], axis=-1)
    din_j = tf.layers.batch_normalization(inputs=din_j, name='b1', reuse=True)
    d_layer_1_j = tf.layers.dense(din_j, 80, activation=tf.nn.sigmoid, name='f1', reuse=True)
    # d_layer_1_j = tf.layers.dense(din_j, 80, activation=None, name='f1', reuse=True)
    # d_layer_1_j = dice(d_layer_1_j, name='dice_1_j')
    d_layer_2_j = tf.layers.dense(d_layer_1_j, 40, activation=tf.nn.sigmoid, name='f2', reuse=True)
    # d_layer_2_j = tf.layers.dense(d_layer_1_j, 40, activation=None, name='f2', reuse=True)
    # d_layer_2_j = dice(d_layer_2_j, name='dice_2_j')
    d_layer_3_j = tf.layers.dense(d_layer_2_j, 1, activation=None, name='f3', reuse=True)
    d_layer_3_i = tf.reshape(d_layer_3_i, [-1])
    d_layer_3_j = tf.reshape(d_layer_3_j, [-1])
    x = i_b - j_b + d_layer_3_i - d_layer_3_j # [B]
    self.logits = i_b + d_layer_3_i
    
    # prediciton for selected items
    # logits for selected item:
    item_emb_all = tf.concat([
        item_emb_w,
        tf.nn.embedding_lookup(cate_emb_w, cate_list)
        ], axis=1)
    item_emb_sub = item_emb_all[:predict_ads_num,:]
    item_emb_sub = tf.expand_dims(item_emb_sub, 0)
    item_emb_sub = tf.tile(item_emb_sub, [predict_batch_size, 1, 1])
    hist_sub =attention_multi_items(item_emb_sub, h_emb, self.sl)
    #-- attention end ---
    
    hist_sub = tf.layers.batch_normalization(inputs = hist_sub, name='hist_bn', reuse=tf.AUTO_REUSE)
    # print hist_sub.get_shape().as_list() 
    hist_sub = tf.reshape(hist_sub, [-1, hidden_units])
    hist_sub = tf.layers.dense(hist_sub, hidden_units, name='hist_fcn', reuse=tf.AUTO_REUSE)

    u_emb_sub = hist_sub
    item_emb_sub = tf.reshape(item_emb_sub, [-1, hidden_units])
    din_sub = tf.concat([u_emb_sub, item_emb_sub, u_emb_sub * item_emb_sub], axis=-1)
    din_sub = tf.layers.batch_normalization(inputs=din_sub, name='b1', reuse=True)
    d_layer_1_sub = tf.layers.dense(din_sub, 80, activation=tf.nn.sigmoid, name='f1', reuse=True)
    #d_layer_1_sub = dice(d_layer_1_sub, name='dice_1_sub')
    d_layer_2_sub = tf.layers.dense(d_layer_1_sub, 40, activation=tf.nn.sigmoid, name='f2', reuse=True)
    #d_layer_2_sub = dice(d_layer_2_sub, name='dice_2_sub')
    d_layer_3_sub = tf.layers.dense(d_layer_2_sub, 1, activation=None, name='f3', reuse=True)
    d_layer_3_sub = tf.reshape(d_layer_3_sub, [-1, predict_ads_num])
    self.logits_sub = tf.sigmoid(item_b[:predict_ads_num] + d_layer_3_sub)
    self.logits_sub = tf.reshape(self.logits_sub, [-1, predict_ads_num, 1])
    #-- fcn end -------

    
    self.mf_auc = tf.reduce_mean(tf.to_float(x > 0))
    self.score_i = tf.sigmoid(i_b + d_layer_3_i)
    self.score_j = tf.sigmoid(j_b + d_layer_3_j)
    self.score_i = tf.reshape(self.score_i, [-1, 1])
    self.score_j = tf.reshape(self.score_j, [-1, 1])
    self.p_and_n = tf.concat([self.score_i, self.score_j], axis=-1)
    print(self.p_and_n.get_shape().as_list())


    # Step variable
    self.global_step = tf.Variable(0, trainable=False, name='global_step')
    self.global_epoch_step = \
        tf.Variable(0, trainable=False, name='global_epoch_step')
    self.global_epoch_step_op = \
        tf.assign(self.global_epoch_step, self.global_epoch_step+1)

    self.loss = tf.reduce_mean(
        tf.nn.sigmoid_cross_entropy_with_logits(
            logits=self.logits,
            labels=self.y)
        )

    trainable_params = tf.trainable_variables()
    self.opt = tf.train.GradientDescentOptimizer(learning_rate=self.lr)
    gradients = tf.gradients(self.loss, trainable_params)
    clip_gradients, _ = tf.clip_by_global_norm(gradients, 5)
    self.train_op = self.opt.apply_gradients(
        zip(clip_gradients, trainable_params), global_step=self.global_step)


  def train(self, sess, uij, l):
    loss, _ = sess.run([self.loss, self.train_op], feed_dict={
        self.u: uij[0],
        self.i: uij[1],
        self.y: uij[2],
        self.hist_i: uij[3],
        self.sl: uij[4],
        self.lr: l,
        })
    return loss

  def eval(self, sess, uij):
    u_auc, socre_p_and_n = sess.run([self.mf_auc, self.p_and_n], feed_dict={
        self.u: uij[0],
        self.i: uij[1],
        self.j: uij[2],
        self.hist_i: uij[3],
        self.sl: uij[4],
        })
    return u_auc, socre_p_and_n
  
  def test(self, sess, uij):
    return sess.run(self.logits_sub, feed_dict={
        self.u: uij[0],
        self.i: uij[1],
        self.j: uij[2],
        self.hist_i: uij[3],
        self.sl: uij[4],
        })
  

  def save(self, sess, path):
    saver = tf.train.Saver()
    saver.save(sess, save_path=path)

  def restore(self, sess, path):
    saver = tf.train.Saver()
    saver.restore(sess, save_path=path)

def extract_axis_1(data, ind):
  batch_range = tf.range(tf.shape(data)[0])
  indices = tf.stack([batch_range, ind], axis=1)
  res = tf.gather_nd(data, indices)
  return res

def attention(queries, keys, keys_length):
  '''
    queries:     [B, H] BATCH_SIZE个embedding向量(二维矩阵)
    keys:        [B, T, H] BATCH_SIZE个之前访问T个商品的embedding向量(三维矩阵)
    keys_length: [B] batch_size里面,每个用户之前点击过商品的个数,注意这里和T的区别,T是取的所有里面的最大值,是定长,而这里是非定长的,表示的是实际点击的商品数。
  '''
  queries_hidden_units = queries.get_shape().as_list()[-1] # 推荐商品的embedding向量维度
  queries = tf.tile(queries, [1, tf.shape(keys)[1]]) # queries的1纬不变,2维扩张为原来的T倍,即之前点击过的商品数。
  # querise纬度变为 [B, T*H]
  queries = tf.reshape(queries, [-1, tf.shape(keys)[1], queries_hidden_units])
  # 修改querise纬度变为[B, T, H],对于一个推荐商品,其会生成T个重复的相同一个推荐商品的embedding向量。
  din_all = tf.concat([queries, keys, queries-keys, queries*keys], axis=-1)
  # 最后一个维度拼接到一起,拼接后变为 [B, T, 4*H]
  d_layer_1_all = tf.layers.dense(din_all, 80, activation=tf.nn.sigmoid, name='f1_att', reuse=tf.AUTO_REUSE)
  # 第一层网络,输出[B, T, 80]
  d_layer_2_all = tf.layers.dense(d_layer_1_all, 40, activation=tf.nn.sigmoid, name='f2_att', reuse=tf.AUTO_REUSE)
  # 第二层网络,输出[B, T, 40]
  d_layer_3_all = tf.layers.dense(d_layer_2_all, 1, activation=None, name='f3_att', reuse=tf.AUTO_REUSE)
  # 第三层网络,输出[B, T, 1]
  d_layer_3_all = tf.reshape(d_layer_3_all, [-1, 1, tf.shape(keys)[1]])
  outputs = d_layer_3_all
  # 最后的输出为 [B, 1, T]
  # Mask
  key_masks = tf.sequence_mask(keys_length, tf.shape(keys)[1])   # [B, T]
  # 标识矩阵B * T个点位,哪些是true (存在之前点击过的商品)哪些是false(不存在之前点击过的商品)
  #例如:tf.sequence_mask([1, 3, 2], 5),返回值为:
  # [[True, False, False, False, False],
  #  [True, True, True, False, False],
  #  [True, True, False, False, False]]
  key_masks = tf.expand_dims(key_masks, 1) # [B, 1, T]
  # 纬度变为 [B, 1, T]
  paddings = tf.ones_like(outputs) * (-2 ** 32 + 1)
  # 生成与outputs纬度相同的tensor,纬度为 [B, 1, T],所有值初始化为:-2 ** 32 + 1,之所以如此初始化是因为-2 ** 32 + 1 在softmax中取值无限接近于0.
  outputs = tf.where(key_masks, outputs, paddings)  # [B, 1, T]
  # key_masks对应点如果为true,则赋值为对应outputs点的值,如果为false(不存在的),赋值为对应paddings点的值:-2 ** 32 + 1。
  # Scale
  outputs = outputs / (keys.get_shape().as_list()[-1] ** 0.5)
  # 归一化处理 outputs = outputs / sqrt(H)
  # Activation
  outputs = tf.nn.softmax(outputs)  # [B, 1, T]
  # softmax,最后T个维度分别表示T个产品和推荐产品的相关性,相关性越高对应softmax输出值就越大。
  # Weighted sum
  outputs = tf.matmul(outputs, keys)  # [B, 1, H]
  # sum polling: 基于相关性和attention机制,选择相关性高的embedding向量。
  # 两个矩阵维度分别为 [B, 1, T] 和 [B, T, H],实际是矩阵的后两纬相乘:[1,T]*[T,H],第一个[1,T]向量,每一个代表了相关度大小,
  # 相关度越高,对应的历史点击商品和当前推荐商品的
  return outputs # 返回 B * 1 * H

def attention_multi_items(queries, keys, keys_length):
  '''
    queries:     [B, N, H] N is the number of ads
    keys:        [B, T, H] 
    keys_length: [B]
  '''
  queries_hidden_units = queries.get_shape().as_list()[-1] # 推荐商品的embedding向量维度
  queries_nums = queries.get_shape().as_list()[1] # 推荐商品的个数
  queries = tf.tile(queries, [1, 1, tf.shape(keys)[1]]) # queries的1维和2维不变,3维扩张为原来的的T倍,T对应之前看过的商品数。
  # [B,N,T*H]
  queries = tf.reshape(queries, [-1, queries_nums, tf.shape(keys)[1], queries_hidden_units]) 
  # 变为四维,即shape : [B, N, T, H]
  max_len = tf.shape(keys)[1]
  keys = tf.tile(keys, [1, queries_nums, 1])
  keys = tf.reshape(keys, [-1, queries_nums, max_len, queries_hidden_units])
  # shape : [B, N, T, H], 推荐商品embedding和访问商品embedding形成一一对应
  din_all = tf.concat([queries, keys, queries-keys, queries*keys], axis=-1) #最后一层拼接到一起
  d_layer_1_all = tf.layers.dense(din_all, 80, activation=tf.nn.sigmoid, name='f1_att', reuse=tf.AUTO_REUSE)
  d_layer_2_all = tf.layers.dense(d_layer_1_all, 40, activation=tf.nn.sigmoid, name='f2_att', reuse=tf.AUTO_REUSE)
  d_layer_3_all = tf.layers.dense(d_layer_2_all, 1, activation=None, name='f3_att', reuse=tf.AUTO_REUSE)
  d_layer_3_all = tf.reshape(d_layer_3_all, [-1, queries_nums, 1, max_len])
  # [B,N,1,T]
  outputs = d_layer_3_all 
  # Mask
  key_masks = tf.sequence_mask(keys_length, max_len)   # [B, T]
  key_masks = tf.tile(key_masks, [1, queries_nums])
  key_masks = tf.reshape(key_masks, [-1, queries_nums, 1, max_len]) # shape : [B, N, 1, T]
  paddings = tf.ones_like(outputs) * (-2 ** 32 + 1)
  outputs = tf.where(key_masks, outputs, paddings)  # [B, N, 1, T]

  # Scale
  outputs = outputs / (keys.get_shape().as_list()[-1] ** 0.5)

  # Activation
  outputs = tf.nn.softmax(outputs)  # [B, N, 1, T]
  outputs = tf.reshape(outputs, [-1, 1, max_len])
  keys = tf.reshape(keys, [-1, max_len, queries_hidden_units])
  #print outputs.get_shape().as_list()
  #print keys.get_sahpe().as_list()
  # Weighted sum
  outputs = tf.matmul(outputs, keys)
  outputs = tf.reshape(outputs, [-1, queries_nums, queries_hidden_units])  # [B, N, 1, H]
  print(outputs.get_shape().as_list())
  return outputs

网络结构主要是在model类初始化构造函数__init__中完成定义和初始化,model类调用train函数完成训练,分开来进一步分析代码:

训练函数和输入:

  def train(self, sess, uij, l):
    loss, _ = sess.run([self.loss, self.train_op], feed_dict={
        self.u: uij[0], # 用户id
        self.i: uij[1], # 推荐商品id
        self.y: uij[2], # 是否点击该商品
        self.hist_i: uij[3], # 之前点击商品列表
        self.sl: uij[4], #之前点击商品个数
        self.lr: l, # 学习率
        })
    return loss

其中,输入:用户id商品id是否点击该商品(label: 0 或者 1)和点击商品个数均为1维:batch_size(一个batch的样本数) 大小的向量。而输入:之前点击商品列表为2维:batch_size * T(所有batch_size个样本里点击过商品最大的个数)。输入tensor与训练函数的输入数据对应:

    self.u = tf.placeholder(tf.int32, [None,]) # [B] 用户id
    self.i = tf.placeholder(tf.int32, [None,]) # [B] 推荐商品id
    self.j = tf.placeholder(tf.int32, [None,]) # [B] 
    self.y = tf.placeholder(tf.float32, [None,]) # [B] 是否点击
    self.hist_i = tf.placeholder(tf.int32, [None, None]) # [B, T] 之前点击商品id列表
    self.sl = tf.placeholder(tf.int32, [None,]) # [B] 之前点击商品个数
    self.lr = tf.placeholder(tf.float32, []) # 学习率

embedding层定义

    hidden_units = 128

    user_emb_w = tf.get_variable("user_emb_w", [user_count, hidden_units]) # 用户embedding
    item_emb_w = tf.get_variable("item_emb_w", [item_count, hidden_units // 2]) # 商品embedding
    item_b = tf.get_variable("item_b", [item_count],
                             initializer=tf.constant_initializer(0.0))
    cate_emb_w = tf.get_variable("cate_emb_w", [cate_count, hidden_units // 2])
    cate_list = tf.convert_to_tensor(cate_list, dtype=tf.int64) # 所有商品的分类List

embedding层定义如下几个embedding层:

        用户id的embedding:user_emb_w

        商品item id的embedding: item_emb_w

        商品分类的embedding:cate_emb_w

以及:

        商品item id的embedding的偏置:item_b

        所有商品对应的商品分类(大约有几百个分类,初始化输入后转换为固定的tensor):cate_list(1对1分类的List,List索引位置对应商品id编码,直接通过索引找到商品分类)

embedding特征拼接

    ic = tf.gather(cate_list, self.i)
    i_emb = tf.concat(values = [
        tf.nn.embedding_lookup(item_emb_w, self.i),
        tf.nn.embedding_lookup(cate_emb_w, ic),
        ], axis=1)
        # 推荐商品i的embedding + 分类embedding B*T,BATCH_SIZE个一维向量。
        # 两个embedding的向量维度均为hidden_units // 2,故拼接后的embedding向量的维度为hidden_units
    i_b = tf.gather(item_b, self.i)

    jc = tf.gather(cate_list, self.j)
    j_emb = tf.concat([
        tf.nn.embedding_lookup(item_emb_w, self.j),
        tf.nn.embedding_lookup(cate_emb_w, jc),
        ], axis=1)
    j_b = tf.gather(item_b, self.j)

    hc = tf.gather(cate_list, self.hist_i)
    h_emb = tf.concat([
        tf.nn.embedding_lookup(item_emb_w, self.hist_i),
        tf.nn.embedding_lookup(cate_emb_w, hc),
        ], axis=2) # 之前点过商品的embedding + 分类embedding B*N*T,BATCH_SIZE个样本 * N个访问记录 * 一维向量

这里,i_emb商品id embedding特征商品分类embedding特征拼接到一起的特征,i_b为对应embedding特征的偏置,其对应维度为: B(BATCH_SIZE)*H(embedding向量维度128)。h_emb为之前点击过的商品id embedding特征分类embedding特征拼接到一起的特征,由于点击过的商品可能有多个,其纬度为:B(BATCH_SIZE)*T(所有batch_size个样本里点击过商品最大的个数)*H(embedding向量维度128)。

attention层

attention实现代码:

    hist_i =attention(i_emb, h_emb, self.sl)
    # 返回 [B,1,H],一个Batch每一个样本都有一个 sum pooling出的embedding向量。
    # embedding向量维度为hidden_units。

attention函数定义:

def attention(queries, keys, keys_length):
  '''
    queries:     [B, H] BATCH_SIZE个embedding向量(二维矩阵)
    keys:        [B, T, H] BATCH_SIZE个之前访问T个商品的embedding向量(三维矩阵)
    keys_length: [B] batch_size里面,每个用户之前点击过商品的个数,注意这里和T的区别,T是取的所有里面的最大值,是定长,而这里是非定长的,表示的是实际点击的商品数。
  '''
  queries_hidden_units = queries.get_shape().as_list()[-1] # 推荐商品的embedding向量维度
  queries = tf.tile(queries, [1, tf.shape(keys)[1]]) # queries的1纬不变,2维扩张为原来的T倍,即之前点击过的商品数。
  # querise纬度变为 [B, T*H]
  queries = tf.reshape(queries, [-1, tf.shape(keys)[1], queries_hidden_units])
  # 修改querise纬度变为[B, T, H],对于一个推荐商品,其会生成T个重复的相同一个推荐商品的embedding向量。
  din_all = tf.concat([queries, keys, queries-keys, queries*keys], axis=-1)
  # 最后一个维度拼接到一起,拼接后变为 [B, T, 4*H]
  d_layer_1_all = tf.layers.dense(din_all, 80, activation=tf.nn.sigmoid, name='f1_att', reuse=tf.AUTO_REUSE)
  # 第一层网络,输出[B, T, 80]
  d_layer_2_all = tf.layers.dense(d_layer_1_all, 40, activation=tf.nn.sigmoid, name='f2_att', reuse=tf.AUTO_REUSE)
  # 第二层网络,输出[B, T, 40]
  d_layer_3_all = tf.layers.dense(d_layer_2_all, 1, activation=None, name='f3_att', reuse=tf.AUTO_REUSE)
  # 第三层网络,输出[B, T, 1]
  d_layer_3_all = tf.reshape(d_layer_3_all, [-1, 1, tf.shape(keys)[1]])
  outputs = d_layer_3_all
  # 最后的输出为 [B, 1, T]
  # Mask
  key_masks = tf.sequence_mask(keys_length, tf.shape(keys)[1])   # [B, T]
  # 标识矩阵B * T个点位,哪些是true (存在之前点击过的商品)哪些是false(不存在之前点击过的商品)
  #例如:tf.sequence_mask([1, 3, 2], 5),返回值为:
  # [[True, False, False, False, False],
  #  [True, True, True, False, False],
  #  [True, True, False, False, False]]
  key_masks = tf.expand_dims(key_masks, 1) # [B, 1, T]
  # 纬度变为 [B, 1, T]
  paddings = tf.ones_like(outputs) * (-2 ** 32 + 1)
  # 生成与outputs纬度相同的tensor,纬度为 [B, 1, T],所有值初始化为:-2 ** 32 + 1,之所以如此初始化是因为-2 ** 32 + 1 在softmax中取值无限接近于0.
  outputs = tf.where(key_masks, outputs, paddings)  # [B, 1, T]
  # key_masks对应点如果为true,则赋值为对应outputs点的值,如果为false(不存在的),赋值为对应paddings点的值:-2 ** 32 + 1。
  # Scale
  outputs = outputs / (keys.get_shape().as_list()[-1] ** 0.5)
  # 归一化处理 outputs = outputs / sqrt(H)
  # Activation
  outputs = tf.nn.softmax(outputs)  # [B, 1, T]
  # softmax,最后T个维度分别表示T个产品和推荐产品的相关性,相关性越高对应softmax输出值就越大。
  # Weighted sum
  outputs = tf.matmul(outputs, keys)  # [B, 1, H]
  # sum polling: 基于相关性和attention机制,选择相关性高的embedding向量。
  # 两个矩阵维度分别为 [B, 1, T] 和 [B, T, H],实际是矩阵的后两纬相乘:[1,T]*[T,H],第一个[1,T]向量,每一个代表了相关度大小,
  # 相关度越高,对应的历史点击商品和当前推荐商品的
  return outputs # 返回 B * 1 * H

这是论文算法中最核心的部分,代码中已经给了详细注释,基本原理就是通过一个3层全连接的神经网络来学习当前推荐商品i_emb和之前点击所有商品的embedding特征h_emb中每一个商品的特征的相关性,这里通过输入两个embedding特征的以及它们之间的乘积来增强对特征的学习:

din_all = tf.concat([queries, keys, queries-keys, queries*keys], axis=-1)

随后,通过softmax生成attention机制,最后通过sum pooling根据相关性选择出之前点击过的相关性较高的商品的embedding特征。

全连接层

    hist_i = tf.layers.batch_normalization(inputs = hist_i)
    hist_i = tf.reshape(hist_i, [-1, hidden_units], name='hist_bn')
    # [B, hidden_units],每一个embedding向量的维度是hidden_units。
    hist_i = tf.layers.dense(hist_i, hidden_units, name='hist_fcn')
    u_emb_i = hist_i
 
    din_i = tf.concat([u_emb_i, i_emb, u_emb_i * i_emb], axis=-1)
    din_i = tf.layers.batch_normalization(inputs=din_i, name='b1')
    d_layer_1_i = tf.layers.dense(din_i, 80, activation=tf.nn.sigmoid, name='f1')
    #if u want try dice change sigmoid to None and add dice layer like following two lines. You can also find model_dice.py in this folder.
    # d_layer_1_i = tf.layers.dense(din_i, 80, activation=None, name='f1')
    # d_layer_1_i = dice(d_layer_1_i, name='dice_1_i')
    d_layer_2_i = tf.layers.dense(d_layer_1_i, 40, activation=tf.nn.sigmoid, name='f2')
    # d_layer_2_i = tf.layers.dense(d_layer_1_i, 40, activation=None, name='f2')
    # d_layer_2_i = dice(d_layer_2_i, name='dice_2_i')
    d_layer_3_i = tf.layers.dense(d_layer_2_i, 1, activation=None, name='f3')
    din_j = tf.concat([u_emb_j, j_emb, u_emb_j * j_emb], axis=-1)
    din_j = tf.layers.batch_normalization(inputs=din_j, name='b1', reuse=True)
    d_layer_1_j = tf.layers.dense(din_j, 80, activation=tf.nn.sigmoid, name='f1', reuse=True)
    # d_layer_1_j = tf.layers.dense(din_j, 80, activation=None, name='f1', reuse=True)
    # d_layer_1_j = dice(d_layer_1_j, name='dice_1_j')
    d_layer_2_j = tf.layers.dense(d_layer_1_j, 40, activation=tf.nn.sigmoid, name='f2', reuse=True)
    # d_layer_2_j = tf.layers.dense(d_layer_1_j, 40, activation=None, name='f2', reuse=True)
    # d_layer_2_j = dice(d_layer_2_j, name='dice_2_j')
    d_layer_3_j = tf.layers.dense(d_layer_2_j, 1, activation=None, name='f3', reuse=True)
    d_layer_3_i = tf.reshape(d_layer_3_i, [-1])
    d_layer_3_j = tf.reshape(d_layer_3_j, [-1])
    x = i_b - j_b + d_layer_3_i - d_layer_3_j # [B]
    self.logits = i_b + d_layer_3_i

attention层挑选出的用户历史行为特征u_emb_i和商品特征i_emb以及两特征向量乘积送入全连接网络,最后加入偏置i_b成为最终logistic判定的输入:

self.logits = i_b + d_layer_3_i

损失函数和训练

最终损失函数为:

    self.loss = tf.reduce_mean(
        tf.nn.sigmoid_cross_entropy_with_logits(
            logits=self.logits,
            labels=self.y)
        )

训练函数为:

    trainable_params = tf.trainable_variables()
    self.opt = tf.train.GradientDescentOptimizer(learning_rate=self.lr)
    gradients = tf.gradients(self.loss, trainable_params)
    clip_gradients, _ = tf.clip_by_global_norm(gradients, 5) # 避免一次迭代中权重的更新过于迅猛
    self.train_op = self.opt.apply_gradients( 
        zip(clip_gradients, trainable_params), global_step=self.global_step)

这里可以与最开始部分介绍的模型调用的训练函数train()对应上了。

你可能感兴趣的:(推荐算法,tensorflow,机器学习,python)