上节重点分析了数据加载相关代码,本节将重点分析下模型训练相关的代码,整个模型相关部分的代码如下:
import tensorflow as tf
from Dice import dice
class Model(object):
def __init__(self, user_count, item_count, cate_count, cate_list, predict_batch_size, predict_ads_num):
self.u = tf.placeholder(tf.int32, [None,]) # [B] 用户id
self.i = tf.placeholder(tf.int32, [None,]) # [B] 推荐商品id
self.j = tf.placeholder(tf.int32, [None,]) # [B]
self.y = tf.placeholder(tf.float32, [None,]) # [B] 是否点击
self.hist_i = tf.placeholder(tf.int32, [None, None]) # [B, T] 之前点击商品id列表
self.sl = tf.placeholder(tf.int32, [None,]) # [B] 之前点击商品个数
self.lr = tf.placeholder(tf.float32, []) # 学习率
hidden_units = 128
user_emb_w = tf.get_variable("user_emb_w", [user_count, hidden_units]) # 用户embedding
item_emb_w = tf.get_variable("item_emb_w", [item_count, hidden_units // 2]) # 商品embedding
item_b = tf.get_variable("item_b", [item_count],
initializer=tf.constant_initializer(0.0))
cate_emb_w = tf.get_variable("cate_emb_w", [cate_count, hidden_units // 2])
cate_list = tf.convert_to_tensor(cate_list, dtype=tf.int64) # 所有商品的分类List
ic = tf.gather(cate_list, self.i)
i_emb = tf.concat(values = [
tf.nn.embedding_lookup(item_emb_w, self.i),
tf.nn.embedding_lookup(cate_emb_w, ic),
], axis=1)
# 推荐商品i的embedding + 分类embedding B*T,BATCH_SIZE个一维向量。
# 两个embedding的向量维度均为hidden_units // 2,故拼接后的embedding向量的维度为hidden_units
i_b = tf.gather(item_b, self.i)
jc = tf.gather(cate_list, self.j)
j_emb = tf.concat([
tf.nn.embedding_lookup(item_emb_w, self.j),
tf.nn.embedding_lookup(cate_emb_w, jc),
], axis=1)
j_b = tf.gather(item_b, self.j)
hc = tf.gather(cate_list, self.hist_i)
h_emb = tf.concat([
tf.nn.embedding_lookup(item_emb_w, self.hist_i),
tf.nn.embedding_lookup(cate_emb_w, hc),
], axis=2) # 之前点过商品的embedding + 分类embedding B*N*T,BATCH_SIZE个样本 * N个访问记录 * 一维向量
hist_i =attention(i_emb, h_emb, self.sl)
# 放回 [B,1,H],一个Batch每一个样本都有一个 sum pooling出的embedding向量。
# embedding向量维度为hidden_units。
#-- attention end ---
hist_i = tf.layers.batch_normalization(inputs = hist_i)
hist_i = tf.reshape(hist_i, [-1, hidden_units], name='hist_bn')
# [B, hidden_units],每一个embedding向量的维度是hidden_units。
hist_i = tf.layers.dense(hist_i, hidden_units, name='hist_fcn')
u_emb_i = hist_i
hist_j =attention(j_emb, h_emb, self.sl)
#
#-- attention end ---
# hist_j = tf.layers.batch_normalization(inputs = hist_j)
hist_j = tf.layers.batch_normalization(inputs = hist_j, reuse=True)
hist_j = tf.reshape(hist_j, [-1, hidden_units], name='hist_bn')
hist_j = tf.layers.dense(hist_j, hidden_units, name='hist_fcn', reuse=True)
u_emb_j = hist_j
print(u_emb_i.get_shape().as_list())
print(u_emb_j.get_shape().as_list())
print(i_emb.get_shape().as_list())
print(j_emb.get_shape().as_list())
#-- fcn begin -------
din_i = tf.concat([u_emb_i, i_emb, u_emb_i * i_emb], axis=-1)
din_i = tf.layers.batch_normalization(inputs=din_i, name='b1')
d_layer_1_i = tf.layers.dense(din_i, 80, activation=tf.nn.sigmoid, name='f1')
#if u want try dice change sigmoid to None and add dice layer like following two lines. You can also find model_dice.py in this folder.
# d_layer_1_i = tf.layers.dense(din_i, 80, activation=None, name='f1')
# d_layer_1_i = dice(d_layer_1_i, name='dice_1_i')
d_layer_2_i = tf.layers.dense(d_layer_1_i, 40, activation=tf.nn.sigmoid, name='f2')
# d_layer_2_i = tf.layers.dense(d_layer_1_i, 40, activation=None, name='f2')
# d_layer_2_i = dice(d_layer_2_i, name='dice_2_i')
d_layer_3_i = tf.layers.dense(d_layer_2_i, 1, activation=None, name='f3')
din_j = tf.concat([u_emb_j, j_emb, u_emb_j * j_emb], axis=-1)
din_j = tf.layers.batch_normalization(inputs=din_j, name='b1', reuse=True)
d_layer_1_j = tf.layers.dense(din_j, 80, activation=tf.nn.sigmoid, name='f1', reuse=True)
# d_layer_1_j = tf.layers.dense(din_j, 80, activation=None, name='f1', reuse=True)
# d_layer_1_j = dice(d_layer_1_j, name='dice_1_j')
d_layer_2_j = tf.layers.dense(d_layer_1_j, 40, activation=tf.nn.sigmoid, name='f2', reuse=True)
# d_layer_2_j = tf.layers.dense(d_layer_1_j, 40, activation=None, name='f2', reuse=True)
# d_layer_2_j = dice(d_layer_2_j, name='dice_2_j')
d_layer_3_j = tf.layers.dense(d_layer_2_j, 1, activation=None, name='f3', reuse=True)
d_layer_3_i = tf.reshape(d_layer_3_i, [-1])
d_layer_3_j = tf.reshape(d_layer_3_j, [-1])
x = i_b - j_b + d_layer_3_i - d_layer_3_j # [B]
self.logits = i_b + d_layer_3_i
# prediciton for selected items
# logits for selected item:
item_emb_all = tf.concat([
item_emb_w,
tf.nn.embedding_lookup(cate_emb_w, cate_list)
], axis=1)
item_emb_sub = item_emb_all[:predict_ads_num,:]
item_emb_sub = tf.expand_dims(item_emb_sub, 0)
item_emb_sub = tf.tile(item_emb_sub, [predict_batch_size, 1, 1])
hist_sub =attention_multi_items(item_emb_sub, h_emb, self.sl)
#-- attention end ---
hist_sub = tf.layers.batch_normalization(inputs = hist_sub, name='hist_bn', reuse=tf.AUTO_REUSE)
# print hist_sub.get_shape().as_list()
hist_sub = tf.reshape(hist_sub, [-1, hidden_units])
hist_sub = tf.layers.dense(hist_sub, hidden_units, name='hist_fcn', reuse=tf.AUTO_REUSE)
u_emb_sub = hist_sub
item_emb_sub = tf.reshape(item_emb_sub, [-1, hidden_units])
din_sub = tf.concat([u_emb_sub, item_emb_sub, u_emb_sub * item_emb_sub], axis=-1)
din_sub = tf.layers.batch_normalization(inputs=din_sub, name='b1', reuse=True)
d_layer_1_sub = tf.layers.dense(din_sub, 80, activation=tf.nn.sigmoid, name='f1', reuse=True)
#d_layer_1_sub = dice(d_layer_1_sub, name='dice_1_sub')
d_layer_2_sub = tf.layers.dense(d_layer_1_sub, 40, activation=tf.nn.sigmoid, name='f2', reuse=True)
#d_layer_2_sub = dice(d_layer_2_sub, name='dice_2_sub')
d_layer_3_sub = tf.layers.dense(d_layer_2_sub, 1, activation=None, name='f3', reuse=True)
d_layer_3_sub = tf.reshape(d_layer_3_sub, [-1, predict_ads_num])
self.logits_sub = tf.sigmoid(item_b[:predict_ads_num] + d_layer_3_sub)
self.logits_sub = tf.reshape(self.logits_sub, [-1, predict_ads_num, 1])
#-- fcn end -------
self.mf_auc = tf.reduce_mean(tf.to_float(x > 0))
self.score_i = tf.sigmoid(i_b + d_layer_3_i)
self.score_j = tf.sigmoid(j_b + d_layer_3_j)
self.score_i = tf.reshape(self.score_i, [-1, 1])
self.score_j = tf.reshape(self.score_j, [-1, 1])
self.p_and_n = tf.concat([self.score_i, self.score_j], axis=-1)
print(self.p_and_n.get_shape().as_list())
# Step variable
self.global_step = tf.Variable(0, trainable=False, name='global_step')
self.global_epoch_step = \
tf.Variable(0, trainable=False, name='global_epoch_step')
self.global_epoch_step_op = \
tf.assign(self.global_epoch_step, self.global_epoch_step+1)
self.loss = tf.reduce_mean(
tf.nn.sigmoid_cross_entropy_with_logits(
logits=self.logits,
labels=self.y)
)
trainable_params = tf.trainable_variables()
self.opt = tf.train.GradientDescentOptimizer(learning_rate=self.lr)
gradients = tf.gradients(self.loss, trainable_params)
clip_gradients, _ = tf.clip_by_global_norm(gradients, 5)
self.train_op = self.opt.apply_gradients(
zip(clip_gradients, trainable_params), global_step=self.global_step)
def train(self, sess, uij, l):
loss, _ = sess.run([self.loss, self.train_op], feed_dict={
self.u: uij[0],
self.i: uij[1],
self.y: uij[2],
self.hist_i: uij[3],
self.sl: uij[4],
self.lr: l,
})
return loss
def eval(self, sess, uij):
u_auc, socre_p_and_n = sess.run([self.mf_auc, self.p_and_n], feed_dict={
self.u: uij[0],
self.i: uij[1],
self.j: uij[2],
self.hist_i: uij[3],
self.sl: uij[4],
})
return u_auc, socre_p_and_n
def test(self, sess, uij):
return sess.run(self.logits_sub, feed_dict={
self.u: uij[0],
self.i: uij[1],
self.j: uij[2],
self.hist_i: uij[3],
self.sl: uij[4],
})
def save(self, sess, path):
saver = tf.train.Saver()
saver.save(sess, save_path=path)
def restore(self, sess, path):
saver = tf.train.Saver()
saver.restore(sess, save_path=path)
def extract_axis_1(data, ind):
batch_range = tf.range(tf.shape(data)[0])
indices = tf.stack([batch_range, ind], axis=1)
res = tf.gather_nd(data, indices)
return res
def attention(queries, keys, keys_length):
'''
queries: [B, H] BATCH_SIZE个embedding向量(二维矩阵)
keys: [B, T, H] BATCH_SIZE个之前访问T个商品的embedding向量(三维矩阵)
keys_length: [B] batch_size里面,每个用户之前点击过商品的个数,注意这里和T的区别,T是取的所有里面的最大值,是定长,而这里是非定长的,表示的是实际点击的商品数。
'''
queries_hidden_units = queries.get_shape().as_list()[-1] # 推荐商品的embedding向量维度
queries = tf.tile(queries, [1, tf.shape(keys)[1]]) # queries的1纬不变,2维扩张为原来的T倍,即之前点击过的商品数。
# querise纬度变为 [B, T*H]
queries = tf.reshape(queries, [-1, tf.shape(keys)[1], queries_hidden_units])
# 修改querise纬度变为[B, T, H],对于一个推荐商品,其会生成T个重复的相同一个推荐商品的embedding向量。
din_all = tf.concat([queries, keys, queries-keys, queries*keys], axis=-1)
# 最后一个维度拼接到一起,拼接后变为 [B, T, 4*H]
d_layer_1_all = tf.layers.dense(din_all, 80, activation=tf.nn.sigmoid, name='f1_att', reuse=tf.AUTO_REUSE)
# 第一层网络,输出[B, T, 80]
d_layer_2_all = tf.layers.dense(d_layer_1_all, 40, activation=tf.nn.sigmoid, name='f2_att', reuse=tf.AUTO_REUSE)
# 第二层网络,输出[B, T, 40]
d_layer_3_all = tf.layers.dense(d_layer_2_all, 1, activation=None, name='f3_att', reuse=tf.AUTO_REUSE)
# 第三层网络,输出[B, T, 1]
d_layer_3_all = tf.reshape(d_layer_3_all, [-1, 1, tf.shape(keys)[1]])
outputs = d_layer_3_all
# 最后的输出为 [B, 1, T]
# Mask
key_masks = tf.sequence_mask(keys_length, tf.shape(keys)[1]) # [B, T]
# 标识矩阵B * T个点位,哪些是true (存在之前点击过的商品)哪些是false(不存在之前点击过的商品)
#例如:tf.sequence_mask([1, 3, 2], 5),返回值为:
# [[True, False, False, False, False],
# [True, True, True, False, False],
# [True, True, False, False, False]]
key_masks = tf.expand_dims(key_masks, 1) # [B, 1, T]
# 纬度变为 [B, 1, T]
paddings = tf.ones_like(outputs) * (-2 ** 32 + 1)
# 生成与outputs纬度相同的tensor,纬度为 [B, 1, T],所有值初始化为:-2 ** 32 + 1,之所以如此初始化是因为-2 ** 32 + 1 在softmax中取值无限接近于0.
outputs = tf.where(key_masks, outputs, paddings) # [B, 1, T]
# key_masks对应点如果为true,则赋值为对应outputs点的值,如果为false(不存在的),赋值为对应paddings点的值:-2 ** 32 + 1。
# Scale
outputs = outputs / (keys.get_shape().as_list()[-1] ** 0.5)
# 归一化处理 outputs = outputs / sqrt(H)
# Activation
outputs = tf.nn.softmax(outputs) # [B, 1, T]
# softmax,最后T个维度分别表示T个产品和推荐产品的相关性,相关性越高对应softmax输出值就越大。
# Weighted sum
outputs = tf.matmul(outputs, keys) # [B, 1, H]
# sum polling: 基于相关性和attention机制,选择相关性高的embedding向量。
# 两个矩阵维度分别为 [B, 1, T] 和 [B, T, H],实际是矩阵的后两纬相乘:[1,T]*[T,H],第一个[1,T]向量,每一个代表了相关度大小,
# 相关度越高,对应的历史点击商品和当前推荐商品的
return outputs # 返回 B * 1 * H
def attention_multi_items(queries, keys, keys_length):
'''
queries: [B, N, H] N is the number of ads
keys: [B, T, H]
keys_length: [B]
'''
queries_hidden_units = queries.get_shape().as_list()[-1] # 推荐商品的embedding向量维度
queries_nums = queries.get_shape().as_list()[1] # 推荐商品的个数
queries = tf.tile(queries, [1, 1, tf.shape(keys)[1]]) # queries的1维和2维不变,3维扩张为原来的的T倍,T对应之前看过的商品数。
# [B,N,T*H]
queries = tf.reshape(queries, [-1, queries_nums, tf.shape(keys)[1], queries_hidden_units])
# 变为四维,即shape : [B, N, T, H]
max_len = tf.shape(keys)[1]
keys = tf.tile(keys, [1, queries_nums, 1])
keys = tf.reshape(keys, [-1, queries_nums, max_len, queries_hidden_units])
# shape : [B, N, T, H], 推荐商品embedding和访问商品embedding形成一一对应
din_all = tf.concat([queries, keys, queries-keys, queries*keys], axis=-1) #最后一层拼接到一起
d_layer_1_all = tf.layers.dense(din_all, 80, activation=tf.nn.sigmoid, name='f1_att', reuse=tf.AUTO_REUSE)
d_layer_2_all = tf.layers.dense(d_layer_1_all, 40, activation=tf.nn.sigmoid, name='f2_att', reuse=tf.AUTO_REUSE)
d_layer_3_all = tf.layers.dense(d_layer_2_all, 1, activation=None, name='f3_att', reuse=tf.AUTO_REUSE)
d_layer_3_all = tf.reshape(d_layer_3_all, [-1, queries_nums, 1, max_len])
# [B,N,1,T]
outputs = d_layer_3_all
# Mask
key_masks = tf.sequence_mask(keys_length, max_len) # [B, T]
key_masks = tf.tile(key_masks, [1, queries_nums])
key_masks = tf.reshape(key_masks, [-1, queries_nums, 1, max_len]) # shape : [B, N, 1, T]
paddings = tf.ones_like(outputs) * (-2 ** 32 + 1)
outputs = tf.where(key_masks, outputs, paddings) # [B, N, 1, T]
# Scale
outputs = outputs / (keys.get_shape().as_list()[-1] ** 0.5)
# Activation
outputs = tf.nn.softmax(outputs) # [B, N, 1, T]
outputs = tf.reshape(outputs, [-1, 1, max_len])
keys = tf.reshape(keys, [-1, max_len, queries_hidden_units])
#print outputs.get_shape().as_list()
#print keys.get_sahpe().as_list()
# Weighted sum
outputs = tf.matmul(outputs, keys)
outputs = tf.reshape(outputs, [-1, queries_nums, queries_hidden_units]) # [B, N, 1, H]
print(outputs.get_shape().as_list())
return outputs
网络结构主要是在model类初始化构造函数__init__中完成定义和初始化,model类调用train函数完成训练,分开来进一步分析代码:
def train(self, sess, uij, l):
loss, _ = sess.run([self.loss, self.train_op], feed_dict={
self.u: uij[0], # 用户id
self.i: uij[1], # 推荐商品id
self.y: uij[2], # 是否点击该商品
self.hist_i: uij[3], # 之前点击商品列表
self.sl: uij[4], #之前点击商品个数
self.lr: l, # 学习率
})
return loss
其中,输入:用户id、商品id、是否点击该商品(label: 0 或者 1)和点击商品个数均为1维:batch_size(一个batch的样本数) 大小的向量。而输入:之前点击商品列表为2维:batch_size * T(所有batch_size个样本里点击过商品最大的个数)。输入tensor与训练函数的输入数据对应:
self.u = tf.placeholder(tf.int32, [None,]) # [B] 用户id
self.i = tf.placeholder(tf.int32, [None,]) # [B] 推荐商品id
self.j = tf.placeholder(tf.int32, [None,]) # [B]
self.y = tf.placeholder(tf.float32, [None,]) # [B] 是否点击
self.hist_i = tf.placeholder(tf.int32, [None, None]) # [B, T] 之前点击商品id列表
self.sl = tf.placeholder(tf.int32, [None,]) # [B] 之前点击商品个数
self.lr = tf.placeholder(tf.float32, []) # 学习率
hidden_units = 128
user_emb_w = tf.get_variable("user_emb_w", [user_count, hidden_units]) # 用户embedding
item_emb_w = tf.get_variable("item_emb_w", [item_count, hidden_units // 2]) # 商品embedding
item_b = tf.get_variable("item_b", [item_count],
initializer=tf.constant_initializer(0.0))
cate_emb_w = tf.get_variable("cate_emb_w", [cate_count, hidden_units // 2])
cate_list = tf.convert_to_tensor(cate_list, dtype=tf.int64) # 所有商品的分类List
embedding层定义如下几个embedding层:
用户id的embedding:user_emb_w
商品item id的embedding: item_emb_w
商品分类的embedding:cate_emb_w
以及:
商品item id的embedding的偏置:item_b
所有商品对应的商品分类(大约有几百个分类,初始化输入后转换为固定的tensor):cate_list(1对1分类的List,List索引位置对应商品id编码,直接通过索引找到商品分类)
ic = tf.gather(cate_list, self.i)
i_emb = tf.concat(values = [
tf.nn.embedding_lookup(item_emb_w, self.i),
tf.nn.embedding_lookup(cate_emb_w, ic),
], axis=1)
# 推荐商品i的embedding + 分类embedding B*T,BATCH_SIZE个一维向量。
# 两个embedding的向量维度均为hidden_units // 2,故拼接后的embedding向量的维度为hidden_units
i_b = tf.gather(item_b, self.i)
jc = tf.gather(cate_list, self.j)
j_emb = tf.concat([
tf.nn.embedding_lookup(item_emb_w, self.j),
tf.nn.embedding_lookup(cate_emb_w, jc),
], axis=1)
j_b = tf.gather(item_b, self.j)
hc = tf.gather(cate_list, self.hist_i)
h_emb = tf.concat([
tf.nn.embedding_lookup(item_emb_w, self.hist_i),
tf.nn.embedding_lookup(cate_emb_w, hc),
], axis=2) # 之前点过商品的embedding + 分类embedding B*N*T,BATCH_SIZE个样本 * N个访问记录 * 一维向量
这里,i_emb为商品id embedding特征和商品分类embedding特征拼接到一起的特征,i_b为对应embedding特征的偏置,其对应维度为: B(BATCH_SIZE)*H(embedding向量维度128)。h_emb为之前点击过的商品id embedding特征和分类embedding特征拼接到一起的特征,由于点击过的商品可能有多个,其纬度为:B(BATCH_SIZE)*T(所有batch_size个样本里点击过商品最大的个数)*H(embedding向量维度128)。
attention实现代码:
hist_i =attention(i_emb, h_emb, self.sl)
# 返回 [B,1,H],一个Batch每一个样本都有一个 sum pooling出的embedding向量。
# embedding向量维度为hidden_units。
attention函数定义:
def attention(queries, keys, keys_length):
'''
queries: [B, H] BATCH_SIZE个embedding向量(二维矩阵)
keys: [B, T, H] BATCH_SIZE个之前访问T个商品的embedding向量(三维矩阵)
keys_length: [B] batch_size里面,每个用户之前点击过商品的个数,注意这里和T的区别,T是取的所有里面的最大值,是定长,而这里是非定长的,表示的是实际点击的商品数。
'''
queries_hidden_units = queries.get_shape().as_list()[-1] # 推荐商品的embedding向量维度
queries = tf.tile(queries, [1, tf.shape(keys)[1]]) # queries的1纬不变,2维扩张为原来的T倍,即之前点击过的商品数。
# querise纬度变为 [B, T*H]
queries = tf.reshape(queries, [-1, tf.shape(keys)[1], queries_hidden_units])
# 修改querise纬度变为[B, T, H],对于一个推荐商品,其会生成T个重复的相同一个推荐商品的embedding向量。
din_all = tf.concat([queries, keys, queries-keys, queries*keys], axis=-1)
# 最后一个维度拼接到一起,拼接后变为 [B, T, 4*H]
d_layer_1_all = tf.layers.dense(din_all, 80, activation=tf.nn.sigmoid, name='f1_att', reuse=tf.AUTO_REUSE)
# 第一层网络,输出[B, T, 80]
d_layer_2_all = tf.layers.dense(d_layer_1_all, 40, activation=tf.nn.sigmoid, name='f2_att', reuse=tf.AUTO_REUSE)
# 第二层网络,输出[B, T, 40]
d_layer_3_all = tf.layers.dense(d_layer_2_all, 1, activation=None, name='f3_att', reuse=tf.AUTO_REUSE)
# 第三层网络,输出[B, T, 1]
d_layer_3_all = tf.reshape(d_layer_3_all, [-1, 1, tf.shape(keys)[1]])
outputs = d_layer_3_all
# 最后的输出为 [B, 1, T]
# Mask
key_masks = tf.sequence_mask(keys_length, tf.shape(keys)[1]) # [B, T]
# 标识矩阵B * T个点位,哪些是true (存在之前点击过的商品)哪些是false(不存在之前点击过的商品)
#例如:tf.sequence_mask([1, 3, 2], 5),返回值为:
# [[True, False, False, False, False],
# [True, True, True, False, False],
# [True, True, False, False, False]]
key_masks = tf.expand_dims(key_masks, 1) # [B, 1, T]
# 纬度变为 [B, 1, T]
paddings = tf.ones_like(outputs) * (-2 ** 32 + 1)
# 生成与outputs纬度相同的tensor,纬度为 [B, 1, T],所有值初始化为:-2 ** 32 + 1,之所以如此初始化是因为-2 ** 32 + 1 在softmax中取值无限接近于0.
outputs = tf.where(key_masks, outputs, paddings) # [B, 1, T]
# key_masks对应点如果为true,则赋值为对应outputs点的值,如果为false(不存在的),赋值为对应paddings点的值:-2 ** 32 + 1。
# Scale
outputs = outputs / (keys.get_shape().as_list()[-1] ** 0.5)
# 归一化处理 outputs = outputs / sqrt(H)
# Activation
outputs = tf.nn.softmax(outputs) # [B, 1, T]
# softmax,最后T个维度分别表示T个产品和推荐产品的相关性,相关性越高对应softmax输出值就越大。
# Weighted sum
outputs = tf.matmul(outputs, keys) # [B, 1, H]
# sum polling: 基于相关性和attention机制,选择相关性高的embedding向量。
# 两个矩阵维度分别为 [B, 1, T] 和 [B, T, H],实际是矩阵的后两纬相乘:[1,T]*[T,H],第一个[1,T]向量,每一个代表了相关度大小,
# 相关度越高,对应的历史点击商品和当前推荐商品的
return outputs # 返回 B * 1 * H
这是论文算法中最核心的部分,代码中已经给了详细注释,基本原理就是通过一个3层全连接的神经网络来学习当前推荐商品i_emb和之前点击所有商品的embedding特征h_emb中每一个商品的特征的相关性,这里通过输入两个embedding特征的以及它们之间的差和乘积来增强对特征的学习:
din_all = tf.concat([queries, keys, queries-keys, queries*keys], axis=-1)
随后,通过softmax生成attention机制,最后通过sum pooling根据相关性选择出之前点击过的相关性较高的商品的embedding特征。
hist_i = tf.layers.batch_normalization(inputs = hist_i)
hist_i = tf.reshape(hist_i, [-1, hidden_units], name='hist_bn')
# [B, hidden_units],每一个embedding向量的维度是hidden_units。
hist_i = tf.layers.dense(hist_i, hidden_units, name='hist_fcn')
u_emb_i = hist_i
din_i = tf.concat([u_emb_i, i_emb, u_emb_i * i_emb], axis=-1)
din_i = tf.layers.batch_normalization(inputs=din_i, name='b1')
d_layer_1_i = tf.layers.dense(din_i, 80, activation=tf.nn.sigmoid, name='f1')
#if u want try dice change sigmoid to None and add dice layer like following two lines. You can also find model_dice.py in this folder.
# d_layer_1_i = tf.layers.dense(din_i, 80, activation=None, name='f1')
# d_layer_1_i = dice(d_layer_1_i, name='dice_1_i')
d_layer_2_i = tf.layers.dense(d_layer_1_i, 40, activation=tf.nn.sigmoid, name='f2')
# d_layer_2_i = tf.layers.dense(d_layer_1_i, 40, activation=None, name='f2')
# d_layer_2_i = dice(d_layer_2_i, name='dice_2_i')
d_layer_3_i = tf.layers.dense(d_layer_2_i, 1, activation=None, name='f3')
din_j = tf.concat([u_emb_j, j_emb, u_emb_j * j_emb], axis=-1)
din_j = tf.layers.batch_normalization(inputs=din_j, name='b1', reuse=True)
d_layer_1_j = tf.layers.dense(din_j, 80, activation=tf.nn.sigmoid, name='f1', reuse=True)
# d_layer_1_j = tf.layers.dense(din_j, 80, activation=None, name='f1', reuse=True)
# d_layer_1_j = dice(d_layer_1_j, name='dice_1_j')
d_layer_2_j = tf.layers.dense(d_layer_1_j, 40, activation=tf.nn.sigmoid, name='f2', reuse=True)
# d_layer_2_j = tf.layers.dense(d_layer_1_j, 40, activation=None, name='f2', reuse=True)
# d_layer_2_j = dice(d_layer_2_j, name='dice_2_j')
d_layer_3_j = tf.layers.dense(d_layer_2_j, 1, activation=None, name='f3', reuse=True)
d_layer_3_i = tf.reshape(d_layer_3_i, [-1])
d_layer_3_j = tf.reshape(d_layer_3_j, [-1])
x = i_b - j_b + d_layer_3_i - d_layer_3_j # [B]
self.logits = i_b + d_layer_3_i
attention层挑选出的用户历史行为特征u_emb_i和商品特征i_emb以及两特征向量乘积送入全连接网络,最后加入偏置i_b成为最终logistic判定的输入:
self.logits = i_b + d_layer_3_i
最终损失函数为:
self.loss = tf.reduce_mean(
tf.nn.sigmoid_cross_entropy_with_logits(
logits=self.logits,
labels=self.y)
)
训练函数为:
trainable_params = tf.trainable_variables()
self.opt = tf.train.GradientDescentOptimizer(learning_rate=self.lr)
gradients = tf.gradients(self.loss, trainable_params)
clip_gradients, _ = tf.clip_by_global_norm(gradients, 5) # 避免一次迭代中权重的更新过于迅猛
self.train_op = self.opt.apply_gradients(
zip(clip_gradients, trainable_params), global_step=self.global_step)
这里可以与最开始部分介绍的模型调用的训练函数train()对应上了。