Learning the Weight of Feature Interactions via Attention Networks

一. 论文解读

FM estimates the target by modelling all interactions between each pair of features：

where is the global bias, denotes the weight of the i-th feature, and denotes the weight of the cross feature , which is factorized as: , where denotes the size of the embedding vector for feature , and denotes the size of embedding vector.

It is worth noting that FM models all feature interactions in the same way: first, a latent vector is shared in estimating all feature interactions that the i-th feature involves; second, all estimated feature interactions have a uniform weight of 1. In practice, it is common that not all features are relevant to prediction. However, FM models all possible feature interactions with the same weight, which may adversely deteriorate its generalization performance.

(AFM), which learns the importance of each feature interaction from data via a neural attention network.

1. 模型结构：

AFM 《Attentional Factorization Machines》_第1张图片

AFM.png

2. 计算流程：

2.1 Pair-wise Interaction Layer

Formally, let the set of non-zero features in the feature vector x be , and the output of the embedding layer be and

2.2 Attention-based Pooling Layer

where is the attention score for feature interaction , which can be interpreted as the importance of in predicting the target.

Formally, the attention network is defined as:

where , , are model parameters, and denotes the hidden layer size of the attention network, which we call attention factor.

2.3 Prediction Layer

3. 损失函数：

二. 代码实现

1. 系统环境

tensorflow 2.0
python 3.6.8

2. Pair-wise Interaction Layer

class PairWiseLayer(tf.keras.layers.Layer):
    """
    include embedding layer and pair-wise interaction layer
    """

    def __init__(self, feature_size, field_size, embedding_size, l2_reg=0.01, **kwargs):
        super().__init__(**kwargs)
        self.feature_size = feature_size
        self.field_size = field_size
        self.embeding_size = embedding_size
        self.l2_reg = l2_reg

    def build(self, input_shape):
        self.embeddings = tf.keras.layers.Embedding(self.feature_size, self.embeding_size,
                                                    embeddings_regularizer=tf.keras.regularizers.l2(self.l2_reg))

    def call(self, inputs):
        feature_ids = inputs['feature_ids']  # [batch_size, field_size]
        feature_vals = inputs['feature_vals']  # [batch_size, filed_size]
        embeddings = self.embeddings(feature_ids)  # [batch_size, field_size, embedding_size]
        # todo 需要优化，目前这种方式太低效
        outputs = []
        for i in range(self.field_size):
            for j in range(i+1, self.field_size):
                weight = tf.multiply(embeddings[:, i, :], embeddings[:, j, :])
                dot = tf.reduce_sum(tf.multiply(feature_vals[:, i], feature_vals[:, j]))
                outputs.append(weight * dot)
        outputs = tf.transpose(tf.stack(outputs), perm=[1, 0, 2])  # [batch_size, interaction_dim, embedding_size]
        return outputs

2. Attention-based Pooling Layer

class AttentionLayer(tf.keras.layers.Layer):
    """
    计算 a_ij, just one layer.
    """

    def __init__(self, hidden_units=64, embedding_size=32, l2_reg=0.01, **kwargs):
        super().__init__(**kwargs)
        self.hidden_units = hidden_units
        self.embedding_size = embedding_size
        self.l2_reg = l2_reg

    def build(self, input_shape):
        """
        :param input_shape: [batch_size, interaction_dim, embedding_size]
        :return:
        """
        self.h = self.add_weight(name='h',
                                 shape=[self.hidden_units],
                                 initializer=tf.zeros_initializer())
        # self.w = self.add_weight(name='w',
        #                          shape=[self.hidden_units, self.embedding_size],
        #                          initializer=tf.zeros_initializer())
        # self.b = self.add_weight(name='b',
        #                          shape=[self.hidden_units],
        #                          initializer=tf.zeros_initializer())
        self.dense = tf.keras.layers.Dense(units=self.hidden_units,
                                           activation='relu',
                                           use_bias=True,
                                           kernel_regularizer=tf.keras.regularizers.l2(self.l2_reg),
                                           bias_regularizer=tf.keras.regularizers.l2(self.l2_reg))

    def call(self, inputs):
        """
        :param inputs:
        :return: [batch_size, interaction_dim, 1]
        """
        inner = self.dense(inputs)  # [batch_size, interaction_dim, hidden_units]
        outer = tf.matmul(inner, tf.expand_dims(self.h, axis=-1))
        outputs = tf.nn.softmax(outer, axis=1)
        return outputs

3. Predict Layer

class PredictLayer(tf.keras.layers.Layer):

    def __init__(self, embedding_size=32, **kwargs):
        super().__init__(**kwargs)
        self.embedding_size = embedding_size

    def build(self, input_shape):
        self.p = self.add_weight(name='p',
                                 shape=[self.embedding_size],
                                 initializer=tf.zeros_initializer())

    def call(self, pi_out, att_out):
        # [batch_size, embedding_size]
        inner = tf.reduce_sum(att_out * pi_out, axis=1)
        outputs = tf.matmul(inner, tf.expand_dims(self.p, axis=-1))
        return outputs

4. model

class AFM(tf.keras.Model):

    def __init__(self, feature_size, field_size, embedding_size=32, l2_reg=0.01, attention_units=64, **kwargs):
        super().__init__(**kwargs)
        self.pi_layer = PairWiseLayer(feature_size=feature_size, field_size=field_size, embedding_size=embedding_size,
                                      l2_reg=l2_reg, **kwargs)
        self.attention_layer = AttentionLayer(hidden_units=attention_units, embedding_size=embedding_size, l2_reg=l2_reg)
        self.dense = tf.keras.layers.Dense(units=1, activation=None)
        self.predict_layer = PredictLayer(embedding_size=embedding_size)

    def call(self, inputs):
        pi_out = self.pi_layer(inputs)
        # print('pi_layer_out shape: ', pi_out.shape)
        att_out = self.attention_layer(pi_out)
        # print('att_out shape: ', att_out.shape)
        den_out = self.dense(inputs['feature_vals'])
        preds = den_out + self.predict_layer(pi_out, att_out)
        return preds

5. train

import tensorflow as tf
from afm import AFM
import argparse
import shutil
import numpy as np
import json
import requests


parser = argparse.ArgumentParser()
parser.add_argument('--data_dir', default=r'', help='Data dir.')
parser.add_argument('--feature_size', default=454, help='Number of features.')
parser.add_argument('--field_size', default=196, help='Number of field_size.')
parser.add_argument('--embedding_size', default=16, help='Embedding size.')
parser.add_argument('--attention_units', default=64, help='Hidden units of MLP.')
parser.add_argument('--l2_reg', default=0.01, help='L2 regularizer for trainable variables.')
parser.add_argument('--batch_size', default=64, help='Batch size.')
parser.add_argument('--num_epochs', default=1, help='Number of epochs.')
parser.add_argument('--learning_rate', default=0.01, help='Learning rate.')
parser.add_argument('--task_type', default='train', help='Task type {train, eval, export, predict}.')
parser.add_argument('--model_dir', default=r'',
                    help='Model check point dir.')
parser.add_argument('--servable_model_dir', default=r'',
                    help='Model for tensorflow serving dir.')
parser.add_argument('--clear_existing_model', default=True, help='Weather to clearing the old model.')


def input_fn(filename, batch_size, num_epochs=1, shuffle=False):
    print('Parsing: ', filename)

    def decode_libsvm(line):
        "parsing libsvm file."
        columns = tf.strings.split(line, sep=' ')
        labels = tf.strings.to_number(columns[0], out_type=tf.int32)
        id_vals = tf.strings.split(columns[1:], sep=':').to_tensor()
        feature_ids, feature_vals = tf.split(id_vals, num_or_size_splits=2, axis=1)
        feature_ids = tf.squeeze(tf.strings.to_number(feature_ids, out_type=tf.int32))
        feature_vals = tf.squeeze(tf.strings.to_number(feature_vals, out_type=tf.float32))
        return {'feature_ids': feature_ids, 'feature_vals': feature_vals}, labels
        # return feature_ids, feature_vals, labels
    dataset = tf.data.TextLineDataset(filename).map(decode_libsvm, num_parallel_calls=10).prefetch(500000)

    if shuffle:
        dataset = dataset.shuffle(buffer_size=256)

    dataset = dataset.repeat(num_epochs)
    dataset = dataset.batch(batch_size)

    return dataset


def model_train(args):
    feature_size = args.feature_size
    field_size = args.field_size
    embedding_size = args.embedding_size
    attention_units = args.attention_units
    l2_reg = args.l2_reg
    batch_size = args.batch_size
    num_epochs = args.num_epochs
    learning_rate = args.learning_rate
    data_dir = args.data_dir
    model_dir = args.model_dir
    servable_model_dir = args.servable_model_dir

    if args.clear_existing_model:
        try:
            shutil.rmtree(model_dir)
        except Exception as e:
            print(e, ' at clear existing model.')
        else:
            print('Existing model cleared at ', model_dir)

    train_file = data_dir + 'tr.libsvm'
    valid_file = data_dir + 'va.libsvm'
    train_data = input_fn(train_file, batch_size=batch_size, num_epochs=num_epochs, shuffle=True)
    valid_data = input_fn(valid_file, batch_size=64, num_epochs=1, shuffle=False)

    model = AFM(feature_size, field_size, embedding_size, l2_reg, attention_units)
    optimizer = tf.keras.optimizers.SGD(learning_rate=learning_rate)
    checkpoint = tf.train.Checkpoint(model=model)

    for batch_index, (inputs, y) in enumerate(train_data):
        with tf.GradientTape() as tape:
            y_pred = model(inputs, training=True)
            loss = tf.keras.losses.binary_crossentropy(y_true=y, y_pred=y_pred)
            loss = tf.reduce_mean(loss)
        grads = tape.gradient(target=loss, sources=model.trainable_variables)
        optimizer.apply_gradients(grads_and_vars=zip(grads, model.trainable_variables))

        if batch_index % 500 == 0:
            print('batch: %d,  loss: %f' % (batch_index, loss))
        if batch_index % 1000 == 0:
            checkpoint.save(model_dir + 'model.ckpt')
    # export
    # tf.saved_model.save(model, export_dir=servable_model_dir, signatures={'call': model.call})

    # validation
    auc = tf.keras.metrics.AUC()
    for inputs, y in valid_data:
        y = tf.reshape(y, shape=[-1, 1])
        y_pred = model.predict(inputs)
        auc.update_state(y_true=y, y_pred=y_pred)
    print('*' * 100)
    print('Test auc: %f\n' % auc.result())


if __name__ == '__main__':
    args = parser.parse_args()
    task_type = args.task_type
    if task_type == 'train':
        model_train(args)

【参考】

https://arxiv.org/abs/1708.04617

AFM 《Attentional Factorization Machines》