博客代码均以上传至GitHub,欢迎follow和start~~!
数据集地址:https://www.kaggle.com/c/porto-seguro-safe-driver-prediction/data
判定是否需要上交索赔,是二分类问题;
评价指标本实验采用 Logloss。
其中数据处理可直接参考 GitHub 代码,主要是对类别型特征按照 FM 格式进行编码。
import numpy as np
import pandas as pd
class FeatureDictionary(object):
def __init__(self, trainfile=None,testfile=None,
numeric_cols=[],
ignore_cols=[],
cate_cols=[]):
self.trainfile = trainfile
#self.testfile = testfile
self.testfile = testfile
self.cate_cols = cate_cols
self.numeric_cols = numeric_cols
self.ignore_cols = ignore_cols
self.gen_feat_dict()
def gen_feat_dict(self):
df = pd.concat([self.trainfile,self.testfile])
self.feat_dict = {}
self.feat_len = {}
tc = 0
for col in df.columns:
if col in self.ignore_cols or col in self.numeric_cols:
continue
else:
us = df[col].unique()
self.feat_dict[col] = dict(zip(us, range(tc, len(us) + tc)))
tc += len(us)
self.feat_dim = tc
class DataParser(object):
def __init__(self, feat_dict):
self.feat_dict = feat_dict
def parse(self, infile=None, df=None, has_label=False):
assert not ((infile is None) and (df is None)), "infile or df at least one is set"
assert not ((infile is not None) and (df is not None)), "only one can be set"
if infile is None:
dfi = df.copy()
else:
dfi = pd.read_csv(infile)
if has_label:
y = dfi["target"].values.tolist()
dfi.drop(["id", "target"], axis=1, inplace=True)
else:
ids = dfi["id"].values.tolist()
dfi.drop(["id"], axis=1, inplace=True)
# dfi for feature index
# dfv for feature value which can be either binary (1/0) or float (e.g., 10.24)
numeric_Xv = dfi[self.feat_dict.numeric_cols].values.tolist()
dfi.drop(self.feat_dict.numeric_cols,axis=1,inplace=True)
dfv = dfi.copy()
for col in dfi.columns:
if col in self.feat_dict.ignore_cols:
dfi.drop(col, axis=1, inplace=True)
dfv.drop(col, axis=1, inplace=True)
continue
else:
dfi[col] = dfi[col].map(self.feat_dict.feat_dict[col])
dfv[col] = 1.
# list of list of feature indices of each sample in the dataset
cate_Xi = dfi.values.tolist()
# list of list of feature values of each sample in the dataset
cate_Xv = dfv.values.tolist()
if has_label:
return cate_Xi, cate_Xv,numeric_Xv,y
else:
return cate_Xi, cate_Xv,numeric_Xv,ids
配置训练网络、数据地址、训练数据集等信息。
TRAIN_FILE = "data/train.csv"
TEST_FILE = "data/test.csv"
SUB_DIR = "output"
NUM_SPLITS = 3
RANDOM_SEED = 2017
# types of columns of the dataset dataframe
CATEGORICAL_COLS = [
'ps_ind_02_cat', 'ps_ind_04_cat', 'ps_ind_05_cat',
'ps_car_01_cat', 'ps_car_02_cat', 'ps_car_03_cat',
'ps_car_04_cat', 'ps_car_05_cat', 'ps_car_06_cat',
'ps_car_07_cat', 'ps_car_08_cat', 'ps_car_09_cat',
'ps_car_10_cat', 'ps_car_11_cat',
]
NUMERIC_COLS = [
# # binary
# "ps_ind_06_bin", "ps_ind_07_bin", "ps_ind_08_bin",
# "ps_ind_09_bin", "ps_ind_10_bin", "ps_ind_11_bin",
# "ps_ind_12_bin", "ps_ind_13_bin", "ps_ind_16_bin",
# "ps_ind_17_bin", "ps_ind_18_bin",
# "ps_calc_15_bin", "ps_calc_16_bin", "ps_calc_17_bin",
# "ps_calc_18_bin", "ps_calc_19_bin", "ps_calc_20_bin",
# numeric
"ps_reg_01", "ps_reg_02", "ps_reg_03",
"ps_car_12", "ps_car_13", "ps_car_14", "ps_car_15",
# feature engineering
"missing_feat", "ps_car_13_x_ps_reg_03",
]
IGNORE_COLS = [
"id", "target",
"ps_calc_01", "ps_calc_02", "ps_calc_03", "ps_calc_04",
"ps_calc_05", "ps_calc_06", "ps_calc_07", "ps_calc_08",
"ps_calc_09", "ps_calc_10", "ps_calc_11", "ps_calc_12",
"ps_calc_13", "ps_calc_14",
"ps_calc_15_bin", "ps_calc_16_bin", "ps_calc_17_bin",
"ps_calc_18_bin", "ps_calc_19_bin", "ps_calc_20_bin"
]
模型输入:
模型的输入主要有下面几个部分:
self.feat_index = tf.placeholder(tf.int32,
shape=[None,None],
name='feat_index')
self.feat_value = tf.placeholder(tf.float32,
shape=[None,None],
name='feat_value')
self.numeric_value = tf.placeholder(tf.float32,[None,None],name='num_value')
self.label = tf.placeholder(tf.float32,shape=[None,1],name='label')
self.dropout_keep_deep = tf.placeholder(tf.float32,shape=[None],name='dropout_deep_deep')
可以看到,这里与 DeepFM 相比,一个明显的变化是将离散特征和连续特征分开,连续特征不在转换成 embedding 进行输入,所以我们的输入共有五部分。
feat_index 是离散特征的一个序号,主要用于通过 embedding_lookup 选择我们的embedding。feat_value是对应离散特征的特征值。numeric_value是我们的连续特征值。label 是实际值。还定义了两个 dropout 来防止过拟合。
权重构建:
权重主要包含四部分,embedding层的权重,cross network中的权重,deep network中的权重以及最后链接层的权重,我们使用一个字典来表示:
def _initialize_weights(self):
weights = dict()
#embeddings
weights['feature_embeddings'] = tf.Variable(
tf.random_normal([self.cate_feature_size,self.embedding_size],0.0,0.01),
name='feature_embeddings')
weights['feature_bias'] = tf.Variable(tf.random_normal([self.cate_feature_size,1],0.0,1.0),name='feature_bias')
#deep layers
num_layer = len(self.deep_layers)
glorot = np.sqrt(2.0/(self.total_size + self.deep_layers[0]))
weights['deep_layer_0'] = tf.Variable(
np.random.normal(loc=0,scale=glorot,size=(self.total_size,self.deep_layers[0])),dtype=np.float32
)
weights['deep_bias_0'] = tf.Variable(
np.random.normal(loc=0,scale=glorot,size=(1,self.deep_layers[0])),dtype=np.float32
)
for i in range(1,num_layer):
glorot = np.sqrt(2.0 / (self.deep_layers[i - 1] + self.deep_layers[I]))
weights["deep_layer_%d" % i] = tf.Variable(
np.random.normal(loc=0, scale=glorot, size=(self.deep_layers[i - 1], self.deep_layers[i])),
dtype=np.float32) # layers[i-1] * layers[I]
weights["deep_bias_%d" % i] = tf.Variable(
np.random.normal(loc=0, scale=glorot, size=(1, self.deep_layers[i])),
dtype=np.float32) # 1 * layer[I]
for i in range(self.cross_layer_num):
weights["cross_layer_%d" % i] = tf.Variable(
np.random.normal(loc=0, scale=glorot, size=(self.total_size,1)),
dtype=np.float32)
weights["cross_bias_%d" % i] = tf.Variable(
np.random.normal(loc=0, scale=glorot, size=(self.total_size,1)),
dtype=np.float32) # 1 * layer[I]
# final concat projection layer
input_size = self.total_size + self.deep_layers[-1]
glorot = np.sqrt(2.0/(input_size + 1))
weights['concat_projection'] = tf.Variable(np.random.normal(loc=0,scale=glorot,size=(input_size,1)),dtype=np.float32)
weights['concat_bias'] = tf.Variable(tf.constant(0.01),dtype=np.float32)
return weights
网络输入:
这一块我们要计算两个并行网络的输入X0,我们需要将离散特征转换成embedding,同时拼接上连续特征:
# model
self.embeddings = tf.nn.embedding_lookup(self.weights['feature_embeddings'],self.feat_index) # N * F * K
feat_value = tf.reshape(self.feat_value,shape=[-1,self.field_size,1])
self.embeddings = tf.multiply(self.embeddings,feat_value)
self.x0 = tf.concat([self.numeric_value,
tf.reshape(self.embeddings,shape=[-1,self.field_size * self.embedding_size])]
,axis=1)
Cross Network:
根据论文中的计算公式,一步步计算得到cross network的输出:
# cross_part
self._x0 = tf.reshape(self.x0, (-1, self.total_size, 1))
x_l = self._x0
for l in range(self.cross_layer_num):
x_l = tf.tensordot(tf.matmul(self._x0, x_l, transpose_b=True),
self.weights["cross_layer_%d" % l],1) + self.weights["cross_bias_%d" % l] + x_l
self.cross_network_out = tf.reshape(x_l, (-1, self.total_size))
Deep Network:
这一块就是一个多层全链接神经网络:
self.y_deep = tf.nn.dropout(self.x0,self.dropout_keep_deep[0])
for i in range(0,len(self.deep_layers)):
self.y_deep = tf.add(tf.matmul(self.y_deep,self.weights["deep_layer_%d" %i]), self.weights["deep_bias_%d"%I])
self.y_deep = self.deep_layers_activation(self.y_deep)
self.y_deep = tf.nn.dropout(self.y_deep,self.dropout_keep_deep[i+1])
Combination Layer:
最后将两个网络的输出拼接起来,经过一层全链接得到最终的输出:
# concat_part
concat_input = tf.concat([self.cross_network_out, self.y_deep], axis=1)
self.out = tf.add(tf.matmul(concat_input,self.weights['concat_projection']),self.weights['concat_bias'])
定义损失:
这里我们可以选择logloss或者mse,并加上L2正则项:
# loss
if self.loss_type == "logloss":
self.out = tf.nn.sigmoid(self.out)
self.loss = tf.losses.log_loss(self.label, self.out)
elif self.loss_type == "mse":
self.loss = tf.nn.l2_loss(tf.subtract(self.label, self.out))
# l2 regularization on weights
if self.l2_reg > 0:
self.loss += tf.contrib.layers.l2_regularizer(
self.l2_reg)(self.weights["concat_projection"])
for i in range(len(self.deep_layers)):
self.loss += tf.contrib.layers.l2_regularizer(
self.l2_reg)(self.weights["deep_layer_%d" % I])
for i in range(self.cross_layer_num):
self.loss += tf.contrib.layers.l2_regularizer(
self.l2_reg)(self.weights["cross_layer_%d" % I])
运行结果:
epoch 0 loss [1.206366]
epoch 1 loss [0.80746907]
epoch 2 loss [0.6729279]
epoch 3 loss [0.6108781]
epoch 4 loss [0.5734275]
epoch 5 loss [0.5456743]
epoch 6 loss [0.52213466]
epoch 7 loss [0.50080884]
epoch 8 loss [0.48109168]
epoch 9 loss [0.46284634]
epoch 10 loss [0.44603518]
epoch 11 loss [0.43057054]
epoch 12 loss [0.41643652]
epoch 13 loss [0.4034918]
epoch 14 loss [0.39158794]
epoch 15 loss [0.38059843]
epoch 16 loss [0.37042868]
epoch 17 loss [0.3610017]
epoch 18 loss [0.35223407]
epoch 19 loss [0.34409672]
epoch 20 loss [0.33653095]
epoch 21 loss [0.32946247]
epoch 22 loss [0.32286808]
epoch 23 loss [0.31669495]
epoch 24 loss [0.31092188]
epoch 25 loss [0.30550152]
epoch 26 loss [0.3004182]
epoch 27 loss [0.2956318]
epoch 28 loss [0.29113483]
epoch 29 loss [0.28688678]