用的是kaggle上的比赛“Quora Question Paris: Can you identify question pairs that have the same intent?”
评估指数: log loss
测试集:
- 大小:40.4万
- 属性:6列,分别是id, qid1, question1, question2, is_duplicate
测试集: - 大小:235万
- 属性:3列,分别是test_id, qeustion1, question2
这里用到的第一个方法是孪生网络 Siamese Network,白话点就是我要看看这两个句子是否一样,就将两个输入feed进两个神经网络,word embedding后,通过Loss的计算,评价两个输入的相似度。
在之前word embedding中有提到cosine值是计算两个向量的夹角来判断两个词的相似性,那么句子了?段落了?可以用exp保留两个向量的长度信息(见下图)
两边都用了LSTM,仔细看下LSTMa一开始是不知道LSTMb的存在,直到进行到h3(a)时,才会和LSTMb中的h4(b)进行匹配。h3-h4用曼哈顿距离来度量两个句子的空间相似度。
当两边都是LSTM时
with tf.name_scope('embeddings'):
self._m_token_embeddings = tf.Variable(
tf.truncated_normal(
[self._m_config["vocab_size"], self._m_config["embedding_dim"]],
stddev=0.1
),
name="token_embeddings"
)
embedded_sent1 = tf.nn.embedding_lookup(self._m_token_embeddings, self._m_ph_sent1)
embedded_sent2 = tf.nn.embedding_lookup(self._m_token_embeddings, self._m_ph_sent2)
self._m_embedded_sent1 = embedded_sent1
with tf.name_scope('lstm_layer'):
cell1 = tf.nn.rnn_cell.LSTMCell(
self._m_config["lstm_dim"],
state_is_tuple=True,
reuse=tf.AUTO_REUSE
)
cell2 = tf.nn.rnn_cell.LSTMCell(
self._m_config["lstm_dim"],
state_is_tuple=True,
reuse=tf.AUTO_REUSE
)
_, (_, output_cell1) = tf.nn.dynamic_rnn(
cell1, embedded_sent1, dtype=tf.float32, sequence_length=self._m_ph_sent1_size)
_, (_, output_cell2) = tf.nn.dynamic_rnn(
cell1, embedded_sent2, dtype=tf.float32, sequence_length=self._m_ph_sent2_size)
with tf.name_scope("feature_mapping"):
sent_diff = output_cell1 - output_cell2
sent_mul = tf.multiply(output_cell1, output_cell2)
features = tf.concat([sent_diff, sent_mul, output_cell1, output_cell2], axis=1)
W = tf.Variable(tf.truncated_normal(
shape=[self._m_config["lstm_dim"] * 4, self._m_config["label_num"]],
stddev=0.1, mean=0.0))
b = tf.Variable(tf.truncated_normal(
shape=[self._m_config["label_num"]], stddev=0.1, mean=0.0))
self._m_logits = tf.nn.xw_plus_b(features, W, b)
with tf.name_scope("loss"):
cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(
labels=self._m_ph_label, logits=self._m_logits)
self._m_loss = tf.reduce_mean(cross_entropy)
with tf.name_scope("accuracy"):
self._m_prediction = tf.argmax(self._m_logits, axis=1)
correct = tf.equal(self._m_prediction, tf.argmax(self._m_ph_label, axis=1))
self._m_accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
with tf.name_scope("optimizer"):
self._m_global_step = tf.Variable(0, name="global_step", trainable=False)
self._m_optimizer = tf.train.AdamOptimizer(self._m_config["learning_rate"])
self._m_train_op = self._m_optimizer.minimize(
self._m_loss, global_step=self._m_global_step)
当两边都用CNN时
with tf.name_scope('embeddings'):
self._m_token_embeddings = tf.Variable(
tf.truncated_normal(
[self._m_config["vocab_size"], self._m_config["embedding_dim"]],
stddev=0.1
),
name="token_embeddings"
)
embedded_sent1 = tf.nn.embedding_lookup(self._m_token_embeddings, self._m_ph_sent1)
embedded_sent2 = tf.nn.embedding_lookup(self._m_token_embeddings, self._m_ph_sent2)
dropout_embedded_sent1 = tf.nn.dropout(embedded_sent1, keep_prob=self._m_ph_keep_prob)
dropout_embedded_sent2 = tf.nn.dropout(embedded_sent2, keep_prob=self._m_ph_keep_prob)
with tf.name_scope('sentence_features'):
sent1_features = self._build_conv_features(dropout_embedded_sent1)
sent2_features = self._build_conv_features(dropout_embedded_sent2)
#dropout_sent1_features = tf.nn.dropout(sent1_features, keep_prob=self._m_ph_keep_prob)
#dropout_sent2_features = tf.nn.dropout(sent2_features, keep_prob=self._m_ph_keep_prob)
dropout_sent1_features = tf.identity(sent1_features)
dropout_sent2_features = tf.identity(sent2_features)
with tf.name_scope("feature_mapping"):
sent_diff = dropout_sent1_features - dropout_sent2_features
sent_mul = tf.multiply(dropout_sent1_features, dropout_sent2_features)
features = tf.concat([sent_diff, sent_mul, dropout_sent1_features, dropout_sent2_features], axis=1)
dropout_features = tf.nn.dropout(features, keep_prob=self._m_ph_keep_prob)
cnn_feature_num = self._m_config["num_filters"] * len(self._m_config["filter_sizes"])
W = tf.Variable(tf.truncated_normal(
shape=[cnn_feature_num * 4, self._m_config["label_num"]],
stddev=0.1, mean=0.0))
b = tf.Variable(tf.truncated_normal(
shape=[self._m_config["label_num"]], stddev=0.1, mean=0.0))
self._m_logits = tf.nn.xw_plus_b(features, W, b)
with tf.name_scope("loss"):
cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(
labels=self._m_ph_label, logits=self._m_logits)
self._m_loss = tf.reduce_mean(cross_entropy)
with tf.name_scope("accuracy"):
self._m_prediction = tf.argmax(self._m_logits, axis=1)
correct = tf.equal(self._m_prediction, tf.argmax(self._m_ph_label, axis=1))
self._m_accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
with tf.name_scope("optimizer"):
self._m_global_step = tf.Variable(0, name="global_step", trainable=False)
self._m_optimizer = tf.train.AdamOptimizer(self._m_config["learning_rate"])
self._m_train_op = self._m_optimizer.minimize(
self._m_loss, global_step=self._m_global_step)
孪生网络是先建模再匹配,LSTMa一直到h3(a)才只知道有h4(b),有没有可能句子一开始就知道另外一条句子,并记性匹配了? 下一节 Match Pyramid是先匹配再建模。