11句对匹配实战-(1)Siamese

用的是kaggle上的比赛“Quora Question Paris: Can you identify question pairs that have the same intent?”
评估指数： log loss
测试集：

大小：40.4万
属性：6列，分别是id, qid1, question1, question2, is_duplicate
测试集：
大小：235万
属性：3列，分别是test_id, qeustion1, question2

这里用到的第一个方法是孪生网络 Siamese Network,白话点就是我要看看这两个句子是否一样，就将两个输入feed进两个神经网络，word embedding后，通过Loss的计算，评价两个输入的相似度。

image.png

上图可以看出，左右两边可以是一个神经网络（如都是CNN），也可以是不同的（一个LSTM,一个CNN）,但是两边的权重值一样。关于loss,softmax是一种好的选择，但不一定是最优的。siamese网络的初衷是计算两个输入的相似度，可以简单点，直接求embedding向量的cosine值就好。

在之前word embedding中有提到cosine值是计算两个向量的夹角来判断两个词的相似性，那么句子了？段落了？可以用exp保留两个向量的长度信息（见下图）

image.png

两边都用了LSTM,仔细看下LSTMa一开始是不知道LSTMb的存在，直到进行到h3(a)时，才会和LSTMb中的h4（b）进行匹配。h3-h4用曼哈顿距离来度量两个句子的空间相似度。

当两边都是LSTM时

with tf.name_scope('embeddings'):
    self._m_token_embeddings = tf.Variable(
        tf.truncated_normal(
            [self._m_config["vocab_size"], self._m_config["embedding_dim"]],
            stddev=0.1
        ),
        name="token_embeddings"
    )
    embedded_sent1 = tf.nn.embedding_lookup(self._m_token_embeddings, self._m_ph_sent1)
    embedded_sent2 = tf.nn.embedding_lookup(self._m_token_embeddings, self._m_ph_sent2)
self._m_embedded_sent1 = embedded_sent1

with tf.name_scope('lstm_layer'):
    cell1 = tf.nn.rnn_cell.LSTMCell(
        self._m_config["lstm_dim"],
        state_is_tuple=True,
        reuse=tf.AUTO_REUSE
    )
    cell2 = tf.nn.rnn_cell.LSTMCell(
        self._m_config["lstm_dim"],
        state_is_tuple=True,
        reuse=tf.AUTO_REUSE
    )
    _, (_, output_cell1) = tf.nn.dynamic_rnn(
        cell1, embedded_sent1, dtype=tf.float32, sequence_length=self._m_ph_sent1_size)
    _, (_, output_cell2) = tf.nn.dynamic_rnn(
        cell1, embedded_sent2, dtype=tf.float32, sequence_length=self._m_ph_sent2_size)

with tf.name_scope("feature_mapping"):
    sent_diff = output_cell1 - output_cell2
    sent_mul = tf.multiply(output_cell1, output_cell2)
    features = tf.concat([sent_diff, sent_mul, output_cell1, output_cell2], axis=1)

    W = tf.Variable(tf.truncated_normal(
                    shape=[self._m_config["lstm_dim"] * 4, self._m_config["label_num"]],
                    stddev=0.1, mean=0.0))
    b = tf.Variable(tf.truncated_normal(
                    shape=[self._m_config["label_num"]], stddev=0.1, mean=0.0))
    self._m_logits = tf.nn.xw_plus_b(features, W, b)

with tf.name_scope("loss"):
    cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(
                        labels=self._m_ph_label, logits=self._m_logits)
    self._m_loss = tf.reduce_mean(cross_entropy)

with tf.name_scope("accuracy"):
    self._m_prediction = tf.argmax(self._m_logits, axis=1)
    correct = tf.equal(self._m_prediction, tf.argmax(self._m_ph_label, axis=1))
    self._m_accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

with tf.name_scope("optimizer"):
    self._m_global_step = tf.Variable(0, name="global_step", trainable=False)
    self._m_optimizer = tf.train.AdamOptimizer(self._m_config["learning_rate"])
    self._m_train_op = self._m_optimizer.minimize(
                            self._m_loss, global_step=self._m_global_step)

当两边都用CNN时

with tf.name_scope('embeddings'):
    self._m_token_embeddings = tf.Variable(
        tf.truncated_normal(
            [self._m_config["vocab_size"], self._m_config["embedding_dim"]],
            stddev=0.1
        ),
        name="token_embeddings"
    )
    embedded_sent1 = tf.nn.embedding_lookup(self._m_token_embeddings, self._m_ph_sent1)
    embedded_sent2 = tf.nn.embedding_lookup(self._m_token_embeddings, self._m_ph_sent2)

    dropout_embedded_sent1 = tf.nn.dropout(embedded_sent1, keep_prob=self._m_ph_keep_prob)
    dropout_embedded_sent2 = tf.nn.dropout(embedded_sent2, keep_prob=self._m_ph_keep_prob)

with tf.name_scope('sentence_features'):
    sent1_features = self._build_conv_features(dropout_embedded_sent1)
    sent2_features = self._build_conv_features(dropout_embedded_sent2)
    #dropout_sent1_features = tf.nn.dropout(sent1_features, keep_prob=self._m_ph_keep_prob)
    #dropout_sent2_features = tf.nn.dropout(sent2_features, keep_prob=self._m_ph_keep_prob)
    dropout_sent1_features = tf.identity(sent1_features)
    dropout_sent2_features = tf.identity(sent2_features)

with tf.name_scope("feature_mapping"):
    sent_diff = dropout_sent1_features - dropout_sent2_features
    sent_mul = tf.multiply(dropout_sent1_features, dropout_sent2_features)
    features = tf.concat([sent_diff, sent_mul, dropout_sent1_features, dropout_sent2_features], axis=1)
    dropout_features = tf.nn.dropout(features, keep_prob=self._m_ph_keep_prob)

    cnn_feature_num = self._m_config["num_filters"] * len(self._m_config["filter_sizes"])
    W = tf.Variable(tf.truncated_normal(
                    shape=[cnn_feature_num * 4, self._m_config["label_num"]],
                    stddev=0.1, mean=0.0))
    b = tf.Variable(tf.truncated_normal(
                    shape=[self._m_config["label_num"]], stddev=0.1, mean=0.0))
    self._m_logits = tf.nn.xw_plus_b(features, W, b)

with tf.name_scope("loss"):
    cross_entropy = tf.nn.softmax_cross_entropy_with_logits_v2(
                        labels=self._m_ph_label, logits=self._m_logits)
    self._m_loss = tf.reduce_mean(cross_entropy)

with tf.name_scope("accuracy"):
    self._m_prediction = tf.argmax(self._m_logits, axis=1)
    correct = tf.equal(self._m_prediction, tf.argmax(self._m_ph_label, axis=1))
    self._m_accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))

with tf.name_scope("optimizer"):
    self._m_global_step = tf.Variable(0, name="global_step", trainable=False)
    self._m_optimizer = tf.train.AdamOptimizer(self._m_config["learning_rate"])
    self._m_train_op = self._m_optimizer.minimize(
                            self._m_loss, global_step=self._m_global_step)

孪生网络是先建模再匹配，LSTMa一直到h3(a)才只知道有h4(b),有没有可能句子一开始就知道另外一条句子，并记性匹配了？ 下一节 Match Pyramid是先匹配再建模。

11句对匹配实战-(1)Siamese

你可能感兴趣的:(11句对匹配实战-(1)Siamese)