来源
https://github.com/galsang/ABCNN/edit/master/ABCNN.py
#!/usr/bin/env python
# encoding: utf-8
import tensorflow as tf
import numpy as np
class ABCNN():
def __init__(self, s, w, l2_reg, model_type, num_features, d0=300, di=50, num_classes=2, num_layers=2):
"""
Implmenentaion of ABCNNs
(https://arxiv.org/pdf/1512.05193.pdf)
:param s: sentence length
:param w: filter width
:param l2_reg: L2 regularization coefficient
:param model_type: Type of the network(BCNN, ABCNN1, ABCNN2, ABCNN3).
:param num_features: The number of pre-set features(not coming from CNN) used in the output layer.
:param d0: dimensionality of word embedding(default: 300)
:param di: The number of convolution kernels (default: 50)
:param num_classes: The number of classes for answers.
:param num_layers: The number of convolution layers.
"""
self.x1 = tf.placeholder(tf.float32, shape=[None, d0, s], name="x1")#Tensor大小为batch_size*d0*s,s是句子长度,都被padding成相同长度,d0是embedding维度
self.x2 = tf.placeholder(tf.float32, shape=[None, d0, s], name="x2")#Tensor大小为batch_size*d0*s,s是句子长度,另一个句子也被padding成相同长度
self.y = tf.placeholder(tf.int32, shape=[None], name="y")#label,size为batch_size
self.features = tf.placeholder(tf.float32, shape=[None, num_features], name="features")#feature,size是batch_size*num_features
# zero padding to inputs for wide convolution
# tf.pad()是将tensor填充,填充的参数是一个张量,代表每一维(开始、结束)填充多少行/列,但是有一个要求它的rank一定要和tensor的rank是一样的,
def pad_for_wide_conv(x):
return tf.pad(x, np.array([[0, 0], [0, 0], [w - 1, w - 1], [0, 0]]), "CONSTANT", name="pad_wide_conv")
def cos_sim(v1, v2):
norm1 = tf.sqrt(tf.reduce_sum(tf.square(v1), axis=1))#在第一维上求和
norm2 = tf.sqrt(tf.reduce_sum(tf.square(v2), axis=1))
dot_products = tf.reduce_sum(v1 * v2, axis=1, name="cos_sim")
return dot_products / (norm1 * norm2)
def euclidean_score(v1, v2):
euclidean = tf.sqrt(tf.reduce_sum(tf.square(v1 - v2), axis=1))
return 1 / (1 + euclidean)
def make_attention_mat(x1, x2):
# x1, x2 = [batch, height, width, 1] = [batch, d, s, 1]
# x2 => [batch, height, 1, width]
# [batch, width, wdith] = [batch, s, s]
euclidean = tf.sqrt(tf.reduce_sum(tf.square(x1 - tf.matrix_transpose(x2)), axis=1))
return 1 / (1 + euclidean)
def convolution(name_scope, x, d, reuse):
with tf.name_scope(name_scope + "-conv"):
with tf.variable_scope("conv") as scope:
conv = tf.contrib.layers.conv2d(
inputs=x,
num_outputs=di,
kernel_size=(d, w),
stride=1,
padding="VALID",
activation_fn=tf.nn.tanh,
weights_initializer=tf.contrib.layers.xavier_initializer_conv2d(),
weights_regularizer=tf.contrib.layers.l2_regularizer(scale=l2_reg),
biases_initializer=tf.constant_initializer(1e-04),
reuse=reuse,
trainable=True,
scope=scope
)
# Weight: [filter_height, filter_width, in_channels, out_channels]
# output: [batch, 1, input_width+filter_Width-1, out_channels] == [batch, 1, s+w-1, di]
# [batch, di, s+w-1, 1]
conv_trans = tf.transpose(conv, [0, 3, 2, 1], name="conv_trans")
return conv_trans
def w_pool(variable_scope, x, attention):
# x: [batch, di, s+w-1, 1]
# attention: [batch, s+w-1]
with tf.variable_scope(variable_scope + "-w_pool"):
if model_type == "ABCNN2" or model_type == "ABCNN3":
pools = []
# [batch, s+w-1] => [batch, 1, s+w-1, 1]
attention = tf.transpose(tf.expand_dims(tf.expand_dims(attention, -1), -1), [0, 2, 1, 3])
for i in range(s):
# [batch, di, w, 1], [batch, 1, w, 1] => [batch, di, 1, 1]
pools.append(tf.reduce_sum(x[:, :, i:i + w, :] * attention[:, :, i:i + w, :],
axis=2,
keep_dims=True))
# [batch, di, s, 1]
w_ap = tf.concat(pools, axis=2, name="w_ap")
else:
w_ap = tf.layers.average_pooling2d(
inputs=x,
# (pool_height, pool_width)
pool_size=(1, w),
strides=1,
padding="VALID",
name="w_ap"
)
# [batch, di, s, 1]
return w_ap
def all_pool(variable_scope, x):
with tf.variable_scope(variable_scope + "-all_pool"):
if variable_scope.startswith("input"):
pool_width = s
d = d0
else:
pool_width = s + w - 1
d = di
all_ap = tf.layers.average_pooling2d(
inputs=x,
# (pool_height, pool_width)
pool_size=(1, pool_width),
strides=1,
padding="VALID",
name="all_ap"
)
# [batch, di, 1, 1]
# [batch, di]
all_ap_reshaped = tf.reshape(all_ap, [-1, d])
#all_ap_reshaped = tf.squeeze(all_ap, [2, 3])
return all_ap_reshaped
def CNN_layer(variable_scope, x1, x2, d):
# x1, x2 = [batch, d, s, 1]
with tf.variable_scope(variable_scope):
if model_type == "ABCNN1" or model_type == "ABCNN3":
with tf.name_scope("att_mat"):
aW = tf.get_variable(name="aW",
shape=(s, d),
initializer=tf.contrib.layers.xavier_initializer(),
regularizer=tf.contrib.layers.l2_regularizer(scale=l2_reg))
# [batch, s, s]
att_mat = make_attention_mat(x1, x2)
# [batch, s, s] * [s,d] => [batch, s, d]
# matrix transpose => [batch, d, s]
# expand dims => [batch, d, s, 1]
x1_a = tf.expand_dims(tf.matrix_transpose(tf.einsum("ijk,kl->ijl", att_mat, aW)), -1)
x2_a = tf.expand_dims(tf.matrix_transpose(
tf.einsum("ijk,kl->ijl", tf.matrix_transpose(att_mat), aW)), -1)
# [batch, d, s, 2]
x1 = tf.concat([x1, x1_a], axis=3)
x2 = tf.concat([x2, x2_a], axis=3)
left_conv = convolution(name_scope="left", x=pad_for_wide_conv(x1), d=d, reuse=False)
right_conv = convolution(name_scope="right", x=pad_for_wide_conv(x2), d=d, reuse=True)
left_attention, right_attention = None, None
if model_type == "ABCNN2" or model_type == "ABCNN3":
# [batch, s+w-1, s+w-1]
att_mat = make_attention_mat(left_conv, right_conv)
# [batch, s+w-1], [batch, s+w-1]
left_attention, right_attention = tf.reduce_sum(att_mat, axis=2), tf.reduce_sum(att_mat, axis=1)
left_wp = w_pool(variable_scope="left", x=left_conv, attention=left_attention)
left_ap = all_pool(variable_scope="left", x=left_conv)
right_wp = w_pool(variable_scope="right", x=right_conv, attention=right_attention)
right_ap = all_pool(variable_scope="right", x=right_conv)
return left_wp, left_ap, right_wp, right_ap
x1_expanded = tf.expand_dims(self.x1, -1)
x2_expanded = tf.expand_dims(self.x2, -1)
LO_0 = all_pool(variable_scope="input-left", x=x1_expanded)
RO_0 = all_pool(variable_scope="input-right", x=x2_expanded)
LI_1, LO_1, RI_1, RO_1 = CNN_layer(variable_scope="CNN-1", x1=x1_expanded, x2=x2_expanded, d=d0)
sims = [cos_sim(LO_0, RO_0), cos_sim(LO_1, RO_1)]
if num_layers > 1:
_, LO_2, _, RO_2 = CNN_layer(variable_scope="CNN-2", x1=LI_1, x2=RI_1, d=di)
self.test = LO_2
self.test2 = RO_2
sims.append(cos_sim(LO_2, RO_2))
with tf.variable_scope("output-layer"):
self.output_features = tf.concat([self.features, tf.stack(sims, axis=1)], axis=1, name="output_features")
self.estimation = tf.contrib.layers.fully_connected(
inputs=self.output_features,
num_outputs=num_classes,
activation_fn=None,
weights_initializer=tf.contrib.layers.xavier_initializer(),
weights_regularizer=tf.contrib.layers.l2_regularizer(scale=l2_reg),
biases_initializer=tf.constant_initializer(1e-04),
scope="FC"
)
self.prediction = tf.contrib.layers.softmax(self.estimation)[:, 1]
self.cost = tf.add(
tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.estimation, labels=self.y)),
tf.reduce_sum(tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)),
name="cost")
tf.summary.scalar("cost", self.cost)
self.merged = tf.summary.merge_all()
print("=" * 50)
print("List of Variables:")
for v in tf.trainable_variables():
print(v.name)
print("=" * 50)