本文主要目的是实现,韩国 Naver Corporation 和 Seoul National University 发表的一篇论文的模型,此模型主要用在句子匹配任务上。利用Keras实现,后端为tensorflow,论文地址
e p i f i x = e m b f i x ( p i ) e_{p_i}^{f_ix} = emb_{fix}(p_i) epifix=embfix(pi)
c p i = c h a r c o n v ( p i ) c^{p_i} = char_{conv}(p_i) cpi=charconv(pi)
e p i t r = e m b t r a i n ( p i ) e_{p_i}^{tr} = emb_{train}(p_i) epitr=embtrain(pi)
p i w = [ e p i t r , e p i f i x , c p i , f p i ] p_i^{w} = [e_{p_i}^{tr}, e_{p_i}^{f_ix}, c^{p_i}, f_{p_i}] piw=[epitr,epifix,cpi,fpi]
x t l = [ h t l − 1 , x t l − 1 ] x_t^{l} = [h_t^{l-1}, x_t^{l-1}] xtl=[htl−1,xtl−1]
h t l = H l ( x t l , h t l − 1 ) h_t^l = H_l(x_t^{l}, h_t^{l-1}) htl=Hl(xtl,htl−1)
o i , j = e x p ( e i , j ) ∑ k = 1 J e i , k o_{i,j} = \tfrac{exp(e_{i,j})}{\sum_{k=1}^J{e_{i,k}}} oi,j=∑k=1Jei,kexp(ei,j)
a p i = ∑ j = 1 J o i , j ∗ h q j a_{p_i} = \sum_{j=1}^J{o_{i,j}*h_{q_j}} api=j=1∑Joi,j∗hqj
e i , j = c o s ( h p i , h q j ) e_{i,j} = cos(h_{p_i}, h_{q_j}) ei,j=cos(hpi,hqj)
x t l = [ h t l − 1 , a t l − 1 , x t l − 1 ] x_t^{l} = [h_t^{l-1}, a_t^{l-1}, x_t^{l-1}] xtl=[htl−1,atl−1,xtl−1]
h t l = H l ( x t l , h t l − 1 ) h_t^l = H_l(x_t^{l}, h_t^{l-1}) htl=Hl(xtl,htl−1)
import os
import jieba
import numpy as np
import pandas as pd
import tensorflow as tf
from multiprocessing import Pool
EMB_DIM = 300
class Similar:
def __init__(self, source, syn):
self.source = source
self.syn = syn
self.x, self.y = self.source.shape
def get_similar_cell(self, index):
arr = np.zeros((1, self.y))
for i in range(self.y):
if self.source[index][i] == 0:
if self.source[index][i] in self.syn[index]:
arr[0][i] = 1
return index, arr
def get_similar(self):
similar = np.zeros_like(self.source)
with Pool() as pool:
results = pool.map(self.get_similar_cell, range(self.x))
for result in results:
key, value = result
similar[key] = value
return similar
def get_data():
sources = []
synonymous = []
sources_char = []
synonymous_char = []
labels = []
base_path = 'log'
for path in os.listdir(base_path):
if path.endswith('.csv'):
dataset = pd.read_csv(base_path + '/' + path, sep='\t', error_bad_lines=True)
for data in dataset.values:
sou = list(jieba.cut(data[0]))
syn = list(jieba.cut(data[1]))
return sources, synonymous, sources_char, synonymous_char, np.array(labels, dtype='int32')
def tokenize(source, syn):
tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
tokenizer.fit_on_texts(source + syn)
source_tensor = tokenizer.texts_to_sequences(source)
syn_tensor = tokenizer.texts_to_sequences(syn)
return tokenizer, source_tensor, syn_tensor
def align(tensor):
source_tensor, syn_tensor, source_char_tensor, syn_char_tensor = tensor
max_len = max_length(source_tensor + syn_tensor + source_char_tensor + syn_char_tensor)
syn_tensor = tf.keras.preprocessing.sequence.pad_sequences(syn_tensor, maxlen=max_len, padding='post')
source_tensor = tf.keras.preprocessing.sequence.pad_sequences(source_tensor, maxlen=max_len, padding='post')
sou_char_tensor = tf.keras.preprocessing.sequence.pad_sequences(source_char_tensor, maxlen=max_len, padding='post')
syn_char_tensor = tf.keras.preprocessing.sequence.pad_sequences(syn_char_tensor, maxlen=max_len, padding='post')
return source_tensor, syn_tensor, sou_char_tensor, syn_char_tensor
def emd_matrix(max_words, word_index):
embedding_index = {}
with open('zhs_wiki_glove.vectors.300d.txt', 'r') as f:
for line in f:
values = line.split()
embedding_index[values[0]] = np.asarray(values[1:], dtype='float32')
embedding_matrix = np.zeros((max_words + 1, EMB_DIM))
for i, word in word_index.items():
if i < max_words:
embedding_vector = embedding_index.get(word, None)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
return embedding_matrix
def max_length(tensor):
return max(len(t) for t in tensor)
def main():
source, syn, source_char, syn_char, label = get_data()
tokenizer, source_tensor, syn_tensor = tokenize(source, syn)
_, sou_char_tensor, syn_char_tensor = tokenize(source_char, syn_char)
source_tensor, syn_tensor, sou_char_tensor, syn_char_tensor = align([source_tensor, syn_tensor, sou_char_tensor,
similar = Similar(source_tensor, syn_tensor).get_similar()
vocab_size = len(tokenizer.index_word)
char_size = len(_.index_word)
emb_matrix = emd_matrix(vocab_size, tokenizer.index_word)
return source_tensor, syn_tensor, sou_char_tensor, syn_char_tensor, emb_matrix, vocab_size, char_size, similar, label
if __name__ == '__main__':
import os
import numpy as np
import tensorflow as tf
import keras.backend as K
from keras import Model, regularizers
from keras.layers import Bidirectional, LSTM, merge
from keras.layers import Input, Embedding, Conv1D, MaxPooling1D, Concatenate, Dense, Dropout
from .datasets import main
def expand_dims(tensor):
return K.expand_dims(tensor, axis=2)
def soft_max(x, axis=1):
dim = K.ndim(x)
if dim == 2:
return K.softmax(x)
elif dim > 2:
e = K.exp(x - K.max(x, axis=axis, keepdims=True))
s = K.sum(e, axis=axis, keepdims=True)
return e / s
raise ValueError('Cannot apply softmax to a tensor that is 1D')
class Output(merge._Merge):
def _merge_function(self, inputs):
last_source_max = inputs[0]
last_syn_max = inputs[0]
p_ = K.squeeze(last_source_max, axis=1)
q_ = K.squeeze(last_syn_max, axis=1)
output = K.concatenate([p_, q_, p_ + q_, p_ - q_, K.abs(p_ - q_)], axis=-1)
return output
def compute_output_shape(self, input_shape):
output_shape = input_shape[0][0], input_shape[0][2] * 5
return output_shape
class Attention(merge._Merge):
def __init__(self, **kwargs):
self.matmul = kwargs
def _merge_function(self, inputs):
x = inputs[0]
y = inputs[1]
syn_attention = inputs[1]
x_y = tf.matmul(x, y, **self.matmul)
x_square = K.square(x)
x_sum = K.sum(x_square, axis=-1, keepdims=True)
y_square = K.square(y)
y_sum = K.sum(y_square, axis=-1, keepdims=True)
x_y_matmul = tf.matmul(x_sum, y_sum, **self.matmul)
cos = x_y / x_y_matmul
activator = soft_max(cos)
output = tf.matmul(activator, syn_attention)
return output
class EmbConcatenate(merge._Merge):
def build(self, input_shape):
self._reshape_required = False
def compute_output_shape(self, input_shape):
if not isinstance(input_shape, list):
raise TypeError
last = 0
for shape in input_shape[:-1]:
last += shape[2]
return input_shape[0][0], input_shape[0][1], last + 1
def _merge_function(self, inputs):
similar_dim = K.expand_dims(inputs[3])
emb_train = inputs[0]
emb_fix = inputs[1]
char_max = inputs[2]
output = K.concatenate([emb_train, emb_fix, char_max, similar_dim])
return output
EMB_DIM = 300
# tf.enable_eager_execution()
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
batch_size = 64
source_ten, syn_ten, source_char_ten, syn_char_ten, vocab_emb_matrix, vocab_size, char_size, similar, labels = main()
middle_shape = syn_ten.shape[1]
# 输入层
source = Input(shape=(middle_shape,), batch_shape=(batch_size, middle_shape), name='source')
syn = Input(shape=(middle_shape,), batch_shape=(batch_size, middle_shape), name='syn')
syn_char = Input(shape=(middle_shape,), batch_shape=(batch_size, middle_shape), name='syn_char')
source_char = Input(shape=(middle_shape,), batch_shape=(batch_size, middle_shape), name='source_char')
similar_input = Input(shape=(middle_shape,), batch_shape=(batch_size, middle_shape), name='similar')
# 嵌入层
source_emb_train = Embedding(vocab_size, EMB_DIM, weights=[vocab_emb_matrix], trainable=False, name='sou_emb_1')(source)
source_emb_fix = Embedding(vocab_size, EMB_DIM, name='sou_emb_2')(source)
source_char_emb = Embedding(char_size, 16, name='sou_emb_3')(source_char)
source_char_cnn = Conv1D(batch_size, 7, activation='relu', padding='same', name='sou_con')(source_char_emb)
source_char_max = MaxPooling1D(data_format='channels_first', name='sou_max')(source_char_cnn)
source_cont = EmbConcatenate()([source_emb_train, source_emb_fix, source_char_max, similar_input])
source_emb_out = Dropout(0.5, name='sou_dro_1')(source_cont)
syn_emb_train = Embedding(vocab_size, EMB_DIM, weights=[vocab_emb_matrix], trainable=False, name='syn_emb_1')(syn)
syn_emb_fix = Embedding(vocab_size, EMB_DIM, name='syn_emb_2')(syn)
syn_char_emb = Embedding(char_size, 16, name='syn_emb_3')(syn_char)
syn_char_cnn = Conv1D(batch_size, 7, activation='relu', padding='same', name='syn_con')(syn_char_emb)
syn_char_max = MaxPooling1D(data_format='channels_first', name='syn_max')(syn_char_cnn)
syn_cont = EmbConcatenate()([syn_emb_train, syn_emb_fix, syn_char_max, similar_input])
syn_emb_out = Dropout(0.5, name='syn_dro_1')(syn_cont)
# 第一层
rnn_first_source = Bidirectional(LSTM(100, return_sequences=True, activity_regularizer=regularizers.l2(0.0000001)),
rnn_first_syn = Bidirectional(LSTM(100, return_sequences=True, activity_regularizer=regularizers.l2(0.0000001)),
attention_weights_first = Attention(transpose_b=True)([rnn_first_source, rnn_first_syn])
first_source_out = Concatenate(axis=-1, name='sou_out_1')([source_emb_out, attention_weights_first, rnn_first_source])
first_syn_out = Concatenate(axis=-1, name='syn_out_1')([syn_emb_out, attention_weights_first, rnn_first_syn])
# 第二层
rnn_second_source = Bidirectional(LSTM(100, return_sequences=True, activity_regularizer=regularizers.l2(0.0000001)),
rnn_second_syn = Bidirectional(LSTM(100, return_sequences=True, activity_regularizer=regularizers.l2(0.0000001)),
attention_weights_second = Attention(transpose_b=True)([rnn_second_source, rnn_second_syn])
second_source_out = Concatenate(axis=-1, name='sou_out_2')(
[first_source_out, attention_weights_second, rnn_second_source])
second_syn_out = Concatenate(axis=-1, name='syn_out_2')([first_syn_out, attention_weights_second, rnn_second_syn])
# 第三层
rnn_three_source = Bidirectional(LSTM(100, return_sequences=True, activity_regularizer=regularizers.l2(0.0000001)),
rnn_three_syn = Bidirectional(LSTM(100, return_sequences=True, activity_regularizer=regularizers.l2(0.0000001)),
attention_weights_three = Attention(transpose_b=True)([rnn_three_source, rnn_three_syn])
three_source_out = Concatenate(axis=-1, name='sou_out_3')(
[second_source_out, attention_weights_three, rnn_three_source])
three_syn_out = Concatenate(axis=-1, name='syn_out_3')([second_syn_out, attention_weights_three, rnn_three_syn])
# 第四层
# 自解码层
source_four_dropout = Dropout(rate=0.2, name='sou_dro_2')(three_source_out)
source_four_auto_dense1 = Dense(200, activation='relu', name='sou_dense_1',
source_four_auto_dense2 = Dense(1000, activation='relu', name='sou_dense_2',
syn_four_dropout = Dropout(rate=0.2, name='syn_dro_2')(three_syn_out)
syn_four_auto_dense1 = Dense(200, activation='relu', name='syn_dense_1',
syn_four_auto_dense2 = Dense(1000, activation='relu', name='syn_dense_2',
rnn_four_source = Bidirectional(LSTM(100, return_sequences=True, activity_regularizer=regularizers.l2(0.0000001)),
rnn_four_syn = Bidirectional(LSTM(100, return_sequences=True, activity_regularizer=regularizers.l2(0.0000001)),
attention_weights_four = Attention(transpose_b=True)([rnn_four_source, rnn_four_syn])
four_source_out = Concatenate(axis=-1, name='sou_out_4')(
[source_four_auto_dense2, attention_weights_four, rnn_four_source])
four_syn_out = Concatenate(axis=-1, name='syn_out_4')([syn_four_auto_dense2, attention_weights_four, rnn_four_syn])
# 第五层
# 自解码层
source_five_dropout = Dropout(rate=0.2, name='sou_dro_3')(four_source_out)
source_five_auto_dense1 = Dense(200, activation='relu', name='sou_dense_5',
source_five_auto_dense2 = Dense(1000, activation='relu', name='sou_dense_6',
syn_five_dropout = Dropout(rate=0.2, name='syn_dro_3')(four_syn_out)
syn_five_auto_dense1 = Dense(200, activation='relu', name='syn_dense_5',
syn_five_auto_dense2 = Dense(1000, activation='relu', name='syn_dense_6',
rnn_five_source = Bidirectional(LSTM(100, return_sequences=True, activity_regularizer=regularizers.l2(0.0000001)),
rnn_five_syn = Bidirectional(LSTM(100, return_sequences=True, activity_regularizer=regularizers.l2(0.0000001)),
attention_weights_five = Attention(transpose_b=True)([rnn_five_source, rnn_five_syn])
five_source_out = Concatenate(axis=-1, name='sou_out_5')(
[source_five_auto_dense2, attention_weights_five, rnn_five_source])
five_syn_out = Concatenate(axis=-1, name='syn_out_5')([syn_five_auto_dense2, attention_weights_five, rnn_five_syn])
# 最大池化层
p = MaxPooling1D(syn_ten.shape[1], name='out_max_1')(five_source_out)
q = MaxPooling1D(syn_ten.shape[1], name='out_max_2')(five_syn_out)
# 输出层
v = Output()([p, q])
output_dropout = Dropout(0.8, name='out_dro_1')(v)
output_dense1 = Dense(units=1000, activation='relu', name='out_dense_1',
output_dense2 = Dense(units=1000, activation='relu', name='out_dense_2',
output_sigmoid = Dense(units=1, activation='sigmoid', name='out_dense_3',
# 模型
model = Model([source, syn, source_char, syn_char, similar_input], output_sigmoid)
model.compile(optimizer='rmsprop', loss='binary_crossentropy')
history = model.fit({'source': source_ten, 'syn': syn_ten, 'source_char': source_char_ten,
'syn_char': syn_char_ten, 'similar': similar}, labels, epochs=10, batch_size=batch_size)