本文主要目的是实现,韩国 Naver Corporation 和 Seoul National University 发表的一篇论文的模型,此模型主要用在句子匹配任务上。利用Keras实现,后端为tensorflow,论文地址
e p i f i x = e m b f i x ( p i ) e_{p_i}^{f_ix} = emb_{fix}(p_i) epifix=embfix(pi)
c p i = c h a r c o n v ( p i ) c^{p_i} = char_{conv}(p_i) cpi=charconv(pi)
e p i t r = e m b t r a i n ( p i ) e_{p_i}^{tr} = emb_{train}(p_i) epitr=embtrain(pi)
p i w = [ e p i t r , e p i f i x , c p i , f p i ] p_i^{w} = [e_{p_i}^{tr}, e_{p_i}^{f_ix}, c^{p_i}, f_{p_i}] piw=[epitr,epifix,cpi,fpi]
每个时间步,利用双向RNN,结合动作使得直到它们达到最上层和以前所有的功能进行预测的集体知识的工作隐藏功能被保留。
x t l = [ h t l − 1 , x t l − 1 ] x_t^{l} = [h_t^{l-1}, x_t^{l-1}] xtl=[htl−1,xtl−1]
h t l = H l ( x t l , h t l − 1 ) h_t^l = H_l(x_t^{l}, h_t^{l-1}) htl=Hl(xtl,htl−1)
o i , j = e x p ( e i , j ) ∑ k = 1 J e i , k o_{i,j} = \tfrac{exp(e_{i,j})}{\sum_{k=1}^J{e_{i,k}}} oi,j=∑k=1Jei,kexp(ei,j)
a p i = ∑ j = 1 J o i , j ∗ h q j a_{p_i} = \sum_{j=1}^J{o_{i,j}*h_{q_j}} api=j=1∑Joi,j∗hqj
e i , j = c o s ( h p i , h q j ) e_{i,j} = cos(h_{p_i}, h_{q_j}) ei,j=cos(hpi,hqj)
x t l = [ h t l − 1 , a t l − 1 , x t l − 1 ] x_t^{l} = [h_t^{l-1}, a_t^{l-1}, x_t^{l-1}] xtl=[htl−1,atl−1,xtl−1]
h t l = H l ( x t l , h t l − 1 ) h_t^l = H_l(x_t^{l}, h_t^{l-1}) htl=Hl(xtl,htl−1)
import os
import jieba
import numpy as np
import pandas as pd
import tensorflow as tf
from multiprocessing import Pool
tf.enable_eager_execution()
EMB_DIM = 300
class Similar:
"""
标记两个句子的相同单词
"""
def __init__(self, source, syn):
self.source = source
self.syn = syn
self.x, self.y = self.source.shape
def get_similar_cell(self, index):
arr = np.zeros((1, self.y))
for i in range(self.y):
if self.source[index][i] == 0:
continue
if self.source[index][i] in self.syn[index]:
arr[0][i] = 1
return index, arr
def get_similar(self):
"""
利用多进程,计算速度提升1倍左右,计算机不同,有所差距
"""
similar = np.zeros_like(self.source)
with Pool() as pool:
results = pool.map(self.get_similar_cell, range(self.x))
for result in results:
key, value = result
similar[key] = value
return similar
def get_data():
"""
处理CSV文件,词对象,字对象
"""
sources = []
synonymous = []
sources_char = []
synonymous_char = []
labels = []
base_path = 'log'
for path in os.listdir(base_path):
if path.endswith('.csv'):
dataset = pd.read_csv(base_path + '/' + path, sep='\t', error_bad_lines=True)
for data in dataset.values:
sou = list(jieba.cut(data[0]))
syn = list(jieba.cut(data[1]))
sources.append(sou)
synonymous.append(syn)
labels.append(data[2])
sources_char.append(list(data[0]))
synonymous_char.append(list(data[1]))
return sources, synonymous, sources_char, synonymous_char, np.array(labels, dtype='int32')
def tokenize(source, syn):
"""
标记,这里将要比较相似性的两个对应的句子合并,获取并集
"""
tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
tokenizer.fit_on_texts(source + syn)
source_tensor = tokenizer.texts_to_sequences(source)
syn_tensor = tokenizer.texts_to_sequences(syn)
return tokenizer, source_tensor, syn_tensor
def align(tensor):
"""
对齐
"""
source_tensor, syn_tensor, source_char_tensor, syn_char_tensor = tensor
max_len = max_length(source_tensor + syn_tensor + source_char_tensor + syn_char_tensor)
syn_tensor = tf.keras.preprocessing.sequence.pad_sequences(syn_tensor, maxlen=max_len, padding='post')
source_tensor = tf.keras.preprocessing.sequence.pad_sequences(source_tensor, maxlen=max_len, padding='post')
sou_char_tensor = tf.keras.preprocessing.sequence.pad_sequences(source_char_tensor, maxlen=max_len, padding='post')
syn_char_tensor = tf.keras.preprocessing.sequence.pad_sequences(syn_char_tensor, maxlen=max_len, padding='post')
return source_tensor, syn_tensor, sou_char_tensor, syn_char_tensor
def emd_matrix(max_words, word_index):
"""
获取预训练的嵌入层,利用Glove算法,自训练
"""
embedding_index = {}
with open('zhs_wiki_glove.vectors.300d.txt', 'r') as f:
for line in f:
values = line.split()
embedding_index[values[0]] = np.asarray(values[1:], dtype='float32')
embedding_matrix = np.zeros((max_words + 1, EMB_DIM))
for i, word in word_index.items():
if i < max_words:
embedding_vector = embedding_index.get(word, None)
if embedding_vector is not None:
embedding_matrix[i] = embedding_vector
return embedding_matrix
def max_length(tensor):
return max(len(t) for t in tensor)
def main():
source, syn, source_char, syn_char, label = get_data()
tokenizer, source_tensor, syn_tensor = tokenize(source, syn)
_, sou_char_tensor, syn_char_tensor = tokenize(source_char, syn_char)
source_tensor, syn_tensor, sou_char_tensor, syn_char_tensor = align([source_tensor, syn_tensor, sou_char_tensor,
syn_char_tensor])
similar = Similar(source_tensor, syn_tensor).get_similar()
vocab_size = len(tokenizer.index_word)
char_size = len(_.index_word)
print(char_size)
emb_matrix = emd_matrix(vocab_size, tokenizer.index_word)
return source_tensor, syn_tensor, sou_char_tensor, syn_char_tensor, emb_matrix, vocab_size, char_size, similar, label
if __name__ == '__main__':
main()
import os
import numpy as np
import tensorflow as tf
import keras.backend as K
from keras import Model, regularizers
from keras.layers import Bidirectional, LSTM, merge
from keras.layers import Input, Embedding, Conv1D, MaxPooling1D, Concatenate, Dense, Dropout
from .datasets import main
def expand_dims(tensor):
return K.expand_dims(tensor, axis=2)
def soft_max(x, axis=1):
"""
来自吴恩达
"""
dim = K.ndim(x)
if dim == 2:
return K.softmax(x)
elif dim > 2:
e = K.exp(x - K.max(x, axis=axis, keepdims=True))
s = K.sum(e, axis=axis, keepdims=True)
return e / s
else:
raise ValueError('Cannot apply softmax to a tensor that is 1D')
class Output(merge._Merge):
def _merge_function(self, inputs):
last_source_max = inputs[0]
last_syn_max = inputs[0]
p_ = K.squeeze(last_source_max, axis=1)
q_ = K.squeeze(last_syn_max, axis=1)
output = K.concatenate([p_, q_, p_ + q_, p_ - q_, K.abs(p_ - q_)], axis=-1)
return output
def compute_output_shape(self, input_shape):
"""
计算方法不太科学
"""
output_shape = input_shape[0][0], input_shape[0][2] * 5
return output_shape
class Attention(merge._Merge):
def __init__(self, **kwargs):
super().__init__()
self.matmul = kwargs
def _merge_function(self, inputs):
x = inputs[0]
y = inputs[1]
syn_attention = inputs[1]
x_y = tf.matmul(x, y, **self.matmul)
x_square = K.square(x)
x_sum = K.sum(x_square, axis=-1, keepdims=True)
y_square = K.square(y)
y_sum = K.sum(y_square, axis=-1, keepdims=True)
x_y_matmul = tf.matmul(x_sum, y_sum, **self.matmul)
cos = x_y / x_y_matmul
activator = soft_max(cos)
output = tf.matmul(activator, syn_attention)
return output
class EmbConcatenate(merge._Merge):
def build(self, input_shape):
self._reshape_required = False
def compute_output_shape(self, input_shape):
if not isinstance(input_shape, list):
raise TypeError
last = 0
for shape in input_shape[:-1]:
last += shape[2]
return input_shape[0][0], input_shape[0][1], last + 1
def _merge_function(self, inputs):
similar_dim = K.expand_dims(inputs[3])
emb_train = inputs[0]
emb_fix = inputs[1]
char_max = inputs[2]
output = K.concatenate([emb_train, emb_fix, char_max, similar_dim])
return output
EMB_DIM = 300
VOCAB_SIZE = 1000
# tf.enable_eager_execution()
os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
batch_size = 64
source_ten, syn_ten, source_char_ten, syn_char_ten, vocab_emb_matrix, vocab_size, char_size, similar, labels = main()
middle_shape = syn_ten.shape[1]
# 输入层
source = Input(shape=(middle_shape,), batch_shape=(batch_size, middle_shape), name='source')
syn = Input(shape=(middle_shape,), batch_shape=(batch_size, middle_shape), name='syn')
syn_char = Input(shape=(middle_shape,), batch_shape=(batch_size, middle_shape), name='syn_char')
source_char = Input(shape=(middle_shape,), batch_shape=(batch_size, middle_shape), name='source_char')
similar_input = Input(shape=(middle_shape,), batch_shape=(batch_size, middle_shape), name='similar')
# 嵌入层
source_emb_train = Embedding(vocab_size, EMB_DIM, weights=[vocab_emb_matrix], trainable=False, name='sou_emb_1')(source)
source_emb_fix = Embedding(vocab_size, EMB_DIM, name='sou_emb_2')(source)
source_char_emb = Embedding(char_size, 16, name='sou_emb_3')(source_char)
source_char_cnn = Conv1D(batch_size, 7, activation='relu', padding='same', name='sou_con')(source_char_emb)
source_char_max = MaxPooling1D(data_format='channels_first', name='sou_max')(source_char_cnn)
source_cont = EmbConcatenate()([source_emb_train, source_emb_fix, source_char_max, similar_input])
source_emb_out = Dropout(0.5, name='sou_dro_1')(source_cont)
syn_emb_train = Embedding(vocab_size, EMB_DIM, weights=[vocab_emb_matrix], trainable=False, name='syn_emb_1')(syn)
syn_emb_fix = Embedding(vocab_size, EMB_DIM, name='syn_emb_2')(syn)
syn_char_emb = Embedding(char_size, 16, name='syn_emb_3')(syn_char)
syn_char_cnn = Conv1D(batch_size, 7, activation='relu', padding='same', name='syn_con')(syn_char_emb)
syn_char_max = MaxPooling1D(data_format='channels_first', name='syn_max')(syn_char_cnn)
syn_cont = EmbConcatenate()([syn_emb_train, syn_emb_fix, syn_char_max, similar_input])
syn_emb_out = Dropout(0.5, name='syn_dro_1')(syn_cont)
# 第一层
rnn_first_source = Bidirectional(LSTM(100, return_sequences=True, activity_regularizer=regularizers.l2(0.0000001)),
name='sou_rnn_1')(source_emb_out)
rnn_first_syn = Bidirectional(LSTM(100, return_sequences=True, activity_regularizer=regularizers.l2(0.0000001)),
name='syn_rnn_1')(syn_emb_out)
attention_weights_first = Attention(transpose_b=True)([rnn_first_source, rnn_first_syn])
first_source_out = Concatenate(axis=-1, name='sou_out_1')([source_emb_out, attention_weights_first, rnn_first_source])
first_syn_out = Concatenate(axis=-1, name='syn_out_1')([syn_emb_out, attention_weights_first, rnn_first_syn])
# 第二层
rnn_second_source = Bidirectional(LSTM(100, return_sequences=True, activity_regularizer=regularizers.l2(0.0000001)),
name='sou_rnn_2')(first_source_out)
rnn_second_syn = Bidirectional(LSTM(100, return_sequences=True, activity_regularizer=regularizers.l2(0.0000001)),
name='syn_rnn_2')(first_syn_out)
attention_weights_second = Attention(transpose_b=True)([rnn_second_source, rnn_second_syn])
second_source_out = Concatenate(axis=-1, name='sou_out_2')(
[first_source_out, attention_weights_second, rnn_second_source])
second_syn_out = Concatenate(axis=-1, name='syn_out_2')([first_syn_out, attention_weights_second, rnn_second_syn])
# 第三层
rnn_three_source = Bidirectional(LSTM(100, return_sequences=True, activity_regularizer=regularizers.l2(0.0000001)),
name='sou_rnn_3')(second_source_out)
rnn_three_syn = Bidirectional(LSTM(100, return_sequences=True, activity_regularizer=regularizers.l2(0.0000001)),
name='syn_rnn_3')(second_syn_out)
attention_weights_three = Attention(transpose_b=True)([rnn_three_source, rnn_three_syn])
three_source_out = Concatenate(axis=-1, name='sou_out_3')(
[second_source_out, attention_weights_three, rnn_three_source])
three_syn_out = Concatenate(axis=-1, name='syn_out_3')([second_syn_out, attention_weights_three, rnn_three_syn])
# 第四层
# 自解码层
source_four_dropout = Dropout(rate=0.2, name='sou_dro_2')(three_source_out)
source_four_auto_dense1 = Dense(200, activation='relu', name='sou_dense_1',
activity_regularizer=regularizers.l2(0.0000001))(source_four_dropout)
source_four_auto_dense2 = Dense(1000, activation='relu', name='sou_dense_2',
activity_regularizer=regularizers.l2(0.0000001))(source_four_auto_dense1)
syn_four_dropout = Dropout(rate=0.2, name='syn_dro_2')(three_syn_out)
syn_four_auto_dense1 = Dense(200, activation='relu', name='syn_dense_1',
activity_regularizer=regularizers.l2(0.0000001))(syn_four_dropout)
syn_four_auto_dense2 = Dense(1000, activation='relu', name='syn_dense_2',
activity_regularizer=regularizers.l2(0.0000001))(syn_four_auto_dense1)
rnn_four_source = Bidirectional(LSTM(100, return_sequences=True, activity_regularizer=regularizers.l2(0.0000001)),
name='sou_rnn_4')(source_four_auto_dense2)
rnn_four_syn = Bidirectional(LSTM(100, return_sequences=True, activity_regularizer=regularizers.l2(0.0000001)),
name='syn_rnn_4')(syn_four_auto_dense2)
attention_weights_four = Attention(transpose_b=True)([rnn_four_source, rnn_four_syn])
four_source_out = Concatenate(axis=-1, name='sou_out_4')(
[source_four_auto_dense2, attention_weights_four, rnn_four_source])
four_syn_out = Concatenate(axis=-1, name='syn_out_4')([syn_four_auto_dense2, attention_weights_four, rnn_four_syn])
# 第五层
# 自解码层
source_five_dropout = Dropout(rate=0.2, name='sou_dro_3')(four_source_out)
source_five_auto_dense1 = Dense(200, activation='relu', name='sou_dense_5',
activity_regularizer=regularizers.l2(0.0000001))(source_five_dropout)
source_five_auto_dense2 = Dense(1000, activation='relu', name='sou_dense_6',
activity_regularizer=regularizers.l2(0.0000001))(source_five_auto_dense1)
syn_five_dropout = Dropout(rate=0.2, name='syn_dro_3')(four_syn_out)
syn_five_auto_dense1 = Dense(200, activation='relu', name='syn_dense_5',
activity_regularizer=regularizers.l2(0.0000001))(syn_five_dropout)
syn_five_auto_dense2 = Dense(1000, activation='relu', name='syn_dense_6',
activity_regularizer=regularizers.l2(0.0000001))(syn_five_auto_dense1)
rnn_five_source = Bidirectional(LSTM(100, return_sequences=True, activity_regularizer=regularizers.l2(0.0000001)),
name='sou_rnn_5')(source_five_auto_dense2)
rnn_five_syn = Bidirectional(LSTM(100, return_sequences=True, activity_regularizer=regularizers.l2(0.0000001)),
name='syn_rnn_5')(syn_five_auto_dense2)
attention_weights_five = Attention(transpose_b=True)([rnn_five_source, rnn_five_syn])
five_source_out = Concatenate(axis=-1, name='sou_out_5')(
[source_five_auto_dense2, attention_weights_five, rnn_five_source])
five_syn_out = Concatenate(axis=-1, name='syn_out_5')([syn_five_auto_dense2, attention_weights_five, rnn_five_syn])
# 最大池化层
p = MaxPooling1D(syn_ten.shape[1], name='out_max_1')(five_source_out)
q = MaxPooling1D(syn_ten.shape[1], name='out_max_2')(five_syn_out)
# 输出层
v = Output()([p, q])
output_dropout = Dropout(0.8, name='out_dro_1')(v)
output_dense1 = Dense(units=1000, activation='relu', name='out_dense_1',
activity_regularizer=regularizers.l2(0.0000001))(output_dropout)
output_dense2 = Dense(units=1000, activation='relu', name='out_dense_2',
activity_regularizer=regularizers.l2(0.0000001))(output_dense1)
output_sigmoid = Dense(units=1, activation='sigmoid', name='out_dense_3',
activity_regularizer=regularizers.l2(0.0000001))(output_dense2)
# 模型
model = Model([source, syn, source_char, syn_char, similar_input], output_sigmoid)
model.compile(optimizer='rmsprop', loss='binary_crossentropy')
history = model.fit({'source': source_ten, 'syn': syn_ten, 'source_char': source_char_ten,
'syn_char': syn_char_ten, 'similar': similar}, labels, epochs=10, batch_size=batch_size)
model.save('second.h5')
print(history)