Keras框架(二):实现文本相似度的几种模型(代码)

根据已学的知识,将几种深度学习模型运用到自己最近的科研项目——文本相似度:

使用框架:Keras

模型:深度学习相关模型

1. LSTM实现文本相似度:

def get_model(nb_words, EMBEDDING_DIM, embedding_matrix, MAX_SEQUENCE_LENGTH,
                           num_lstm, rate_drop_lstm, rate_drop_dense, num_dense, act):

    sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
	
	# embedding
    embedding_layer = Embedding(nb_words,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)
    embedded_sequences_1 = embedding_layer(sequence_1_input)
    embedded_sequences_2 = embedding_layer(sequence_2_input)
	
	# lstm
    lstm_layer = LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm)
    x1 = lstm_layer(embedded_sequences_1)
    y1 = lstm_layer(embedded_sequences_2)

	# classifier
    merged = concatenate([x1, y1])
    merged = Dropout(rate_drop_dense)(merged)
    merged = BatchNormalization()(merged)
    merged = Dense(num_dense, activation=act)(merged)
    merged = Dropout(rate_drop_dense)(merged)
    merged = BatchNormalization()(merged)
    preds = Dense(1, activation='sigmoid')(merged)

    model = Model(inputs=[sequence_1_input, sequence_2_input], \
                  outputs=preds)
    model.compile(loss='binary_crossentropy',
                  optimizer='nadam',
                  metrics=['acc'])
    model.summary()
    return model

2. BiLSTM实现文本相似度

def get_model(nb_words, EMBEDDING_DIM, embedding_matrix, MAX_SEQUENCE_LENGTH,
                           num_lstm, rate_drop_lstm, rate_drop_dense, num_dense, act):
    embedding_layer = Embedding(nb_words,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)
    lstm_layer = Bidirectional(LSTM(num_lstm, dropout=rate_drop_lstm, recurrent_dropout=rate_drop_lstm))

    sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences_1 = embedding_layer(sequence_1_input)
    x1 = lstm_layer(embedded_sequences_1)

    sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences_2 = embedding_layer(sequence_2_input)
    y1 = lstm_layer(embedded_sequences_2)

    merged = concatenate([x1, y1])
    merged = Dropout(rate_drop_dense)(merged)
    merged = BatchNormalization()(merged)

    merged = Dense(num_dense, activation=act)(merged)
    merged = Dropout(rate_drop_dense)(merged)
    merged = BatchNormalization()(merged)
    preds = Dense(1, activation='sigmoid')(merged)

    model = Model(inputs=[sequence_1_input, sequence_2_input], \
                  outputs=preds)
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['acc'])
    model.summary()
    return model

3. ESIM实现文本相似度

def get_model(embedding_matrix_file, MAX_SEQUENCE_LENGTH, num_lstm, rate_drop_dense, num_dense):

    sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

    # embedding
    embedding_layer = create_pretrained_embedding(embedding_matrix_file, mask_zero=False)
    bn = BatchNormalization(axis=2)
    embedded_sequences_1 = bn(embedding_layer(sequence_1_input))
    embedded_sequences_2 = bn(embedding_layer(sequence_2_input))

    # encode
    encode = Bidirectional(LSTM(num_lstm, return_sequences=True))
    encode_sequences_1 = encode(embedded_sequences_1)
    encode_sequences_2 = encode(embedded_sequences_2)

    # attention
    alignd_sequences_1, alignd_sequences_2 = soft_attention_alignment(encode_sequences_1, encode_sequences_2)

    # compose
    combined_sequences_1 = Concatenate()(
        [encode_sequences_1, alignd_sequences_2, submult(encode_sequences_1, alignd_sequences_2)])
    combined_sequences_2 = Concatenate()(
        [encode_sequences_2, alignd_sequences_1, submult(encode_sequences_2, alignd_sequences_1)])

    compose = Bidirectional(LSTM(num_lstm, return_sequences=True))
    compare_sequences_1 = compose(combined_sequences_1)
    compare_sequences_2 = compose(combined_sequences_2)

    # aggregate
    rep_sequences_1 = apply_multiple(compare_sequences_1, [GlobalAvgPool1D(), GlobalMaxPool1D()])
    rep_sequences_2 = apply_multiple(compare_sequences_2, [GlobalAvgPool1D(), GlobalMaxPool1D()])

    # classifier
    merged = Concatenate()([rep_sequences_1, rep_sequences_2])
    dense = BatchNormalization()(merged)
    dense = Dense(num_dense, activation='elu')(dense)
    dense = BatchNormalization()(dense)
    dense = Dropout(rate_drop_dense)(dense)
    dense = Dense(num_dense, activation='elu')(dense)
    dense = BatchNormalization()(dense)
    dense = Dropout(rate_drop_dense)(dense)
    out_ = Dense(1, activation='sigmoid')(dense)

    model = Model(inputs=[sequence_1_input, sequence_2_input], outputs=out_)
    model.compile(optimizer=Adam(lr=1e-3), loss='binary_crossentropy', metrics=['binary_crossentropy', 'accuracy'])
    return model

4. DSSM实现文本相似度

def get_model(embedding_matrix, nb_words, EMBEDDING_DIM, MAX_SEQUENCE_LENGTH, num_lstm, rate_drop_dense):


    att1_layer = Attention.Attention(MAX_SEQUENCE_LENGTH)

    sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')   # 编码后的问题1的词特征
    sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')   # 编码后的问题2的词特征

    # embedding
    embedding_layer = Embedding(nb_words,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)
    embedded_sequences_1 = embedding_layer(sequence_1_input)
    embedded_sequences_2 = embedding_layer(sequence_2_input)

    # encode
    lstm1_layer = Bidirectional(LSTM(num_lstm))
    encode_sequences_1 = lstm1_layer(embedded_sequences_1)
    encode_sequences_2 = lstm1_layer(embedded_sequences_2)

    # lstm
    lstm0_layer = LSTM(num_lstm, return_sequences=True)
    lstm2_layer = LSTM(num_lstm)
    v1ls = lstm2_layer(lstm0_layer(embedded_sequences_1))
    v2ls = lstm2_layer(lstm0_layer(embedded_sequences_2))
    v1 = Concatenate(axis=1)([att1_layer(embedded_sequences_1), encode_sequences_1])
    v2 = Concatenate(axis=1)([att1_layer(embedded_sequences_2), encode_sequences_2])

    # sequence_1c_input = Input(shape=(MAX_SEQUENCE_LENGTH_CHAR,), dtype='int32')  # 编码后的问题1的字特征
    # sequence_2c_input = Input(shape=(MAX_SEQUENCE_LENGTH_CHAR,), dtype='int32')  # 编码后的问题2的字特征

    # embedding_char_layer = Embedding(char_words,
    #                             EMBEDDING_DIM)

    # embedded_sequences_1c = embedding_char_layer(sequence_1c_input)
    # embedded_sequences_2c = embedding_char_layer(sequence_2c_input)

    # x1c = lstm1_layer(embedded_sequences_1c)
    # x2c = lstm1_layer(embedded_sequences_2c)
    # v1c = Concatenate(axis=1)([att1_layer(embedded_sequences_1c), x1c])
    # v2c = Concatenate(axis=1)([att1_layer(embedded_sequences_2c), x2c])

    # compose
    mul = Multiply()([v1, v2])
    sub = Lambda(lambda x: K.abs(x))(Subtract()([v1, v2]))
    maximum = Maximum()([Multiply()([v1, v1]), Multiply()([v2, v2])])
    # mulc = Multiply()([v1c, v2c])
    # subc = Lambda(lambda x: K.abs(x))(Subtract()([v1c, v2c]))
    # maximumc = Maximum()([Multiply()([v1c, v1c]), Multiply()([v2c, v2c])])
    sub2 = Lambda(lambda x: K.abs(x))(Subtract()([v1ls, v2ls]))
    # matchlist = Concatenate(axis=1)([mul, sub, mulc, subc, maximum, maximumc, sub2])
    matchlist = Concatenate(axis=1)([mul, sub, maximum, sub2])
    matchlist = Dropout(rate_drop_dense)(matchlist)

    matchlist = Concatenate(axis=1)(
        [Dense(32, activation='relu')(matchlist), Dense(48, activation='sigmoid')(matchlist)])
    res = Dense(1, activation='sigmoid')(matchlist)

    # model = Model(inputs=[sequence_1_input, sequence_2_input,
    #                       sequence_1c_input, sequence_2c_input], outputs=res)
    model = Model(inputs=[sequence_1_input, sequence_2_input], outputs=res)
    model.compile(optimizer=Adam(lr=0.001), loss="binary_crossentropy", metrics=['acc'])
    model.summary()
    return model

5. Decomption + Attention实现文本相似度

def get_model(embedding_matrix_file, MAX_SEQUENCE_LENGTH,
              rate_drop_projction, num_projction, hidden_projction,
              rate_drop_compare, num_compare,
              rate_drop_dense, num_dense):

    sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

    # embedding
    embedding_layer = create_pretrained_embedding(embedding_matrix_file, mask_zero=False)
    embedded_sequences_1 = embedding_layer(sequence_1_input)
    embedded_sequences_2 = embedding_layer(sequence_2_input)

    # projection
    projection_layers = []
    if hidden_projction > 0:
        projection_layers.extend([
            Dense(hidden_projction, activation='elu'),
            Dropout(rate=rate_drop_projction),
        ])
    projection_layers.extend([
        Dense(num_projction, activation=None),
        Dropout(rate=rate_drop_projction),
    ])
    encode_sequences_1 = time_distributed(embedded_sequences_1, projection_layers)
    encode_sequences_2 = time_distributed(embedded_sequences_2, projection_layers)

    # attention
    alignd_sequences_1, alignd_sequences_2 = soft_attention_alignment(encode_sequences_1, encode_sequences_2)

    # compare
    combined_sequences_1 = Concatenate()(
        [encode_sequences_1, alignd_sequences_2, submult(encode_sequences_1, alignd_sequences_2)])
    combined_sequences_2 = Concatenate()(
        [encode_sequences_2, alignd_sequences_1, submult(encode_sequences_2, alignd_sequences_1)])
    compare_layers = [
        Dense(num_compare, activation='elu'),
        Dropout(rate_drop_compare),
        Dense(num_compare, activation='elu'),
        Dropout(rate_drop_compare),
    ]
    compare_sequences_1 = time_distributed(combined_sequences_1, compare_layers)
    compare_sequences_2 = time_distributed(combined_sequences_2, compare_layers)

    # aggregate
    rep_sequences_1 = apply_multiple(compare_sequences_1, [GlobalAvgPool1D(), GlobalMaxPool1D()])
    rep_sequences_2 = apply_multiple(compare_sequences_2, [GlobalAvgPool1D(), GlobalMaxPool1D()])

    # classifier
    merged = Concatenate()([rep_sequences_1, rep_sequences_2])
    dense = BatchNormalization()(merged)
    dense = Dense(num_dense, activation='elu')(dense)
    dense = Dropout(rate_drop_dense)(dense)
    dense = BatchNormalization()(dense)
    dense = Dense(num_dense, activation='elu')(dense)
    dense = Dropout(rate_drop_dense)(dense)
    out_ = Dense(1, activation='sigmoid')(dense)

    model = Model(inputs=[sequence_1_input, sequence_2_input], outputs=out_)
    model.compile(optimizer=Adam(lr=1e-3), loss='binary_crossentropy', metrics=['binary_crossentropy', 'accuracy'])
    return model

6. 使用多头自注意力机制的简单网络实现文本相似度

def get_model(embedding_matrix_file, MAX_SEQUENCE_LENGTH, rate_drop_dense):
    sequence_1_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    sequence_2_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')

    # embedding
    embedding_layer = create_pretrained_embedding(embedding_matrix_file, mask_zero=False)
    embedded_sequences_1 = embedding_layer(sequence_1_input)
    embedded_sequences_2 = embedding_layer(sequence_2_input)

    # position embedding
    # embedded_sequences_1 = pos_embed.Position_Embedding()(embedded_sequences_1)
    # embedded_sequences_2 = pos_embed.Position_Embedding()(embedded_sequences_2)


    # attention
    O_seq_1 = Attention.Attention(8, 16)([embedded_sequences_1, embedded_sequences_1, embedded_sequences_1])
    O_seq_2 = Attention.Attention(8, 16)([embedded_sequences_2, embedded_sequences_2, embedded_sequences_2])

    # aggregate  ESMI
    # rep_sequences_1 = apply_multiple(compare_sequences_1, [GlobalAvgPool1D(), GlobalMaxPool1D()])
    # rep_sequences_2 = apply_multiple(compare_sequences_2, [GlobalAvgPool1D(), GlobalMaxPool1D()])

    rep_sequences_1 = GlobalAveragePooling1D()(O_seq_1)
    rep_sequences_2 = GlobalAveragePooling1D()(O_seq_2)

    # classifier
    merged = Concatenate()([rep_sequences_1, rep_sequences_2])
    O_seq = Dropout(rate_drop_dense)(merged)
    outputs = Dense(1, activation='sigmoid')(O_seq)

    model = Model(inputs=[sequence_1_input, sequence_2_input], outputs=outputs)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

 

 

 

你可能感兴趣的:(deep,learning,Python)