bidirectional long short term merory attention network (BAN) 针对 smiles 的预测和分类任务,利用了 SMILES Enumeration 数据增强和基于注意力机制的 LSTM,原文:Learning to SMILES: BAN-based strategies to improve latent representation learning from molecules,代码:smiles-biLSTM-attention-network,其中缺少的数据以logD7.4例,解析 reg 任务,代码从 dataset 开始,因为 reg 开始就建立数据库然后调用 get_data,而 preprocessing 中有很多没有使用的函数。模型框架如下:
import pandas as pd
import numpy as np
from preprocessing import randomize_smile
import re
from copy import deepcopy
regex_pattern=r'Cl|Br|[#%\)\(\+\-1032547698:=@CBFIHONPS\[\]cionps]'
def randomize_smile(sml,max_len=100):
"""Function that randomizes a SMILES sequnce. This was adapted from the
implemetation of E. Bjerrum 2017, SMILES Enumeration as Data Augmentation
for Neural Network Modeling of Molecules.
Args:
sml: SMILES sequnce to randomize.
Return:
randomized SMILES sequnce or
nan if SMILES is not interpretable.
"""
try:
m = Chem.MolFromSmiles(sml)
ans = list(range(m.GetNumAtoms()))
np.random.shuffle(ans)
nm = Chem.RenumberAtoms(m, ans)
smiles = Chem.MolToSmiles(nm, canonical=False)
i = 0
while len(smiles)>max_len:
m = Chem.MolFromSmiles(sml)
ans = list(range(m.GetNumAtoms()))
np.random.shuffle(ans)
nm = Chem.RenumberAtoms(m, ans)
smiles = Chem.MolToSmiles(nm, canonical=False)
i = i+1
if i>5:
break
if len(smiles)>max_len:
return sml
else:
return smiles
except:
return np.nan
class Dataset(object):
def __init__(self, filename,
smile_field,
label_field,
max_len=100,
train_augment_times=1,
test_augment_times=1,
random_state=0):
df = pd.read_csv(filename, sep='\t')
df['length'] = df[smile_field].map(lambda x: len(x.replace('Cl', 'X').replace('Br', 'Y')))
self.df = deepcopy(df[df.length <= max_len])
self.smile_field = smile_field
self.label_field = label_field
self.max_len = max_len
self.train_augment_times = train_augment_times
self.test_augment_times = test_augment_times
self.random_state = random_state
vocab = np.load('./data/vocab.npy', allow_pickle=True)
self.vocab = vocab.item()
from dataset import Dataset
seed=1
task="logD"
dataset = Dataset('data/reg/{}.txt'.format(task), 'SMILES', 'Label', 100, 100, 100, seed)
dataset.df
index | Index | SMILES | Label |
---|---|---|---|
0 | 0 | Fc1cc2c(N(C=C(C(O)=O)C2=O)C2CC2)cc1N1CCNCC1 | -0.96 |
1 | 1 | S1(=O)(=O)N(CCN1C)c1cc2c([nH]cc2CCN©C)cc1 | -0.92 |
2 | 2 | OC=1C(=O)C=CN(CCCO)C=1C | -0.9 |
3 | 3 | Fc1cc2c(N(C=C(C(O)=O)C2=O)c2ccccc2C)cc1N1CCNCC1 | -0.83 |
{0: '', 1: '#', 2: '%', 3: ')', 4: '(', 5: '+', 6: '-', 7: '1', 8: '0', 9: '3', 10: '2', 11: '5', 12: '4', 13: '7', 14: '6', 15: '9', 16: '8', 17: ':', 18: '=', 19: '@', 20: 'C', 21: 'B', 22: 'F', 23: 'I', 24: 'H', 25: 'O', 26: 'N', 27: 'P', 28: 'S', 29: '[', 30: ']', 31: 'c', 32: 'i', 33: 'o', 34: 'n', 35: 'p', 36: 's', 37: 'Cl', 38: 'Br', 39: ''}
def get_data(self):
data = self.df
length_count = data.length.value_counts()
train_idx = []
for k, v in length_count.items():
if v >= 3:
idx = data[data.length == k].sample(frac=0.8, random_state=self.random_state).index
else:
idx = data[data.length == k].sample(n=1, random_state=self.random_state).index
train_idx.extend(idx)
X_train = deepcopy(data[data.index.isin(train_idx)])
X_test = deepcopy(data[~data.index.isin(train_idx)])
# length_count = data.length.value_counts()
# test_idx = []
# for k, v in length_count.items():
# if v >= 3:
# idx = data[data.length == k].sample(frac=0.5, random_state=random_state).index
# else:
# idx = data[data.length == k].sample(n=1, random_state=random_state).index
# test_idx.extend(idx)
# X_test = data[data.index.isin(test_idx)]
# X_valid = data[~data.index.isin(test_idx)]
if self.train_augment_times>1:
train_temp = pd.concat([X_train] * (self.train_augment_times - 1), axis=0)
train_temp[self.smile_field] = train_temp[self.smile_field].map(lambda x: randomize_smile(x))
train_set = pd.concat([train_temp, X_train], ignore_index=True)
else:
train_set = X_train
train_set.dropna(inplace=True)
train_set = deepcopy(train_set)
train_set['length'] = train_set[self.smile_field].map(lambda x: len(x.replace('Cl', 'X').replace('Br', 'Y')))
train_set = train_set[train_set.length <= self.max_len]
if self.test_augment_times>1:
test_temp = pd.concat([X_test] * (self.test_augment_times - 1), axis=0)
test_temp[self.smile_field] = test_temp[self.smile_field].map(lambda x: randomize_smile(x))
test_set = pd.concat([test_temp, X_test], ignore_index=True)
# test_set['length'] = test_set[self.smile_field].map(lambda x: len(x.replace('Cl', 'X').replace('Br', 'Y')))
# test_set = test_set[test_set.length <= self.max_len]
else:
test_set = X_test
test_set = deepcopy(test_set)
x_train,y_train = self.numerical_smiles(train_set)
x_test, y_test = self.numerical_smiles(test_set)
print(len(X_train)/len(X_train[self.smile_field].unique()))
print(x_test.shape)
return x_train,y_train,x_test,y_test
def numerical_smiles(self, data):
x = np.zeros((len(data), (self.max_len + 2)), dtype='int32')
y = np.array(data[self.label_field]).astype('float32')
for i,smiles in enumerate(data[self.smile_field].tolist()):
smiles = self._char_to_idx(seq = smiles)
smiles = self._pad_start_end_token(smiles)
x[i,:len(smiles)] = np.array(smiles)
return x, y
def _pad_start_end_token(self,seq):
seq.insert(0, self.vocab['' ])
seq.append(self.vocab['' ])
return seq
def _char_to_idx(self,seq):
char_list = re.findall(regex_pattern, seq)
return [self.vocab[char_list[j]] for j in range(len(char_list))]
vocab = np.load('./data/vocab.npy', allow_pickle=True).item()
vocab[0]=''
vocab[39]=''
vocab=dict(zip(vocab.values(),vocab.keys()))
print(vocab)
np.save('F:\\vocab.npy', vocab)
data=dataset.get_data()
data
"""
1.0116686114352391
(22100, 102)
(array([[40, 20, 7, ..., 0, 0, 0],
[40, 20, 7, ..., 0, 0, 0],
[40, 31, 7, ..., 0, 0, 0],
...,
[40, 25, 7, ..., 0, 0, 0],
[40, 37, 31, ..., 0, 0, 0],
[40, 25, 20, ..., 0, 0, 0]], dtype=int32),
array([-0.96, -0.92, -0.9 , ..., 1.14, 1.67, 1.53], dtype=float32),
array([[40, 20, 4, ..., 0, 0, 0],
[40, 20, 26, ..., 0, 0, 0],
[40, 33, 7, ..., 0, 0, 0],
...,
[40, 25, 7, ..., 0, 0, 0],
[40, 25, 18, ..., 0, 0, 0],
[40, 25, 7, ..., 0, 0, 0]], dtype=int32),
array([-0.67 , -0.2 , 0.4 , ..., -3.34 , 1.81 , 1.415], dtype=float32))
"""
x_train,y_train,x_test,y_test=data
x_train.shape,y_train.shape,x_test.shape,y_test.shape
"""
((86700, 102), (86700,), (22100, 102), (22100,))
"""
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"] = "True"
activation = 'relu'
dropout_rate = 0.25
os.environ["CUDA_VISIBLE_DEVICES]" = "0"
dataset = Dataset('data/reg/{}.txt'.format(task), 'SMILES', 'Label', 100, 100, 100, seed)
test_aug_times = dataset.test_augment_times
train_aug_times = dataset.train_augment_times
data = dataset.get_data()
x_train = data[0].astype('int32')
y_train = data[1].astype('float32').reshape(-1, 1)
y_mean = y_train.mean()
y_max = y_train.max()
y_train = (y_train - y_mean) / y_max
x_test = data[2].astype('int32')
y_test = data[3].astype('float32').reshape(-1, 1)
y_test = (y_test - y_mean) / y_max
cbk = keras.callbacks.LambdaCallback(on_epoch_end=lambda epoch, logs: myfunc(epoch, logs))
def myfunc(epoch, logs):
global x_test, y_test, model, ch, test_aug_times, ch1, ch5, ch10, ch20, ch50, ch100
y_pred = model.predict(x_test)
y_test1 = y_test.reshape(test_aug_times, -1)[-1:, :].mean(0)
y_pred1 = y_pred.reshape(test_aug_times, -1)[-1:, :].mean(0)
y_test5 = y_test.reshape(test_aug_times, -1)[-5:, :].mean(0)
y_pred5 = y_pred.reshape(test_aug_times, -1)[-5:, :].mean(0)
y_test10 = y_test.reshape(test_aug_times, -1)[-10:, :].mean(0)
y_pred10 = y_pred.reshape(test_aug_times, -1)[-10:, :].mean(0)
y_test20 = y_test.reshape(test_aug_times, -1)[-20:, :].mean(0)
y_pred20 = y_pred.reshape(test_aug_times, -1)[-20:, :].mean(0)
y_test50 = y_test.reshape(test_aug_times, -1)[-50:, :].mean(0)
y_pred50 = y_pred.reshape(test_aug_times, -1)[-50:, :].mean(0)
y_test100 = y_test.reshape(test_aug_times, -1).mean(0)
y_pred100 = y_pred.reshape(test_aug_times, -1).mean(0)
r1 = r2_keras(y_test1, y_pred1)
r5 = r2_keras(y_test5, y_pred5)
r10 = r2_keras(y_test10, y_pred10)
r20 = r2_keras(y_test20, y_pred20)
r50 = r2_keras(y_test50, y_pred50)
r100 = r2_keras(y_test100, y_pred100)
print('\n')
print(r1, r5, r10, '\n', r20, r50, r100)
print('\n')
ch1.append(r1.numpy())
ch5.append(r5.numpy())
ch10.append(r10.numpy())
ch20.append(r20.numpy())
ch50.append(r50.numpy())
ch100.append(r100.numpy())
return r1
def r2_keras(y_true, y_pred):
y_true = tf.reshape(y_true, (-1, 1))
y_pred = tf.reshape(y_pred, (-1, 1))
SS_res = tf.reduce_sum(tf.square(y_true - y_pred))
SS_tot = tf.reduce_sum(tf.square(y_true - tf.reduce_mean(y_true)))
return (1 - SS_res / (SS_tot + 10e-8))
model = build_model()
def build_model():
inp = keras.Input(shape=[102, ], dtype=tf.int32)
emb = layers.Embedding(41, 64, mask_zero=True,
embeddings_regularizer=keras.regularizers.l2(1e-5),
embeddings_constraint=keras.constraints.max_norm(3)
)(inp)
mask = tf.equal(inp, 0)
emb = layers.Masking(mask_value=0.0)(emb)
emb = layers.Dropout(dropout_rate)(emb)
x = layers.Bidirectional(layers.LSTM(128, return_sequences=True))(emb)
x = layers.Dropout(dropout_rate)(x)
x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
x = RemoveMask()(x)
x = AttentionWithContext(x, mask)
x = layers.Dense(256,
activation='relu',
)(x)
x = layers.Dropout(0.25)(x)
x = layers.Dense(64,
activation='relu'
)(x)
x = layers.Dropout(0.5)(x)
y = layers.Dense(1)(x)
model = keras.Model(inputs=inp, outputs=y)
learning_rate = tf.keras.optimizers.schedules.ExponentialDecay(
0.005,
decay_steps=3000,
decay_rate=0.96,
staircase=True)
optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
model.compile(loss='mse', optimizer=optimizer, metrics=[r2_keras])
return model
class RemoveMask(keras.layers.Layer):
def __init__(self, return_masked=False, no_mask=False, **kwargs):
super(RemoveMask, self).__init__(**kwargs)
self.supports_masking = True
self.no_mask = no_mask
def compute_mask(self, inputs, mask=None):
return None
def AttentionWithContext(x, mask):
att = layers.Dense(64, activation=activation)(x)
att = layers.Dropout(0.25)(att)
att = layers.Dense(1)(att)
mask = tf.expand_dims(tf.cast(mask, tf.float32), 2)
att = att + mask * (-1e8)
att = layers.Softmax(1)(att)
context_vector = att * x
context_vector = tf.reduce_sum(context_vector, 1)
context_vector = layers.Flatten()(context_vector)
return context_vector
Model: "model"
__________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
==================================================================================================
input_1 (InputLayer) [(None, 102)] 0 []
embedding (Embedding) (None, 102, 64) 2624 ['input_1[0][0]']
masking (Masking) (None, 102, 64) 0 ['embedding[0][0]']
dropout (Dropout) (None, 102, 64) 0 ['masking[0][0]']
bidirectional (Bidirectional) (None, 102, 256) 197632 ['dropout[0][0]']
dropout_1 (Dropout) (None, 102, 256) 0 ['bidirectional[0][0]']
bidirectional_1 (Bidirectional (None, 102, 128) 164352 ['dropout_1[0][0]']
)
remove_mask (RemoveMask) (None, 102, 128) 0 ['bidirectional_1[0][0]']
tf.math.equal (TFOpLambda) (None, 102) 0 ['input_1[0][0]']
dense (Dense) (None, 102, 64) 8256 ['remove_mask[0][0]']
tf.cast (TFOpLambda) (None, 102) 0 ['tf.math.equal[0][0]']
dropout_2 (Dropout) (None, 102, 64) 0 ['dense[0][0]']
tf.expand_dims (TFOpLambda) (None, 102, 1) 0 ['tf.cast[0][0]']
dense_1 (Dense) (None, 102, 1) 65 ['dropout_2[0][0]']
tf.math.multiply (TFOpLambda) (None, 102, 1) 0 ['tf.expand_dims[0][0]']
tf.__operators__.add (TFOpLamb (None, 102, 1) 0 ['dense_1[0][0]',
da) 'tf.math.multiply[0][0]']
softmax (Softmax) (None, 102, 1) 0 ['tf.__operators__.add[0][0]']
tf.math.multiply_1 (TFOpLambda (None, 102, 128) 0 ['softmax[0][0]',
) 'remove_mask[0][0]']
tf.math.reduce_sum (TFOpLambda (None, 128) 0 ['tf.math.multiply_1[0][0]']
)
flatten (Flatten) (None, 128) 0 ['tf.math.reduce_sum[0][0]']
dense_2 (Dense) (None, 256) 33024 ['flatten[0][0]']
dropout_3 (Dropout) (None, 256) 0 ['dense_2[0][0]']
dense_3 (Dense) (None, 64) 16448 ['dropout_3[0][0]']
dropout_4 (Dropout) (None, 64) 0 ['dense_3[0][0]']
dense_4 (Dense) (None, 1) 65 ['dropout_4[0][0]']
==================================================================================================
Total params: 422,466
Trainable params: 422,466
Non-trainable params: 0
__________________________________________________________________________________________________
h = model.fit(x_train, y_train, epochs=150, batch_size=512,
validation_data=(x_test, y_test), callbacks=[cbk, ])
history = h.history
history['r1'] = ch1
history['r5'] = ch5
history['r10'] = ch10
history['r20'] = ch20
history['r50'] = ch50
history['r100'] = ch100
res = pd.DataFrame(history)
res.to_csv('result/{}_{}_{}_{}.csv'.format(task, seed, train_aug_times, test_aug_times))
keras.backend.clear_session()