2021SC@SDUSC
加载亲和力值数据
affinity = pickle.load(open(dataset_path + 'Y', 'rb'), encoding='latin1')
设置四个list分别存储药物SMILES、蛋白质序列、蛋白质key、药物分子式,并使用数据集通过循环将它们填满
drugs = []
prots = []
prot_keys = []
drug_smiles = []
# smiles
for d in ligands.keys():
lg = Chem.MolToSmiles(Chem.MolFromSmiles(ligands[d]), isomericSmiles=True)
drugs.append(lg)
drug_smiles.append(ligands[d])
# seqs
for t in proteins.keys():
prots.append(proteins[t])
prot_keys.append(t)
Chem的用法,具体是可以将分子与SMILES相互转换
lg = Chem.MolToSmiles(Chem.MolFromSmiles(ligands[d]), isomericSmiles=True)
处理affinity,并将affinity转换为array形式
if dataset == 'davis':
affinity = [-np.log10(y / 1e9) for y in affinity]
affinity = np.asarray(affinity)
之后对刚刚获取的训练集和测试集数据进行封装打包以及持久化(详见注释)
opts = ['train', 'valid']
valid_train_count = 0
valid_valid_count = 0
for opt in opts:
if opt == 'train':
rows, cols = np.where(np.isnan(affinity) == False)
rows, cols = rows[train_folds], cols[train_folds]
train_fold_entries = []
for pair_ind in range(len(rows)):
if not valid_target(prot_keys[cols[pair_ind]], dataset): # ensure the contact and aln files exists
continue
#将刚刚四种添加到ls里打包
ls = []
ls += [drugs[rows[pair_ind]]]
ls += [prots[cols[pair_ind]]]
ls += [prot_keys[cols[pair_ind]]]
ls += [affinity[rows[pair_ind], cols[pair_ind]]]
train_fold_entries.append(ls)
valid_train_count += 1
#创建csv文件并将封装好的数据填入到csv文件中
csv_file = 'data/' + dataset + '_' + 'fold_' + str(fold) + '_' + opt + '.csv'
data_to_csv(csv_file, train_fold_entries)
#对于验证集,一样的操作
elif opt == 'valid':
rows, cols = np.where(np.isnan(affinity) == False)
rows, cols = rows[valid_fold], cols[valid_fold]
valid_fold_entries = []
for pair_ind in range(len(rows)):
if not valid_target(prot_keys[cols[pair_ind]], dataset):
continue
ls = []
ls += [drugs[rows[pair_ind]]]
ls += [prots[cols[pair_ind]]]
ls += [prot_keys[cols[pair_ind]]]
ls += [affinity[rows[pair_ind], cols[pair_ind]]]
valid_fold_entries.append(ls)
valid_valid_count += 1
csv_file = 'data/' + dataset + '_' + 'fold_' + str(fold) + '_' + opt + '.csv'
data_to_csv(csv_file, valid_fold_entries)
将drugs和prot_keys再次分别赋值作后续处理
compound_iso_smiles = drugs
target_key = prot_keys
使用compound_iso_smiles循环通过smile_to_graph函数转换smiles获取分子图
# create smile graph
smile_graph = {}
for smile in compound_iso_smiles:
g = smile_to_graph(smile)
smile_graph[smile] = g
smile_to_graph函数:
# mol smile to mol graph edge index
def smile_to_graph(smile):
mol = Chem.MolFromSmiles(smile) #将smiles转换为分子式
c_size = mol.GetNumAtoms() #获取原子数
#features存储了一个分子的所有原子特征,也就是分子图的节点特征
features = []
for atom in mol.GetAtoms():
feature = atom_features(atom) #获取原子特征
features.append(feature / sum(feature))
#分子图的边特征
edges = []
for bond in mol.GetBonds(): #mol.GetBonds可以获取分子的所有边
edges.append([bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()]) #添加边到edges中
g = nx.Graph(edges).to_directed() #将图转换为有向图
edge_index = []
mol_adj = np.zeros((c_size, c_size))
for e1, e2 in g.edges:
mol_adj[e1, e2] = 1
# edge_index.append([e1, e2])
mol_adj += np.matrix(np.eye(mol_adj.shape[0]))
index_row, index_col = np.where(mol_adj >= 0.5)
for i, j in zip(index_row, index_col):
edge_index.append([i, j])
# print('smile_to_graph')
# print(np.array(features).shape)
return c_size, features, edge_index
atom_features函数:作用是获取返回原子特征
# mol atom feature for mol graph
def atom_features(atom):
# 44 +11 +11 +11 +1
return np.array(one_of_k_encoding_unk(atom.GetSymbol(),
['C', 'N', 'O', 'S', 'F', 'Si', 'P', 'Cl', 'Br', 'Mg', 'Na', 'Ca', 'Fe', 'As',
'Al', 'I', 'B', 'V', 'K', 'Tl', 'Yb', 'Sb', 'Sn', 'Ag', 'Pd', 'Co', 'Se',
'Ti', 'Zn', 'H', 'Li', 'Ge', 'Cu', 'Au', 'Ni', 'Cd', 'In', 'Mn', 'Zr', 'Cr',
'Pt', 'Hg', 'Pb', 'X']) +
one_of_k_encoding(atom.GetDegree(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) +
one_of_k_encoding_unk(atom.GetTotalNumHs(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) +
one_of_k_encoding_unk(atom.GetImplicitValence(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) +
[atom.GetIsAromatic()])
同分子图,获取蛋白质图,与分子图不同,蛋白质图仅需查表即可
# create target graph
target_graph = {}
for key in target_key:
if not valid_target(key, dataset): # ensure the contact and aln files exists
continue
g = target_to_graph(key, proteins[key], contac_path, msa_path)
target_graph[key] = g
进一步处理蛋白质、分子、亲和力,最后封装为DTADataset对象
#将药物、蛋白质、亲和力转换为list
train_drugs, train_prot_keys, train_Y = list(df_train_fold['compound_iso_smiles']), list(
df_train_fold['target_key']), list(df_train_fold['affinity'])
#转换为numpy
train_drugs, train_prot_keys, train_Y = np.asarray(train_drugs), np.asarray(train_prot_keys), np.asarray(train_Y)
#将三个numpy打包为DTADataset对象
train_dataset = DTADataset(root='data', dataset=dataset + '_' + 'train', xd=train_drugs, target_key=train_prot_keys,
y=train_Y, smile_graph=smile_graph, target_graph=target_graph)
DTADataset对象的初始化处理部分
class DTADataset(InMemoryDataset):
def __init__(self, root='/tmp', dataset='davis',
xd=None, y=None, transform=None,
pre_transform=None, smile_graph=None, target_key=None, target_graph=None):
super(DTADataset, self).__init__(root, transform, pre_transform)
self.dataset = dataset
self.process(xd, target_key, y, smile_graph, target_graph)
同理对验证集作相同处理
df_valid_fold = pd.read_csv('data/' + dataset + '_' + 'fold_' + str(fold) + '_' + 'valid' + '.csv')
valid_drugs, valid_prots_keys, valid_Y = list(df_valid_fold['compound_iso_smiles']), list(
df_valid_fold['target_key']), list(df_valid_fold['affinity'])
valid_drugs, valid_prots_keys, valid_Y = np.asarray(valid_drugs), np.asarray(valid_prots_keys), np.asarray(
valid_Y)
valid_dataset = DTADataset(root='data', dataset=dataset + '_' + 'train', xd=valid_drugs,
target_key=valid_prots_keys, y=valid_Y, smile_graph=smile_graph,
target_graph=target_graph)
最后得到返回,也就是create_dataset_for_5folds函数返回的训练集和验证集
return train_dataset, valid_dataset
同理对于测试集合的处理与上述训练和验证集相同(以下为部分代码):
def create_dataset_for_test(dataset):
# load dataset
dataset_path = 'data/' + dataset + '/'
test_fold = json.load(open(dataset_path + 'folds/test_fold_setting1.txt')) #加载测试数据
ligands = json.load(open(dataset_path + 'ligands_can.txt'), object_pairs_hook=OrderedDict) #加载分子配体数据
proteins = json.load(open(dataset_path + 'proteins.txt'), object_pairs_hook=OrderedDict) #加载蛋白质数据
affinity = pickle.load(open(dataset_path + 'Y', 'rb'), encoding='latin1') #加载亲和力值数据
# load contact and aln
msa_path = 'data/' + dataset + '/aln'
contac_path = 'data/' + dataset + '/pconsc4'
msa_list = []
训练,验证,打印训练与验证结果
for epoch in range(NUM_EPOCHS):
train(model, device, train_loader, optimizer, epoch + 1) #训练模型
print('predicting for valid data')
G, P = predicting(model, device, valid_loader) #验证集数据预测
val = get_mse(G, P)#获取均方误差
print('valid result:', val, best_mse) #打印验证结果
# 若均方误差小于最好的
if val < best_mse:
best_mse = val #将当前设置为最好的
best_epoch = epoch + 1
torch.save(model.state_dict(), model_file_name) #保存状态字典
print('rmse improved at epoch ', best_epoch, '; best_test_mse', best_mse, model_st, dataset, fold)
else:
print('No improvement since epoch ', best_epoch, '; best_test_mse', best_mse, model_st, dataset, fold)
训练部分的代码,使用训练集的数据进行预测,计算均方误差,反向传播以及参数更新
# training function at each epoch 训练一个批次
def train(model, device, train_loader, optimizer, epoch):
print('Training on {} samples...'.format(len(train_loader.dataset)))
model.train()
LOG_INTERVAL = 10
TRAIN_BATCH_SIZE = 512
loss_fn = torch.nn.MSELoss()
for batch_idx, data in enumerate(train_loader):
data_mol = data[0].to(device)
data_pro = data[1].to(device)
optimizer.zero_grad() #梯度归零
output = model(data_mol, data_pro) #进行一次预测
loss = loss_fn(output, data_mol.y.view(-1, 1).float().to(device)) #将结果output送入lossFunction
loss.backward() #反向传播
optimizer.step() #参数更新
if batch_idx % LOG_INTERVAL == 0:
print('Train epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch,
batch_idx * TRAIN_BATCH_SIZE,
len(train_loader.dataset),
100. * batch_idx / len(train_loader),
loss.item()))
使用之前处理好的验证集数据进行预测,最后返回为预测值与label的数组集合
# predict 利用验证集进行预测
def predicting(model, device, loader):
model.eval()
total_preds = torch.Tensor()
total_labels = torch.Tensor()
print('Make prediction for {} samples...'.format(len(loader.dataset)))
with torch.no_grad():
for data in loader:
data_mol = data[0].to(device)
data_pro = data[1].to(device)
output = model(data_mol, data_pro)
total_preds = torch.cat((total_preds, output.cpu()), 0)
total_labels = torch.cat((total_labels, data_mol.y.view(-1, 1).cpu()), 0)
return total_labels.numpy().flatten(), total_preds.numpy().flatten()
至此训练与验证代码部分结束