DGraphDTA训练部分源码解读分析(二)2021SC@SDUSC

2021SC@SDUSC
加载亲和力值数据

affinity = pickle.load(open(dataset_path + 'Y', 'rb'), encoding='latin1') 

设置四个list分别存储药物SMILES、蛋白质序列、蛋白质key、药物分子式,并使用数据集通过循环将它们填满

drugs = []
prots = []
prot_keys = []
drug_smiles = []
# smiles
for d in ligands.keys():
    lg = Chem.MolToSmiles(Chem.MolFromSmiles(ligands[d]), isomericSmiles=True)
    drugs.append(lg)
    drug_smiles.append(ligands[d])
# seqs
for t in proteins.keys():
    prots.append(proteins[t])
    prot_keys.append(t)

Chem的用法,具体是可以将分子与SMILES相互转换

lg = Chem.MolToSmiles(Chem.MolFromSmiles(ligands[d]), isomericSmiles=True)

处理affinity,并将affinity转换为array形式

    if dataset == 'davis':
        affinity = [-np.log10(y / 1e9) for y in affinity]
    affinity = np.asarray(affinity) 

之后对刚刚获取的训练集和测试集数据进行封装打包以及持久化(详见注释)

   opts = ['train', 'valid']
    valid_train_count = 0
    valid_valid_count = 0
    for opt in opts:
        if opt == 'train':
            rows, cols = np.where(np.isnan(affinity) == False)
            rows, cols = rows[train_folds], cols[train_folds]
            train_fold_entries = []
            for pair_ind in range(len(rows)):
                if not valid_target(prot_keys[cols[pair_ind]], dataset):  # ensure the contact and aln files exists
                    continue
                #将刚刚四种添加到ls里打包
                ls = []
                ls += [drugs[rows[pair_ind]]]
                ls += [prots[cols[pair_ind]]]
                ls += [prot_keys[cols[pair_ind]]]
                ls += [affinity[rows[pair_ind], cols[pair_ind]]]
                train_fold_entries.append(ls)
                valid_train_count += 1

            #创建csv文件并将封装好的数据填入到csv文件中
            csv_file = 'data/' + dataset + '_' + 'fold_' + str(fold) + '_' + opt + '.csv'
            data_to_csv(csv_file, train_fold_entries)

        #对于验证集,一样的操作
        elif opt == 'valid':
            rows, cols = np.where(np.isnan(affinity) == False)
            rows, cols = rows[valid_fold], cols[valid_fold]
            valid_fold_entries = []
            for pair_ind in range(len(rows)):
                if not valid_target(prot_keys[cols[pair_ind]], dataset):
                    continue
                ls = []
                ls += [drugs[rows[pair_ind]]]
                ls += [prots[cols[pair_ind]]]
                ls += [prot_keys[cols[pair_ind]]]
                ls += [affinity[rows[pair_ind], cols[pair_ind]]]
                valid_fold_entries.append(ls)
                valid_valid_count += 1

            csv_file = 'data/' + dataset + '_' + 'fold_' + str(fold) + '_' + opt + '.csv'
            data_to_csv(csv_file, valid_fold_entries)

将drugs和prot_keys再次分别赋值作后续处理

compound_iso_smiles = drugs 
target_key = prot_keys 

使用compound_iso_smiles循环通过smile_to_graph函数转换smiles获取分子图

# create smile graph
smile_graph = {}
for smile in compound_iso_smiles:
    g = smile_to_graph(smile)
    smile_graph[smile] = g

smile_to_graph函数:

# mol smile to mol graph edge index
def smile_to_graph(smile):
    mol = Chem.MolFromSmiles(smile) #将smiles转换为分子式

    c_size = mol.GetNumAtoms() #获取原子数

    #features存储了一个分子的所有原子特征,也就是分子图的节点特征
    features = []
    for atom in mol.GetAtoms():
        feature = atom_features(atom) #获取原子特征
        features.append(feature / sum(feature)) 

    #分子图的边特征
    edges = []
    for bond in mol.GetBonds(): #mol.GetBonds可以获取分子的所有边
        edges.append([bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()]) #添加边到edges中
    g = nx.Graph(edges).to_directed() #将图转换为有向图
    edge_index = []
    mol_adj = np.zeros((c_size, c_size))
    for e1, e2 in g.edges:
        mol_adj[e1, e2] = 1
        # edge_index.append([e1, e2])
    mol_adj += np.matrix(np.eye(mol_adj.shape[0]))
    index_row, index_col = np.where(mol_adj >= 0.5)
    for i, j in zip(index_row, index_col):
        edge_index.append([i, j])
    # print('smile_to_graph')
    # print(np.array(features).shape)
    return c_size, features, edge_index

atom_features函数:作用是获取返回原子特征

# mol atom feature for mol graph
def atom_features(atom):
    # 44 +11 +11 +11 +1
    return np.array(one_of_k_encoding_unk(atom.GetSymbol(),
                                          ['C', 'N', 'O', 'S', 'F', 'Si', 'P', 'Cl', 'Br', 'Mg', 'Na', 'Ca', 'Fe', 'As',
                                           'Al', 'I', 'B', 'V', 'K', 'Tl', 'Yb', 'Sb', 'Sn', 'Ag', 'Pd', 'Co', 'Se',
                                           'Ti', 'Zn', 'H', 'Li', 'Ge', 'Cu', 'Au', 'Ni', 'Cd', 'In', 'Mn', 'Zr', 'Cr',
                                           'Pt', 'Hg', 'Pb', 'X']) +
                    one_of_k_encoding(atom.GetDegree(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) +
                    one_of_k_encoding_unk(atom.GetTotalNumHs(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) +
                    one_of_k_encoding_unk(atom.GetImplicitValence(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) +
                    [atom.GetIsAromatic()])

同分子图,获取蛋白质图,与分子图不同,蛋白质图仅需查表即可

# create target graph
target_graph = {}
for key in target_key:
    if not valid_target(key, dataset):  # ensure the contact and aln files exists
        continue
    g = target_to_graph(key, proteins[key], contac_path, msa_path)
    target_graph[key] = g

进一步处理蛋白质、分子、亲和力,最后封装为DTADataset对象

#将药物、蛋白质、亲和力转换为list
train_drugs, train_prot_keys, train_Y = list(df_train_fold['compound_iso_smiles']), list(
    df_train_fold['target_key']), list(df_train_fold['affinity'])
#转换为numpy
train_drugs, train_prot_keys, train_Y = np.asarray(train_drugs), np.asarray(train_prot_keys), np.asarray(train_Y)
#将三个numpy打包为DTADataset对象
train_dataset = DTADataset(root='data', dataset=dataset + '_' + 'train', xd=train_drugs, target_key=train_prot_keys,
                           y=train_Y, smile_graph=smile_graph, target_graph=target_graph)

DTADataset对象的初始化处理部分

class DTADataset(InMemoryDataset):
    def __init__(self, root='/tmp', dataset='davis',
                 xd=None, y=None, transform=None,
                 pre_transform=None, smile_graph=None, target_key=None, target_graph=None):

        super(DTADataset, self).__init__(root, transform, pre_transform)
        self.dataset = dataset
        self.process(xd, target_key, y, smile_graph, target_graph)

同理对验证集作相同处理

df_valid_fold = pd.read_csv('data/' + dataset + '_' + 'fold_' + str(fold) + '_' + 'valid' + '.csv')
    valid_drugs, valid_prots_keys, valid_Y = list(df_valid_fold['compound_iso_smiles']), list(
        df_valid_fold['target_key']), list(df_valid_fold['affinity'])
    valid_drugs, valid_prots_keys, valid_Y = np.asarray(valid_drugs), np.asarray(valid_prots_keys), np.asarray(
        valid_Y)
    valid_dataset = DTADataset(root='data', dataset=dataset + '_' + 'train', xd=valid_drugs,
                               target_key=valid_prots_keys, y=valid_Y, smile_graph=smile_graph,
                               target_graph=target_graph)

最后得到返回,也就是create_dataset_for_5folds函数返回的训练集和验证集

return train_dataset, valid_dataset

同理对于测试集合的处理与上述训练和验证集相同(以下为部分代码):

def create_dataset_for_test(dataset):
    # load dataset
    dataset_path = 'data/' + dataset + '/'
    test_fold = json.load(open(dataset_path + 'folds/test_fold_setting1.txt')) #加载测试数据
    ligands = json.load(open(dataset_path + 'ligands_can.txt'), object_pairs_hook=OrderedDict) #加载分子配体数据
    proteins = json.load(open(dataset_path + 'proteins.txt'), object_pairs_hook=OrderedDict) #加载蛋白质数据
    affinity = pickle.load(open(dataset_path + 'Y', 'rb'), encoding='latin1') #加载亲和力值数据
    # load contact and aln
    msa_path = 'data/' + dataset + '/aln'
    contac_path = 'data/' + dataset + '/pconsc4'
    msa_list = []

训练,验证,打印训练与验证结果

    for epoch in range(NUM_EPOCHS):
        train(model, device, train_loader, optimizer, epoch + 1) #训练模型
        print('predicting for valid data')
        G, P = predicting(model, device, valid_loader) #验证集数据预测
        val = get_mse(G, P)#获取均方误差
        print('valid result:', val, best_mse) #打印验证结果

        # 若均方误差小于最好的
        if val < best_mse: 
            best_mse = val #将当前设置为最好的
            best_epoch = epoch + 1
            torch.save(model.state_dict(), model_file_name) #保存状态字典
            print('rmse improved at epoch ', best_epoch, '; best_test_mse', best_mse, model_st, dataset, fold)
        else:
            print('No improvement since epoch ', best_epoch, '; best_test_mse', best_mse, model_st, dataset, fold)

训练部分的代码,使用训练集的数据进行预测,计算均方误差,反向传播以及参数更新

# training function at each epoch 训练一个批次
def train(model, device, train_loader, optimizer, epoch):
    print('Training on {} samples...'.format(len(train_loader.dataset)))
    model.train()
    LOG_INTERVAL = 10
    TRAIN_BATCH_SIZE = 512
    loss_fn = torch.nn.MSELoss()
    for batch_idx, data in enumerate(train_loader):
        data_mol = data[0].to(device)
        data_pro = data[1].to(device)
        optimizer.zero_grad() #梯度归零
        output = model(data_mol, data_pro) #进行一次预测
        loss = loss_fn(output, data_mol.y.view(-1, 1).float().to(device)) #将结果output送入lossFunction
        loss.backward() #反向传播
        optimizer.step() #参数更新
        if batch_idx % LOG_INTERVAL == 0:
            print('Train epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(epoch,
                                                                           batch_idx * TRAIN_BATCH_SIZE,
                                                                           len(train_loader.dataset),
                                                                           100. * batch_idx / len(train_loader),
                                                                           loss.item()))

使用之前处理好的验证集数据进行预测,最后返回为预测值与label的数组集合

# predict 利用验证集进行预测
def predicting(model, device, loader):
    model.eval()
    total_preds = torch.Tensor()
    total_labels = torch.Tensor()
    print('Make prediction for {} samples...'.format(len(loader.dataset)))
    with torch.no_grad():
        for data in loader:
            data_mol = data[0].to(device)
            data_pro = data[1].to(device)
            output = model(data_mol, data_pro)
            total_preds = torch.cat((total_preds, output.cpu()), 0)
            total_labels = torch.cat((total_labels, data_mol.y.view(-1, 1).cpu()), 0)
    return total_labels.numpy().flatten(), total_preds.numpy().flatten()

至此训练与验证代码部分结束

你可能感兴趣的:(DTA任务源码分析,python,机器学习,深度学习)