其实将分子的SMILES转化为图是很简单的,也是很便捷的,主要有以下几步:
import numpy as np
import torch
import dgl
from dgl import DGLGraph
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors as rdDesc
molecule_smiles='[C@@H](Cl)(F)Br'
G = DGLGraph()
#加载smile生成mol对象
molecule = Chem.MolFromSmiles(molecule_smiles)
G.add_nodes(molecule.GetNumAtoms())
get_atom_features和get_bond_features分别为提取原子特征和化学键特征的函数,如下:
这里先插入一个辅助函数:
#辅助函数
def one_of_k_encoding_unk(x, allowable_set):
'将x与allowable_set逐个比较,相同为True, 不同为False, 都不同则认为是最后一个相同'
if x not in allowable_set:
x = allowable_set[-1]
return list(map(lambda s: x == s, allowable_set))
获取原子的特征,特征包括:元素种类、隐含价、价电子、成键、电荷、杂化类型
def get_atom_features(atom):
possible_atom = ['C', 'N', 'O', 'F', 'P', 'Cl', 'Br', 'I', 'DU'] #DU代表其他原子
atom_features = one_of_k_encoding_unk(atom.GetSymbol(), possible_atom)
atom_features += one_of_k_encoding_unk(atom.GetImplicitValence(), [0, 1])
atom_features += one_of_k_encoding_unk(atom.GetNumRadicalElectrons(), [0, 1])
atom_features += one_of_k_encoding_unk(atom.GetDegree(), [0, 1, 2, 3, 4, 5, 6])
atom_features += one_of_k_encoding_unk(atom.GetFormalCharge(), [-1, 1])
atom_features += one_of_k_encoding_unk(atom.GetHybridization(),
[Chem.rdchem.HybridizationType.SP,
Chem.rdchem.HybridizationType.SP2,
Chem.rdchem.HybridizationType.SP3,
Chem.rdchem.HybridizationType.SP3D])
return np.array(atom_features)
获取边特征,包括:是否为单键、双键、三键、成环、芳香环、共轭
def get_bond_features(bond):
bond_type = bond.GetBondType()
bond_feats = [
bond_type == Chem.rdchem.BondType.SINGLE, bond_type == Chem.rdchem.BondType.DOUBLE,
bond_type == Chem.rdchem.BondType.TRIPLE, bond_type == Chem.rdchem.BondType.AROMATIC,
bond.GetIsConjugated(),
bond.IsInRing()
]
return np.array(bond_feats)
node_features = []
edge_features = []
for i in range(molecule.GetNumAtoms()):
atom_i = molecule.GetAtomWithIdx(i)
atom_i_features = get_atom_features(atom_i)
node_features.append(atom_i_features)
for j in range(molecule.GetNumAtoms()):
bond_ij = molecule.GetBondBetweenAtoms(i, j)
if bond_ij is not None:
G.add_edges(i,j)
bond_features_ij = get_bond_features(bond_ij)
edge_features.append(bond_features_ij)
G.ndata['x'] = torch.from_numpy(np.array(node_features)) #dgl添加原子/节点特征
G.edata['w'] = torch.from_numpy(np.array(edge_features)) #dgl添加键/边特征
G