让计算机识别分子结构是计算化学码农的必备技能,也是对分子进行后续操作的基础。本文整理和总结了rdkit进行读取、输出和可视化的一些方法,包含对SMILES、SDF、MOL、MOL2、CSV等文件的处理,以及分子的结构展示。
>>> from rdkit import Chem
>>> m = Chem.MolFromSmiles('C[C@H](O)c1ccccc1')
>>> m = Chem.MolFromSmarts('Cc1ccccc1')
文件类似这样(格式化的就行):
SMILES | Name |
---|---|
C1=CC=CC=CC=C1 | 0 |
c1ccccc1 | 1 |
c1cocc1 | 2 |
>>> suppl = Chem.SmilesMolSupplier('data/batch_smiles.smi', delimiter='\t')
>>> mols = [Chem.MolToSmiles(mol) for mol in suppl]
>>> print(mols)
['C1=CC=CC=CC=C1', 'c1ccccc1', 'c1ccoc1']
>>> with open('data/batch_smiles.smi', 'r') as f:
>>> mols_text = f.read()
>>> suppl = Chem.SmilesMolSupplierFromText(mols_text, delimiter='\t')
>>> mols = [Chem.MolToSmiles(mol) for mol in suppl]
>>> print(mols)
['C1=CC=CC=CC=C1', 'c1ccccc1', 'c1ccoc1']
>>> import pandas as pd
>>> from rdkit.Chem import Descriptors
>>> from rdkit.Chem import PandasTools
>>> df = pd.read_csv('data/smiles_df.csv')
>>> PandasTools.AddMoleculeColumnToFrame(df,'SMILES','mol',includeFingerprints=True)
>>> df['MW'] = df['mol'].apply(Descriptors.MolWt)
>>> df.head(2)
Name SMILES mol MW
0 Lanreotide c1(c2c(cccc2)[nH... <img data-content="rd... 1096.347
1 Lansoprazole Cc1c(OCC(F)(F)F)... <img data-content="rd... 369.368
>>> suppl = Chem.SDMolSupplier('data/batch.sdf')
>>> mols = [Chem.MolToSmiles(mol) for mol in suppl if mol]
>>> print(mols)
['C1=C\\C=C/C=C\\C=C/1', 'c1ccccc1', 'c1ccoc1']
>>> import gzip
>>> gz_file = gzip.open('data/batch.sdf.gz', 'r')
>>> suppl = Chem.ForwardSDMolSupplier(gz_file)
>>> mols = [Chem.MolToSmiles(mol) for mol in suppl if mol]
>>> print(mols)
>>> f.close()
['C1=C\\C=C/C=C\\C=C/1', 'c1ccccc1', 'c1ccoc1']
>>> m = Chem.MolFromMolFile('data/output.mol')
>>> print(Chem.MolToSmiles(mol))
c1cocc1
>>> m = Chem.MolFromMol2File('data/output.mol2')
>>> print(Chem.MolToSmiles(mol))
c1cocc1
# PDB
>>> Chem.MolFromPDBFile()
>>> Chem.MolFromPDBBlock()
# FASTA
>>> Chem.MolFromFASTA()
# peptide
>>> Chem.MolFromSequence()
>>> m1 = Chem.MolFromSmiles('C1=CC=CC=CC=C1')
>>> m2 = Chem.MolFromSmiles('C1=CC=CC=C1')
>>> m3 = Chem.MolFromSmiles('C1=COC=C1')
>>> mols = [m1, m2, m3]
>>> print([Chem.MolToSmiles(mol) for mol in mols])
['C1=CC=CC=CC=C1', 'c1ccccc1', 'c1ccoc1']
>>> for mol in mols:
>>> Chem.Kekulize(mol)
>>> print([Chem.MolToSmiles(mol, kekuleSmiles=True) for mol in mols])
['C1=CC=CC=CC=C1', 'C1=CC=CC=C1', 'C1=COC=C1']
>>> m4 = Chem.MolFromSmiles('C[C@H](O)c1ccccc1')
>>> print(Chem.MolToSmiles(m4))
C[C@H](O)c1ccccc1
>>> print(Chem.MolToSmiles(m4, isomericSmiles=False))
CC(O)c1ccccc1
>>> writer = Chem.SmilesWriter('data/batch.smi', delimiter='\t')
>>> for i, mol in enumerate(mols):
>>> writer.write(mol)
>>> writer.close()
批量输出SMILES及属性,通过以下函数进行操作:
mol.GetPropNames(),查看分子属性列表
mol.GetProp(),获取相应属性
mol.SetProp(key, val),新增属性名key、对应属性值val
writer.SetProps(),设置哪些属性要输出
以输出分子量和LogP为例
使用Descriptors计算属性,并添加
>>> writer = Chem.SmilesWriter('data/batch_smiles.smi', delimiter='\t', nameHeader='mol_id')
>>> writer.SetProps(['LOGP', 'MW'])
>>> for i, mol in enumerate(mols):
>>> mw = Descriptors.ExactMolWt(mol)
>>> logp = Descriptors.MolLogP(mol)
>>> mol.SetProp('MW', '%.2f' %(mw))
>>> mol.SetProp('LOGP', '%.2f' %(logp))
>>> mol.SetProp('_Name', 'No_%s' %(i))
>>> writer.write(mol)
>>> writer.close()
>>> print('number of mols:', writer.NumMols())
number of mols: 3
>>> print('mol properties:', [i for i in mol.GetPropNames()])
mol properties: ['MW', 'LOGP']
>>> Chem.MolToSmarts(m3, isomericSmiles=True)
'[#6]1:[#6]:[#8]:[#6]:[#6]:1'
>>> writer = Chem.SDWriter('data/batch.sdf')
>>> writer.SetProps(['LOGP', 'MW'])
>>> for i, mol in enumerate(mols):
>>> mw = Descriptors.ExactMolWt(mol)
>>> logp = Descriptors.MolLogP(mol)
>>> mol.SetProp('MW', '%.2f' %(mw))
>>> mol.SetProp('LOGP', '%.2f' %(logp))
>>> mol.SetProp('_Name', 'No_%s' %(i))
>>> writer.write(mol)
>>> writer.close()
>>> outf = gzip.open('data/batch.sdf.gz','wt+')
>>> writer = Chem.SDWriter(outf)
>>> for mol in mols:
>>> writer.write(mol)
>>> writer.close()
>>> outf.close()
>>> m1 = Chem.MolFromSmiles('C1CCC1')
>>> print(Chem.MolToMolBlock(m1))
RDKit 2D
4 4 0 0 0 0 0 0 0 0999 V2000
1.0607 0.0000 0.0000 C 0 0 0 0 0 0 0 0 0 0 0 0
...
4 1 1 0
M END
>>> m1.SetProp('_Name', 'cyclobutane')
>>> Chem.MolToMolFile(m1, 'data/output.mol')
# PDB
>>> Chem.MolToPDBBlock()
>>> Chem.MolToPDBFile()
>>> Chem.PDBWriter()
# FASTA
>>> Chem.MolToFASTA()
# XYZ
>>> Chem.MolToXYZBlock()
>>> Chem.MolToXYZFile()
该部分只介绍方法,不贴图了,代码源文件可自行查看。
>>> from rdkit.Chem import Draw
>>> mol = Chem.MolFromSmiles('C[C@H](O)c1ccccc1')
>>> Draw.MolToImage(mol, size=(150,150), kekulize=True)
>>> Draw.ShowMol(mol, size=(150,150), kekulize=False)
>>> Draw.MolToFile(mol, 'data/output.png', size=(150, 150))
>>> df = pd.read_csv('data/smiles_df.csv')
>>> PandasTools.AddMoleculeColumnToFrame(df,'SMILES','mol',includeFingerprints=True)
>>> PandasTools.FrameToGridImage(df, column='mol', molsPerRow=5, subImgSize=(200,200), legendsCol='Name')
>>> mols = df.mol.tolist()
>>> legends = df.Name.tolist()
>>> Draw.MolsToGridImage(mols, maxMols=2, molsPerRow=2, subImgSize=(300,300), legends=legends)
>>> m3d = Chem.MolFromSmiles('CNC(=O)N(N(CCCl)S(C)(=O)=O)S(C)(=O)=O')
>>> m3d = Chem.AddHs(m3d)
>>> AllChem.EmbedMolecule(m3d, randomSeed=3)
>>> AllChem.MMFFOptimizeMolecule(m3d)
>>> Draw.MolToImage(m3d, size=(250,250))
本文参考自rdkit官方文档。
代码及源文件在这里。