首先看一下文件夹目录:
获取地址为:https://github.com/nyu-dl/dl4chem-mgm
直接下载这个txt文件就可以了,然后直接把后缀改为csv
获取地址为:https://github.com/BenevolentAI/guacamol
或者直接按照如图下的进行下载,然后把每个文件后缀改为csv
You can download pre-built datasets here:
md5 05ad85d871958a05c02ab51a4fde8530
training
md5 e53db4bff7dc4784123ae6df72e3b1f0
validation
md5 677b757ccec4809febd83850b43e1616
test
md5 7d45bc95c33c10cb96ef5e78c38ac0b6
all
运行的命令为:
python smiles2images.py --smiles_dataroot ./datasets_smiles/ChEMBL/guacamol_v1_all.csv --images_datafolder ./datasets_images/ChEMBL/ --smiles_first_line CCCC(=O)NNC(=O)Nc1ccccc1
代码为:
import argparse
import os
import pandas as pd
from tqdm import tqdm
from rdkit import Chem
from rdkit.Chem.Draw import rdMolDraw2D
def main():
parser = argparse.ArgumentParser(description='SMILES to images')
# SMILES数据集所在的文件夹
parser.add_argument('--smiles_dataroot', type=str, default="./datasets_smiles/ChEMBL/guacamol_v1_all.csv", help='data root')
parser.add_argument('--images_datafolder', type=str, default="./datasets_images/ChEMBL/", help='data root')
parser.add_argument('--smiles_first_line', type=str, default="CCCC(=O)NNC(=O)Nc1ccccc1", help='这是csv文件的第一行索引')
args = parser.parse_args()
# SMILES数据集
raw_file_path = os.path.join(args.smiles_dataroot)
# 输出图像文件夹
opt_img = os.path.join(args.images_datafolder)
if not os.path.exists(opt_img):
os.makedirs(opt_img)
df = pd.read_csv(raw_file_path)
opt_smiles = df[args.smiles_first_line].values
i = 1
for opt_smiles in tqdm(opt_smiles, total=len(opt_smiles)):
filename = "{}.png".format(i)
i = i + 1
opt_img_save_path = os.path.join(opt_img, filename)
mol = Chem.MolFromSmiles(opt_smiles)
drawer = rdMolDraw2D.MolDraw2DCairo(256, 256)
opts = rdMolDraw2D.MolDrawOptions()
# 设置杆的粗细
opts.bondLineWidth = 3
# 设置字母的大小
opts.minFontSize = 15
drawer.SetDrawOptions(opts)
rdMolDraw2D.PrepareAndDrawMolecule(drawer, mol)
drawer.FinishDrawing()
drawer.WriteDrawingText(opt_img_save_path)
if __name__ == '__main__':
main()
这里直接看连接:生成的分子图像是否可以识别为SMILES,然后再将识别后的SMILES转换为图像?_马鹏森的博客-CSDN博客
运行的命令为:
python distribution_learning_file.py --generate_smiles_file image2smiles_validity.csv --train_smiles_file 100k_moleculars.csv --output_dir ./ --suite v2 --number_samples 10000
代码为:
import argparse
import os
from guacamol.assess_distribution_learning import _assess_distribution_learning
from guacamol.utils.helpers import setup_default_logger
from typing import List
from guacamol.distribution_matching_generator import DistributionMatchingGenerator
class AllGeneratorSmiles(DistributionMatchingGenerator):
"""
Generator that samples SMILES strings from a predefined list.
"""
def __init__(self, molecules: List[str]) -> None:
"""
Args:
molecules: list of molecules from which the samples will be drawn
"""
self.molecules = molecules
def generate(self, number_samples: int) -> List[str]:
return list(self.molecules)
def main():
setup_default_logger()
parser = argparse.ArgumentParser(description='Molecule distribution learning benchmark for random smiles sampler', formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--generate_smiles_file', default='image2smiles_validity.csv', help='这个是生成的N个SMIMES文件')
parser.add_argument('--train_smiles_file', default='100k_moleculars.csv', help='这个是训练集的SMIMES文件')
parser.add_argument('--output_dir', default='./', help='Output directory')
parser.add_argument('--suite', default='v2')
parser.add_argument('--number_samples', default='10000', type=int, help='Number of samples to validity generate')
args = parser.parse_args()
if args.output_dir is None:
args.output_dir = os.path.dirname(os.path.realpath(__file__))
with open(args.generate_smiles_file, 'r') as smiles_file:
smiles_list = [line.strip() for line in smiles_file.readlines()]
generator = AllGeneratorSmiles(molecules=smiles_list)
json_file_path = os.path.join(args.output_dir, 'distribution_learning_results.json')
# number_samples是从生成的文件中随机取出多少个样本
_assess_distribution_learning(model = generator,
chembl_training_file = args.train_smiles_file,
json_output_file = json_file_path,
benchmark_version = args.suite,
number_samples=args.number_samples)
if __name__ == '__main__':
main()