根据SID或者CID下载PUBCHEM数据库的smile信息(总结版)

PUBCHEM下载smile

      • 使用PUBCHEMbulk download功能
      • 把smile列加入到源文件中

使用PUBCHEMbulk download功能

参见我的一篇博文如何使用pubchem的bulk download功能

把smile列加入到源文件中

废话少说 上代码

# -*- coding: utf-8 -*-
"""
Created on Wed Sep 11 10:16:11 2019

@author: 86177
"""

import os
import pandas as pd
import sys


def get_relationship_dic(txt_path):
    #make diction between sids and smiles
    dic = {} 
    df = pd.read_csv(txt_path,sep ='\t')
    dic = dict(zip(df.iloc[:,0],df.iloc[:,1]))
    return dic


def append_smiles(txt_path,f_path,sid_name):
    dic =   get_relationship_dic(txt_path)
    
    try:
        files = os.listdir(f_path,sid_name)
        for file in files:
            print('{} started'.format(file))
            df = pd.read_csv(os.path.join(f_path,file)).dropna(axis = 0,subset=[sid_name])
            sid_all = set(df[sid_name].astype(int))
            sid = dic.keys()
            dic.update({k:None for k in sid_all.difference(sid)})
            df['SMILES'] = df[sid_name].map(lambda x:dic[x])
            df.to_csv(os.path.join(f_path,file),index =False)
            print('{} finished'.format(file))
    except:
            print('{} started'.format(f_path.split('\\')[-1]))
            df = pd.read_csv(f_path).dropna(axis = 0,subset=[sid_name])
            sid_all = set(df[sid_name].astype(int))
            sid = dic.keys()
            dic.update({k:None for k in sid_all.difference(sid)})
            df['SMILES'] = df[sid_name].map(lambda x:dic[x])
            df.to_csv(f_path,index =False)
            print('{} finished'.format(f_path.split('\\')[-1]))

def main(f_path,txt_path,sid_name):
    append_smiles(txt_path,f_path,sid_name)
        
if __name__== '__main__':
# ==========================================================================================   
#"请按顺序在命令框输入 csv文件/文件夹路径,cid或者sid列名,smiles文件路径"
#===========================================================================================    
    f_path = sys.argv[1]
    sid_name = sys.argv[2]
    txt_path = sys.argv[3]
    main(f_path,txt_path,sid_name)

你可能感兴趣的:(药用数据库的数据提取与使用,PUBCHEM,SMILE)