大量的pubchem实验获取化合物smile并归属各个实验.

目标:获取了多个pubchem实验的csv文件,需要获取其中化合物的smile.

import pandas as pd
import os
import re

def Summary_cid(path,out_path):
	# cid sumamry and batches to files that each has 500000 cid 
	files = os.listdir(path)
	CIDS = []
	for file in files:
		df = pd.read_csv(os.path.join(path,file))
		df = df.dropna(axis=0,subset = ["PUBCHEM_CID"]) 
	    CIDS.extend(df['PUBCHEM_CID'])
	for i in range(0,len(CIDS),500000):
		df = pd.DataFrame({'CID':CIDS[i:i+500001]}).to_csv(os.path.join(out_path,'{}-{}.csv'.format(i,i+500000)))


# 通过csv文件在pubchem网站上下载smiles_txt.

def get_relationship_dic(path):
    #make diction between cids and smiles
    dic = {} 
    for file in os.listdir(path):
        with open (os.path.join(path,file),'r+') as f:
            smiles = f.readlines()
            smiles_only = [i.replace('\t',',').split(',')[1].strip() for i in smiles]
            CID_only =  [i.replace('\t',',').split(',')[0].strip() for i in smiles]
            dic.update({CID:SMILE for (CID,SMILE) in zip(CID_only,smiles_only)})
     return dic

def append_smile_column(path):
    # append smile column to origin file by the diction which made before
	files = os.listdir(path)
    for file in files:
        print('{} started'.format(file))
        df = pd.read_csv(os.path.join(path,file)).dropna(axis = 0,subset=['PUBCHEM_CID'])
        df['SMILES'] = df['PUBCHEM_CID'].map(lambda x:dic[str(int(x))])
        df.to_csv(os.path.join(path,file),index =False)
        print('{} finished'.format(file))

if __name__ == '__main__':
	ori_path = (input by yourself)
	out_path = (input by yourseld)
	txt_path = (input by yourseld)
	Summary_cid(ori_path,out_path) 
	dic = get_relationship_dic(txt_path)
	append_smile_column(ori_path)      		

你可能感兴趣的:(药用数据库的数据提取与使用)