用http协议在rcsb.org自动下载pdb文件

请参考rcsb官方提供文档接口
由于python提供接口简单,所以选择了它

程序流程

1.从thefile.txt 文件中读取蛋白序列
2.然后从rcsb 查询,获取查询XML数据
3.解析XML文件
4.下载pdb文件

python代码

可以用python name.py 直接运行.

#!/usr/bin/python
# -*- coding: UTF-8 -*-
import urllib
import urllib2
import xml.sax

global_url = "https://files.rcsb.org/download/"

class PdbHandler( xml.sax.ContentHandler ):
    def __init__(self):
        self.CurrentData = ""
        self.BlastOutput_program = ""
        self.BlastOutput_version = ""
        self.BlastOutput_db = ""
        self.BlastOutput_query_def = ""
        self.BlastOutput_query_len = ""
        self.blastOutput_iterations = ""
        self.blastOutput_query_len = 0
        self.BlastOutput_param = ""
        self.Parameters_matrix = ""
        self.Parameters_expect = ""
        self.Hit_def = ""       

    # 元素开始事件处理
    def startElement(self, tag, attributes):
        self.CurrentData = tag
        if tag == "BlastOutput":
            print "-------start---BlastOutput--"

    # 元素结束事件处理
    def endElement(self, tag):
        if self.CurrentData == "BlastOutput_program":
            print "BlastOutput_program:", self.BlastOutput_program
        elif self.CurrentData == "Hit_def":
            print "file name :", self.Hit_def[0:4] 
            download_pdb(self.Hit_def[0:4])

    # 内容事件处理
    def characters(self, content):
        if self.CurrentData == "BlastOutput_program":
            self.BlastOutput_program = content
        elif self.CurrentData == "BlastOutput_version":
            self.BlastOutput_version = content
        elif self.CurrentData == "BlastOutput_db":
            self.BlastOutput_db = content
        elif self.CurrentData == "BlastOutput_query-ID":
            self.BlastOutput_query_ID = content
        elif self.CurrentData == "BlastOutput_query-def":
            self.BlastOutput_query_def = content
        elif self.CurrentData == "BlastOutput_query-len":
            self.BlastOutput_query_len = content
        elif self.CurrentData == "Parameters_matrix":
            self.Parameters_matrix = content
        elif self.CurrentData == "Hit_def":
            self.Hit_def = content

def download_pdb(file_name):
    try:
        f = urllib2.urlopen(global_url + file_name + ".pdb") 
        data = f.read()
        with open("pdb/" + file_name + ".pdb", "wb") as code:     
            code.write(data)
            code.close()
    except Exception as e:
        print global_url + file_name + ".pdb"


def setup_connect():
    test_data = {'sequence':'TDMLTLTRYVMEKGRQAKGTGELTQLLNSMLTAIKAISSAVRKAGLAHLYGIAGSVNVDQ'
                    , 'eCutOff':'10.0'
                    , 'matrix':'BLOSUM62'
                    , 'outputFormat':'XML'}
    #test_data['sequence'] = seq

    test_data_urlencode = urllib.urlencode(test_data)

    requrl = "https://www.rcsb.org/pdb/rest/postBLAST"

    req = urllib2.Request(url = requrl,data =test_data_urlencode)
    print req

    res_data = urllib2.urlopen(req)
    print res_data
    res = res_data.read()
    return res

def save_to_file(file_name, contents):
    try:
        fh = open(file_name, 'w')
        fh.write(contents)
        fh.close()      
    except Exception as e:
         print("save to file error!")

def read_file():
    '''
    file_object = open('thefile.txt')
    try:
    #:  for line in file_object:
          #  process line
    finally:
         file_object.close()
    '''

if ( __name__ == "__main__"):
    print '__main__'
    # 创建一个 XMLReader
    parser = xml.sax.make_parser()
    # turn off namepsaces
    parser.setFeature(xml.sax.handler.feature_namespaces, 0)

    # 重写 ContextHandler
    Handler = PdbHandler()
    parser.setContentHandler( Handler )
    parser.parse("output1.xml")

'''
    xml_data = setup_connect()
    save_to_file('output1.xml', xml_data)

    file_object = open('thefile.txt')
    try:
        while 1:
            line = file_object.readline()
            if not line:
                break
            line = line.split()
            print( line )
            print( line[1] )

            xml_data = setup_connect(line[1])
#   save_to_file('output.txt', data)
            parser = xml.sax.parseString(xml_data, PdbHandler())
    #parser.setFeature(xml.sax.handler.feature_namespaces, 0)
    #Handler = 
    #parser.setContentHandler( Handler )
            parser.parse()
    finally:
        file_object.close()
'''

下面是thefile.txt文件格式,字段之间用空格分隔.

TDMLTLTRYVMEKGRQAKGTGELTQLLNSMLTAIKAISSAVRKAGLAHLYGIAGSVNVDQ  1

下面是我的运行结果,下载了很多PDB文件.
这里写图片描述

你可能感兴趣的:(Python)