利用python多线程抓取Mesh关键字

# -*- coding: utf-8 -*-
"""
Sinomed主题分类抓取脚本  by: 橘子一方
======================================
多线程每次抓取一个大分类('A','B',...,'TL','TN'),修改分类直接在main函数中修改传入参数为大分类号字母即可。
输出为JSON,每个元素包括name, node, p_node三个字段
node   : 节点编码字符串(末端此字段为空)
p_node : 父节点的编码字符串
name   : 名称
"""
import urllib2
import cookielib
import json
import threading
import xml.etree.ElementTree as ET

SAVE_PATH = 'mesh.json'


url_prefix = 'http://www.sinomed.ac.cn'

tree_root_prefix = 'http://www.sinomed.ac.cn/cross/subjectSearch.do?method=xml&db=me_MESHE&treenumber='
tree_root_list = ('A','B','C','D','E','F','G','H','I','J','K','L','N','V','Z','TA','TB','TC','TD','TE','TF','TG','TH','TI','TK','TL','TN')

ResultContent = []

cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))

def get_tree(currentNode,url):
    ''' 递归抓取XML的树结构 '''
    request = urllib2.Request(url)
    content = opener.open(request)
    xml = content.read()
    opener.close()
    tree = ET.fromstring(xml)
    for e in tree:
        name = e.attrib['text']
        has_sub = e.attrib.get('src',False)  # 判断是否有子节点

        if has_sub:
            sub_url = url_prefix+e.attrib.get('src',None)  # 获取子节点的地址
            node_index = e.attrib['src'].find('&treenumber=')+12  # 获取节点索引
            node = e.attrib['src'][node_index:]  # 获取节点名称
            ResultContent.append({'name':name, 'node':node, 'p_node':currentNode})
            get_tree(node,sub_url)
        else:
            ResultContent.append({'name':name, 'node':'', 'p_node':currentNode})

def thread_key(prefix):
    ''' 生成线程的列表 '''
    assert prefix in tree_root_list
    key = []
    xml = urllib2.urlopen(tree_root_prefix+prefix).read()
    tree = ET.fromstring(xml)
    for i in range(len(tree)):
        if i<9:
            key.append(prefix+'0'+str(i+1))
        else:
            key.append(prefix+str(i+1))
    return key

def main(prefix):
    ''' 多线程抓取的主程序 '''
    thread = []
    key = thread_key(prefix)
    print 'defining thread...'
    for i in key:
        thread.append(threading.Thread(target=get_tree,args=(i,tree_root_prefix+i)))
    print 'done!'

    print 'start...'
    for i in thread:      
        i.start()

    for i in thread:
        i.join()

def write_to(content, file_path):
    ''' 将字符串写入指定的文件中 '''
    print 'writing...'
    f = open(file_path, 'w')
    f.write(content)
    f.close()
    print 'done!'

if __name__ == '__main__':
    main('C')
    # get_tree('C01',tree_root_prefix+'C01')
    content = json.dumps(ResultContent, ensure_ascii=False, separators=(',', ':')).encode('utf-8')
    write_to(content, SAVE_PATH)

你可能感兴趣的:(python,抓取)