# -*- coding: utf-8 -*-
"""
Sinomed主题分类抓取脚本 by: 橘子一方
======================================
多线程每次抓取一个大分类('A','B',...,'TL','TN'),修改分类直接在main函数中修改传入参数为大分类号字母即可。
输出为JSON,每个元素包括name, node, p_node三个字段
node : 节点编码字符串(末端此字段为空)
p_node : 父节点的编码字符串
name : 名称
"""
import urllib2
import cookielib
import json
import threading
import xml.etree.ElementTree as ET
SAVE_PATH = 'mesh.json'
url_prefix = 'http://www.sinomed.ac.cn'
tree_root_prefix = 'http://www.sinomed.ac.cn/cross/subjectSearch.do?method=xml&db=me_MESHE&treenumber='
tree_root_list = ('A','B','C','D','E','F','G','H','I','J','K','L','N','V','Z','TA','TB','TC','TD','TE','TF','TG','TH','TI','TK','TL','TN')
ResultContent = []
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
def get_tree(currentNode,url):
''' 递归抓取XML的树结构 '''
request = urllib2.Request(url)
content = opener.open(request)
xml = content.read()
opener.close()
tree = ET.fromstring(xml)
for e in tree:
name = e.attrib['text']
has_sub = e.attrib.get('src',False) # 判断是否有子节点
if has_sub:
sub_url = url_prefix+e.attrib.get('src',None) # 获取子节点的地址
node_index = e.attrib['src'].find('&treenumber=')+12 # 获取节点索引
node = e.attrib['src'][node_index:] # 获取节点名称
ResultContent.append({'name':name, 'node':node, 'p_node':currentNode})
get_tree(node,sub_url)
else:
ResultContent.append({'name':name, 'node':'', 'p_node':currentNode})
def thread_key(prefix):
''' 生成线程的列表 '''
assert prefix in tree_root_list
key = []
xml = urllib2.urlopen(tree_root_prefix+prefix).read()
tree = ET.fromstring(xml)
for i in range(len(tree)):
if i<9:
key.append(prefix+'0'+str(i+1))
else:
key.append(prefix+str(i+1))
return key
def main(prefix):
''' 多线程抓取的主程序 '''
thread = []
key = thread_key(prefix)
print 'defining thread...'
for i in key:
thread.append(threading.Thread(target=get_tree,args=(i,tree_root_prefix+i)))
print 'done!'
print 'start...'
for i in thread:
i.start()
for i in thread:
i.join()
def write_to(content, file_path):
''' 将字符串写入指定的文件中 '''
print 'writing...'
f = open(file_path, 'w')
f.write(content)
f.close()
print 'done!'
if __name__ == '__main__':
main('C')
# get_tree('C01',tree_root_prefix+'C01')
content = json.dumps(ResultContent, ensure_ascii=False, separators=(',', ':')).encode('utf-8')
write_to(content, SAVE_PATH)