【request爬虫3】批量爬取Cell Blast

特别声明:

  1. 供交流学习使用,不得用作商业用途。
  2. 如有违规侵权,请联系删除。
import requests
# from pyquery import PyQuery as pq
import time
import json
import sys
import os
import re

# 设置数据保存路径 & 请求网址
wd = r'/share/disk1/Data/Users/luohb/spider/Cell_BLAST/result/'
url='https://cblast.gao-lab.org/datasets_meta'

# 网站请求获取 Json 数据
headers = {
    'Accept': 'application/json, text/plain, */*',
    'Accept-Encoding': 'gzip, deflate',
    'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8,ja;q=0.7',
    'Connection': 'keep-alive',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.122 Safari/537.36'
}

try:
    res = requests.post(url=url, headers=headers)
    print(res.status_code)
    # print(res.text)
except Exception:
    print('request fail...please check!')

# Json文件解析
i = 0
json_list = json.loads(res.text)
for item in json_list:
    time.sleep(2)  #避免请求异常导致爬取过快
    try:
        dataset_name = str(item['dataset_name'].replace(' ', '_'))
        organism = str(item['organism'].replace(' ', '_'))
        organ = str(item['organ'].replace(' ', '_'))
        platform = str(item['platform'])
        cell_number = str(item['cell_number'])
        visualization = list(item['visualization'].split(','))
        i += 1
    except KeyError:
        print('item {} has key error, please check!'.format(i))
    
    #create & change directory
    dir_name = '-'.join([dataset_name, organism, organ, platform, cell_number])
    path = os.path.join(wd, dir_name)
    os.mkdir(path)
    os.chdir(path)
    print(os.getcwd())

    #download h5file
    h5_url = 'https://cblast.gao-lab.org/{name}/{name}.h5'.format(name=dataset_name)
    # print(h5_url)
    os.system('wget {}'.format(h5_url))

    #download SVG file
    for viz in visualization:
        viz = viz.strip()
        svg_path = 'https://cblast.gao-lab.org/{name}/{svg_type}'.format(name=dataset_name, svg_type=viz)
        print(svg_path)
        os.system('wget {}'.format(svg_path))

你可能感兴趣的:(【request爬虫3】批量爬取Cell Blast)