下载空气质量数据

最近需要从网上下载空气质量数据,看到真气网的历史数据整理的不错,因此想利用脚本下载;查看网站之后发现通过数据流隐藏了数据,又懒得通过抓包分析,于是使用selenium来模拟浏览器进行下载。具体代码如下:
ChromeDriver
历史空气质量数据

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Date    : 2019-09-14 22:22:06
# @Author  : Your Name ([email protected])
# @Link    : http://example.org
# @Version : $Id$

import os
import time
import datetime
from urllib.parse import urlencode

import requests
import pandas as pd
from selenium import webdriver



# https://www.aqistudy.cn/historydata/daydata.php?city=%E5%8C%97%E4%BA%AC&month=2016-11
url_base = 'https://www.aqistudy.cn/historydata/daydata.php'

citys = [
    '北京',
    '天津',
    '石家庄',
    '保定',
    '唐山',
    '秦皇岛',
    '邢台',
    '邯郸',
    '沧州',
    ]

sdt = datetime.datetime(2013, 12, 1)  # 开始时间
edt = datetime.datetime.now()  # 结束时间  datetime.datetime(2013, 12, 1)
months = pd.date_range(sdt, edt, freq='1m')

homedir = os.path.dirname(os.path.realpath(__file__))
output_path = os.path.join(homedir, 'data')
print(homedir)
print(output_path)
if not os.path.exists(output_path):
    os.makedirs(output_path, exist_ok=True)



# open selenium, download from http://npm.taobao.org/mirrors/chromedriver/
# 下载后放在脚本所在目录
driver = webdriver.Chrome()  


for icity, vcity in enumerate(citys):
    output_filename = os.path.join(output_path, '{}_{}_{}.csv'.format(
        vcity, sdt.strftime('%Y%m'), edt.strftime('%Y%m')))
    for imonth, vmonth in enumerate(months):
        url = '{}?{}&month={}'.format(
            url_base,
            urlencode({'city':vcity}, 'utf-8'),
            vmonth.strftime('%Y-%m')
            )

        driver.get(url)
        time.sleep(5)
        data = pd.read_html(driver.page_source, header=False)[0]
        # output data
        if os.path.exists(output_filename):
            headers=False
            mode='a'
        else:
            headers=True
            mode='w'
        data.to_csv(output_filename, index=False, header=headers, mode=mode)
        print(vcity, vmonth, url)
        time.sleep(2.5)

其他问题自行解决!

你可能感兴趣的:(下载空气质量数据)