1.1网站分析(抓包)
使用谷歌浏览器(火狐浏览器)的开发者工具,通过刷新监听抓取浏览器请求的响应包,找到历史数据及所有地区url等对应的包
1.1.1历史数据的数据包(get请求方式)
重庆安康: https://api.waqi.info/api/attsse/9239/yd.json 贵阳马鞍:https://api.waqi.info/api/attsse/1368/yd.json
通过对比两个地区,可发现url里面的数字即idx是变化的使用谷歌浏览器或者postman打开包后可看出数据是按照月分的,一个data对应一个月,且里面的数据是加密的,
1.1.2世界所有地区idx的包(post请求方式)
Url: https://api.waqi.info/mapq2/bounds
同1.1.1方法一样可发现包里面字典键值data对应的就是不同地区的idx
1.1.3解密函数的包Url:https://aqicn.org/webapp/dist/historic-module-dyn.2b2626b6ef49374f9dcd.js
通过对网站进行断点调试,可查看到数据经过这个包里面的一些函数时,加密数据会被解析成一个月每一天的对应指标的值,
完整项目:基于python 的爬虫及flask框架web大屏交互式可视化https://download.csdn.net/download/weixin_66397563/87651644?spm=1001.2014.3001.5503
注意: 本文以学习技术为主,不可以用于非法行为, 如有侵权请联系删除
import requests
import json
import subprocess
from functools import partial
import time
import os
import pandas as pd
import re
import random
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/101.0.4951.7 Safari/537.36'
}
subprocess.Popen = partial(subprocess.Popen, encoding="utf-8")
# 修改编码方式,window默认编码是gbk,
import execjs
def get_idx():
"""
:return: 得到所有地区的idx和name,返回列表与元组嵌套类型[(idx,name),]
"""
url = 'https://api.waqi.info/mapq2/bounds'
data = {
'bounds': "-306.21093750000006,-62.10388252289787,306.5625,78.42019327591201",
'country': "",
'inc': "placeholders",
'viewer': "webgl",
'zoom': 2
}
request = requests.post(url, data=data, headers=headers).text
a = json.loads(request)
result = []
for j in a["data"]:
idx = j['idx']
if idx.isdigit():
result.append(idx)
return result
def get_py_json(url):
'''
:param url: 某地区url
:return: 提取url加密了的的数据
'''
resp = requests.get(url, headers=headers).text
lis = resp.split('\n\n')
result = []
for i in range(1, len(lis) - 1, 2):
st = lis[i][18:]
dic = json.loads(st)
if 'msg' in dic:
result.append(dic["msg"])
if result:
return result[1:]
return result
def get_js_function(js_path, func_name, func_args):
'''
:param js_path: 存放解密函数的js文件
:param func_name: 调用的js函数名
:param func_args: 待解密数据
:return: 返回解析后的数据
'''
with open(js_path, 'r', encoding="utf-8") as f:
js = f.read()
ctx = execjs.compile(js)
return ctx.call(func_name, func_args)
def get_decode_data(json):
'''
:param json: 加密数据
:return: 解密后的数据
'''
items = []
for item in json:
if item["ps"]:
data = get_js_function('./static/js/test.js', 's', item)
items.append(data)
return items
def get_index_data(items):
'''
:param items: 解密后的数据
:return: 将不同指标分开,按照时间排序合并一起
'''
pm25 = []
pm10 = []
O3 = []
NO2 = []
SO2 = []
CO = []
time_time = []
city = items[0].get('source').get('city').get('name')
for item in items:
for i in item["species"]:
name = i['name']
values = i["values"]
if name == 'PM2.5':
for j in i['values']:
d1 = j["t"]["d"]
v = j['v']
time_time.append(d1)
pm25.append((d1, v))
if name == 'PM10':
for j in i['values']:
d2 = j["t"]["d"]
v = j['v']
time_time.append(d2)
pm10.append((d2, v))
if name == 'O3':
for j in i['values']:
d3 = j["t"]["d"]
v = j['v']
time_time.append(d3)
O3.append((d3, v))
if name == 'NO2':
for j in i['values']:
d4 = j["t"]["d"]
v = j['v']
time_time.append(d4)
NO2.append((d4, v))
if name == 'SO2':
for j in i['values']:
d5 = j["t"]["d"]
v = j['v']
time_time.append(d5)
SO2.append((d5, v))
if name == 'CO':
for j in i['values']:
d6 = j["t"]["d"]
v = j['v']
time_time.append(d6)
CO.append((d6, v))
time1 = list(set(time_time))
date_time = []
for i in time1:
time_1 = time.strptime(i[:10], "%Y-%m-%d")
date_time.append(time_1)
date_time.sort()
data_1 = []
for i in date_time:
data_1.append(time.strftime("%Y-%m-%d", i) + 'T00:00:00.000Z')
head_list = ['时间', 'pm25', 'pm10', 'O3', 'NO2', 'SO2', 'CO']
data_list = [head_list, ]
for i in data_1:
list_1 = [i, '', '', '', '', '', '']
for pm2 in pm25:
if pm2[0] == i:
list_1[1] = pm2[1]
for pm1 in pm10:
if pm1[0] == i:
list_1[2] = pm1[1]
for O_3 in O3:
if O_3[0] == i:
list_1[3] = O_3[1]
for n2 in NO2:
if n2[0] == i:
list_1[4] = n2[1]
for s2 in SO2:
if s2[0] == i:
list_1[5] = s2[1]
for c_1 in CO:
if c_1[0] == i:
list_1[6] = c_1[1]
data_list.append(list_1)
return data_list, city
def write_file(city, data_list):
# “?”、“、”、“╲”、“/”、“*”、““”、“”“、“<”、“>”、“|”
lst = re.sub(r'[?、 .╲*"<>|,]', '_', city).replace(":", "/").split("/")
city_1 = '/'
for i in range(0, len(lst) - 1):
city_1 += lst[i] + '/'
path = os.path.join("data/空气污染指数历史数据" + city_1)
if not os.path.exists(path) or not os.path.isdir(path):
os.makedirs(path)
file_name = os.path.join(path + lst[-1] + '.csv')
print(file_name)
pd.DataFrame(data_list).to_csv(file_name, encoding='utf-8', index=False, header=False)
def main():
print("Air爬虫启动")
pid = os.getpid()
print("pid:", pid)
with open("./data/air_pid.txt", "w") as f:
f.write(str(pid))
for j in range(100000):
idx = get_idx()
# print(len(idx))
for i in idx:
url = f'https://api.waqi.info/api/attsse/{i}/yd.json'
with open('./data/url.txt', 'r', encoding='utf-8') as f:
line = f.read().splitlines()
if url not in line:
try:
encryption_list = get_py_json(url)
decode_data = get_decode_data(encryption_list)
data_list1, city = get_index_data(decode_data)
print(city)
write_file(city, data_list1)
with open('./data/url.txt', 'a', encoding='utf-8') as f:
f.write(f'{url}\n')
except Exception as e:
print(f"请求错误:{e}")
time.sleep(5)
# time.sleep(random.randint(1,10))
time.sleep(random.randint(5, 10))
if __name__ == '__main__':
for j in range(100000):
idx = get_idx()
# print(len(idx))
for i in idx:
url = f'https://api.waqi.info/api/attsse/{i}/yd.json'
with open('./data/url.txt', 'r', encoding='utf-8') as f:
line = f.read().splitlines()
if url not in line:
try:
encryption_list = get_py_json(url)
decode_data = get_decode_data(encryption_list)
data_list1, city = get_index_data(decode_data)
print(city)
write_file(city, data_list1)
with open('./data/url.txt', 'a', encoding='utf-8') as f:
f.write(f'{url}\n')
except Exception as e:
print(f"请求错误:{e}")
time.sleep(5)
# time.sleep(random.randint(1,10))
time.sleep(random.randint(5, 10))
注意: 本文以学习技术为主,不可以用于非法行为, 如有侵权请连续删除