网络爬虫系列3:爬取全国各地区疫情风险等级json数据

参照Python爬取全国各地区疫情风险等级-pudn.com ,对原代码进行优化,详细内容可参照原网页。

import hashlib
import os
import requests
import time
import sys
import json
import csv


# 显示某等级地区的小计
def show_level_count(x_list):
    j = 0
    for i in range(len(x_list)):
        j += len(x_list[i]["communitys"])
    print(j)
    return j

# 获取risk_data.json中,将其写入risk_data.csv
def writer_to_csv(risk_txt):
    risk_json = json.loads(risk_txt)

    so_far_time = risk_json["data"]["end_update_time"]

    highlist = risk_json["data"]["highlist"]   #获取data/highlist下的所有数据,
    # highlist为一个列表[元素1,元素2,.....],每个元素为一个字典{'type': '2', 'province': '北京市'......}
    middlelist = risk_json["data"]["middlelist"]
    lowlist = risk_json["data"]["lowlist"]

    encoding = 'utf_8_sig'
    path1='./risk_data_2022/risk_data_' + so_far_time + '.csv'
    f = open(path1, 'w', encoding=encoding, newline='')
    csv_writer = csv.writer(f)   #准备写数据

    level_dict = {}   #建立一个空字典
    level_dict["高风险"] = highlist    #添加键"高风险",值为highlist,例如level_dict["老师"] = ['张三']
    level_dict["中风险"] = middlelist
    level_dict["低风险"] = lowlist

    for level in level_dict.keys():
        risk_level = level
        for i in range(len(level_dict[level])):
            province = level_dict[level][i]["province"]
            city = level_dict[level][i]["city"]
            county = level_dict[level][i]["county"]
            for j in range(len(level_dict[level][i]["communitys"])):
                csv_writer.writerow(
                    [risk_level, province, city, county, level_dict[level][i]["communitys"][j]])
    # write_to_csv_file(csv_writer, highlist, "高风险")
    # write_to_csv_file(csv_writer, middlelist, "中风险")
    # write_to_csv_file(csv_writer, lowlist, "低风险")
    f.close()

    print("写入risk_data.csv完成.")

# 利用该函数可从网页获取Json数据(该数据就是一个大列表数据),
# 具体可参照Python爬取全国各地区疫情风险等级(https://www.pudn.com/news/631b15f60d6a7b3b66c9deca.html),
# 如果正确,返回一个值为response.text,并将其中\u2022替换为空, 另一个为response.status_code


# 获取risk_data.json中,将其写入risk_data.csv的另一种方法
def writer_to_csv(risk_txt):
    risk_json = json.loads(risk_txt)

    so_far_time = risk_json["data"]["end_update_time"]

    data_all = risk_json["data"]  #获取data下的所有数据,
    # data_all为一个列表[元素1,元素2,.....],每个元素为一个字典

    encoding = 'utf_8_sig'
    path1='./risk_data_2022/risk_data_' + so_far_time + '.csv'
    f = open(path1, 'w', encoding=encoding, newline='')
    csv_writer = csv.writer(f)   #准备写数据

    level_dict = {}   #建立一个空字典
    level_dict["高风险"] = data_all['highlist']    #添加键"高风险",值为highlist,例如level_dict["老师"] = ['张三']
    level_dict["中风险"] = data_all['middlelist']
    level_dict["低风险"] = data_all['lowlist']
    print(level_dict)
    for level in level_dict.keys():
        risk_level = level
        for i in range(len(level_dict[level])):
            province = level_dict[level][i]["province"]
            city = level_dict[level][i]["city"]
            county = level_dict[level][i]["county"]
            for j in range(len(level_dict[level][i]["communitys"])):
                csv_writer.writerow(
                    [risk_level, province, city, county, level_dict[level][i]["communitys"][j]])
    # write_to_csv_file(csv_writer, highlist, "高风险")
    # write_to_csv_file(csv_writer, middlelist, "中风险")
    # write_to_csv_file(csv_writer, lowlist, "低风险")
    f.close()

    print("写入risk_data.csv完成.")

def get_risk_area_data():
    timestamp = str(int(time.time()))  #time.time()是统计的wall time(即墙上时钟),
    # 也就是系统时钟的时间戳,从1970-01-01 00:00:00 UTC,开始到现在所经历的时间,以浮点数的'秒'来表示。
    # 所以两次调用的时间差即为系统经过的总时间。
    # timestamp = '1662646358'

    x_wif_timestamp = timestamp
    timestampHeader = timestamp

    x_wif_nonce = 'QkjjtiLM2dCratiA'
    x_wif_paasid = 'smt-application'

    x_wif_signature_str = timestamp + \
                          'fTN2pfuisxTavbTuYVSsNJHetwq5bJvCQkjjtiLM2dCratiA' + timestamp
    x_wif_signature = hashlib.sha256(
        x_wif_signature_str.encode('utf-8')).hexdigest().upper()

    signatureHeader_str = timestamp + \
                          '23y0ufFl5YxIyGrI8hWRUZmKkvtSjLQA' + '123456789abcdefg' + timestamp
    signatureHeader = hashlib.sha256(
        signatureHeader_str.encode('utf-8')).hexdigest().upper()

    url = 'http://bmfw.www.gov.cn/bjww/interface/interfaceJson'

    headerss = {
        'Accept': "application/json, text/plain, */*",
        'Content-Type': "application/json;charset=utf-8",
        'x-wif-nonce': "QkjjtiLM2dCratiA",
        'x-wif-paasid': "smt-application",
        'x-wif-signature': x_wif_signature,
        'x-wif-timestamp': x_wif_timestamp,
    }

    From_data = "{\"key\":\"3C502C97ABDA40D0A60FBEE50FAAD1DA\",\
    \"appId\":\"NcApplication\",\"paasHeader\":\"zdww\",\
    \"timestampHeader\":\"" + timestampHeader + "\",\
    \"nonceHeader\":\"123456789abcdefg\",\"signatureHeader\":\"" + signatureHeader + "\"}"
    # print(From_data)

    response = requests.post(url=url, data=From_data, headers=headerss)
    if not response.status_code == 200:
        # print(response.status_code)
        return "", response.status_code

    #print(response.text)
    return response.text.replace('\u2022', ''), response.status_code


if __name__ == '__main__':
    risk_data = get_risk_area_data()
    if risk_data[1] == 200:   #如果正确,就获取第一个参数,即为response.text,并将该数写入risk_data.json中
        with open('./risk_data_2022/risk_data.json', 'w', encoding='utf-8') as f:
            f.write(risk_data[0])
        print("写入risk_data.log完成.")

    f = open('./risk_data_2022/risk_data.json', 'r', encoding='utf-8')
    risk_txt = f.read()
    f.close()

    writer_to_csv(risk_txt)

    print('全部程序完成,请勿频繁使用!')
    os.system('pause')

你可能感兴趣的:(python基础学习,爬虫,python,数据挖掘)