Python爬取百度疫情数据并预测未来七天及其可视化

本文采用爬虫、AR模型以及pyrcharts技术,实现从百度上取疫情数据,并将数据用于训练模型,最后将模型的预测结果可视化展示,代码如下
from random import random
import pandas as pd
import requests
import numpy as np
import pyecharts.options as opts
from pyecharts.globals import ThemeType
from pyecharts.commons.utils import JsCode
from pyecharts.charts import Timeline, Grid, Bar, Map, Pie, Line
import datetime
from typing import List
from datetime import date
from requests.models import codes
from lxml import etree
from urllib import parse
from statsmodels.tsa.arima.model import ARIMA


url = "https://api.inews.qq.com/newsqa/v1/query/inner/publish/modules/list?modules=chinaDayList,chinaDayAddList,nowConfirmStatis,provinceCompare"
headers = {
    "Referer": "https://news.qq.com/",
    "Host": "api.inews.qq.com",
    "Origin": "https://news.qq.com",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.182 Safari/537.36"
}

response = requests.post(url, headers=headers)
datas = eval(response.text)
datas = datas['data']['provinceCompare']
# 获取全部的省份
key = list(datas.keys())
# 各省每日新增,设置成字典
provinceAdd = {}
for i in key:
    temp = []
    # 将每个省的数据存为列表
    urls = "https://api.inews.qq.com/newsqa/v1/query/pubished/daily/list?province=%s&"
    URL = urls % (parse.quote(i))
    responseProvince = eval(requests.post(URL, headers=headers).text)[
        'data'][-30:-1]
    for j in responseProvince:
        temp.append(int(j['confirm_add']))
    provinceAdd[i] = temp


"""时间序列模型"""


def AR(data):
    result = []
    for i in range(7):
        model = ARIMA(data, order=[2, 0, 1])
        model_fit = model.fit()
        yhat = round(model_fit.predict(len(data), len(data))[0])
        data.append(yhat)
        del data[0]
        result.append(yhat)
    return result


# 存储时间序列模型预测的34个省的未来7天的值
result = {}
for i in key:
    data = provinceAdd[i]
    result[i] = AR(data)

total_num = []

DATA = []
for i in range(7):
    temp = {}
    dates = datetime.date.today()+datetime.timedelta(days=i)
    temp["time"] = ("%s年%s月%s日" % (dates.year, dates.month, dates.day))
    # 构建键data的value值
    temp['data'] = []
    num = 0
    for j in key:
        num = num+result[j][i]
    total_num.append(num)
    # 取j省,存数据
    for j in key:
        temp_data = {}
        temp_data["name"] = j
        # 取j省的第一条预测数据
        temp_data["value"] = [result[j][i], round(result[j][i]/num, 4), j]
        temp['data'].append(temp_data)
    DATA.append(temp)


time_list = [
    ("%s年%s月%s日" % ((datetime.date.today()+datetime.timedelta(days=d)).year, (datetime.date.today()+datetime.timedelta(days=d)).month, (datetime.date.today()+datetime.timedelta(days=d)).day)) for d in range(7)]

maxNum = 300
minNum = 0


# 画图

def get_year_chart(year: str):
    map_data = [
        [[x["name"], x["value"]] for x in d["data"]] for d in DATA if d["time"] == year
    ][0]
    min_data, max_data = (minNum, maxNum)
    data_mark: List = []
    i = 0
    for x in time_list:
        if x == year:
            data_mark.append(total_num[i])
        else:
            data_mark.append("")
        i = i + 1

    map_chart = (
        Map()
        .add(
            series_name="",
            data_pair=map_data,
            zoom=1,
            center=[119.5, 34.5],
            is_map_symbol_show=False,
            itemstyle_opts={
                "normal": {"areaColor": "#323c48", "borderColor": "#404a59"},
                "emphasis": {
                    "label": {"show": Timeline},
                    "areaColor": "rgba(255,255,255, 0.5)",
                },
            },
        )
        .set_global_opts(
            title_opts=opts.TitleOpts(
                title="预测" + str(year) + "全国分地区新冠患病人数",
                subtitle="",
                pos_left="center",
                pos_top="top",
                title_textstyle_opts=opts.TextStyleOpts(
                    font_size=25, color="rgba(255,255,255, 0.9)"
                ),
            ),
            tooltip_opts=opts.TooltipOpts(
                is_show=True,
                formatter=JsCode(
                    """function(params) {
                    if ('value' in params.data) {
                        return params.data.value[2] + ': ' + params.data.value[0];
                    }
                }"""
                ),
            ),
            visualmap_opts=opts.VisualMapOpts(
                is_calculable=True,
                dimension=0,
                pos_left="30",
                pos_top="center",
                range_text=["High", "Low"],
                range_color=["lightskyblue", "yellow", "orangered"],
                textstyle_opts=opts.TextStyleOpts(color="#ddd"),
                min_=min_data,
                max_=max_data,
            ),
        )
    )

    line_chart = (
        Line()
        .add_xaxis(time_list)
        .add_yaxis("", total_num)
        .add_yaxis(
            "",
            data_mark,
            markpoint_opts=opts.MarkPointOpts(
                data=[opts.MarkPointItem(type_="max")]),
        )
        .set_series_opts(label_opts=opts.LabelOpts(is_show=False))
        .set_global_opts(
            title_opts=opts.TitleOpts(
                title="预测"+str(datetime.date.today())+"~"+str(datetime.timedelta(days=6))+"号全国新冠患者人数", pos_left="72%", pos_top="5%"
            )
        )
    )
    bar_x_data = [x[0] for x in map_data]
    bar_y_data = [{"name": x[0], "value": x[1][0]} for x in map_data]
    bar = (
        Bar()
        .add_xaxis(xaxis_data=bar_x_data)
        .add_yaxis(
            series_name="",
            y_axis=bar_y_data,
            label_opts=opts.LabelOpts(
                is_show=True, position="right", formatter="{b} : {c}"
            ),
        )
        .reversal_axis()
        .set_global_opts(
            xaxis_opts=opts.AxisOpts(
                max_=maxNum, axislabel_opts=opts.LabelOpts(is_show=False)
            ),
            yaxis_opts=opts.AxisOpts(
                axislabel_opts=opts.LabelOpts(is_show=False)),
            tooltip_opts=opts.TooltipOpts(is_show=False),
            visualmap_opts=opts.VisualMapOpts(
                is_calculable=True,
                dimension=0,
                pos_left="10",
                pos_top="top",
                range_text=["High", "Low"],
                range_color=["lightskyblue", "yellow", "orangered"],
                textstyle_opts=opts.TextStyleOpts(color="#ddd"),
                min_=min_data,
                max_=max_data,
            ),
        )
    )

    pie_data = [[x[0], x[1][0]] for x in map_data]
    pie = (
        Pie()
        .add(
            series_name="",
            data_pair=pie_data,
            radius=["15%", "35%"],
            center=["80%", "82%"],
            itemstyle_opts=opts.ItemStyleOpts(
                border_width=1, border_color="rgba(0,0,0,0.3)"
            ),
        )
        .set_global_opts(
            tooltip_opts=opts.TooltipOpts(is_show=True, formatter="{b} {d}%"),
            legend_opts=opts.LegendOpts(is_show=False),
        )
    )

    grid_chart = (
        Grid()
        .add(
            bar,
            grid_opts=opts.GridOpts(
                pos_left="10", pos_right="45%", pos_top="50%", pos_bottom="5"
            ),
        )
        .add(
            line_chart,
            grid_opts=opts.GridOpts(
                pos_left="65%", pos_right="80", pos_top="10%", pos_bottom="50%"
            ),
        )
        .add(pie, grid_opts=opts.GridOpts(pos_left="45%", pos_top="60%"))
        .add(map_chart, grid_opts=opts.GridOpts())
    )

    return grid_chart


if __name__ == "__main__":
    timeline = Timeline(
        init_opts=opts.InitOpts(
            width="1600px", height="900px", theme=ThemeType.DARK)
    )
    for y in time_list:
        g = get_year_chart(year=y)
        timeline.add(g, time_point=str(y))

    timeline.add_schema(
        orient="vertical",
        is_auto_play=True,
        is_inverse=True,
        play_interval=5000,
        pos_left="null",
        pos_right="5",
        pos_top="20",
        pos_bottom="20",
        width="60",
        label_opts=opts.LabelOpts(is_show=True, color="#fff"),
    )

    timeline.render("2018.html")
最后生成的是一个html文件,展示结果如下:

Python爬取百度疫情数据并预测未来七天及其可视化_第1张图片

你可能感兴趣的:(Python,机器学习,#,爬虫,python,百度,数据挖掘)