大数据分析实训——使用Spark SQL分析美国新冠肺炎疫情

项目思路:

使用Spark SQL读取文件数据集来生成Data Frame对象,再利用Spark SQL函数对Data Frame对象进行数据分析,并将结果存入MySQL数据库,再以Web网页的形式对分析结果进行可视化,其中使用Spring Boot读取数据库把数据以JSON形式返回给Vue,ECharts对JSON结果进行分析,得到图表。

项目环境:

JDK(v11.0.13)

Spark(v3.2.1)

Scala(v2.13.8)

Spring Boot(v2.6.7)

Vue(v3.2.23)

ECharts(v5.3.2)

Maven(v3.8.3)

MySQL(v8.0.26)

项目演示:

已经部署了,就不一一截图说明了。首页的每个州可以点进去,是他们州下每个县的柱状图数据(暂未实现下钻到县级地图)

spark.znzzi.com
未实现移动端适配,建议PC端访问
每日新增折线图可能会出现错误,部署后测试发现此时会卡住,退出网页重进即可 (已修复)

项目代码:

数据分析部分

1 生成DataFrame对象

val readDF = sqlContext.read
  .format("csv")
  .option("header", "true")
  .option("timestampFormat", "yyyy-MM-dd")
  .load("src/main/resources/us-counties.csv")

2 数据库配置

val url = "jdbc:mysql://localhost:3306/uscovid19"
val prop = new java.util.Properties
prop.setProperty("user", "uscovid19")
prop.setProperty("password", "zhang")

3 统计美国每个州下每个县的累计确诊人数和死亡人数

val usCountyDF = covidDF.select("state", "county", "date", "cases", "deaths")
  .groupBy("state", "county")
  .agg(sum("cases").as("cases"), sum("deaths").as("deaths"))
  .orderBy("state", "county")

4 统计每日累计确诊人数和累计死亡人数

val covidSum = covidDF.groupBy("date").agg(sum("cases").alias("cases"), sum("deaths").alias("deaths"))
  .orderBy("date")

5 统计每日的新增确诊人数

val last_day_cases = Window.orderBy("date")
val covidAdd = covidDF.groupBy("date").agg(sum("cases").alias("cases"))
  .withColumn("variation", col("cases") - lag("cases", 1).over(last_day_cases))
  .na.fill(0.toDouble, Seq("variation"))
  .orderBy("date")

6 统计每日的新增死亡人数

val covidAddDeath = covidDF.groupBy("date").agg(sum("deaths").alias("deaths"))
  .withColumn("variation", col("deaths") - lag("deaths", 1).over(last_day_cases))
  .na.fill(0.toDouble, Seq("variation"))
  .orderBy("date")

7 统计截止5.19日,美国每个州的累计确诊人数和死亡人数

val covidState = covidDF.filter(to_date(to_timestamp(col("date"))) <= lit("2020-05-19"))
  .groupBy("state").agg(sum("cases").alias("cases"), sum("deaths").alias("deaths"))
  .orderBy("cases")

8 统计截止5.19日,全美和各州的病死率

val usaRate = covidDF.filter(to_date(to_timestamp(col("date"))) <= lit("2020-05-19"))
  .agg(sum("cases").alias("cases"), sum("deaths").alias("deaths"))
  .withColumn("rate", round(col("deaths") / col("cases") * 100, 2))
  .withColumn("state", lit("USA"))
  .select("state", "cases", "deaths", "rate")

val deathRate = covidDF
  .groupBy("state")
  .agg(sum("cases").alias("cases"), sum("deaths").alias("deaths"))
  .withColumn("rate", col("deaths") / col("cases") * 100)
  .withColumn("rate", round(col("rate"), 2).alias("rate"))
//前十病死率
val top10DeathRate = deathRate.orderBy(desc("rate")).limit(10)
top10DeathRate.write.mode("overwrite").jdbc(url, "top10_rate", prop)
//总病死率
val totalDeathRate = deathRate
  .union(usaRate).orderBy("state")

可视化后端部分

1 每日累计确诊人数和累计死亡人数接口的实现

实体类:

public class DatesCasesDeaths {
    private String date;
    private String cases;
    private String deaths;

    public String getDate() {
        return date;
    }

    public void setDate(String date) {
        this.date = date;
    }

    public String getCases() {
        return cases;
    }

    public void setCases(String cases) {
        this.cases = cases;
    }

    public String getDeaths() {
        return deaths;
    }

    public void setDeaths(String deaths) {
        this.deaths = deaths;
    }

    public DatesCasesDeaths() {
    }

    public DatesCasesDeaths(String date, String cases, String deaths) {
        this.date = date;
        this.cases = cases;
        this.deaths = deaths;
    }

    @Override
    public String toString() {
        return "{" +
                "date='" + date +
                ", cases=" + cases +
                ", deaths=" + deaths +
                '}';
    }
}

Service类:

public interface ChartService {
    /**
     * 每日累计确诊人数和死亡人数
     *
     * @return DayTotal
     */
    List<DatesCasesDeaths> dayCasesDeaths();
}

ServiceImpl类:

public class ChartServiceImpl implements ChartService, ChartMapper {
    private final ChartMapper chartMapper;

    public ChartServiceImpl(ChartMapper chartMapper) {
        this.chartMapper = chartMapper;
    }

    @Override
    public List<DatesCasesDeaths> dayCasesDeaths() {
        return chartMapper.dayCasesDeaths();
    }
}

Mapper类:

public interface ChartMapper {

    /**
     * 每日累计确诊人数和死亡人数
     *
     * @return DayTotal
     */
    List<DatesCasesDeaths> dayCasesDeaths();
}

Controller类:

public class ChartController {
    @Resource
    private ChartService chartService;

    @ResponseBody
    @RequestMapping(value = "/dayCasesDeaths", method = RequestMethod.POST)
    public RestResponse dayCasesDeaths() {
        RestResponse restResponse = new RestResponse();
        restResponse.setStatus(200);
        restResponse.setMessage("success");
        restResponse.setData(chartService.dayCasesDeaths());
        return restResponse;
    }
}

可视化前端部分

1 每日累计确诊人数和累计死亡人数

<template>
  <el-empty description="暂无数据" v-if="nullData"/>
  <h3 style="color: #8f9298;position: fixed;top: 1%; ;right: 46.5%;">美国每日疫情h3>
  <Bar v-loading="loading"
       :chart-options="chartOptions"
       :chart-data="chartData"
       :chart-id="chartId"
       :dataset-id-key="datasetIdKey"
       :plugins="plugins"
       :css-classes="cssClasses"
       :styles="styles"
       :width="width"
       :height="height"
  />
template>

<script>
import {Bar} from 'vue-chartjs'
import {Chart as ChartJS, Title, Tooltip, Legend, BarElement, CategoryScale, LinearScale} from 'chart.js'

ChartJS.register(Title, Tooltip, Legend, BarElement, CategoryScale, LinearScale)

export default {
  name: 'DayCasesDeaths',
  components: {Bar},
  props: {
    chartId: {
      type: String,
      default: 'bar-chart'
    },
    datasetIdKey: {
      type: String,
      default: 'label'
    },
    width: {
      type: Number,
      default: 900
    },
    height: {
      type: Number,
      default: 400
    },
    cssClasses: {
      default: '',
      type: String
    },
    styles: {
      type: Object,
      default: () => {
      }
    },
    plugins: {
      type: Array,
      default: () => []
    }
  },
  data() {
    return {
      nullData: false,
      loading: true,
      chartData: {
        labels: [],
        datasets: []
      },
      chartOptions: {
        responsive: true
      }
    }
  },
  methods: {
    getDayCasesDeaths() {
      this.axios.post("api/dayCasesDeaths").then(res => {
        if ((res.data.status === 200)) {
          const json = JSON.stringify(res.data.data)
          this.chartData.labels = JSON.parse(json).map(item => item.date)
          this.chartData.datasets[0] = {
            label: '确诊',
            backgroundColor: 'rgb(255,221,99)',
            borderColor: 'rgb(255,221,99)',
            borderWidth: 0,
            hoverBackgroundColor: 'rgb(255,221,99)',
            hoverBorderColor: 'rgb(255,221,99)',
            data: JSON.parse(json).map(item => item.cases)
          }
          this.chartData.datasets[1] = {
            label: '死亡',
            backgroundColor: 'rgb(255,0,0)',
            borderColor: 'rgb(255,0,0)',
            borderWidth: 0,
            hoverBackgroundColor: 'rgb(255,0,0)',
            hoverBorderColor: 'rgb(255,0,0)',
            data: JSON.parse(json).map(item => item.deaths)
          }
          this.loading = false
        } else {
          this.nullData = true
          this.$message.error(res.data.message)
        }
      })
    }
  },
  created() {
    this.getDayCasesDeaths()
  }
}
script>

每日的累计确诊人数和累计死亡人数展示效果:http://spark.znzzi.com/dayCasesDeaths

项目仓库地址:
数据分析部分:https://github.com/z1zhang/us-covid19-sparksql

数据可视化后端:https://github.com/z1zhang/us-covid19-springboot

数据可视化前端:https://github.com/z1zhang/us-covid19-vue3
欢迎Star和Fork

项目完整源代码(带简要说明)

你可能感兴趣的:(大数据,spark,大数据,数据分析)