Spark SQL案例练习

需求:区域热门商品Top3

需求说明

  • 这里的热门商品是从点击量的维度来看的。
  • 计算各个区域前三大热门商品,并备注上每个商品在主要城市中的分布比例,超过两个城市用其他显示(如下图)。

Spark SQL案例练习_第1张图片

数据准备

数据下载

链接:https://pan.baidu.com/s/1TuSGlD53Vc8rnH2UhC8TCg
提取码:tvmi

建表

CREATE TABLE `user_visit_action`(
  `date` string,
  `user_id` bigint,
  `session_id` string,
  `page_id` bigint,
  `action_time` string,
  `search_keyword` string,
  `click_category_id` bigint,
  `click_product_id` bigint,
  `order_category_ids` string,
  `order_product_ids` string,
  `pay_category_ids` string,
  `pay_product_ids` string,
  `city_id` bigint)
row format delimited fields terminated by '\t';
load data local inpath '/home/hadoop/data/sparksql/user_visit_action.txt' into table sparkpractice.user_visit_action;

CREATE TABLE `product_info`(
  `product_id` bigint,
  `product_name` string,
  `extend_info` string)
row format delimited fields terminated by '\t';
load data local inpath '/home/hadoop/data/sparksql/product_info.txt' into table sparkpractice.product_info;

CREATE TABLE `city_info`(
  `city_id` bigint,
  `city_name` string,
  `area` string)
row format delimited fields terminated by '\t';
load data local inpath '/home/hadoop/data/sparksql/city_info.txt' into table sparkpractice.city_info;

需求分析

  • 1.先连接三张表求出区域、城市名、商品名称
  • 2.根据区域和商品分组,统计每个区域的商品点击次数
  • 3.统计出每个区域的热门商品,取前三
  • 4.自定义UDAF函数,传入城市名,返回每个城市的点击次数占比

代码实现

主类代码

object Spark06_Project {
  def main(args: Array[String]): Unit = {
    val conf: SparkConf = new SparkConf().setAppName(this.getClass.getSimpleName).setMaster("local[*]")
    val spark: SparkSession = SparkSession.builder().enableHiveSupport().config(conf).getOrCreate()
    //注册自定义函数
    spark.udf.register("cityCount", new CityClickUDAF)
    //1.先连接三张表求出区域、城市名、商品名称
    spark.sql(
      """
        |SELECT
        |area,city_name,product_name
        |FROM
        |sparkpractice.city_info c
        |JOIN sparkpractice.user_visit_action u
        |ON c.city_id = u.city_id
        |JOIN sparkpractice.product_info p
        |ON u.click_product_id = p.product_id
      """.stripMargin).createOrReplaceTempView("t1")

    //2.根据区域和商品分组,统计每个区域的商品点击次数
    spark.sql(
      """
        |SELECT
        |area,product_name,count(*) click_count,cityCount(city_name) city_count
        |FROM t1
        |GROUP BY area,product_name
      """.stripMargin).createOrReplaceTempView("t2")
    //3.统计出每个区域的热门商品,取前三
    spark.sql(
      """
        |SELECT *,
        |ROW_NUMBER() OVER(PARTITION BY area ORDER BY click_count DESC) rk
        |FROM t2
      """.stripMargin).createOrReplaceTempView("t3")
    // 取前三
    spark.sql(
      """
        |SELECT *
        |FROM t3
        |WHERE t3.rk<=3
      """.stripMargin).show(false)


    spark.stop()
  }
}

自定义UDAF

//4.自定义UDAF函数,传入城市名,返回每个城市的点击次数占比
class CityClickUDAF extends UserDefinedAggregateFunction {
  //输入类型  城市名
  override def inputSchema: StructType = {
    StructType(Array(StructField("cityName", StringType)))
  }

  //缓冲区数据类型  城市名,点击数 总点击数
  override def bufferSchema: StructType = {
    StructType(Array(
      StructField("city_count", MapType(StringType, LongType)),
      StructField("totalCout", LongType)
    ))
  }

  //输出类型
  override def dataType: DataType = StringType

  //稳定性
  override def deterministic: Boolean = true

  //初始化缓冲区
  override def initialize(buffer: MutableAggregationBuffer): Unit = {
    buffer(0) = Map[String, Long]()
    buffer(1) = 0L
  }

  //分区内合并 map(城市名,1)
  override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
    //先获取输入的值
    val cityName: String = input.getString(0)
    var map = buffer.getMap[String, Long](0)
    //不可变集合的赋值
    buffer(0) = map + (cityName -> (map.getOrElse(cityName, 0L) + 1L))
    //总体点击量
    buffer(1) = buffer.getLong(1) + 1
  }

  //分区间合并
  override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
    val map1 = buffer1.getMap[String, Long](0)
    val map2 = buffer2.getMap[String, Long](0)
    //合并城市点击量
    buffer1(0) = map1.foldLeft(map2)(
      (m2, cityCount) => {
        m2 + (cityCount._1 -> (m2.getOrElse(cityCount._1, 0L) + cityCount._2))
      }
    )
    //合并总体点击量
    buffer1(1) = buffer1.getLong(1) + buffer2.getLong(1)
  }

  //最终返回结果
  override def evaluate(buffer: Row): Any = {
    //map(城市,点击量)
    val city_count = buffer.getMap[String, Long](0)
    //总点击量
    val totalCount: Long = buffer.getLong(1)
    //排序取点击次数前2,计算百分比
    var remarks: List[CityRemark] = city_count.toList.sortBy(-_._2).take(2).map {
      case (city, count) => {
        //封装为对象,规范输出格式
        CityRemark(city, count.toDouble / totalCount)
      }
    }
    //显示其他的城市点击占比
    if (city_count.size > 2) {
      val ratio: Double = remarks.foldLeft(1D)( _ - _.cityRatio)
       remarks = remarks :+ CityRemark("其他", ratio)
    }

    remarks.mkString(",")

  }
}

你可能感兴趣的:(spark,spark,spark,sql)