链接:https://pan.baidu.com/s/1TuSGlD53Vc8rnH2UhC8TCg
提取码:tvmi
CREATE TABLE `user_visit_action`(
`date` string,
`user_id` bigint,
`session_id` string,
`page_id` bigint,
`action_time` string,
`search_keyword` string,
`click_category_id` bigint,
`click_product_id` bigint,
`order_category_ids` string,
`order_product_ids` string,
`pay_category_ids` string,
`pay_product_ids` string,
`city_id` bigint)
row format delimited fields terminated by '\t';
load data local inpath '/home/hadoop/data/sparksql/user_visit_action.txt' into table sparkpractice.user_visit_action;
CREATE TABLE `product_info`(
`product_id` bigint,
`product_name` string,
`extend_info` string)
row format delimited fields terminated by '\t';
load data local inpath '/home/hadoop/data/sparksql/product_info.txt' into table sparkpractice.product_info;
CREATE TABLE `city_info`(
`city_id` bigint,
`city_name` string,
`area` string)
row format delimited fields terminated by '\t';
load data local inpath '/home/hadoop/data/sparksql/city_info.txt' into table sparkpractice.city_info;
object Spark06_Project {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName(this.getClass.getSimpleName).setMaster("local[*]")
val spark: SparkSession = SparkSession.builder().enableHiveSupport().config(conf).getOrCreate()
//注册自定义函数
spark.udf.register("cityCount", new CityClickUDAF)
//1.先连接三张表求出区域、城市名、商品名称
spark.sql(
"""
|SELECT
|area,city_name,product_name
|FROM
|sparkpractice.city_info c
|JOIN sparkpractice.user_visit_action u
|ON c.city_id = u.city_id
|JOIN sparkpractice.product_info p
|ON u.click_product_id = p.product_id
""".stripMargin).createOrReplaceTempView("t1")
//2.根据区域和商品分组,统计每个区域的商品点击次数
spark.sql(
"""
|SELECT
|area,product_name,count(*) click_count,cityCount(city_name) city_count
|FROM t1
|GROUP BY area,product_name
""".stripMargin).createOrReplaceTempView("t2")
//3.统计出每个区域的热门商品,取前三
spark.sql(
"""
|SELECT *,
|ROW_NUMBER() OVER(PARTITION BY area ORDER BY click_count DESC) rk
|FROM t2
""".stripMargin).createOrReplaceTempView("t3")
// 取前三
spark.sql(
"""
|SELECT *
|FROM t3
|WHERE t3.rk<=3
""".stripMargin).show(false)
spark.stop()
}
}
//4.自定义UDAF函数,传入城市名,返回每个城市的点击次数占比
class CityClickUDAF extends UserDefinedAggregateFunction {
//输入类型 城市名
override def inputSchema: StructType = {
StructType(Array(StructField("cityName", StringType)))
}
//缓冲区数据类型 城市名,点击数 总点击数
override def bufferSchema: StructType = {
StructType(Array(
StructField("city_count", MapType(StringType, LongType)),
StructField("totalCout", LongType)
))
}
//输出类型
override def dataType: DataType = StringType
//稳定性
override def deterministic: Boolean = true
//初始化缓冲区
override def initialize(buffer: MutableAggregationBuffer): Unit = {
buffer(0) = Map[String, Long]()
buffer(1) = 0L
}
//分区内合并 map(城市名,1)
override def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
//先获取输入的值
val cityName: String = input.getString(0)
var map = buffer.getMap[String, Long](0)
//不可变集合的赋值
buffer(0) = map + (cityName -> (map.getOrElse(cityName, 0L) + 1L))
//总体点击量
buffer(1) = buffer.getLong(1) + 1
}
//分区间合并
override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
val map1 = buffer1.getMap[String, Long](0)
val map2 = buffer2.getMap[String, Long](0)
//合并城市点击量
buffer1(0) = map1.foldLeft(map2)(
(m2, cityCount) => {
m2 + (cityCount._1 -> (m2.getOrElse(cityCount._1, 0L) + cityCount._2))
}
)
//合并总体点击量
buffer1(1) = buffer1.getLong(1) + buffer2.getLong(1)
}
//最终返回结果
override def evaluate(buffer: Row): Any = {
//map(城市,点击量)
val city_count = buffer.getMap[String, Long](0)
//总点击量
val totalCount: Long = buffer.getLong(1)
//排序取点击次数前2,计算百分比
var remarks: List[CityRemark] = city_count.toList.sortBy(-_._2).take(2).map {
case (city, count) => {
//封装为对象,规范输出格式
CityRemark(city, count.toDouble / totalCount)
}
}
//显示其他的城市点击占比
if (city_count.size > 2) {
val ratio: Double = remarks.foldLeft(1D)( _ - _.cityRatio)
remarks = remarks :+ CityRemark("其他", ratio)
}
remarks.mkString(",")
}
}