fastspark | 用SparkCore和SparkSQL两种方式实现各省份广告TopN统计

内容

本文讲述使用SparkCore和SparkSQL实现每个省份点击量最多的前三个广告id,测试数据如下

省份id	广告id
1	100
1	100
1	100
1	112
1	101
1	112
1	102
1	102
1	103
1	112
1	112
1	101
1	112
2	100
2	121
2	101
2	121
2	104
2	121
2	111
2	104
2	103
2	111
2	121
2	104
3	121
3	112
3	112
3	121
3	100

SparkCore


import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}

import scala.collection.mutable.ArrayBuffer

/**
  * Program: fastspark
  * Package:  
  * Description: Created by felahong on 2020/4/15 12:03
  * TODO 统计每个省份点击TOP3的广告
  */

case class AdClick(province: Int, ad: Int)

object ProvinceAdTopThree {

  def main(args: Array[String]): Unit = {

    val conf = new SparkConf().setAppName("province-ad-count").setMaster("local[*]")
    val sc = SparkContext.getOrCreate(conf)
    sc.setLogLevel("warn")

    // 读取数据文件
    val logRdd = sc.textFile("hdfs://felahong:9000/test/pro_ad_tmp.txt", 2)
//    logRdd.foreach(println)

    // 封装为AdClick 类型的RDD
    val adClickRdd: RDD[AdClick] = logRdd.map(line => {
      val arr: Array[String] = line.split("\t").filter(_.length > 0)
//      println(arr.length)
      AdClick(arr(0).toInt, arr(1).toInt)
    })

    val proAd2CountRdd: RDD[(String, Int)] = adClickRdd.map(adClick => (adClick.province+"_"+adClick.ad, 1)).reduceByKey(_+_) // (pro_ad, sum)

    val pro2AdsRdd = proAd2CountRdd.map(line => {
      val arr = line._1.split("_")
      (arr(0), (arr(1), line._2))
    }).groupByKey() // (proid, ((adid, sum), (adid, sum)))
    pro2AdsRdd.foreach(println)

    // flatMap返回值必须是序列类的类型
    val res: RDD[String] = pro2AdsRdd.flatMap({case(pro, items) =>
      val topThree = items.toList
        .sortWith(_._2 > _._2)
        .take(3)
      for(topn <- topThree) yield {
        pro + " " + topn._1 + " " + topn._2
      }
    })
//
//    val res = pro2AdsRdd.flatMap{ case(pro, items)=>
//      val filterItems = items.toList.sortWith(_._2 > _._2).take(3).toArray
//      val result = new ArrayBuffer[String]()
//      for(item <- filterItems){
//        result += (pro + " " + item._1 + " " + item._2)
//      }
//      result
//    }

    println
    res.foreach(println)

    sc.stop()
  }

}

SparkSQL

import org.apache.spark.sql.SparkSession
import org.apache.spark.{SparkConf, SparkContext}

/**
  * Program: fastspark
  * Package:  
  * Description: Created by felahong on 2020/4/15 16:49
  * TODO 通过SparkSQL 统计每个省份点击TOP3的广告
  */

object ProvinceAdTopThreeSQL extends App {

  private val conf: SparkConf = new SparkConf().setMaster("local[*]").setAppName("province-ad-count-sql")
  val ss = SparkSession.builder().config(conf).getOrCreate()
  val sc = ss.sparkContext

  sc.setLogLevel("WARN")

  import ss.implicits._

  val inpath = "hdfs://felahong:9000/test/pro_ad_tmp.txt"
  val logRdd = sc.textFile(inpath).map{ line =>
    val arr = line.split("\t")
    (arr(0), arr(1))
  }.toDF("proid", "adid")
      .cache()

  logRdd.createOrReplaceTempView("ad_log")

  val hql =
    """select proid, adid, clickCount from(
      |  select proid, adid, clickCount, row_number() over(partition by proid order by clickCount desc) as rnk from (
      |     select proid, adid, count(*) clickCount from ad_log group by proid, adid
      |  ) t1
      |)t
      |where rnk<=3
    """.stripMargin

  val res = ss.sql(hql).cache()
  res.show()

  ss.stop()
  sc.stop()

}

你可能感兴趣的:(大数据,Spark)