电商用户行为分析__热门商品实时统计

需求:

统计一小时的热门商品,每5分钟更新一次
热门度用浏览次数(pv)来衡量 

解题思路:

1、创建执行环境,获取数据,筛选出(pv);

2、开1小时,每5分钟输出一次的窗口;

3、分组,增量聚合各个商品ID的浏览次数;

4、将数据装进集合,进行排序

实现代码:

》》》》》 

package Data_text

import org.apache.flink.api.common.functions.AggregateFunction
import org.apache.flink.api.common.state.{ListState, ListStateDescriptor}
import org.apache.flink.api.java.tuple.{Tuple, Tuple1}
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.KeyedProcessFunction
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.WindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.util.Collector

import java.sql.Timestamp
import scala.collection.mutable.ListBuffer

//定义输入数据样例类(跟数据源的csv对应的上)
case class UserBehavior1(userId: Long, itemId: Long, categoryId: Int, behavior: String, timestamp: Long )
// 定义窗口聚合结果样例类
case class ItemViewCount1(itemId: Long, windowEnd: Long, count: Long)

object pv_2 {
  def main(args: Array[String]): Unit = {
    //创建执行环境
    val env =  StreamExecutionEnvironment.getExecutionEnvironment
    env.setParallelism(1) //防止乱序
    env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime) //定义事件时间
    //获取数据
    val value = env.readTextFile("E:\\idea项目\\untitled4\\src\\main\\resources\\UserBehavior.csv")
    //对数据进行处理,筛选
    val stream = value
      .map(data=>{
        val arr = data.split(",")
        UserBehavior1(userId = arr(0).toLong, itemId = arr(1).toLong, categoryId = arr(2).toInt, behavior = arr(3), timestamp = arr(4).toLong)
      })
      .assignAscendingTimestamps(_.timestamp * 1000L)

    val stream2 = stream
      .filter(_.behavior == "pv")
      .keyBy(_.itemId)
      .timeWindow(Time.hours(1),Time.minutes(5))
      .aggregate( new CountAgg1 , new ItemViewCountWindowResult1()) 


    val outputStream = stream2
      .keyBy(_.windowEnd)
      .process(new TopNHostItem1(10)) //对数据进行排序

    outputStream.print("")
    env.execute("")

  }

}
class CountAgg1() extends AggregateFunction[UserBehavior1,Long,Long]{
  override def createAccumulator(): Long = 0L

  override def add(in: UserBehavior1, acc: Long): Long = acc+1

  override def getResult(acc: Long): Long = acc

  override def merge(acc: Long, acc1: Long): Long = acc+acc1
}

class ItemViewCountWindowResult1() extends WindowFunction[Long,ItemViewCount1,Long,TimeWindow]{
  override def apply(key: Long, window: TimeWindow, input: Iterable[Long], out: Collector[ItemViewCount1]): Unit = {
    val itemId = key 
    val windowEnd = window.getEnd
    val count = input.iterator.next()
    out.collect(ItemViewCount1(itemId,windowEnd,count))
  }
}

class TopNHostItem1(i: Int) extends KeyedProcessFunction[Long,ItemViewCount1,String]{
  //先定义一个ListState,保存所有商品的count,id,ts等信息,因为要最终从这个结果里面排序截取topN个,所以得提前全部存下来才行
  var listState:ListState[ItemViewCount1] = _

  override def open(parameters: Configuration): Unit = {

    listState = getRuntimeContext.getListState(new ListStateDescriptor[ItemViewCount1]("list",classOf[ItemViewCount1]))

  }

  override def processElement(i: ItemViewCount1, context: KeyedProcessFunction[Long, ItemViewCount1, String]#Context, collector: Collector[String]): Unit = {
    // 每来一条数据,直接追加到ListState
    listState.add(i)
    //注册一个定时器,windowEnd+1秒之后触发
    context.timerService().registerEventTimeTimer(i.windowEnd + 1)

  }

  override def onTimer(timestamp: Long, ctx: KeyedProcessFunction[Long, ItemViewCount1, String]#OnTimerContext, out: Collector[String]): Unit = {

    //为了方便排序,定义另一个ListBuffer,保存ListState的所有数据
    val allItemListBuffer: ListBuffer[ItemViewCount1] = ListBuffer()

    //从状态(state)中获取 List 状态,然后使用 iterator() 方法创建一个迭代器(iterator),以便遍历这个 List 中的元素
    val iter = listState.get().iterator()

    //iter.hasNext()是一个用于检查迭代器 iter 中是否还有下一个元素的方法。如果迭代器中还有元素返回 true,否则返回 false
    while (iter.hasNext){
      allItemListBuffer += iter.next()
    }

    //清空ListState的数据,已经放到allItemListBuffer 准备计算了,等下次触发就应该是新的了
    listState.clear()

    //按照count,从大到小排序,然后再取前N个
    val counts = allItemListBuffer.sortBy(_.count).reverse.take(i)

    //格式化输出数据
    val result = new StringBuilder
    result.append("窗口结束时间:").append(new Timestamp(timestamp-1)).append("\n")

    //遍历结果列表中的每个值 , 输出到一行
    //遍历列表的索引号
    for( i <- counts.indices){
      //接收遍历的值
      val count = counts(i)

    result.append("NO:").append(i+1).append("\t")
      .append("商品id = ").append(count.itemId).append("\t")
      .append("访问量: ").append(count.count).append("\n")
    }

    //空两行输出
    result.append("---------------------------------------\n\n")

    //延迟两秒输出结果
    Thread.sleep(2000)

    out.collect(result.toString())

  }

}

运行结果:

》》》》》

电商用户行为分析__热门商品实时统计_第1张图片

你可能感兴趣的:(scala,开发语言,flink)