Flink求TopN

一、数据字段如下:

 

字段名

数据类型

说明

userId

Long

加密后的用户ID

itemId

Long

加密后的商品ID

categoryId

Int

加密后的商品所属类别ID

behavior

String

用户行为类型,包括(‘pv’, ‘’buy, ‘cart’, ‘fav’)

timestamp

Long

行为发生的时间戳,单位秒

 

数据样本如下:

82170,3588374,2465336,pv,1511658004
587599,2067643,4818107,cart,1511658004
367451,15775,4756105,pv,1511658004
428316,2478780,4284875,pv,1511658004
284910,3680091,3829657,pv,1511658004
345119,737662,4357323,pv,1511658004
551442,1762997,1879194,pv,1511658004
550384,3908776,1029459,pv,1511658004
677500,4534693,2640118,pv,1511658004
398626,2791489,1467750,pv,1511658004
118053,3545571,2433095,pv,1511658005
457401,4063698,4801426,pv,1511658005
45105,3234847,3141941,fav,1511658005
604760,2661651,3738615,pv,1511658005
905383,2064903,2939262,cart,1511658005
740788,3657484,4936889,pv,1511658005
456838,1242724,4756105,fav,1511658005
585217,215764,2640118,pv,1511658006
658185,4025021,4048584,fav,1511658006
210431,2035568,2328673,pv,1511658006

 

二、需求:每隔5分钟输出最近一小时内点击量最多的前N个商品。

三、需求分析:

  1. 抽取出业务时间戳,告诉Flink框架基于业务时间做窗口
  2. 过滤出点击行为数据
  3. 按一小时的窗口大小,每5分钟统计一次,做滑动窗口聚合(Sliding Window)
  4. 按每个窗口聚合,输出每个窗口中点击量前N名的商品

四、代码实现

数据从kafka到Flink

import java.sql.Timestamp
import java.util.Properties

import org.apache.flink.api.common.functions.AggregateFunction
import org.apache.flink.api.common.serialization.SimpleStringSchema
import org.apache.flink.api.common.state.{ListState, ListStateDescriptor}
import org.apache.flink.api.java.tuple.{Tuple, Tuple1}
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.TimeCharacteristic
import org.apache.flink.streaming.api.functions.KeyedProcessFunction
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.scala._
import org.apache.flink.streaming.api.scala.function.WindowFunction
import org.apache.flink.streaming.api.windowing.time.Time
import org.apache.flink.streaming.api.windowing.windows.TimeWindow
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer
import org.apache.flink.util.Collector

import scala.collection.mutable.ListBuffer
/**
  * @author fczheng 
  *  2019-11-08 15:39
  */
object HotItems {
    def main(args: Array[String]): Unit = {
        
        val env = StreamExecutionEnvironment.getExecutionEnvironment
        env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime)
        env.setParallelism(1)
        
        //读取文件
        //val data = env.readTextFile("E:\\develop\\workspace\\myidea\\UserBehaviorAnalysis\\HotItemsAnalysis\\src\\main\\resources\\UserBehavior.csv")
        
        //读取kafka中的流
        val properties = new Properties()
        properties.setProperty("bootstrap.servers", "hadoop102:9092,hadoop103:9092")
        properties.setProperty("group.id", "consumer-group")
        properties.setProperty("key.deserializer",
            "org.apache.kafka.common.serialization.StringDeserializer")
        properties.setProperty("value.deserializer",
            "org.apache.kafka.common.serialization.StringDeserializer")
        properties.setProperty("auto.offset.reset", "latest")
    
        val data = env.addSource(new FlinkKafkaConsumer[String]("hotitems",new SimpleStringSchema(),properties))
        val stream = data
                        .map(line =>{
                            val s: Array[String] = line.split(",")
                            
                            UserBehavior(
                                s(0).toLong,
                                s(1).toLong,
                                s(2).toInt,
                                s(3),
                                s(4).toLong
                            )
                        })
                        .assignAscendingTimestamps(_.timeStamp * 1000) //指定时间戳和watermark
                        .filter(_.behavior == "pv")
                        //.keyBy("itemId")
                         .keyBy(_.itemId)
                        .timeWindow(Time.minutes(60),Time.minutes(5))
                        .aggregate(new CountAgg() , new WindowResultFunction())
                        .keyBy(_.windowEnd)
                       .process(new TopNHostItems(3)).print()
        
        env.execute()
    }
}
//自定义process Function
// 求某个窗口中前 N 名的热门点击商品,key 为窗口时间戳,输出为 TopN 的结果字符串
class TopNHostItems(topSize: Int) extends
        KeyedProcessFunction[Long,ItemViewCount,String]{
    
    //自定义一个list State,用来保存所有的ItemViewCount
    private var itemState : ListState[ItemViewCount] = _
    
    override def open(parameters: Configuration): Unit = {
        
        super.open(parameters)
        
        //命名状态变量的名字和状态变量的类型
        val itemsStateDesc = new ListStateDescriptor[ItemViewCount]("itemState-state",classOf[ItemViewCount])
            
            //定义状态变量
            itemState = getRuntimeContext.getListState(itemsStateDesc)
        
    }
    
    override def processElement(input: ItemViewCount,
                                context: KeyedProcessFunction[Long, ItemViewCount, String]#Context,
                                collector: Collector[String]): Unit = {
        
        //每一条数据存入state中
        itemState.add(input)
    
    
        // 注册定时器,延时触发;当定时器触发时,当前windowEnd的一组数据应该都到齐,统一排序处理
        // 也就是当程序看到windowend + 1的水位线watermark时,触发onTimer回调函数
        context.timerService().registerEventTimeTimer(input.windowEnd + 1)
    }
    
    override def onTimer(timestamp: Long, 
                         ctx: KeyedProcessFunction[Long, ItemViewCount, String]#OnTimerContext, 
                         out: Collector[String]): Unit = {
        //定时器触发时,已经收集到所有的数据,首先把所有数据放到一个list中
        val allItems: ListBuffer[ItemViewCount] = new ListBuffer()
        import scala.collection.JavaConversions._
        
        for(item <- itemState.get){
            allItems += item
        }
        
        //提前清楚状态中的数据,释放空间
        itemState.clear()
        
        //按点击量从大到小排序
        val sortedItems: ListBuffer[ItemViewCount]
            = allItems.sortBy(_.count)(Ordering.Long.reverse)take(topSize)
    
        // 将排名信息格式化成 String, 便于打印
        val result: StringBuilder = new StringBuilder
        result.append("====================================\n")
        result.append("时间: ").append(new Timestamp(timestamp - 1)).append("\n")
    
        for(i <- sortedItems.indices){
            val currentItem: ItemViewCount = sortedItems(i)
            // e.g.  No1:  商品ID=12224  浏览量=2413
            result.append("No").append(i+1).append(":")
                    .append("  商品ID=").append(currentItem.itemId)
                    .append("  浏览量=").append(currentItem.count).append("\n")
        }
        result.append("====================================\n\n")
    
        // 控制输出频率,模拟实时滚动结果
        Thread.sleep(1000)
        out.collect(result.toString())
    
    }
}

// 自定义预聚合函数,来一个数据就加一
class CountAgg extends AggregateFunction[UserBehavior,Long,Long]{
    override def createAccumulator(): Long = 0L
    
    override def add(in: UserBehavior, acc: Long): Long = acc + 1
    
    override def getResult(acc: Long): Long = acc
    
    override def merge(acc1: Long, acc2: Long): Long = acc1 + acc2
}

//用于输出窗口的结构
class WindowResultFunction extends WindowFunction[
        Long,
        ItemViewCount,
        Long,
        TimeWindow]{
    
    override def apply(
                      key: Long,
                      window: TimeWindow,
                      aggregateResult: Iterable[Long],
                      collector: Collector[ItemViewCount]) : Unit = {
        
       // val itemId: Long = key.asInstanceOf[Tuple1[Long]].f0
        val itemId = key
        val windowEnd = window.getEnd
        val count: Long = aggregateResult.iterator.next()
        
        collector.collect(ItemViewCount(itemId, windowEnd, count))
        
    }
}

//输入数据的样例类
case class UserBehavior(userId: Long,
                        itemId: Long,
                        categoryId: Int,
                        behavior: String,
                        timeStamp: Long)

//中间输出的商品浏览量的样例类
case class ItemViewCount(itemId: Long,
                         windowEnd: Long,
                         count: Long)

五、结果展示

====================================
时间: 2017-11-26 10:20:00.0
No1:  商品ID=2218996  浏览量=1
No2:  商品ID=1994111  浏览量=1
No3:  商品ID=1469184  浏览量=1
====================================


====================================
时间: 2017-11-26 10:25:00.0
No1:  商品ID=2338453  浏览量=5
No2:  商品ID=3784551  浏览量=4
No3:  商品ID=3919770  浏览量=3
====================================


====================================
时间: 2017-11-26 10:30:00.0
No1:  商品ID=2338453  浏览量=7
No2:  商品ID=3034696  浏览量=6
No3:  商品ID=3784551  浏览量=5
====================================


====================================
时间: 2017-11-26 10:35:00.0
No1:  商品ID=2338453  浏览量=12
No2:  商品ID=3034696  浏览量=7
No3:  商品ID=291932  浏览量=6
====================================


====================================
时间: 2017-11-26 10:40:00.0
No1:  商品ID=2338453  浏览量=13
No2:  商品ID=291932  浏览量=9
No3:  商品ID=279675  浏览量=8
====================================


====================================
时间: 2017-11-26 10:45:00.0
No1:  商品ID=2338453  浏览量=14
No2:  商品ID=279675  浏览量=11
No3:  商品ID=291932  浏览量=11
====================================

六、pom依赖


        1.7.2
        2.11
        2.2.0
    

    
        
            org.apache.flink
            flink-scala_${scala.binary.version}
            ${flink.version}
        
        
            org.apache.flink
            flink-streaming-scala_${scala.binary.version}
            ${flink.version}
        
        
            org.apache.kafka
            kafka_${scala.binary.version}
            ${kafka.version}
        
        
            org.apache.flink
            flink-connector-kafka_${scala.binary.version}
            ${flink.version}
        
    

 

七、插件

 
        
            
            
                net.alchim31.maven
                scala-maven-plugin
                3.4.6
                
                    
                        
                        
                            testCompile
                        
                    
                
            

            
                org.apache.maven.plugins
                maven-assembly-plugin
                3.0.0
                
                    
                        
                            jar-with-dependencies
                        
                    
                
                
                    
                        make-assembly
                        package
                        
                            single
                        
                    
                
            
        
    

 

你可能感兴趣的:(Flink)