Flink 侧输出流拆分流应用

业务场景:使用Flink同步Kafka数据近实时写入MySQL,需要将登录数据拆分为登录,日活,新增分别入三个MySQL表。 采用侧输出流将流拆分为多个流,分别进行处理。

主类

/**
  * Flink 读取 Kafka,每秒聚合一次数据,批量写入 MySQL
  *
  * create by LiuJinHe 2020/5/26
  */
object CpDataKafkaToMySQL {
  private val logger = LoggerFactory.getLogger(this.getClass)

  def main(args: Array[String]): Unit = {

    val env = StreamExecutionEnvironment.getExecutionEnvironment
    // 本地测试线程 1
    //    env.setParallelism(1)

    // 失败重启,固定间隔,每隔3秒重启1次,总尝试重启10次
    env.setRestartStrategy(RestartStrategies.fixedDelayRestart(10, 3))

    // 事件处理的时间,由系统时间决定
    env.setStreamTimeCharacteristic(TimeCharacteristic.ProcessingTime)

    // 设置checkpoint周期
    env.enableCheckpointing(60000)
    val config = env.getCheckpointConfig
    // 设置模式 exactly_one
    config.setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE)
    // checkpoint 之间最小间隔
    config.setMinPauseBetweenCheckpoints(1000)
    // checkpoint 超时时间60秒,超时将被丢弃
    config.setCheckpointTimeout(60000)
    // 同一时间只允许进行一次 checkpoint
    config.setMaxConcurrentCheckpoints(1)

    val kafkaProp = ConfigManager.load(Constants.CP_KAFKA_PROP)
    val kafkaConsumer = KafkaUtils.getConsumer(kafkaProp).setStartFromGroupOffsets()

    val dataKafkaStream = env.addSource(kafkaConsumer).name("cp kafka source")

    val dataStream = dataKafkaStream.map(
      json => {
        val sdkData =
          try
            JSON.parseObject(json, classOf[CpData])
          catch {
            case ex: Throwable =>
              val str =
                s"""
                   |${DateUtils.timestampToTime(System.currentTimeMillis())} Exception: ${ex.getMessage}
                   |============ illegal json:
                   |$json
                """.stripMargin
              logger.info(str)
              CpData("none", 0, 0, "")
          }
        sdkData
      })
      .filter(line =>
        line.key != "none" && line.cp_game_id != 0 &&
          (line.key == "role_create" || line.key == "cp_role_login" || line.key == "role_pay")
      )

    // 定义侧输出流
    val roleActiveSide = new OutputTag[CpData]("role_active")
    val roleNewUserSide = new OutputTag[CpData]("role_new_user")

    val mainDataStream = dataStream
      .keyBy(data => data.key)
      .process(new CpProcess(roleActiveSide, roleNewUserSide))

    // 侧输出流,日活
    val roleActiveSideStream = mainDataStream.getSideOutput(roleActiveSide)
    roleActiveSideStream
      .timeWindowAll(Time.seconds(1))
      .apply(new CpWindow)
      .addSink(new CpMySQLSink).name("active side stream mysql sink")

    // 侧输出流,新增
    val roleNewUserSideStream = mainDataStream.getSideOutput(roleNewUserSide)
    roleNewUserSideStream
      .timeWindowAll(Time.seconds(1))
      .apply(new CpWindow)
      .addSink(new CpMySQLSink).name("new user side stream mysql sink")

    // 主输出流,登录
    mainDataStream
      .timeWindowAll(Time.seconds(1))
      .apply(new CpWindow)
      .addSink(new CpMySQLSink).name("cp main stream mysql sink")

    env.execute("cp K to M stream job")
  }
}

Kafka 数据为 json 格式,定义样例类。Flink 读取 Kafka 之前有记录过。

/**
  * 输入类
  */
case class CpData(key: String, cp_game_id: Int, time: Long, data: String)

因为还有其他事件,所以 keyBy 以后 process 为 KeyedProcessFunction。

/**
  * processFunction 拆分流发送到不同流中
  *
  * create by LiuJinHe 2020/5/26
  */
class CpProcess(roleActiveSide:OutputTag[CpData], roleNewUserSide:OutputTag[CpData]) extends KeyedProcessFunction[String, CpData, CpData]{
  override def processElement(value: CpData, ctx: KeyedProcessFunction[String, CpData, CpData]#Context, out: Collector[CpData]): Unit = {
    // 全量发送数据到流中
    out.collect(value)

    val key = ctx.getCurrentKey

    if (key == "cp_role_login") {
      // 发送数据到侧输出流
      ctx.output(roleActiveSide, value.copy("role_active",value.cp_game_id,value.time,value.data))
      ctx.output(roleNewUserSide, value.copy("role_new_user",value.cp_game_id,value.time,value.data))
    }
  }
}

 

WindowFunction

/**
  * 自定义 cp window function
  *
  * create by LiuJinHe 2020/5/26
  */
class CpWindow extends AllWindowFunction[CpData, Iterable[CpData], TimeWindow] {
  override def apply(window: TimeWindow, input: Iterable[CpData], out: Collector[Iterable[CpData]]): Unit = {
    if (input.nonEmpty) {
      println("1 秒数内据条数: " + input.size)
      out.collect(input)
    }
  }
}

 

sink 为自定义MySQL Sink,之前有写过,逻辑基本不变。

 

你可能感兴趣的:(Flink)