业务场景:使用Flink同步Kafka数据近实时写入MySQL,需要将登录数据拆分为登录,日活,新增分别入三个MySQL表。 采用侧输出流将流拆分为多个流,分别进行处理。
主类
/**
* Flink 读取 Kafka,每秒聚合一次数据,批量写入 MySQL
*
* create by LiuJinHe 2020/5/26
*/
object CpDataKafkaToMySQL {
private val logger = LoggerFactory.getLogger(this.getClass)
def main(args: Array[String]): Unit = {
val env = StreamExecutionEnvironment.getExecutionEnvironment
// 本地测试线程 1
// env.setParallelism(1)
// 失败重启,固定间隔,每隔3秒重启1次,总尝试重启10次
env.setRestartStrategy(RestartStrategies.fixedDelayRestart(10, 3))
// 事件处理的时间,由系统时间决定
env.setStreamTimeCharacteristic(TimeCharacteristic.ProcessingTime)
// 设置checkpoint周期
env.enableCheckpointing(60000)
val config = env.getCheckpointConfig
// 设置模式 exactly_one
config.setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE)
// checkpoint 之间最小间隔
config.setMinPauseBetweenCheckpoints(1000)
// checkpoint 超时时间60秒,超时将被丢弃
config.setCheckpointTimeout(60000)
// 同一时间只允许进行一次 checkpoint
config.setMaxConcurrentCheckpoints(1)
val kafkaProp = ConfigManager.load(Constants.CP_KAFKA_PROP)
val kafkaConsumer = KafkaUtils.getConsumer(kafkaProp).setStartFromGroupOffsets()
val dataKafkaStream = env.addSource(kafkaConsumer).name("cp kafka source")
val dataStream = dataKafkaStream.map(
json => {
val sdkData =
try
JSON.parseObject(json, classOf[CpData])
catch {
case ex: Throwable =>
val str =
s"""
|${DateUtils.timestampToTime(System.currentTimeMillis())} Exception: ${ex.getMessage}
|============ illegal json:
|$json
""".stripMargin
logger.info(str)
CpData("none", 0, 0, "")
}
sdkData
})
.filter(line =>
line.key != "none" && line.cp_game_id != 0 &&
(line.key == "role_create" || line.key == "cp_role_login" || line.key == "role_pay")
)
// 定义侧输出流
val roleActiveSide = new OutputTag[CpData]("role_active")
val roleNewUserSide = new OutputTag[CpData]("role_new_user")
val mainDataStream = dataStream
.keyBy(data => data.key)
.process(new CpProcess(roleActiveSide, roleNewUserSide))
// 侧输出流,日活
val roleActiveSideStream = mainDataStream.getSideOutput(roleActiveSide)
roleActiveSideStream
.timeWindowAll(Time.seconds(1))
.apply(new CpWindow)
.addSink(new CpMySQLSink).name("active side stream mysql sink")
// 侧输出流,新增
val roleNewUserSideStream = mainDataStream.getSideOutput(roleNewUserSide)
roleNewUserSideStream
.timeWindowAll(Time.seconds(1))
.apply(new CpWindow)
.addSink(new CpMySQLSink).name("new user side stream mysql sink")
// 主输出流,登录
mainDataStream
.timeWindowAll(Time.seconds(1))
.apply(new CpWindow)
.addSink(new CpMySQLSink).name("cp main stream mysql sink")
env.execute("cp K to M stream job")
}
}
Kafka 数据为 json 格式,定义样例类。Flink 读取 Kafka 之前有记录过。
/**
* 输入类
*/
case class CpData(key: String, cp_game_id: Int, time: Long, data: String)
因为还有其他事件,所以 keyBy 以后 process 为 KeyedProcessFunction。
/**
* processFunction 拆分流发送到不同流中
*
* create by LiuJinHe 2020/5/26
*/
class CpProcess(roleActiveSide:OutputTag[CpData], roleNewUserSide:OutputTag[CpData]) extends KeyedProcessFunction[String, CpData, CpData]{
override def processElement(value: CpData, ctx: KeyedProcessFunction[String, CpData, CpData]#Context, out: Collector[CpData]): Unit = {
// 全量发送数据到流中
out.collect(value)
val key = ctx.getCurrentKey
if (key == "cp_role_login") {
// 发送数据到侧输出流
ctx.output(roleActiveSide, value.copy("role_active",value.cp_game_id,value.time,value.data))
ctx.output(roleNewUserSide, value.copy("role_new_user",value.cp_game_id,value.time,value.data))
}
}
}
WindowFunction
/**
* 自定义 cp window function
*
* create by LiuJinHe 2020/5/26
*/
class CpWindow extends AllWindowFunction[CpData, Iterable[CpData], TimeWindow] {
override def apply(window: TimeWindow, input: Iterable[CpData], out: Collector[Iterable[CpData]]): Unit = {
if (input.nonEmpty) {
println("1 秒数内据条数: " + input.size)
out.collect(input)
}
}
}
sink 为自定义MySQL Sink,之前有写过,逻辑基本不变。