用Flink对keyed state 状态的代码进行设置TTL(过期时间)和hdfs的checkpoints恢复和配置Flink-conf.yaml

依赖


        <dependency>
            <groupId>org.apache.flinkgroupId>
            <artifactId>flink-coreartifactId>
            <version>1.8.1version>
        dependency>
        <dependency>
            <groupId>org.apache.flinkgroupId>
            <artifactId>flink-clients_2.11artifactId>
            <version>1.8.1version>
        dependency>

        <dependency>
            <groupId>org.apache.flinkgroupId>
            <artifactId>flink-scala_2.11artifactId>
            <version>1.8.1version>
        dependency>
        <dependency>
            <groupId>org.apache.flinkgroupId>
            <artifactId>flink-streaming-scala_2.11artifactId>
            <version>1.8.1version>
        dependency>
        <dependency>
            <groupId>org.slf4jgroupId>
            <artifactId>slf4j-log4j12artifactId>
            <version>1.7.26version>
        dependency>
        

        <dependency>
            <groupId>org.apache.hadoopgroupId>
            <artifactId>hadoop-hdfsartifactId>
            <version>2.9.2version>
        dependency>

        <dependency>
            <groupId>org.apache.hadoopgroupId>
            <artifactId>hadoop-commonartifactId>
            <version>2.9.2version>
        dependency>

        
        <dependency>
            <groupId>org.apache.flinkgroupId>
            <artifactId>flink-connector-kafka_2.11artifactId>
            <version>1.8.1version>
        dependency>
        <dependency>
            <groupId>org.apache.flinkgroupId>
            <artifactId>flink-jsonartifactId>
            <version>1.8.1version>
        dependency>

        <dependency>
            <groupId>org.apache.flinkgroupId>
            <artifactId>flink-connector-filesystem_2.11artifactId>
            <version>1.8.1version>
        dependency>
        
        <dependency>
            <groupId>org.apache.bahirgroupId>
            <artifactId>flink-connector-redis_2.11artifactId>
            <version>1.0version>
        dependency>

    dependencies>
    <build>
        <plugins>
            
            <plugin>
                <groupId>net.alchim31.mavengroupId>
                <artifactId>scala-maven-pluginartifactId>
                <version>4.0.1version>
                <executions>
                    <execution>
                        <id>scala-compile-firstid>
                        <phase>process-resourcesphase>
                        <goals>
                            <goal>add-sourcegoal>
                            <goal>compilegoal>
                        goals>
                    execution>
                executions>
            plugin>
        plugins>
    build>

代码

package com.baizhi.demo04

import org.apache.flink.streaming.api.CheckpointingMode
import org.apache.flink.streaming.api.environment.CheckpointConfig.ExternalizedCheckpointCleanup
import org.apache.flink.streaming.api.scala.{DataStream, StreamExecutionEnvironment, _}

object Flink {
  def main(args: Array[String]): Unit = {

    val env = StreamExecutionEnvironment.getExecutionEnvironment
    //开启checkpoint
    env.enableCheckpointing(7000,CheckpointingMode.EXACTLY_ONCE)
    //checkpoint必须在2s内完成,如果完成不了终止
    env.getCheckpointConfig.setCheckpointTimeout(4000)
    //距离上一次的checkpoint完成之后需要等5s 之后再开启下一次的checkpoint
    env.getCheckpointConfig.setMinPauseBetweenCheckpoints(5000)
    env.getCheckpointConfig.setMaxConcurrentCheckpoints(1)//只开启一个checkpoint线程
    //在退出应用时候,不删除checkpoint数据
    env.getCheckpointConfig.enableExternalizedCheckpoints(ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION)
    //必须保证任务可以从checkpoint恢复,恢复不成功任务失败
    env.getCheckpointConfig.setFailOnCheckpointingErrors(true)

    val data: DataStream[String] = env.socketTextStream("Flink",9999)

    data.flatMap(line=> line.split("\\s+"))
      .map((_,1))
      .keyBy(0)
      .map(new CountMapFunction)
      .print()

    env.execute()



  }

}

package com.baizhi.demo04

import org.apache.flink.api.common.functions.RichMapFunction
import org.apache.flink.api.common.state.{StateTtlConfig, ValueState, ValueStateDescriptor}
import org.apache.flink.api.common.time.Time
import org.apache.flink.configuration.Configuration
import org.apache.flink.streaming.api.scala._

class CountMapFunction extends RichMapFunction[(String,Int),(String,Int)]{
  var state:ValueState[Int]= _
  override def map(value: (String, Int)): (String, Int) = {
    var history = state.value()
    if (history==null){
      history=0
    }
   state.update(history+value._2)
    (value._1,history+value._2)
  }

  override def open(parameters: Configuration): Unit = {
    val dec = new ValueStateDescriptor[Int]("count",createTypeInformation[Int])
    //1.创建TTLConfig
    val ttlConfig = StateTtlConfig
      .newBuilder(Time.seconds(80)) //这是state存活时间10s
      .setUpdateType(StateTtlConfig.UpdateType.OnCreateAndWrite)//设置过期时间更新方式
      .setStateVisibility(StateTtlConfig.StateVisibility.NeverReturnExpired)//永远不要返回过期的状态
      //.cleanupInRocksdbCompactFilter(1000)//处理完1000个状态查询时候,会启用一次CompactFilter
      .build

    //2.开启TTL
    dec.enableTimeToLive(ttlConfig)

    val context = getRuntimeContext
    state=context.getState(dec)


  }
}

配置Flink的环境

配置Flink-conf.yaml
#==============================================================================
# Fault tolerance and checkpointing
#==============================================================================

# The backend that will be used to store operator state checkpoints if
# checkpointing is enabled.
#
# Supported backends are 'jobmanager', 'filesystem', 'rocksdb', or the
# .
#
 state.backend: rocksdb

# Directory for checkpoints filesystem, when using any of the default bundled
# state backends.
#
 state.checkpoints.dir: hdfs:///flink-checkpoints

# Default target directory for savepoints, optional.
#
 state.savepoints.dir: hdfs:///flink-savepoints

# Flag to enable/disable incremental checkpoints for backends that
# support incremental checkpoints (like the RocksDB state backend).
#
 state.backend.incremental: true
 state.backend.rocksdb.ttl.compaction.filter.enabled: true

#==============================================================================
# HistoryServer
#==============================================================================

# The HistoryServer is started and stopped via bin/historyserver.sh (start|stop)

# Directory to upload completed jobs to. Add this directory to the list of
# monitored directories of the HistoryServer as well (see below).
jobmanager.archive.fs.dir: hdfs:///completed-jobs/

# The address under which the web-based HistoryServer listens.
historyserver.web.address: CentOS

# The port under which the web-based HistoryServer listens.
historyserver.web.port: 8082

# Comma separated list of directories to monitor for completed jobs.
historyserver.archive.fs.dir: hdfs:///completed-jobs/

# Interval in milliseconds for refreshing the monitored directories.
historyserver.archive.fs.refresh-interval: 10000


将hadoop_classpath配置到环境变量中

export HADOOP_CLASSPATH=hadoop classpath

你可能感兴趣的:(用Flink对keyed state 状态的代码进行设置TTL(过期时间)和hdfs的checkpoints恢复和配置Flink-conf.yaml)