快速搭建Flink消费kafka的模板代码。
8
8
1.13.6
1.8
2.12
1.7.30
3.1.2
org.apache.flink
flink-java
${flink.version}
org.apache.flink
flink-streaming-java_${scala.binary.version}
${flink.version}
org.apache.flink
flink-connector-kafka_${scala.binary.version}
${flink.version}
org.apache.flink
flink-connector-base
${flink.version}
org.apache.flink
flink-clients_${scala.binary.version}
${flink.version}
org.slf4j
slf4j-api
${slf4j.version}
org.slf4j
slf4j-log4j12
${slf4j.version}
org.apache.flink
flink-table-api-java-bridge_${scala.binary.version}
${flink.version}
org.apache.flink
flink-table-planner-blink_${scala.binary.version}
${flink.version}
com.alibaba
fastjson
1.2.62
org.projectlombok
lombok
1.18.24
org.apache.flink
flink-json
${flink.version}
org.apache.flink
flink-statebackend-rocksdb_2.11
1.13.6
com.clearspring.analytics
stream
2.7.0
org.apache.flink
flink-runtime-web_${scala.binary.version}
1.12.2
org.apache.maven.plugins
maven-compiler-plugin
8
org.apache.maven.plugins
maven-assembly-plugin
3.0.0
jar-with-dependencies
make-assembly
package
single
nexus-aliyun
nexus-aliyun
http://maven.aliyun.com/nexus/content/groups/public/
true
false
bigdata
org.apache.flink
flink-java
org.apache.flink
flink-streaming-java_${scala.binary.version}
org.apache.flink
flink-connector-kafka_${scala.binary.version}
org.apache.flink
flink-connector-base
org.apache.flink
flink-statebackend-rocksdb_2.11
org.apache.flink
flink-runtime-web_2.12
org.slf4j
slf4j-api
org.slf4j
slf4j-log4j12
com.clearspring.analytics
stream
com.alibaba
fastjson
log4j.rootLogger=ERROR, stdout
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d %p [%c] - %m%n
log4j.appender.logfile=org.apache.log4j.FileAppender
log4j.appender.logfile.File=target/spring.log
log4j.appender.logfile.layout=org.apache.log4j.PatternLayout
log4j.appender.logfile.layout.ConversionPattern=%d %p [%c] - %m%n
public class CommonConfig {
/**
* kafka服务地址
*/
public final static String BOOTSTRAP_SERVERS="";
/**
* 目标topic
*/
public final static String SOURCE_TOPIC="";
public final static String TARGET_TOPIC="";
/**
* 应用名称
*/
public final static String APP_NAME="";
/**
* 状态后端保存地址
*/
public final static String STATE_BACKEND_PATH="hdfs://master1:8020/checkpoint";
}
public class KafkaConsumerUtil {
static String BOOTSTRAP_SERVERS = CommonConfig.BOOTSTRAP_SERVERS;
public static FlinkKafkaConsumer getKafkaConsumer(String topic, String groupId) {
Properties prop = new Properties();
prop.setProperty("bootstrap.servers", BOOTSTRAP_SERVERS);
prop.setProperty(ConsumerConfig.GROUP_ID_CONFIG, groupId);
prop.setProperty(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG,"earliest");
FlinkKafkaConsumer consumer = new FlinkKafkaConsumer<>(topic,
//由于默认的解码器,如果字符串为空的时候他会保存,所以自定义一个
new KafkaDeserializationSchema() {
@Override
public boolean isEndOfStream(String nextElement) {
return false;
}
@Override
public String deserialize(ConsumerRecord record) throws Exception {
if(record == null || record.value() == null) {
return "";
}
return new String(record.value(),"UTF-8");
}
@Override
public TypeInformation getProducedType() {
return BasicTypeInfo.STRING_TYPE_INFO;
}
}, prop);
return consumer;
}
}
public class KafkaProductUtil {
public static FlinkKafkaProducer getKafkaProduct(String targetTopic) {
Properties properties = new Properties();
properties.setProperty("bootstrap.servers", CommonConfig.BOOTSTRAP_SERVERS);
/**
* 这里的作用是如果flink的checkpoint的时间小于事物的时间,就会失败,所以要设置时间要大于等于flink的checkpoint的时间
*/
properties.setProperty(ProducerConfig.TRANSACTION_TIMEOUT_CONFIG, 60 * 15 * 1000 + "");
KafkaSerializationSchema serializationSchema = new KafkaSerializationSchema() {
@Override
public ProducerRecord serialize(String element, @Nullable Long timestamp) {
return new ProducerRecord<>(
targetTopic, // target topic
element.getBytes(StandardCharsets.UTF_8)); // record contents
}
};
FlinkKafkaProducer myProducer = new FlinkKafkaProducer<>(
targetTopic, // target topic
serializationSchema, // serialization schema
properties, // producer config
FlinkKafkaProducer.Semantic.EXACTLY_ONCE); // fault-tolerance
return myProducer;
}
}
public class FlinkMessageUnique {
private final static String GROUP_ID= FlinkMessageUnique.class.getSimpleName();
public static void main(String[] args) throws Exception {
// TODO 配置本地flink ui
// Configuration configuration = new Configuration();
// configuration.setInteger(RestOptions.PORT, 8082);
// StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment(configuration);
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//这里和kafka的分区保持一致
env.setParallelism(1);
// TODO 1. 状态后端设置
env.enableCheckpointing(3000L, CheckpointingMode.EXACTLY_ONCE);
//检查点超时时间
env.getCheckpointConfig().setCheckpointTimeout(60 * 1000L);
//两次检查点最小间隔时间,就是第一次检查点完成以后,最少经过3s钟开始检查点
env.getCheckpointConfig().setMinPauseBetweenCheckpoints(3000L);
env.getCheckpointConfig().enableExternalizedCheckpoints(
// ExternalizedCheckpointCleanup用于指定当job canceled的时候externalized checkpoint该如何清理
// DELETE_ON_CANCELLATION的话,在job canceled的时候会自动删除externalized state
// RETAIN_ON_CANCELLATION则在job canceled的时候会保留externalized checkpoint state
CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION
);
env.setRestartStrategy(RestartStrategies.failureRateRestart(
// 设置任务失败重启 允许任务失败最大次数 10次
10,
// 任务失败的时间启动的间隔
Time.of(1L, TimeUnit.MINUTES),
// 允许任务延迟时间
Time.of(3L, TimeUnit.MINUTES)
));
//设置状态后端
// 此处也可以是HDFS路径,这里为了测试方便,所以使用的是本地路径
env.setStateBackend(new RocksDBStateBackend(CommonConfig.STATE_BACKEND_PATH, true));
// env.setStateBackend(new RocksDBStateBackend("hdfs://master1:8020/fink-checkpoints", true));
// env.getCheckpointConfig().setCheckpointStorage("hdfs://master1:8020/bigdata/ck");
System.setProperty("HADOOP_USER_NAME", "bigdata");
DataStreamSource data = env.addSource(KafkaConsumerUtil.getKafkaConsumer(CommonConfig.SOURCE_TOPIC, GROUP_ID));
//TODO 使用布隆过滤器进行数据过滤
SingleOutputStreamOperator datadata = data.keyBy(new KeySelector() {
@Override
public String getKey(String s) throws Exception {
return "key";
}
}).process(new KeyedProcessFunction() {
private transient ValueState uniqueCount;
@Override
public void open(Configuration parameters) throws Exception {
ValueStateDescriptor descriptor =
new ValueStateDescriptor<>(
"uniqueCount", // the state name
TypeInformation.of(new TypeHint() {}), // type information
new Integer(0)); // default value of the state, if nothing was set
uniqueCount = getRuntimeContext().getState(descriptor);
}
@Override
public void processElement(String s, Context context, Collector collector) throws Exception {
Integer count = uniqueCount.value();
if(count==0){
collector.collect(s);
uniqueCount.update(1);
}
}
//这里作用主要是方便容错
}).uid("process_unique_count");
//去重复以后写回kafka主题
datadata.addSink(KafkaProductUtil.getKafkaProduct(CommonConfig.TARGET_TOPIC));
env.execute(CommonConfig.APP_NAME);
}
}
/home/bigdata/module/flink-1.13.6/bin/flink run \
-d \
-m yarn-cluster \
-yqu default \
-ynm 提交到yarn的名称 \
-c com.bigdata \
SNAPSHOT-jar-with-dependencies.jar
服务以后会重新生成一个ck的随机目录接着上一次的1560消费比如1561
/home/bigdata/module/flink-1.13.6/bin/flink run \
-d \
-s hdfs://master1:8020/checkpoint/bigdata/ck/d5e28ce894fbd7ea9d25a52c1972892d/chk-1560 \
-m yarn-cluster \
-yqu default \
-ynm 提交到yarn的名称 \
-c com.bigdata.FlinkMessageUnique \
SNAPSHOT-jar-with-dependencies.jar