pom.xml
4.0.0
org.myorg.quickstart
quickstart
0.1
jar
Flink Quickstart Job
http://www.myorganization.org
UTF-8
1.9.1
1.8
2.11
${java.version}
${java.version}
apache.snapshots
Apache Development Snapshot Repository
https://repository.apache.org/content/repositories/snapshots/
false
true
org.apache.flink
flink-java
${flink.version}
provided
org.apache.flink
flink-streaming-java_${scala.binary.version}
${flink.version}
provided
org.slf4j
slf4j-log4j12
1.7.7
runtime
log4j
log4j
1.2.17
runtime
org.apache.httpcomponents
httpasyncclient
4.1.4
com.alibaba
fastjson
1.2.51
org.apache.flink
flink-connector-kafka_${scala.binary.version}
${flink.version}
org.apache.flink
flink-connector-redis_${scala.binary.version}
1.1.5
org.apache.maven.plugins
maven-compiler-plugin
3.1
${java.version}
org.apache.maven.plugins
maven-shade-plugin
3.0.0
package
shade
org.apache.flink:force-shading
com.google.code.findbugs:jsr305
org.slf4j:*
log4j:*
*:*
META-INF/*.SF
META-INF/*.DSA
META-INF/*.RSA
org.myorg.quickstart.StreamingJob
org.eclipse.m2e
lifecycle-mapping
1.0.0
org.apache.maven.plugins
maven-shade-plugin
[3.0.0,)
shade
org.apache.maven.plugins
maven-compiler-plugin
[3.1,)
testCompile
compile
add-dependencies-for-IDEA
idea.version
org.apache.flink
flink-java
${flink.version}
compile
org.apache.flink
flink-streaming-java_${scala.binary.version}
${flink.version}
compile
flink-config.propertites (此配置文件最好不要放在工程里面,可以单独放在服务器目录,方便修改)
topics=test group.id=lzc bootstrap.servers=bigdata1:9092,bigdata2:9092 auto.offset.reset=earliest enable.auto.commit=false checkpoint.interval=10000 redis.host=localhost redis.pwd=123456 redis.db=0
FlinkKafkaToRedis
package org.myorg.quickstart.kafka;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.util.Collector;
public class FlinkKafkaToRedis {
public static void main(String[] args) throws Exception{
//传入配置文件路径即可
ParameterTool parameters = ParameterTool.fromPropertiesFile(args[0]);
DataStream lines = FlinkUtil.createKafkaStream(parameters, SimpleStringSchema.class);
SingleOutputStreamOperator words = lines.flatMap(new FlatMapFunction() {
@Override
public void flatMap(String words, Collector collector) throws Exception {
for (String word : words.split(" ")) {
collector.collect(word);
}
}
});
SingleOutputStreamOperator> word = words.map(new MapFunction>() {
@Override
public Tuple2 map(String word) throws Exception {
return new Tuple2<>(word, 1);
}
});
//为了保证程序出现问题可以继续累加
SingleOutputStreamOperator> sum = word.keyBy(0).sum(1);
sum.map(new MapFunction, Tuple3>() {
@Override
public Tuple3 map(Tuple2 tp) throws Exception {
return Tuple3.of("word_count",tp.f0,tp.f1.toString());
}
}).addSink(new MyRedisSink());
FlinkUtil.getEnv().execute("kafkaSource");
}
}
自定义工具类FlinkUtil
package org.myorg.quickstart.kafka;
import org.apache.flink.api.common.restartstrategy.RestartStrategies;
import org.apache.flink.api.common.serialization.DeserializationSchema;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.CheckpointConfig;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer;
import java.util.Arrays;
import java.util.List;
import java.util.Properties;
public class FlinkUtil {
private static StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
/**
* createKafkaStream
* @param parameters
* @param clazz
* @param
* @return
* @throws Exception
*/
public static DataStream createKafkaStream(ParameterTool parameters, Class extends DeserializationSchema> clazz) throws Exception{
env.getConfig().setGlobalJobParameters(parameters);
//本地环境读取hdfs需要设置,集群上不需要
//System.setProperty("HADOOP_USER_NAME","root");
//默认情况下,检查点被禁用。要启用检查点
env.enableCheckpointing(parameters.getLong("checkpoint.interval",5000L),CheckpointingMode.EXACTLY_ONCE);
//设置重启策略 默认不停重启
env.getConfig().setRestartStrategy(RestartStrategies.fixedDelayRestart(3,2000));
//设置state存储的后端(建议在flink配置文件里面配置)
//env.setStateBackend(new FsStateBackend("hdfs://namenode:40010/flink/checkpoints"));
//程序异常退出或人为cancel掉,不删除checkpoint数据(默认是会删除)
env.getCheckpointConfig().enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);
Properties properties = new Properties();
properties.setProperty("bootstrap.servers", parameters.getRequired("bootstrap.servers"));
properties.setProperty("group.id", parameters.getRequired("group.id"));
//如果没有记录偏移量 第一次从最开始消费
properties.setProperty("auto.offset.reset",parameters.get("auto.offset.reset","earliest"));
//kafka的消费者不自动提交偏移量 而是交给flink通过checkpoint管理
properties.setProperty("enable.auto.commit",parameters.get("enable.auto.commit","false"));
String topics = parameters.getRequired("topics");
List topicList = Arrays.asList(topics.split(","));
//Source : Kafka, 从Kafka中读取数据
FlinkKafkaConsumer kafkaConsumer = new FlinkKafkaConsumer(
topicList,
clazz.newInstance(),
properties);
//flink checkpoint成功后还要向kafka特殊的topic中写入偏移量 默认是true
kafkaConsumer.setCommitOffsetsOnCheckpoints(true);
return env.addSource(kafkaConsumer);
}
/**
* 获取执行环境
* @return
*/
public static StreamExecutionEnvironment getEnv(){
return env;
}
}
自定义redisSink(MyRedisSink)
package org.myorg.quickstart.kafka;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.functions.sink.RichSinkFunction;
import redis.clients.jedis.Jedis;
public class MyRedisSink extends RichSinkFunction> {
private transient Jedis jedis;
@Override
public void open(Configuration parameters) throws Exception {
super.open(parameters);
//获取全局参数
ParameterTool params = (ParameterTool) getRuntimeContext().getExecutionConfig().getGlobalJobParameters();
String host = params.getRequired("redis.host");
String pwd = params.getRequired("redis.pwd");
int db = params.getInt("redis.db", 0);
jedis = new Jedis(host, 6379, 5000);
jedis.auth(pwd);
jedis.select(db);
}
@Override
public void invoke(Tuple3 value, Context context) throws Exception {
if (!jedis.isConnected()){
jedis.connect();
}
jedis.hset(value.f0,value.f1,value.f2);
}
@Override
public void close() throws Exception {
super.close();
jedis.close();
}
}