第一步:将 flink-1.10.1-bin-scala_2.12.tgz 上传到服务器中并解压缩
第二步:修改 conf/flink-conf.yaml 文件
# 修改 jobmanager.rpc.address 参数,修改为 jobmanager 的机器
jobmanager.rpc.address: hadoop151
第三步:修改 conf/slaves 文件
# slave 机器
hadoop152
hadoop153
第四步:将 flink 整个目录分发到其他机器上
命令
# 启动
bin/start-cluster.sh
# 停止
bin/stop-cluster.sh
访问 web 页面
# =================== 启动任务 ===================
bin/flink run -c 全限定类名 –p 分区个数 jar包
# 示例
bin/flink run -c com.itfzk.flink.wordcount.KafkaStreamWordCount -p 3 FlinkStudyDemo-1.0-SNAPSHOT-jar-with-dependencies.jar
# =================== 停止任务 ===================
bin/flink cancel JobId
# 示例
bin/flink cancel f69fbd0650ae4202b2a46b3ad2089606
命令
# =================== 启动 yarn-session ===================
# -n(--container):TaskManager 的数量
# -s(--slots): 每个 TaskManager 的 slot 数量,默认一个 slot 一个 core,默认每个 taskmanager 的 slot 的个数为 1,有时可以多一些 taskmanager,做冗余
# -jm:JobManager 的内存(单位 MB)
# -tm:每个 taskmanager 的内存(单位 MB)
# -nm:yarn 的 appName(现在 yarn 的 ui 上的名字)
# -d:后台执行
bin/yarn-session.sh -n 2 -s 2 -jm 1024 -tm 1024 -nm test -d
# =================== 停止 yarn-session ===================
yarn application -kill Application-Id
# 示例
yarn application -kill application_1633171918776_0003
访问 web 页面
# =================== 启动任务 ===================
bin/flink run -c 全限定类名 –p 分区个数 jar包
# 示例
bin/flink run -c com.itfzk.flink.wordcount.KafkaStreamWordCount -p 3 FlinkStudyDemo-1.0-SNAPSHOT-jar-with-dependencies.jar
# =================== 停止任务 ===================
bin/flink cancel JobId
# 示例
bin/flink cancel f69fbd0650ae4202b2a46b3ad2089606
# =================== 启动任务 ===================
bin/flink run –m yarn-cluster -c 全限定类名 –p 分区个数 jar包
# 示例
bin/flink run –m yarn-cluster -c com.itfzk.flink.wordcount.KafkaStreamWordCount -p 3 FlinkStudyDemo-1.0-SNAPSHOT-jar-with-dependencies.jar
# =================== 停止任务 ===================
bin/flink cancel JobId
# 示例
bin/flink cancel f69fbd0650ae4202b2a46b3ad2089606
创建一个执行环境,表示当前执行程序的上下文。 getExecutionEnvironment 会根据查询运行的方式决定返回什么样的运行环境,是最常用的一种创建执行环境的方式
// 普通运行环境
ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
// 流式运行环境(常用)
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
返回本地执行环境,需要在调用时指定默认的并行度
LocalStreamEnvironment env = StreamExecutionEnvironment.createLocalEnvironment(1);
返回集群执行环境,将 Jar 提交到远程服务器。需要在调用时指定 JobManager的 IP 和端口号,并指定要在集群中运行的 Jar 包
StreamExecutionEnvironment env = StreamExecutionEnvironment.createRemoteEnvironment("jobmanage-hostname", 6123, "YOURPATH//WordCount.jar");
DataStream<Integer> mapStram = dataStream.map(new MapFunction<String, Integer>() {
public Integer map(String value) throws Exception {
}
});
DataStream<String> flatMapStream = dataStream.flatMap(new FlatMapFunction<String, String>() {
public void flatMap(String value, Collector<String> out) throws Exception {
}
});
DataStream<Interger> filterStream = dataStream.filter(new FilterFunction<String>() {
public boolean filter(String value) throws Exception {
}
});
Split
Select
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
String filePath = "E:\\~fzk\\java\\IDEA\\bigdata\\FlinkStudyDemo\\test\\test1";
DataStream<String> inputDataStream = env.readTextFile(filePath);
DataStream<SensorsData> map = inputDataStream.map(new MapFunction<String, SensorsData>() {
public SensorsData map(String value) throws Exception {
String[] splits = value.split(" ");
return new SensorsData(splits[0], new Long(splits[1]), new Double(splits[2]));
}
});
KeyedStream<SensorsData, Tuple> keyedStream = map.keyBy("id");
// split:分流
SplitStream<SensorsData> splitStream = keyedStream.split(new OutputSelector<SensorsData>() {
public Iterable<String> select(SensorsData value) {
return value.getWendu() > 37 ? Collections.singletonList("h") : Collections.singletonList("d");
}
});
// select:选择一个或多个 DataStream
DataStream<SensorsData> resultDataStream = splitStream.select("d");
env.execute();
}
Connect
CoMap
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
String filePath = "E:\\~fzk\\java\\IDEA\\bigdata\\FlinkStudyDemo\\test\\test1";
DataStream<String> inputDataStream = env.readTextFile(filePath);
DataStream<SensorsData> map = inputDataStream.map(new MapFunction<String, SensorsData>() {
public SensorsData map(String value) throws Exception {
String[] splits = value.split(" ");
return new SensorsData(splits[0], new Long(splits[1]), new Double(splits[2]));
}
});
KeyedStream<SensorsData, Tuple> keyedStream = map.keyBy("id");
SplitStream<SensorsData> splitStream = keyedStream.split(new OutputSelector<SensorsData>() {
public Iterable<String> select(SensorsData value) {
return value.getWendu() > 37 ? Collections.singletonList("high") : Collections.singletonList("low");
}
});
DataStream<SensorsData> highDataStream = splitStream.select("high");
DataStream<SensorsData> lowDataStream = splitStream.select("low");
// connect & CoMapFunction:合流
ConnectedStreams<SensorsData, SensorsData> connectedStreams = highDataStream.connect(lowDataStream);
/*
new CoMapFunction
第一个参数:合流的第一个数据类型
第二个参数:合流的第二个数据类型
第三个参数:合流的返回类型
*/
DataStream<Object> resultDataStream = connectedStreams.map(new CoMapFunction<SensorsData, SensorsData, Object>() {
public Object map1(SensorsData value) throws Exception {
return value;
}
public Object map2(SensorsData value) throws Exception {
return value;
}
});
env.execute();
}
DataStream → DataStream:向每个分区广播元素
dataStream.broadcast();
stream.join(otherStream)
.where(<KeySelector>)
.equalTo(<KeySelector>)
.window(<WindowAssigner>)
.apply(<JoinFunction>)
orangeStream
.keyBy(<KeySelector>)
.intervalJoin(greenStream.keyBy(<KeySelector>))
.between(Time.milliseconds(-2), Time.milliseconds(1))
.process (new ProcessJoinFunction<Integer, Integer, String(){
@Override
public void processElement(Integer left, Integer right, Context ctx, Collector<String> out) {
out.collect(first + "," + second);
}
});
接口:AssignerWithPeriodicWatermarks
使用前设置时间语义
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//设置时间语义
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
//周期性的生成 watermar,默认周期是 200 毫秒
env.getConfig().setAutoWatermarkInterval(5000);
String filePath = "E:\\~fzk\\java\\IDEA\\bigdata\\FlinkStudyDemo\\test\\test1";
DataStream<String> inputDataStream = env.readTextFile(filePath);
DataStream<SensorsData> map = inputDataStream.map(new MapFunction<String, SensorsData>() {
public SensorsData map(String value) throws Exception {
String[] splits = value.split(" ");
return new SensorsData(splits[0], new Long(splits[1]), new Double(splits[2]));
}
});
//乱序时间情况下的 watermark
//Time.milliseconds(1000) :延迟时间,1000ms
DataStream<SensorsData> eventTimeDataStream = map.assignTimestampsAndWatermarks(new BoundedOutOfOrdernessTimestampExtractor<SensorsData>(Time.milliseconds(1000)) {
@Override
public long extractTimestamp(SensorsData element) {
return element.getTimestamp();
}
});
env.execute();
}
// 类
public class SensorsData {
private String id;
private Long timestamp;
private double wendu;
}
接口:AssignerWithPunctuatedWatermarks
使用前设置时间语义
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
//设置时间语义
env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);
String filePath = "E:\\~fzk\\java\\IDEA\\bigdata\\FlinkStudyDemo\\test\\test1";
DataStream<String> inputDataStream = env.readTextFile(filePath);
DataStream<SensorsData> map = inputDataStream.map(new MapFunction<String, SensorsData>() {
public SensorsData map(String value) throws Exception {
String[] splits = value.split(" ");
return new SensorsData(splits[0], new Long(splits[1]), new Double(splits[2]));
}
});
//顺序时间情况下的 watermark
DataStream<SensorsData> eventTimeDataStream = map.assignTimestampsAndWatermarks(new AscendingTimestampExtractor<SensorsData>() {
@Override
public long extractAscendingTimestamp(SensorsData element) {
return element.getTimestamp();
}
});
env.execute();
}
// 类
public class SensorsData {
private String id;
private Long timestamp;
private double wendu;
}
// 分配数据时间戳和水位线(SensorsData:自定义的实体类)
SingleOutputStreamOperator<SensorsData> watermarksData = map.assignTimestampsAndWatermarks(
// 水位线策略
new WatermarkStrategy<SensorsData>() {
// 水位线生成器,主要负责按照既定的方式,基于时间戳生成水位线
@Override
public WatermarkGenerator<SensorsData> createWatermarkGenerator(WatermarkGeneratorSupplier.Context context) {
return new WatermarkGenerator<SensorsData>() {
// 每个事件(数据)到来都会调用的方法
// 它的参数有当前事件、时间戳,以及允许发出水位线的一个 WatermarkOutput
// 调用这个方法可以实现更新水位线:watermarkOutput.emitWatermark(new Watermark(long timestamp))
@Override
public void onEvent(SensorsData sensorsData, long l, WatermarkOutput watermarkOutput) {
watermarkOutput.emitWatermark(new Watermark(long timestamp));
}
// 周期性调用的方法,可以由 WatermarkOutput 发出水位线。
// 调用这个方法可以实现更新水位线:watermarkOutput.emitWatermark(new Watermark(long timestamp))
// 周期时间为处理时间,可以调用环境配置的 env.getConfig().setAutoWatermarkInterval()方法来设置,默认为200ms
@Override
public void onPeriodicEmit(WatermarkOutput watermarkOutput) {
watermarkOutput.emitWatermark(new Watermark(long timestamp));
}
};
}
// 分配时间戳,主要负责从流中数据元素的某个字段中提取时间戳,并分配给元素。时间戳的分配是生成水位线的基础
@Override
public TimestampAssigner<SensorsData> createTimestampAssigner(TimestampAssignerSupplier.Context context) {
return new TimestampAssigner<SensorsData>() {
@Override
public long extractTimestamp(SensorsData sensorsData, long l) {
return 0;
}
};
}
}
);
WatermarkStrategy.forBoundedOutOfOrderness
// 乱序时间情况下的 watermark(WatermarkStrategy.forBoundedOutOfOrderness)
SingleOutputStreamOperator<SensorsData> watermarksData = map.assignTimestampsAndWatermarks(
WatermarkStrategy.<SensorsData>forBoundedOutOfOrderness(Duration.ofSeconds(10)).withTimestampAssigner(
new SerializableTimestampAssigner<SensorsData>() {
@Override
public long extractTimestamp(SensorsData sensorsData, long l) {
return sensorsData.getTimestamp();
}
}
)
);
forMonotonousTimestamps 方法内部实现的水位线策略如下:
// forBoundedOutOfOrderness
public interface WatermarkStrategy<T> extends TimestampAssignerSupplier<T>, WatermarkGeneratorSupplier<T> {
static <T> WatermarkStrategy<T> forBoundedOutOfOrderness(Duration maxOutOfOrderness) {
return (ctx) -> {
return new BoundedOutOfOrdernessWatermarks(maxOutOfOrderness);
};
}
}
// BoundedOutOfOrdernessWatermarks
public class BoundedOutOfOrdernessWatermarks<T> implements WatermarkGenerator<T> {
private long maxTimestamp;
private final long outOfOrdernessMillis;
public BoundedOutOfOrdernessWatermarks(Duration maxOutOfOrderness) {
Preconditions.checkNotNull(maxOutOfOrderness, "maxOutOfOrderness");
Preconditions.checkArgument(!maxOutOfOrderness.isNegative(), "maxOutOfOrderness cannot be negative");
this.outOfOrdernessMillis = maxOutOfOrderness.toMillis();
this.maxTimestamp = -9223372036854775808L + this.outOfOrdernessMillis + 1L;
}
public void onEvent(T event, long eventTimestamp, WatermarkOutput output) {
this.maxTimestamp = Math.max(this.maxTimestamp, eventTimestamp);
}
public void onPeriodicEmit(WatermarkOutput output) {
output.emitWatermark(new Watermark(this.maxTimestamp - this.outOfOrdernessMillis - 1L));
}
}
WatermarkStrategy.forMonotonousTimestamps
//顺序时间情况下的 watermark(WatermarkStrategy.forMonotonousTimestamps)
SingleOutputStreamOperator<SensorsData> watermarksData = map.assignTimestampsAndWatermarks(
WatermarkStrategy.<SensorsData>forMonotonousTimestamps().withTimestampAssigner(
new SerializableTimestampAssigner<SensorsData>() {
// 分配时间戳,主要负责从流中数据元素的某个字段中提取时间戳,并分配给元素。时间戳的分配是生成水位线的基础
@Override
public long extractTimestamp(SensorsData sensorsData, long l) {
return sensorsData.getTimestamp();
}
}
)
);
forMonotonousTimestamps 方法内部实现的水位线策略如下:
// forMonotonousTimestamps
public interface WatermarkStrategy<T> extends TimestampAssignerSupplier<T>, WatermarkGeneratorSupplier<T> {
static <T> WatermarkStrategy<T> forMonotonousTimestamps() {
return (ctx) -> {
return new AscendingTimestampsWatermarks();
};
}
}
// AscendingTimestampsWatermarks
public class AscendingTimestampsWatermarks<T> extends BoundedOutOfOrdernessWatermarks<T> {
public AscendingTimestampsWatermarks() {
super(Duration.ofMillis(0L));
}
}
// BoundedOutOfOrdernessWatermarks
public class BoundedOutOfOrdernessWatermarks<T> implements WatermarkGenerator<T> {
private long maxTimestamp;
private final long outOfOrdernessMillis;
public BoundedOutOfOrdernessWatermarks(Duration maxOutOfOrderness) {
Preconditions.checkNotNull(maxOutOfOrderness, "maxOutOfOrderness");
Preconditions.checkArgument(!maxOutOfOrderness.isNegative(), "maxOutOfOrderness cannot be negative");
this.outOfOrdernessMillis = maxOutOfOrderness.toMillis();
this.maxTimestamp = -9223372036854775808L + this.outOfOrdernessMillis + 1L;
}
public void onEvent(T event, long eventTimestamp, WatermarkOutput output) {
this.maxTimestamp = Math.max(this.maxTimestamp, eventTimestamp);
}
public void onPeriodicEmit(WatermarkOutput output) {
output.emitWatermark(new Watermark(this.maxTimestamp - this.outOfOrdernessMillis - 1L));
}
}
collectWithTimestamp(String var1, long var2)【发送水位线】
⚠️注意注意注意:在自定义数据源中发送了水位线以后,就不能在程序中使用 assignTimestampsAndWatermarks 方法 来 生 成 水 位 线 了 。 在 自 定 义 数 据 源 中 生 成 水 位 线 和 在 程 序 中 使 用assignTimestampsAndWatermarks 方法生成水位线二者只能取其一
DataStreamSource<String> sourceData = env.addSource(new SourceFunction<String>() {
private boolean flag = true;
@Override
public void run(SourceContext<String> sourceContext) throws Exception {
while (flag) {
// 发送水位线
sourceContext.collectWithTimestamp(String var1, long var2);
}
}
@Override
public void cancel() {
flag = false;
}
});
// 分配滚动窗口时间为 10s
// TumblingProcessingTimeWindows.of()
map.keyBy(...)
.window(TumblingProcessingTimeWindows.of(Time.seconds(10)))
.reduce(...)
// 分配滑动窗口的 窗口大小:10s,步长:2s
// SlidingProcessingTimeWindows.of()
map.keyBy(...)
.window(SlidingProcessingTimeWindows.of(Time.seconds(10), Time.seconds(2)))
.reduce(...)
// 分配会话窗口时间:10s,10秒没数据说明一个会话结束
// ProcessingTimeSessionWindows.withGap()
map.keyBy(...)
.window(ProcessingTimeSessionWindows.withGap(Time.seconds(4)))
.reduce(...)
// 使用前需要分配 事件的时间和水位线:assignTimestampsAndWatermarks
// 分配滚动窗口时间为 10s
// TumblingEventTimeWindows.of()
map.keyBy(...)
.window(TumblingEventTimeWindows.of(Time.seconds(10)))
.reduce(...)
// 使用前需要分配 事件的时间和水位线:assignTimestampsAndWatermarks
// 分配滑动窗口的 窗口大小:10s,步长:2s
// SlidingEventTimeWindows.of()
map.keyBy(...)
.window(SlidingEventTimeWindows.of(Time.seconds(10), Time.seconds(2)))
.reduce(...)
// 使用前需要分配 事件的时间和水位线:assignTimestampsAndWatermarks
// 分配会话窗口时间:10s,10秒没数据说明一个会话结束
// EventTimeSessionWindows.withGap()
map.keyBy(...)
.window(EventTimeSessionWindows.withGap(Time.seconds(4)))
.reduce(...)
// 滚动计数窗口,countWindow 传入一个参数
// 长度为 10 的窗口
map.keyBy(...)
.countWindow(5)
.reduce(...)
// 滚动计数窗口,countWindow 传入两个参数
// 长度为 10,滑动步长为 3 的窗口
map.keyBy(...)
.countWindow(5, 2)
.reduce(...)
sourceData.keyBy(data -> data.f0)
.window(TumblingProcessingTimeWindows.of(Time.seconds(10)))
.reduce(new ReduceFunction<Tuple2<String, Long>>() {
@Override
public Tuple2<String, Long> reduce(Tuple2<String, Long> data, Tuple2<String, Long> t1) throws Exception {
return new Tuple2<>(data.f0, data.f1 + t1.f1);
}
}).print("out");
AggregateFunction 接口说明
// AggregateFunction : 输入类型(IN)、累加器类型(ACC)和输出类型(OUT)
public interface AggregateFunction<IN, ACC, OUT> extends Function, Serializable {
// 创建一个累加器,这就是为聚合创建了一个初始状态,每个聚合任务只会调用一次
ACC createAccumulator();
// 将输入的元素添加到累加器中。这就是基于聚合状态,对新来的数据进行进一步聚合的过程。方法传入两个参数:当前新到的数据 value,和当前的累加器accumulator;返回一个新的累加器值,也就是对聚合状态进行更新。每条数据到来之后都会调用这个方法
ACC add(IN var1, ACC var2);
// 从累加器中提取聚合的输出结果。也就是说,我们可以定义多个状态,然后再基于这些聚合的状态计算出一个结果进行输出。比如之前我们提到的计算平均值,就可以把 sum 和 count 作为状态放入累加器,而在调用这个方法时相除得到最终结果。这个方法只在窗口要输出结果时调用
OUT getResult(ACC var1);
// 合并两个累加器,并将合并后的状态作为一个累加器返回。这个方法只在需要合并窗口的场景下才会被调用;最常见的合并窗口(Merging Window)的场景就是会话窗口(Session Windows)
ACC merge(ACC var1, ACC var2);
}
实例:
sourceData.keyBy(data -> true)
.window(TumblingProcessingTimeWindows.of(Time.seconds(4)))
.aggregate(new AggregateFunction<Tuple2<String, Long>, Tuple2<Long, HashSet<String>>, Double>() {
@Override
public Tuple2<Long, HashSet<String>> createAccumulator() {
return Tuple2.of(0L, new HashSet<>());
}
@Override
public Tuple2<Long, HashSet<String>> add(Tuple2<String, Long> inData, Tuple2<Long, HashSet<String>> accData) {
accData.f0 += inData.f1;
accData.f1.add(inData.f0);
return Tuple2.of(accData.f0, accData.f1);
}
@Override
public Double getResult(Tuple2<Long, HashSet<String>> accData) {
return (double) accData.f0 / accData.f1.size();
}
@Override
public Tuple2<Long, HashSet<String>> merge(Tuple2<Long, HashSet<String>> longHashSetTuple2, Tuple2<Long, HashSet<String>> acc1) {
mergeData1.f1.addAll(mergeData2.f1);
return Tuple2.of(mergeData1.f0 + mergeData2.f0, mergeData1.f1);
}
}).print("out");
sourceData.keyBy(data -> true)
.window(TumblingProcessingTimeWindows.of(Time.seconds(4)))
.process(new ProcessWindowFunction<Tuple2<String, Long>, String, Boolean, TimeWindow>() {
/**
* @param aBoolean keyBy的分组值
* @param context 上下文信息
* @param iterable 窗口数据
* @param collector 返回数据
* @throws Exception
*/
@Override
public void process(Boolean aBoolean, ProcessWindowFunction<Tuple2<String, Long>, String, Boolean, TimeWindow>.Context context, Iterable<Tuple2<String, Long>> iterable, Collector<String> collector) throws Exception {
// TODO
}
});
我们可以利用 Keyed State,实现这样一个需求:检测传感器的温度值,如果连续的两个温度差值超过 10 度,就输出报警
public class Test {
public static void main(String[] args) throws Exception {
//创建执行环节
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
DataStream<String> inputDataStream = env.socketTextStream("localhost", 9999);
SingleOutputStreamOperator<MyBean> myBeanDataStream = inputDataStream.map(new MapFunction<String, MyBean>() {
@Override
public MyBean map(String s) throws Exception {
String[] split = s.split(" ");
return new MyBean(split[0], Double.valueOf(split[1]));
}
});
SingleOutputStreamOperator<Tuple3<String, Double, Double>> resultDataStream = myBeanDataStream
.keyBy((KeySelector<MyBean, String>) data -> data.getId())
.flatMap(new MyRichFlatMapFunction(10.0));
resultDataStream.print();
env.execute();
}
}
// 富方法:存储状态值
public class MyRichFlatMapFunction extends RichFlatMapFunction<MyBean, Tuple3<String, Double, Double>> {
private ValueState<Double> myValueState;
private Double abs;
public MyRichFlatMapFunction(Double abs) {
this.abs = abs;
}
@Override
public void open(Configuration parameters) throws Exception {
// 创建状态值
myValueState = getRuntimeContext().getState(new ValueStateDescriptor<Double>("my-flatmap", Double.class));
}
@Override
public void flatMap(MyBean myBean, Collector<Tuple3<String, Double, Double>> collector) throws Exception {
// 获取状态值
Double lastWendu = myValueState.value();
if(lastWendu != null){
double absWebdu = Math.abs(myBean.getWendu() - lastWendu);
if (absWebdu > abs){
collector.collect(new Tuple3<>(myBean.getId(), lastWendu, myBean.getWendu()));
}
}
// 修改状态值
myValueState.update(myBean.getWendu());
}
@Override
public void close() throws Exception {
// 清除状态值
myValueState.clear();
}
}
// 实体类
public class MyBean {
private String id;
private Double wendu;
}
private ValueState<Long> valueState;
private ListState<Long> listState;
private MapState<Long, Long> mapState;
private ReducingState<Long> reducingState;
private AggregatingState<Long, Long> aggregatingState;
@Override
public void open(Configuration parameters) throws Exception {
valueState = getRuntimeContext().getState(
new ValueStateDescriptor<Long>(
"value-state",
Long.class
)
);
listState = getRuntimeContext().getListState(
new ListStateDescriptor<Long>(
"list-state",
Long.class
)
);
mapState = getRuntimeContext().getMapState(
new MapStateDescriptor<Long, Long>(
"map-state",
Long.class,
Long.class
)
);
reducingState = getRuntimeContext().getReducingState(
new ReducingStateDescriptor<Long>(
"reduce-state",
new ReduceFunction<Long>() {
@Override
public Long reduce(Long aLong, Long t1) throws Exception {
return aLong + t1;
}
},
Long.class
)
);
aggregatingState = getRuntimeContext().getAggregatingState(
new AggregatingStateDescriptor<Long, Long, Long>(
"agg-state",
new AggregateFunction<Long, Long, Long>() {
@Override
public Long createAccumulator() {
return 0L;
}
@Override
public Long add(Long aLong, Long aLong2) {
return aLong + aLong2;
}
@Override
public Long getResult(Long aLong) {
return aLong;
}
@Override
public Long merge(Long aLong, Long acc1) {
return null;
}
},
Long.class
)
);
}
在实际应用中,很多状态会随着时间的推移逐渐增长,如果不加以限制,最终就会导致存储空间的耗尽。
说明:
配置状态的 TTL 时,需要创建一个 StateTtlConfig 配置对象,然后调用状态描述器的.enableTimeToLive()方法启动 TTL 功能
TTL默认配置
public Builder(@Nonnull Time ttl) {
this.updateType = StateTtlConfig.UpdateType.OnCreateAndWrite;
this.stateVisibility = StateTtlConfig.StateVisibility.NeverReturnExpired;
this.ttlTimeCharacteristic = StateTtlConfig.TtlTimeCharacteristic.ProcessingTime;
this.isCleanupInBackground = true;
this.strategies = new EnumMap(StateTtlConfig.CleanupStrategies.Strategies.class);
this.ttl = ttl;
}
使用举例:
public static class TtlStateProcess extends ProcessFunction<Tuple2<String, Long>, String> {
private ValueState<String> valueState;
@Override
public void open(Configuration parameters) throws Exception {
// TTL配置
// 失效时间:1h
// 更新类型:OnCreateAndWrite
// 状态的可见性:NeverReturnExpired
StateTtlConfig stateTtlConfig = StateTtlConfig
.newBuilder(Time.hours(1L))
.setUpdateType(StateTtlConfig.UpdateType.OnCreateAndWrite)
.setStateVisibility(StateTtlConfig.StateVisibility.NeverReturnExpired)
.build();
// 创建 状态描述器
ValueStateDescriptor<String> valueStateDescriptor = new ValueStateDescriptor<String>(
"value-state",
String.class
);
// 将 TTL 配置加入到 状态描述器中
valueStateDescriptor.enableTimeToLive(stateTtlConfig);
// 添加状态到上下文中
valueState = getRuntimeContext().getState(valueStateDescriptor);
}
@Override
public void processElement(Tuple2<String, Long> data, ProcessFunction<Tuple2<String, Long>, String>.Context context, Collector<String> collector) throws Exception {
}
}
算子状态(Operator State)就是一个算子并行实例上定义的状态,作用范围被限定为当前算子任务。算子状态跟数据的 key 无关,所以不同 key 的数据只要被分发到同一个并行子任务,就会访问到同一个Operator State
广播状态非常容易理解:状态广播出去,所有并行子任务的状态都是相同的;并行度调整时只要直接复制就可以了
public class BroadcastStateProcessFunction {
private static MapStateDescriptor<String, String> mapStateDescriptor = new MapStateDescriptor<>("rule-state", Types.STRING, Types.STRING);
public SingleOutputStreamOperator<String> processFunction(
SingleOutputStreamOperator<Tuple2<String, Long>> sourceData,
SingleOutputStreamOperator<String> ruleData){
// 1、添加广播( .connect(BroadcastStream broadcastStream) )
return sourceData
.connect(ruleData.broadcast(mapStateDescriptor))
.process(new BroadcastStateProcessFunction.BroadcastStateProcess());
}
// 2、继承 BroadcastProcessFunction 接口,并重写 processBroadcastElement 方法
private class BroadcastStateProcess extends BroadcastProcessFunction<Tuple2<String, Long>, String, String> {
@Override
public void processElement(Tuple2<String, Long> stringLongTuple2, BroadcastProcessFunction<Tuple2<String, Long>, String, String>.ReadOnlyContext readOnlyContext, Collector<String> collector) throws Exception {
collector.collect(stringLongTuple2.f0);
}
// 3、实现 processBroadcastElement 方法,并使用
@Override
public void processBroadcastElement(String s, BroadcastProcessFunction<Tuple2<String, Long>, String, String>.Context context, Collector<String> collector) throws Exception {
// 添加广播信息
context.getBroadcastState(mapStateDescriptor).put("fzk", s);
// 获取广播
String data = context.getBroadcastState(mapStateDescriptor).get("fzk");
}
}
}
KeyedProcessFunction 用来操作 KeyedStream。KeyedProcessFunction 会处理流的每一个元素,输出为 0 个、1 个或者多个元素。所有的 Process Function 都继承自RichFunction 接口,所以都有 open()、close()和 getRuntimeContext()等方法。
KeyedProcessFunction
class MyKeyedProcessFunction extends KeyedProcessFunction<Tuple, MyBean, MyBean> {
@Override
public void open(Configuration parameters) throws Exception {
}
@Override
public void processElement(MyBean myBean, KeyedProcessFunction<Tuple, MyBean, MyBean>.Context context, Collector<MyBean> collector) throws Exception {
collector.collect(myBean);
}
@Override
public void onTimer(long timestamp, KeyedProcessFunction<Tuple, MyBean, MyBean>.OnTimerContext ctx, Collector<MyBean> out) throws Exception {
}
@Override
public void close() throws Exception {
}
}
Context 和 OnTimerContext 所持有的 TimerService 对象拥有以下方法
class MyKeyedProcessFunction extends KeyedProcessFunction<Tuple, MyBean, MyBean> {
@Override
public void processElement(MyBean myBean, KeyedProcessFunction<Tuple, MyBean, MyBean>.Context context, Collector<MyBean> collector) throws Exception {
long currentProcessingTime = context.timerService().currentProcessingTime();
long currentWatermark = context.timerService().currentWatermark();
context.timerService().registerProcessingTimeTimer(10000l);
context.timerService().registerEventTimeTimer(10000l);
context.timerService().deleteProcessingTimeTimer(10000l);
context.timerService().deleteEventTimeTimer(10000l);
}
}
process function 的 side outputs 功能可以产生多条流,并且这些流的数据类型可以不一样。一个 side output 可以定义为 OutputTag[X]对象,X 是输出流的数据类型。process function 可以通过 Context 对象发射一个事件到一个或者多个 side outputs
事例:监控传感器温度值,将温度值低于 30 度的数据输出到 side output
public class Test {
public static void main(String[] args) throws Exception {
//创建执行环节
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
DataStream<String> inputDataStream = env.socketTextStream("localhost", 9999);
SingleOutputStreamOperator<Tuple2<String, Double>> myBeanDataStream = inputDataStream.map(new MapFunction<String, Tuple2<String, Double>>() {
@Override
public Tuple2<String, Double> map(String s) throws Exception {
String[] split = s.split(" ");
return new Tuple2<String, Double>(split[0], Double.valueOf(split[1]));
}
});
//定义侧输出流
OutputTag<Tuple2<String, Double>> outputTag = new OutputTag<Tuple2<String, Double>>("high-output") {};
// 使用自定义算子:ProcessFunction
SingleOutputStreamOperator<Tuple2<String, Double>> resultDataStream = myBeanDataStream
.process(new MyProcessFunction(30.0, outputTag));
resultDataStream.print("low-wendu");
// 获取侧输出流并输出
resultDataStream.getSideOutput(outputTag).print("high-wendu");
env.execute();
}
private static class MyProcessFunction extends ProcessFunction<Tuple2<String, Double>, Tuple2<String, Double>> {
private Double wenduLimit;
private OutputTag<Tuple2<String, Double>> outputTag;
// 初始化
public MyProcessFunction(Double wenduLimit, OutputTag<Tuple2<String, Double>> outputTag) {
this.wenduLimit = wenduLimit;
this.outputTag = outputTag;
}
@Override
public void processElement(Tuple2<String, Double> myBean, ProcessFunction<Tuple2<String, Double>, Tuple2<String, Double>>.Context context, Collector<Tuple2<String, Double>> collector) throws Exception {
// 温度高于限制温度就将数据加入到侧输出流,否则正常输出
if(myBean.f1 > wenduLimit){
context.output(outputTag, myBean);
}else {
collector.collect(myBean);
}
}
}
}
检查点模式(CheckpointingMode)
超时时间(checkpointTimeout)
最小间隔时间(minPauseBetweenCheckpoints)
最大并发检查点数量(maxConcurrentCheckpoints)
开启外部持久化存储(enableExternalizedCheckpoints)
检查点异常时是否让整个任务失败(failOnCheckpointingErrors)
不对齐检查点(enableUnalignedCheckpoints)
public class CheckpointConfig implements Serializable {
private static final long serialVersionUID = -750378776078908147L;
private static final Logger LOG = LoggerFactory.getLogger(CheckpointConfig.class);
public static final CheckpointingMode DEFAULT_MODE;
public static final long DEFAULT_TIMEOUT = 600000L;
public static final long DEFAULT_MIN_PAUSE_BETWEEN_CHECKPOINTS = 0L;
public static final int DEFAULT_MAX_CONCURRENT_CHECKPOINTS = 1;
public static final int UNDEFINED_TOLERABLE_CHECKPOINT_NUMBER = -1;
private CheckpointingMode checkpointingMode;
private long checkpointInterval;
private long checkpointTimeout;
private long minPauseBetweenCheckpoints;
private int maxConcurrentCheckpoints;
private boolean forceCheckpointing;
private boolean forceUnalignedCheckpoints;
private boolean unalignedCheckpointsEnabled;
private Duration alignmentTimeout;
private boolean approximateLocalRecovery;
private CheckpointConfig.ExternalizedCheckpointCleanup externalizedCheckpointCleanup;
/** @deprecated */
@Deprecated
private boolean failOnCheckpointingErrors;
private boolean preferCheckpointForRecovery;
private int tolerableCheckpointFailureNumber;
private transient CheckpointStorage storage;
}
把状态存放在内存里
具体实现上:哈希表状态后端在内部会直接把状态当作对象(objects),保存在 Taskmanager 的 JVM 堆(heap)上
HashMapStateBackend 是将本地状态全部放入内存的,这样可以获得最快的读写速度,使计算性能达到最佳;代价则是内存的占用
env.setStateBackend(new HashMapStateBackend());
RocksDB 是一种内嵌的 key-value 存储介质,可以把数据持久化到本地硬盘。
配置EmbeddedRocksDBStateBackend 后,会将处理中的数据全部放入 RocksDB 数据库中,RocksDB默认存储在 TaskManager 的本地数据目录里
数据被存储为序列化的字节数组(Byte Arrays),读写操作需要序列化/反序列化,因此状态的访问性能要差一些
env.setStateBackend(new EmbeddedRocksDBStateBackend());
启动检查点
// 开启检查点,每 60s 执行一次检查点
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.enableCheckpointing(60000);
检查点存储
// 配置存储检查点到 JobManager 堆内存
env.getCheckpointConfig().setCheckpointStorage(new JobManagerCheckpointStorage());
// 配置存储检查点到文件系统
env.getCheckpointConfig().setCheckpointStorage(new FileSystemCheckpointStorage("hdfs://namenode:8020/flink/checkpoints"));
检查点配置:需要什么配置参考上面的检查点配置说明
对状态进行持久化保存的快照机制叫作“检查点”(Checkpoint)。于是使用算子状态时,就需要对检查点的相关操作进行定义,实现一个 CheckpointedFunction 接口,并实现一下两个方法:
initializeState
方法:定义了初始化逻辑,也定义了恢复逻辑
snapshotState()
方法:检查点的快照保存逻辑
public interface CheckpointedFunction {
void snapshotState(FunctionSnapshotContext var1) throws Exception;
void initializeState(FunctionInitializationContext var1) throws Exception;
}
${jobId} : 需要填充要做镜像保存的作业 ID
targetDirectory : 可选,表示保存点存储的路径,对于保存点的默认路径,可以通过配置文件 flink-conf.yaml 中的 state.savepoints.dir 项来设定:state.savepoints.dir: hdfs:///flink/savepoints
不停止任务做保存点
bin/flink savepoint ${jobId} [:targetDirectory]
停止任务做保存点
bin/flink stop --savepointPath :jobId [:targetDirectory]
${savepointPath} : 指定保存点的路径
runArgs : 可选,flink任务的参数
bin/flink run -s ${savepointPath} [:runArgs]
<properties>
<flink.version>1.13.0flink.version>
<java.version>1.8java.version>
<scala.binary.version>2.12scala.binary.version>
properties>
<dependencies>
<dependency>
<groupId>org.apache.flinkgroupId>
<artifactId>flink-clients_${scala.binary.version}artifactId>
<version>${flink.version}version>
dependency>
<dependency>
<groupId>org.apache.flinkgroupId>
<artifactId>flink-sql-connector-kafka_${scala.binary.version}artifactId>
<version>${flink.version}version>
dependency>
<dependency>
<groupId>org.apache.flinkgroupId>
<artifactId>flink-connector-kafka_${scala.binary.version}artifactId>
<version>${flink.version}version>
<exclusions>
<exclusion>
<artifactId>kafka-clientsartifactId>
<groupId>org.apache.kafkagroupId>
exclusion>
exclusions>
dependency>
<dependency>
<groupId>org.apache.flinkgroupId>
<artifactId>flink-streaming-java_${scala.binary.version}artifactId>
<exclusions>
<exclusion>
<artifactId>slf4j-apiartifactId>
<groupId>org.slf4jgroupId>
exclusion>
<exclusion>
<artifactId>commons-collectionsartifactId>
<groupId>commons-collectionsgroupId>
exclusion>
exclusions>
<version>${flink.version}version>
dependency>
<dependency>
<groupId>org.slf4jgroupId>
<artifactId>slf4j-apiartifactId>
<version>1.7.25version>
dependency>
<dependency>
<groupId>org.slf4jgroupId>
<artifactId>slf4j-simpleartifactId>
<version>1.7.25version>
dependency>
<dependency>
<groupId>mysqlgroupId>
<artifactId>mysql-connector-javaartifactId>
<version>8.0.13version>
dependency>
<dependency>
<groupId>com.google.guavagroupId>
<artifactId>guavaartifactId>
<version>18.0version>
dependency>
<dependency>
<groupId>com.google.code.gsongroupId>
<artifactId>gsonartifactId>
<version>2.8.5version>
dependency>
<dependency>
<groupId>com.fasterxml.jackson.coregroupId>
<artifactId>jackson-databindartifactId>
<version>2.9.4version>
dependency>
<dependency>
<groupId>org.apache.httpcomponentsgroupId>
<artifactId>httpclientartifactId>
<version>4.5.2version>
<exclusions>
<exclusion>
<artifactId>commons-loggingartifactId>
<groupId>commons-logginggroupId>
exclusion>
exclusions>
dependency>
<dependency>
<groupId>org.projectlombokgroupId>
<artifactId>lombokartifactId>
<version>1.18.4version>
dependency>
<dependency>
<groupId>com.jayway.jsonpathgroupId>
<artifactId>json-pathartifactId>
<version>2.4.0version>
<scope>compilescope>
dependency>
<dependency>
<groupId>joda-timegroupId>
<artifactId>joda-timeartifactId>
<version>2.9.9version>
dependency>
<dependency>
<groupId>junitgroupId>
<artifactId>junitartifactId>
<version>4.12version>
<scope>testscope>
dependency>
<dependency>
<groupId>asmgroupId>
<artifactId>asmartifactId>
<version>3.3.1version>
dependency>
<dependency>
<groupId>asmgroupId>
<artifactId>asm-commonsartifactId>
<version>3.3.1version>
dependency>
<dependency>
<groupId>asmgroupId>
<artifactId>asm-utilartifactId>
<version>3.3.1version>
dependency>
<dependency>
<groupId>cglibgroupId>
<artifactId>cglib-nodepartifactId>
<version>2.2.2version>
dependency>
<dependency>
<groupId>org.apache.flinkgroupId>
<artifactId>flink-statebackend-rocksdb_${scala.binary.version}artifactId>
<scope>compilescope>
<exclusions>
<exclusion>
<artifactId>slf4j-apiartifactId>
<groupId>org.slf4jgroupId>
exclusion>
exclusions>
<version>${flink.version}version>
dependency>
<dependency>
<groupId>com.beustgroupId>
<artifactId>jcommanderartifactId>
<version>1.72version>
dependency>
<dependency>
<groupId>com.alibabagroupId>
<artifactId>druidartifactId>
<version>1.1.21version>
dependency>
<dependency>
<groupId>commons-dbutilsgroupId>
<artifactId>commons-dbutilsartifactId>
<version>1.7version>
dependency>
<dependency>
<groupId>com.alibabagroupId>
<artifactId>fastjsonartifactId>
<version>1.2.72version>
dependency>
<dependency>
<groupId>org.apache.kafkagroupId>
<artifactId>kafka_${scala.binary.version}artifactId>
<version>${kafka.version}version>
<scope>compilescope>
dependency>
<dependency>
<groupId>org.apache.commonsgroupId>
<artifactId>commons-collections4artifactId>
<version>4.1version>
dependency>
<dependency>
<groupId>com.github.oshigroupId>
<artifactId>oshi-coreartifactId>
<version>3.5.0version>
dependency>
dependencies>