上篇文章简单将了有状态算子的使用。本文将着重说明广播状态(Broadcast State)的使用。广播状态还是上篇文章中介绍的基本状态,只是将其进行了广播。
flink的join算子也是基于广播状态来实现的。
这里使用flink逛网提供的需求:
为了完成上述需求,我们将Rule流广播出去,然后使用connect算子将两个流关联起来,然后在process算子里面指定匹配检测逻辑。
MapStateDescriptor ruleStateDescriptor = new MapStateDescriptor<>(
"RulesBroadcastState",
BasicTypeInfo.STRING_TYPE_INFO,
TypeInformation.of(new TypeHint() {}));
// 广播 rules 流并创建 broadcast state
BroadcastStream ruleBroadcastStream = ruleStream
.broadcast(ruleStateDescriptor);
DataStream output = colorPartitionedStream
.connect(ruleBroadcastStream)
.process(
// type arguments in our KeyedBroadcastProcessFunction represent:
// 1. the key of the keyed stream
// 2. the type of elements in the non-broadcast side
// 3. the type of elements in the broadcast side
// 4. the type of the result, here a string
new KeyedBroadcastProcessFunction() {
// my matching logic
}
);
class Item {
Color color;
Shape shape;
public Color getColor() {
return color;
}
public void setColor(Color color) {
this.color = color;
}
public Shape getShape() {
return shape;
}
public void setShape(Shape shape) {
this.shape = shape;
}
@Override
public String toString() {
return "Item{" +
"color=" + color.getCol() +
", shape=" + shape.getShp() +
'}';
}
}
class Color {
String col;
public String getCol() {
return col;
}
public void setCol(String col) {
this.col = col;
}
@Override
public String toString() {
return "Color{" +
"col='" + col + '\'' +
'}';
}
}
class Shape {
String shp;
public Shape(String shp) {
this.shp = shp;
}
public String getShp() {
return shp;
}
public void setShp(String shp) {
this.shp = shp;
}
@Override
public String toString() {
return "Shape{" +
"shp='" + shp + '\'' +
'}';
}
}
class Rule {
String name;
Shape first;
Shape second;
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public Shape getFirst() {
return first;
}
public void setFirst(Shape first) {
this.first = first;
}
public Shape getSecond() {
return second;
}
public void setSecond(Shape second) {
this.second = second;
}
}
一个数据源生成Item对象的流,一个数据源生成Rule对象的流
package it.kenn.state;
import org.apache.flink.api.common.state.MapState;
import org.apache.flink.api.common.state.MapStateDescriptor;
import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
import org.apache.flink.api.common.typeinfo.TypeHint;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.typeutils.ListTypeInfo;
import org.apache.flink.streaming.api.datastream.BroadcastStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.co.KeyedBroadcastProcessFunction;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.util.Collector;
import java.util.*;
/**
* 广播流与广播状态
*/
public class BroadcastStateDemo {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStreamSource- itemStream = env.addSource(new ItemsSource());
KeyedStream
- colorPartitionedStream = itemStream.keyBy(new KeySelector
- () {
@Override
public Color getKey(Item item) throws Exception {
return item.getColor();
}
});
DataStreamSource
ruleStream = env.addSource(new RuleSource());
//定义MapState,将来将其广播出去
MapStateDescriptor ruleStateDescriptor = new MapStateDescriptor<>(
"RulesBroadcastState",
BasicTypeInfo.STRING_TYPE_INFO,
TypeInformation.of(new TypeHint() {
}));
//这里是将ruleStream广播出去,并将MapState传入ruleStream中
BroadcastStream ruleBroadcastStream = ruleStream.broadcast(ruleStateDescriptor);
SingleOutputStreamOperator resStream = colorPartitionedStream
//连接两个流
.connect(ruleBroadcastStream)
//四个泛型分别是key,in1,in2,out,可以看源码能看出来
.process(new KeyedBroadcastProcessFunction() {
// 再定义一个MapState,存储规则中 的第一个元素并等待第二个元素的到来
//todo 这里为什么存了一个列表还是没有太明白
//we keep a list as we may have many first elements waiting
//想明白了。因为rule也是一个流
private final MapStateDescriptor> mapStateDesc =
new MapStateDescriptor<>(
"items",
BasicTypeInfo.STRING_TYPE_INFO,
new ListTypeInfo<>(Item.class));
// 和上面定义的ruleStateDescriptor一模一样
private final MapStateDescriptor ruleStateDescriptor =
new MapStateDescriptor<>(
"RulesBroadcastState",
BasicTypeInfo.STRING_TYPE_INFO,
TypeInformation.of(new TypeHint() {}));
@Override
public void processElement(Item value, ReadOnlyContext ctx, Collector out) throws Exception {
final MapState> state = getRuntimeContext().getMapState(mapStateDesc);
final Shape shape = value.getShape();
for (Map.Entry entry : ctx.getBroadcastState(ruleStateDescriptor).immutableEntries()) {
final String ruleName = entry.getKey();
final Rule rule = entry.getValue();
List- stored = state.get(ruleName);
if (stored == null) {
stored = new ArrayList<>();
}
if (shape.getShp().equals(rule.second .getShp()) && !stored.isEmpty()) {
for (Item i : stored) {
out.collect("MATCH: " + i + " - " + value);
}
stored.clear();
}
// there is no else{} to cover if rule.first == rule.second
if (shape.getShp().equals(rule.first.getShp())) {
stored.add(value);
}
if (stored.isEmpty()) {
state.remove(ruleName);
System.out.println("hell?");
} else {
state.put(ruleName, stored);
}
}
}
/**
* 注意到这个方法就干了一件事,也就是把广播流中的数据全部塞到了broadcast map state状态中去了,而不是将其输出了
* 这样做是为了在processElement中获取Rule流中的规则
* @param rule
* @param context
* @param collector
* @throws Exception
*/
@Override
public void processBroadcastElement(Rule rule, Context context, Collector
collector) throws Exception {
context.getBroadcastState(ruleStateDescriptor).put(rule.name, rule);
}
});
resStream.print();
env.execute();
}
}
class Item {
Color color;
Shape shape;
public Color getColor() {
return color;
}
public void setColor(Color color) {
this.color = color;
}
public Shape getShape() {
return shape;
}
public void setShape(Shape shape) {
this.shape = shape;
}
@Override
public String toString() {
return "Item{" +
"color=" + color.getCol() +
", shape=" + shape.getShp() +
'}';
}
}
class Color {
String col;
public String getCol() {
return col;
}
public void setCol(String col) {
this.col = col;
}
@Override
public String toString() {
return "Color{" +
"col='" + col + '\'' +
'}';
}
}
class Shape {
String shp;
public Shape(String shp) {
this.shp = shp;
}
public String getShp() {
return shp;
}
public void setShp(String shp) {
this.shp = shp;
}
@Override
public String toString() {
return "Shape{" +
"shp='" + shp + '\'' +
'}';
}
}
class Rule {
String name;
Shape first;
Shape second;
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public Shape getFirst() {
return first;
}
public void setFirst(Shape first) {
this.first = first;
}
public Shape getSecond() {
return second;
}
public void setSecond(Shape second) {
this.second = second;
}
}
class ItemsSource implements SourceFunction- {
boolean flag = true;
@Override
public void run(SourceContext
- sourceContext) throws Exception {
while (flag) {
String[] colors = new String[]{"blue", "yellow", "gray", "black", "red", "orange", "green", "white", "gold"};
String[] shapes = new String[]{"triangle", "rectangle", "circle", "unknown"};
Random random = new Random();
int colorIndex = random.nextInt(8);
int shapeIndex = random.nextInt(4);
Item item = new Item();
Shape shape = new Shape(shapes[shapeIndex]);
Color color = new Color();
color.setCol(colors[colorIndex]);
item.setColor(color);
item.setShape(shape);
sourceContext.collect(item);
}
}
@Override
public void cancel() {
flag = false;
}
}
class RuleSource implements SourceFunction
{
boolean flag = true;
@Override
public void run(SourceContext sourceContext) throws Exception {
while (flag) {
Rule rule = new Rule();
String[] shapes = new String[]{"unknown", "circle", "rectangle","triangle"};
Random random = new Random();
int index1 = random.nextInt(4);
int index2 = random.nextInt(4);
rule.setName(UUID.randomUUID().toString());
rule.setFirst(new Shape(shapes[index1]));
rule.setSecond(new Shape(shapes[index2]));
sourceContext.collect(rule);
}
}
@Override
public void cancel() {
flag = false;
}
}
package it.kenn.state;
import org.apache.flink.api.common.state.MapState;
import org.apache.flink.api.common.state.MapStateDescriptor;
import org.apache.flink.api.common.typeinfo.BasicTypeInfo;
import org.apache.flink.api.common.typeinfo.TypeHint;
import org.apache.flink.api.common.typeinfo.TypeInformation;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.typeutils.ListTypeInfo;
import org.apache.flink.streaming.api.datastream.BroadcastStream;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.co.KeyedBroadcastProcessFunction;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.util.Collector;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.Random;
/**
* 广播流与广播状态
*/
public class BroadcastStateDemo {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
DataStreamSource- itemStream = env.addSource(new ItemsSource());
KeyedStream
- colorPartitionedStream = itemStream.keyBy(new KeySelector
- () {
@Override
public Color getKey(Item item) throws Exception {
return item.getColor();
}
});
DataStreamSource
ruleStream = env.addSource(new RuleSource());
MapStateDescriptor ruleStateDescriptor = new MapStateDescriptor<>(
"RulesBroadcastState",
BasicTypeInfo.STRING_TYPE_INFO,
TypeInformation.of(new TypeHint() {
}));
BroadcastStream ruleBroadcastStream = ruleStream
.broadcast(ruleStateDescriptor);
SingleOutputStreamOperator resStream = colorPartitionedStream.connect(ruleBroadcastStream)
//四个泛型分别是key,in1,in2,out,可以看源码能看出来
.process(new KeyedBroadcastProcessFunction() {
// store partial matches, i.e. first elements of the pair waiting for their second element
// we keep a list as we may have many first elements waiting
private final MapStateDescriptor> mapStateDesc =
new MapStateDescriptor<>(
"items",
BasicTypeInfo.STRING_TYPE_INFO,
new ListTypeInfo<>(Item.class));
// identical to our ruleStateDescriptor above
private final MapStateDescriptor ruleStateDescriptor =
new MapStateDescriptor<>(
"RulesBroadcastState",
BasicTypeInfo.STRING_TYPE_INFO,
TypeInformation.of(new TypeHint() {}));
@Override
public void processElement(Item value, ReadOnlyContext ctx, Collector out) throws Exception {
final MapState> state = getRuntimeContext().getMapState(mapStateDesc);
final Shape shape = value.getShape();
for (Map.Entry entry : ctx.getBroadcastState(ruleStateDescriptor).immutableEntries()) {
final String ruleName = entry.getKey();
final Rule rule = entry.getValue();
List- stored = state.get(ruleName);
if (stored == null) {
stored = new ArrayList<>();
}
System.out.println("shape: "+shape.getShp() + " 2: "+rule.getSecond().getShp() + " "+stored.isEmpty());
if (shape == rule.getSecond() && !stored.isEmpty()) {
for (Item i : stored) {
out.collect("MATCH: " + i + " - " + value);
}
stored.clear();
}
// there is no else{} to cover if rule.first == rule.second
if (shape.equals(rule.first)) {
stored.add(value);
}
if (stored.isEmpty()) {
state.remove(ruleName);
} else {
state.put(ruleName, stored);
}
}
}
@Override
public void processBroadcastElement(Rule rule, Context context, Collector
collector) throws Exception {
context.getBroadcastState(ruleStateDescriptor).put(rule.name, rule);
}
});
resStream.print();
env.execute();
}
}
上面代码还有空指针异常的错误,,没有找到原因。领会精神就好。。
上面代码虽然有点问题(有点晚了,改天再搞一下),但是不妨碍我们总结一下他的套路:
顺带一提,processElement和processBroadcastElement两个函数中的上下文对象可以进行如下操作。下面五个操作都挺重要的
ctx.getBroadcastState(MapStateDescriptor stateDescriptor)
ctx.timestamp()
,ctx.currentWatermark()
ctx.currentProcessingTime()
, andctx.output(OutputTag outputTag, X value)
.