Flink源码分析系列文档目录
请点击:Flink 源码分析系列文档目录
Map和Filter
DataStream的map方法
public SingleOutputStreamOperator map(MapFunction mapper) {
TypeInformation outType = TypeExtractor.getMapReturnTypes(clean(mapper), getType(),
Utils.getCallLocationName(), true);
return transform("Map", outType, new StreamMap<>(clean(mapper)));
}
StreamMap.class
@Internal
public class StreamMap
extends AbstractUdfStreamOperator>
implements OneInputStreamOperator {
private static final long serialVersionUID = 1L;
public StreamMap(MapFunction mapper) {
super(mapper);
chainingStrategy = ChainingStrategy.ALWAYS;
}
@Override
public void processElement(StreamRecord element) throws Exception {
// 此处userFunction为mapper function。执行mapper方法,使用执行结果替换原先的元素
output.collect(element.replace(userFunction.map(element.getValue())));
}
}
filter方法
public SingleOutputStreamOperator filter(FilterFunction filter) {
return transform("Filter", getType(), new StreamFilter<>(clean(filter)));
}
StreamFilter.class
@Internal
public class StreamFilter extends AbstractUdfStreamOperator> implements OneInputStreamOperator {
private static final long serialVersionUID = 1L;
public StreamFilter(FilterFunction filterFunction) {
super(filterFunction);
chainingStrategy = ChainingStrategy.ALWAYS;
}
@Override
public void processElement(StreamRecord element) throws Exception {
// 如果filter方法执行结果为true,则collect该元素,否则忽略该元素。
if (userFunction.filter(element.getValue())) {
output.collect(element);
}
}
}
transform方法
public SingleOutputStreamOperator transform(String operatorName, TypeInformation outTypeInfo, OneInputStreamOperator operator) {
// 检查输出元素类型是否为MissingTypeInfo。如果是的话会抛出异常
// read the output type of the input Transform to coax out errors about MissingTypeInfo
transformation.getOutputType();
// 创建OneInputTransformation
OneInputTransformation resultTransform = new OneInputTransformation<>(
this.transformation,
operatorName,
operator,
outTypeInfo,
environment.getParallelism());
@SuppressWarnings({ "unchecked", "rawtypes" })
SingleOutputStreamOperator returnStream = new SingleOutputStreamOperator(environment, resultTransform);
getExecutionEnvironment().addOperator(resultTransform);
return returnStream;
}
我们这里关注OneInputTransformation两个成员变量
- input 作为该transformation数据输入的transformation。即上游的transformation。
- operator 此处transformation需要进行的数据转换操作。
多个级联的map和filter操作会被transform成为一连串的OneInputTransformation。后一个transformation的input指向前一个transformation
Split
Split算子已被废弃,建议使用sideOutput
public SplitStream split(OutputSelector outputSelector) {
return new SplitStream<>(this, clean(outputSelector));
}
SplitStream.java
public DataStream select(String... outputNames) {
return selectOutput(outputNames);
}
private DataStream selectOutput(String[] outputNames) {
for (String outName : outputNames) {
if (outName == null) {
throw new RuntimeException("Selected names must not be null");
}
}
SelectTransformation selectTransform = new SelectTransformation(this.getTransformation(), Lists.newArrayList(outputNames));
return new DataStream(this.getExecutionEnvironment(), selectTransform);
}
Union
@SafeVarargs
public final DataStream union(DataStream... streams) {
List> unionedTransforms = new ArrayList<>();
unionedTransforms.add(this.transformation);
// 检查所有union的stream类型是否一致
for (DataStream newStream : streams) {
if (!getType().equals(newStream.getType())) {
throw new IllegalArgumentException("Cannot union streams of different types: "
+ getType() + " and " + newStream.getType());
}
unionedTransforms.add(newStream.getTransformation());
}
return new DataStream<>(this.environment, new UnionTransformation<>(unionedTransforms));
}
KeyBy
public KeyedStream keyBy(KeySelector key) {
Preconditions.checkNotNull(key);
return new KeyedStream<>(this, clean(key));
}
KeyedStream
public KeyedStream(DataStream dataStream, KeySelector keySelector) {
this(dataStream, keySelector, TypeExtractor.getKeySelectorTypes(keySelector, dataStream.getType()));
}
public KeyedStream(DataStream dataStream, KeySelector keySelector, TypeInformation keyType) {
this(
dataStream,
new PartitionTransformation<>(
dataStream.getTransformation(),
new KeyGroupStreamPartitioner<>(keySelector, StreamGraphGenerator.DEFAULT_LOWER_BOUND_MAX_PARALLELISM)),
keySelector,
keyType);
}
KeyedStream(
DataStream stream,
PartitionTransformation partitionTransformation,
KeySelector keySelector,
TypeInformation keyType) {
super(stream.getExecutionEnvironment(), partitionTransformation);
this.keySelector = clean(keySelector);
this.keyType = validateKeyType(keyType);
}
PartitionTransformation 分区变换。该变换在生成StreamGraph的时候会被处理为VirtualPartitionNode
包含了上游input和StreamPartitioner
此处的StreamPartitioner传入的是KeyGroupStreamPartitioner
KeyGroupStreamPartitioner
通过selectChannels方法来决定元素所属的分区
public int[] selectChannels(SerializationDelegate> record) {
K key;
try {
key = keySelector.getKey(record.getInstance().getValue());
} catch (Exception e) {
throw new RuntimeException("Could not extract key from " + record.getInstance().getValue(), e);
}
returnArray[0] = KeyGroupRangeAssignment.assignKeyToParallelOperator(key, maxParallelism, numberOfChannels);
return returnArray;
}
KeyGroupRangeAssignment的assignKeyToParallelOperator方法
public static int assignKeyToParallelOperator(Object key, int maxParallelism, int parallelism) {
return computeOperatorIndexForKeyGroup(maxParallelism, parallelism, assignToKeyGroup(key, maxParallelism));
}
computeOperatorIndexForKeyGroup方法
public static int computeOperatorIndexForKeyGroup(int maxParallelism, int parallelism, int keyGroupId) {
return keyGroupId * parallelism / maxParallelism;
}
assignToKeyGroup方法
public static int assignToKeyGroup(Object key, int maxParallelism) {
return computeKeyGroupForKeyHash(key.hashCode(), maxParallelism);
}
computeKeyGroupForKeyHash方法
public static int computeKeyGroupForKeyHash(int keyHash, int maxParallelism) {
return MathUtils.murmurHash(keyHash) % maxParallelism;
}
Rebalance
Rebalance的执行流程和keyBy相同,只不过使用的是RebalancePartitioner
RebalancePartitioner的setUp和selectChannels方法
随机分配元素到partition
public void setup(int numberOfChannels) {
super.setup(numberOfChannels);
returnArray[0] = ThreadLocalRandom.current().nextInt(numberOfChannels);
}
public int[] selectChannels(SerializationDelegate> record) {
returnArray[0] = (returnArray[0] + 1) % numberOfChannels;
return returnArray;
}
Cogroup
DataStream的coGroup方法如下
public CoGroupedStreams coGroup(DataStream otherStream) {
return new CoGroupedStreams<>(this, otherStream);
}
方法返回了CoGroupStreams
public CoGroupedStreams(DataStream input1, DataStream input2) {
this.input1 = requireNonNull(input1);
this.input2 = requireNonNull(input2);
}
CoGroupedStreams包含了两个stream。
where方法设置了keySelector1,equalTo方法设置了keySelector2。
重点是apply方法
public DataStream apply(CoGroupFunction function, TypeInformation resultType) {
//clean the closure
function = input1.getExecutionEnvironment().clean(function);
UnionTypeInfo unionType = new UnionTypeInfo<>(input1.getType(), input2.getType());
UnionKeySelector unionKeySelector = new UnionKeySelector<>(keySelector1, keySelector2);
DataStream> taggedInput1 = input1
// 生成TaggedUnion,输入值map到one变量
.map(new Input1Tagger())
.setParallelism(input1.getParallelism())
.returns(unionType);
DataStream> taggedInput2 = input2
// 生成TaggedUnion,输入值map到two变量
.map(new Input2Tagger())
.setParallelism(input2.getParallelism())
.returns(unionType);
// 两个stream合并
DataStream> unionStream = taggedInput1.union(taggedInput2);
// stream里one或two值相同的TaggedUnion元素,会被分到同一个分区中
// 此处keyby问题为如果两个one值相同的话,也会被分入同一个分区中,也就是说同一个stream的元素会自己join
// we explicitly create the keyed stream to manually pass the key type information in
windowedStream =
new KeyedStream, KEY>(unionStream, unionKeySelector, keyType)
.window(windowAssigner);
if (trigger != null) {
windowedStream.trigger(trigger);
}
if (evictor != null) {
windowedStream.evictor(evictor);
}
if (allowedLateness != null) {
windowedStream.allowedLateness(allowedLateness);
}
return windowedStream.apply(new CoGroupWindowFunction(function), resultType);
}
UnionKeySelector的getKey方法
public KEY getKey(TaggedUnion value) throws Exception{
if (value.isOne()) {
return keySelector1.getKey(value.getOne());
} else {
return keySelector2.getKey(value.getTwo());
}
}
如果TaggedUnion的one有值,则把one作为key,否则把two值作为key。
public void apply(KEY key,
W window,
Iterable> values,
Collector out) throws Exception {
List oneValues = new ArrayList<>();
List twoValues = new ArrayList<>();
for (TaggedUnion val: values) {
if (val.isOne()) {
oneValues.add(val.getOne());
} else {
twoValues.add(val.getTwo());
}
}
wrappedFunction.coGroup(oneValues, twoValues, out);
}
CoGroupWindowFunction.apply
@Override
public void apply(KEY key,
W window,
Iterable> values,
Collector out) throws Exception {
List oneValues = new ArrayList<>();
List twoValues = new ArrayList<>();
for (TaggedUnion val: values) {
if (val.isOne()) {
oneValues.add(val.getOne());
} else {
twoValues.add(val.getTwo());
}
}
wrappedFunction.coGroup(oneValues, twoValues, out);
}
两组数据分别以集合形式提供
Join
public JoinedStreams join(DataStream otherStream) {
return new JoinedStreams<>(this, otherStream);
}
/**
* Completes the join operation with the user function that is executed
* for each combination of elements with the same key in a window.
*
* Note: This method's return type does not support setting an operator-specific parallelism.
* Due to binary backwards compatibility, this cannot be altered. Use the
* {@link #with(JoinFunction, TypeInformation)}, method to set an operator-specific parallelism.
*/
public DataStream apply(JoinFunction function, TypeInformation resultType) {
//clean the closure
function = input1.getExecutionEnvironment().clean(function);
coGroupedWindowedStream = input1.coGroup(input2)
.where(keySelector1)
.equalTo(keySelector2)
.window(windowAssigner)
.trigger(trigger)
.evictor(evictor)
.allowedLateness(allowedLateness);
return coGroupedWindowedStream
.apply(new JoinCoGroupFunction<>(function), resultType);
}
可见join内部使用cogroup实现的
JoinCoGroupFunction.apply
@Override
public void coGroup(Iterable first, Iterable second, Collector out) throws Exception {
for (T1 val1: first) {
for (T2 val2: second) {
out.collect(wrappedFunction.join(val1, val2));
}
}
}
两组数据分别以笛卡尔积的形式提供(排列组合)