package org.apache.flink.streaming.examples.wordcount;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.utils.MultipleParameterTool;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.examples.wordcount.util.WordCountData;
import org.apache.flink.util.Collector;
import org.apache.flink.util.Preconditions;
public class WordCount {
// *************************************************************************
// 先大致讲述一下代码,必须要有一个思路的转换,Flink与Spark一致,在没有执行executor之前,是不会执行的
// 相当于是一个配置文件,最后执行了一下Start;
// *************************************************************************
public static void main(String[] args) throws Exception {
// Checking input parameters 这个工具类是Flink独有的,不过多讲解,非常好用简单
final MultipleParameterTool params = MultipleParameterTool.fromArgs(args);
// set up the execution environment 创建全局应用环境上下文,类似于SparkContext和SpringBoot的Context
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
// 将解析的参数加入conf
env.getConfig().setGlobalJobParameters(params);
// 添加一个Source,Flink总共有三大件Source,Transform,Sink
DataStream<String> text = null;
if (params.has("input")) {
// union all the inputs from text files
for (String input : params.getMultiParameterRequired("input")) {
if (text == null) {
text = env.readTextFile(input);
} else {
text = text.union(env.readTextFile(input));
}
}
Preconditions.checkNotNull(text, "Input DataStream should not be null.");
} else {
System.out.println("Executing WordCount example with default input data set.");
System.out.println("Use --input to specify file input.");
// get default test text data
text = env.fromElements(WordCountData.WORDS);
}
// 添加一个Transform,很简单,类似于Spark
DataStream<Tuple2<String, Integer>> counts =
// split up the lines in pairs (2-tuples) containing: (word,1)
text.flatMap(new Tokenizer())
// group by the tuple field "0" and sum up tuple field "1"
.keyBy(value -> value.f0)
.sum(1);
// 添加一个Sink
if (params.has("output")) {
counts.writeAsText(params.get("output"));
} else {
System.out.println("Printing result to stdout. Use --output to specify output path.");
counts.print();
}
// execute program
env.execute("Streaming WordCount"); //这里这个参数指的是Flink的JOBName,展示在Web页面的
System.out.println(env.getExecutionPlan()); 打印执行计划
}
// *************************************************************************
// USER FUNCTIONS Flink支持用户创建类实现的方式来完成函数定义
// *************************************************************************
public static final class Tokenizer implements FlatMapFunction<String, Tuple2<String, Integer>> {
@Override
public void flatMap(String value, Collector<Tuple2<String, Integer>> out) {
// normalize and split the line
String[] tokens = value.toLowerCase().split("\\W+");
// emit the pairs
for (String token : tokens) {
if (token.length() > 0) {
out.collect(new Tuple2<>(token, 1));
}
}
}
}
}
以text = env.fromElements(WordCountData.WORDS)为例;
源码里是这样说的,创建一个有限的数据流,数据必须是同一类型,例如全部是Integer或者String类型,Flink将尝试从Data中获取类型
@SafeVarargs
public final <OUT> DataStreamSource<OUT> fromElements(OUT... data) {
//判断数据是否为空,这个简单不需要说那么多
if (data.length == 0) {
throw new IllegalArgumentException(
"fromElements needs at least one element as argument");
}
//创建一个TypeInfo
TypeInformation<OUT> typeInfo;
try {
//根据第一条数据获取其数据类型,这里我的数据第一条是String,我们可以自己指定类型
typeInfo = TypeExtractor.getForObject(data[0]);
} catch (Exception e) {
throw new RuntimeException(
"Could not create TypeInformation for type "
+ data[0].getClass().getName()
+ "; please specify the TypeInformation manually via "
+ "StreamExecutionEnvironment#fromElements(Collection, TypeInformation)",
e);
}
//这里我们看到调用了fromCollection,传入了数据和数据类型
return fromCollection(Arrays.asList(data), typeInfo);
}
该方法意思是从给定的非空集合创建数据流
public <OUT> DataStreamSource<OUT> fromCollection(Collection<OUT> data, TypeInformation<OUT> typeInfo) {
Preconditions.checkNotNull(data, "Collection must not be null"); //判断数据是否为空,如果为空抛出异常
// must not have null elements and mixed elements 必须是非空的和相同类型的元素
FromElementsFunction.checkCollection(data, typeInfo.getTypeClass());
// 创建一个SourceFunction,这里很简单了,Flink三大件第一步,没这步后面都是扯淡
SourceFunction<OUT> function;
try {
//new Function,函数里面就不说了,有兴趣的可以去看一下
function = new FromElementsFunction<>(typeInfo.createSerializer(getConfig()), data);
} catch (IOException e) {
throw new RuntimeException(e.getMessage(), e);
}
//该Function为SourceFunction,我们将它注册到Source中去,这里我们注意一下
//如果我们之后调用该方法,该方法的并行度将永远是1
return addSource(function, "添加了一个source", typeInfo, Boundedness.BOUNDED)
.setParallelism(1);
}
private <OUT> DataStreamSource<OUT> addSource(
final SourceFunction<OUT> function,
final String sourceName,
@Nullable final TypeInformation<OUT> typeInfo,
final Boundedness boundedness) {
checkNotNull(function);
checkNotNull(sourceName);
checkNotNull(boundedness);
//获取数据类型
TypeInformation<OUT> resolvedTypeInfo =
getTypeInfo(function, sourceName, SourceFunction.class, typeInfo);
//判断该数据流是否并行
boolean isParallel = function instanceof ParallelSourceFunction;
//清除一些不必要的闭包,Spark也有该操作
clean(function);
//创建一个StreamSource流
final StreamSource<OUT, ?> sourceOperator = new StreamSource<>(function);
//创建一个DataStreamSource对象,继续往下走
return new DataStreamSource<>(
this, resolvedTypeInfo, sourceOperator, isParallel, sourceName, boundedness);
}
public DataStreamSource(
StreamExecutionEnvironment environment,
TypeInformation<T> outTypeInfo,
StreamSource<T, ?> operator,
boolean isParallel,
String sourceName,
Boundedness boundedness) {
super( //直接调用父类的构造方法,创一个对象,将env和LegacySourceTransformation对象传入
//至此我们的DataStreamSource创建完毕,Souce篇到此结束
//这里注意一下transformation对象,后面有用
environment,
new LegacySourceTransformation<>(
sourceName,
operator,
outTypeInfo,
environment.getParallelism(),
boundedness));
this.isParallel = isParallel;
if (!isParallel) {
setParallelism(1);
}
}
public <R> SingleOutputStreamOperator<R> flatMap(FlatMapFunction<T, R> flatMapper) {
//解析数据类型,同上;
TypeInformation<R> outType = TypeExtractor.getFlatMapReturnTypes(clean(flatMapper), getType(), Utils.getCallLocationName(), true);
//调用flatMap方法返回的对象,继续深入
return flatMap(flatMapper, outType);
}
public <R> SingleOutputStreamOperator<R> flatMap(
FlatMapFunction<T, R> flatMapper, TypeInformation<R> outputType) {
//调用transform方法,完成对象返回
return transform("Flat Map", outputType, new StreamFlatMap<>(clean(flatMapper)));
}
public <R> SingleOutputStreamOperator<R> flatMap(
FlatMapFunction<T, R> flatMapper, TypeInformation<R> outputType) {
//调用transform方法,完成对象返回
return transform("Flat Map", outputType, new StreamFlatMap<>(clean(flatMapper)));
}
protected <R> SingleOutputStreamOperator<R> doTransform(
String operatorName,
TypeInformation<R> outTypeInfo,
StreamOperatorFactory<R> operatorFactory) {
//中间略过一个方法,太简单了,所以直接过
// read the output type of the input Transform to coax out errors about MissingTypeInfo
transformation.getOutputType();//这里就是看一下有没有source
//创建一个transformation对象,标识为一个transformation
OneInputTransformation<T, R> resultTransform =
new OneInputTransformation<>(
this.transformation,
operatorName,
operatorFactory,
outTypeInfo,
environment.getParallelism());
//创建结果流
@SuppressWarnings({
"unchecked", "rawtypes"})
SingleOutputStreamOperator<R> returnStream =
new SingleOutputStreamOperator(environment, resultTransform);
//添加转换链条
getExecutionEnvironment().addOperator(resultTransform);
//返回流
return returnStream;
}
这里有一个点注意一下,不是所有的transformation都会被加入,比如默认的keyby算子,也就是分组算子,我们看下效果;
KeyedStream(
DataStream<T> stream,
PartitionTransformation<T> partitionTransformation,
KeySelector<T, KEY> keySelector,
TypeInformation<KEY> keyType) {
//可以看出keyby并没有被加入transform中,那究竟是为什么呢
super(stream.getExecutionEnvironment(), partitionTransformation);
this.keySelector = clean(keySelector);
this.keyType = validateKeyType(keyType);
}
public SingleOutputStreamOperator<T> reduce(ReduceFunction<T> reducer) {
//其实是因为将keyby组合成了reduceTransfrom,也就是常说的flink简单优化了流程链路,这里真是一个小tips
//不过也可以通过设置并行度强行将锁链打开,不过作用不大,Transfrom到此结束
ReduceTransformation<T, KEY> reduce =
new ReduceTransformation<>(
"Keyed Reduce",
environment.getParallelism(),
transformation,
clean(reducer),
keySelector,
getKeyType());
getExecutionEnvironment().addOperator(reduce);
return new SingleOutputStreamOperator<>(getExecutionEnvironment(), reduce);
}
@PublicEvolving
public DataStreamSink<T> print() {
//可以看到和Source区别不大有木有;
//往里深入一下
PrintSinkFunction<T> printFunction = new PrintSinkFunction<>();
return addSink(printFunction).name("Print to Std. Out");
}
public DataStreamSink<T> addSink(SinkFunction<T> sinkFunction) {
// read the output type of the input Transform to coax out errors about MissingTypeInfo
transformation.getOutputType();
// configure the type if needed 略过该方法讲解,作用不大
if (sinkFunction instanceof InputTypeConfigurable) {
((InputTypeConfigurable) sinkFunction).setInputType(getType(), getExecutionConfig());
}
//创建Sink
StreamSink<T> sinkOperator = new StreamSink<>(clean(sinkFunction));
//通过StreamSink得到数据流输出
DataStreamSink<T> sink = new DataStreamSink<>(this, sinkOperator);
//插入链路,这个也是有原因的,具体原因我想懂flink的人都懂
getExecutionEnvironment().addOperator(sink.getTransformation());
return sink;
}
说实话,前段时间过的特别黑暗,浑浑噩噩,最近终于能静下心来好好阅读源码,好好学习,以上流程就是Flink的
Source
Transformation
Sink
链路生成流程,可以看到,并没有执行,数据根本不流动,所以可以把它们看做是一个插件,Spark也是如此,相当于是配置文件,Flink学会之后一定要静下心来总结,开始探索底层,说实话Spark其实探索了百分之十左右到现在的放弃以及彻底忘了scala咋写告诫了我一定要每日要学习,复习,技术犹如逆水行舟,不进则退,所以一定要努力学习