package operator;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.MapFunction;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.api.common.restartstrategy.RestartStrategies;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.*;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.*;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.source.SourceFunction;
import org.apache.flink.streaming.api.functions.timestamps.BoundedOutOfOrdernessTimestampExtractor;
import org.apache.flink.streaming.api.functions.windowing.WindowFunction;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.Window;
import org.apache.flink.util.Collector;
import org.omg.PortableInterceptor.INACTIVE;
import org.apache.flink.api.java.*;
import java.text.SimpleDateFormat;
import java.util.*;
import java.sql.Timestamp;
import org.apache.flink.streaming.connectors.wikiedits.WikipediaEditEvent;
import org.apache.flink.streaming.connectors.wikiedits.WikipediaEditsSource;
import org.apache.flink.api.common.state.MapState;
import org.apache.flink.api.common.state.MapStateDescriptor;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.java.functions.KeySelector;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.runtime.state.StateBackend;
import org.apache.flink.runtime.state.filesystem.FsStateBackend;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.CheckpointConfig;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.streaming.api.functions.source.RichParallelSourceFunction;
import org.apache.flink.util.Collector;
// Tuple4
public class AggregateFunction_2 {
public static void main(String[] args)throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.setParallelism(1);
DataStream> dataStream = env.addSource(new SourceFunction>() {
boolean runing = true;
@Override
public void run(SourceContext> ctx) throws Exception {
//ctx.collect(new Tuple3("user" , 2, new Timestamp(new Date().getTime())));
int i = 1;
while (runing) {
Tuple3 t3;
Thread.sleep(1000);
if (i % 2 == 1) { //判断
t3 = new Tuple3("user" + 1, 1, new Timestamp(new Date().getTime()));
} else {
t3 = new Tuple3("user" + i, i, new Timestamp(new Date().getTime()));
}
//System.out.println("=======");
//System.out.println(t3);
i = i + 1;
ctx.collect(t3);
/* 返回
user1 1
user2 2
user1 1
user4 4
user1 1
user6 6
*/
}
}
@Override
public void cancel() {
runing = false;
}
});
/* DataStream dataStream = env.fromElements(
Tuple3.of("1",333,new Timestamp(new Date().getTime())),
Tuple3.of("2", 111,new Timestamp(new Date().getTime())),
Tuple3.of("1",222,new Timestamp(new Date().getTime())),
Tuple3.of("2",444,new Timestamp(new Date().getTime())),
Tuple3.of("9",444,new Timestamp(new Date().getTime())),
Tuple3.of("6", 555,new Timestamp(new Date().getTime())),
Tuple3.of("1", 555,new Timestamp(new Date().getTime()))
)
;
*/
//dataStream.print();
// 输入类型IN 累加器类型ACC 输出 out
DataStream data_aggregate =dataStream
// .timeWindowAll(Time.seconds(2))\
.keyBy(0) //分组
.countWindow(2) //2个
// .sum(1);
.aggregate(new AggregateFunction, Tuple3, Tuple3>() {
@Override
// 初始化列累加器 .创建一个新的累加器,启动一个新的聚合,负责迭代状态的初始化
//来一条数据.相应组内只有一条数据时候执行一次
//如果原先有一条,那么新进来一条时候,就不执行了。直接执行add getresult
//累加器有点像是中间传递的东西
//user1+user1 通过累加器就是 acc_1=acc(初始化)+第一个user, acc=acc_1+第一个user1
//相加的结果都保留在累加器中。相当于一个寄存的地方
public Tuple3 createAccumulator() {
System.out.println("------createAccumulator--------"+new Timestamp(new Date().getTime()));
return new Tuple3<>("",0,new Timestamp(new Date().getTime()));
}
//累加器的累加方法 来一条数据执行一次 对于数据的每条数据,和迭代数据的聚合的具体实现
@Override
public Tuple3 add(Tuple3 value, Tuple3 accumulator) {
System.out.println("------add--------"+value);
accumulator.f0=value.f0; //类加器的第一个值等于第一个数的fo
accumulator.f1+=value.f1; //第二个值累加
return accumulator;
}
// 返回值 在窗口内满足2个,计算结束的时候执行一次 从累加器获取聚合的结果
@Override
public Tuple3 getResult(Tuple3 accumulator) {
System.out.println("------getResult--------"+accumulator);
return accumulator;
}
//合并两个累加器,返回一个具有合并状态的累加器 一般不触发这个
@Override
public Tuple3 merge(Tuple3 a, Tuple3 b) {
System.out.println("------merge--------"+a);
return null;
}
}
);
data_aggregate.print();
env.execute("execute");
}
}
输出
------createAccumulator--------2020-10-20 20:52:43.177
------add--------(user1,1,2020-10-20 20:52:43.095) --进来user1 分组后。组内只有一条user1数据 执行createAccumulator-->add (add是加的初始化的累加器)
------createAccumulator--------2020-10-20 20:52:44.179
------add--------(user2,2,2020-10-20 20:52:44.103) --进来user2 分组后。组内只有一条user2数据 执行createAccumulator-->add(add是加的初始化的累加器)
------add--------(user1,1,2020-10-20 20:52:45.103) --又进来user1 分组后 组内有两个user1 满足数量要求 执行add-->getresult (add:第二个user1+(第一个user1+初始的累加器) )同时由于AggregateFunction是增量计算的。所以清空组内的数据,
------getResult--------(user1,2,2020-10-20 20:52:43.178)
(user1,2,2020-10-20 20:52:43.178)
------createAccumulator--------2020-10-20 20:52:46.189 --进来user4 组内只有一条user4数据 执行createAccumulator-->add(add是加的初始化的累加器)
------add--------(user4,4,2020-10-20 20:52:46.103)
------createAccumulator--------2020-10-20 20:52:47.195 --!!!注意由于上面已经进来了两个user1,输出了。由于AggregateFunction是增量计算的。所以前面两个输出后。该组内被清空了 此时是组第一个
------add--------(user1,1,2020-10-20 20:52:47.104)
------createAccumulator--------2020-10-20 20:52:48.2 --进来一个user6 组内只有一个user6 执行createAccumulator-->add(add是加的初始化的累加器)
------add--------(user6,6,2020-10-20 20:52:48.104)
------add--------(user1,1,2020-10-20 20:52:49.104) --进来一个user1 此时组内有两个了 ,满足数量要求,就 add-->getresult输出。同时由于AggregateFunction是增量计算的。所以清空组内的数据,
------getResult--------(user1,2,2020-10-20 20:52:47.195)
(user1,2,2020-10-20 20:52:47.195)
------createAccumulator--------2020-10-20 20:52:50.109
------add--------(user8,8,2020-10-20 20:52:50.104)
------createAccumulator--------2020-10-20 20:52:51.114
------add--------(user1,1,2020-10-20 20:52:51.105)
------createAccumulator--------2020-10-20 20:52:52.119
------add--------(user10,10,2020-10-20 20:52:52.105)
------add--------(user1,1,2020-10-20 20:52:53.106)
------getResult--------(user1,2,2020-10-20 20:52:51.114)
(user1,2,2020-10-20 20:52:51.114)