Flink之处理函数

Flink之处理函数

  • 一、准备
    • 1.1、Event类
    • 1.2、EventWithWatermarkSource类
  • 二、处理函数分类
    • 2.1、ProcessFunction
      • 2.1.1、函数详解
      • 2.1.2、基本使用
    • 2.2、KeyedProcessFunction
      • 2.2.1、函数详解
      • 2.2.2、基本使用
    • 2.3、ProcessWindowFunction
      • 2.3.1、函数详解
      • 2.3.2、基本使用
    • 2.4、ProcessAllWindowFunction
      • 2.4.1、函数详解
      • 2.4.2、基本使用
    • 2.5、CoProcessFunction
      • 2.5.1、函数详解
      • 2.5.2、基本使用
    • 2.6、ProcessJoinFunction
      • 2.6.1、函数详解
      • 2.6.2、基本使用
    • 2.7、BroadcastProcessFunction
      • 2.7.1、函数详解
      • 2.7.2、基本使用
    • 2.8、KeyedBroadcastProcessFunction
      • 2.8.1、函数详解
      • 2.8.2、基本使用
  • 三、热门url案例
    • 3.1、需求
    • 3.2、实现代码
      • 3.2.1、Event类
      • 3.2.2、UrlViewCount类
      • 3.2.3、TopNExample类
    • 3.3、代码改进
      • 3.3.1、TopNExample类不足
      • 3.3.2、改进代码


一、准备

1.1、Event类

package com.hpsk.flink.beans;

import java.sql.Timestamp;

public class Event {
    public String user; // 用户
    public String url; // 访问的url
    public Long timestamp; // 访问时间

    public Event(){

    }

    public Event(String user, String url, Long timestamp) {
        this.user = user;
        this.url = url;
        this.timestamp = timestamp;
    }

    @Override
    public String toString() {
        return "Event{" +
                "user='" + user + '\'' +
                ", url='" + url + '\'' +
                ", timestamp=" + new Timestamp(timestamp) +
                '}';
    }
}

1.2、EventWithWatermarkSource类

package com.hpsk.flink.source;

import com.hpsk.flink.beans.Event;
import org.apache.flink.streaming.api.functions.source.ParallelSourceFunction;
import org.apache.flink.streaming.api.watermark.Watermark;
import java.util.Calendar;
import java.util.Random;

public class EventWithWatermarkSource implements ParallelSourceFunction<Event> {
    private boolean isRunning = true;
    String[] users = new String[]{"Alice", "Bob", "Mary", "Tom"};
    String[] urls = new String[]{"./home", "./cart", "./prod?id=1", "./prod?id=10"};
    @Override
    public void run(SourceContext<Event> ctx) throws Exception {
        Random random = new Random();
        while (isRunning) {
            String user = users[random.nextInt(users.length)];
            String url = urls[random.nextInt(urls.length)];
            long currTs = Calendar.getInstance().getTimeInMillis(); // 毫秒时间戳
            Event event = new Event(user, url, currTs);
            // 使用 collectWithTimestamp 方法将数据发送出去,并指明数据中的时间戳的字段
            ctx.collectWithTimestamp(event, currTs);
            // 发送水位线
            ctx.emitWatermark(new Watermark(event.timestamp - 1L));
            // 睡眠1秒
            Thread.sleep(1000L);
        }
    }

    @Override
    public void cancel() {
        isRunning = false;
    }
}

二、处理函数分类

2.1、ProcessFunction

2.1.1、函数详解

// ProcessFunction需要2个参数:输入数据类型、输出数据类型
// 由于继承AbstractRichFunction可以实现open和close生命周期方法
public abstract class ProcessFunction<I, O> extends AbstractRichFunction {
	// 必须实现的方法,用来处理每条数据
    public abstract void processElement(I value, Context ctx, Collector<O> out) throws Exception;
	// 定时器触发调用的方法,定时器必须是keyBy之后定义,否则会抛异常
	// Setting timers is only supported on a keyed streams.
    public void onTimer(long timestamp, OnTimerContext ctx, Collector<O> out) throws Exception {}

    public abstract class Context {.....
    }
}

2.1.2、基本使用

package com.hpsk.flink.function;

import com.hpsk.flink.beans.Event;
import com.hpsk.flink.source.EventWithWatermarkSource;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.ProcessFunction;
import org.apache.flink.util.Collector;
import org.apache.flink.util.OutputTag;
import java.sql.Timestamp;

public class ProcessFunctionDS {
    public static void main(String[] args) throws Exception {
        // 1.创建执行环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);
        // 2.从自定义Source读取数据
        DataStream<Event> inputDS = env.addSource(new EventWithWatermarkSource());
        // 3.输出原数据
        inputDS.print("input ");
        // 4.定义一个侧输出流
        OutputTag<Event> outputTag = new OutputTag<Event>("event"){};
        // 5.调用ProcessFunction处理数据
        SingleOutputStreamOperator<Event> result = inputDS.process(new ProcessFunction<Event, Event>() {
            @Override
            public void processElement(Event value, Context ctx, Collector<Event> out) throws Exception {
                long currTs = ctx.timerService().currentProcessingTime();
                // 定时器必须keyBy:Setting timers is only supported on a keyed streams.
                // ctx.timerService().registerProcessingTimeTimer(currTs + 10 * 1000L);
                if (!value.user.equals("Bob")) { // 将Bob数据抽离主流
                    out.collect(value);
                } else {
                    ctx.output(outputTag, value); // 将Bob数据放入侧输出流
                }
                System.out.println(new Timestamp(currTs) + " 数据到达时间:" + value);
            }
        });
        // 6.输出结果数据
        result.print("output ");
        // 7.输出侧输出流数据
        result.getSideOutput(outputTag).print("outputTag ");
        // 8.执行
        env.execute();
    }
}


2.2、KeyedProcessFunction

2.2.1、函数详解

// KeyedProcessFunction需要3个参数:key的数据类型、输入数据类型、输出数据类型
// 由于继承AbstractRichFunction可以实现open和close生命周期方法
public abstract class KeyedProcessFunction<K, I, O> extends AbstractRichFunction {
    // 必须实现的方法,用来处理每条数据
    public abstract void processElement(I value, Context ctx, Collector<O> out) throws Exception;
    // 定时器触发调用的方法,定时器必须是keyBy之后定义,否则会抛异常
    public void onTimer(long timestamp, OnTimerContext ctx, Collector<O> out) throws Exception {}

    public abstract class Context {.....
    }
}

2.2.2、基本使用

package com.hpsk.flink.function;

import com.hpsk.flink.beans.Event;
import com.hpsk.flink.source.EventWithWatermarkSource;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.util.Collector;
import java.sql.Timestamp;

public class KeyedProcessFunctionDS {
    public static void main(String[] args) throws Exception {
        // 1.创建执行环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);
        // 2.从自定义Source读取数据
        DataStream<Event> inputDS = env.addSource(new EventWithWatermarkSource());

        // 3.输出原数据
        inputDS.print("input ");
        // 4.调用KeyedProcessFunction处理数据
        SingleOutputStreamOperator<String> result = inputDS
                .keyBy(t -> t.user)
                .process(new KeyedProcessFunction<String, Event, String>() {
                    @Override
                    public void processElement(Event value, Context ctx, Collector<String> out) throws Exception {
                        out.collect("数据到达,时间戳为:" + new Timestamp(ctx.timestamp()));
                        out.collect(" 数据到达,水位线为: " + new Timestamp(ctx.timerService().currentWatermark()) + "\n -------分割线-------");
                        // 注册一个 10 秒后的定时器
                        ctx.timerService().registerEventTimeTimer(ctx.timestamp() + 10 * 1000L);
                    }

                    @Override
                    public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
                        out.collect("定时器触发,触发时间:" + new Timestamp(timestamp));
                    }
                });
        // 5.输出结果
        result.print("output ");
        // 6.执行
        env.execute();
    }
}

2.3、ProcessWindowFunction

2.3.1、函数详解

// keyBy后作用于window之上,处理window数据
// ProcessWindowFunction需要4个参数:输入数据类型、输出数据类型、keyBy的key的类型、Window的子类
// 由于继承AbstractRichFunction可以实现open和close生命周期方法
// 也可以获取窗口的相关信息
// 还可以定义状态
public abstract class ProcessWindowFunction<IN, OUT, KEY, W extends Window>
        extends AbstractRichFunction {
    // 必须实现的方法,用来处理每条数据
    public abstract void process(
            KEY key, Context context, Iterable<IN> elements, Collector<OUT> out) throws Exception;

2.3.2、基本使用

package com.hpsk.flink.function;

import com.hpsk.flink.beans.Event;
import com.hpsk.flink.source.EventWithWatermarkSource;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import java.sql.Timestamp;
import java.util.HashSet;

public class ProcessWindowFunctionDS {
    public static void main(String[] args) throws Exception {
        // 1.创建执行环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(2);
        // 2.从自定义Source读取数据
        DataStream<Event> inputDS = env.addSource(new EventWithWatermarkSource());
        // 3.输出原数据
        inputDS.print("input ");
        // 4.调用KeyedProcessFunction处理数据
        SingleOutputStreamOperator<String> result = inputDS
                .keyBy(t -> t.user)
                .window(TumblingEventTimeWindows.of(Time.seconds(10)))
                .process(new ProcessWindowFunction<Event, String, String, TimeWindow>() {
                    @Override
                    public void process(String s, Context ctx, Iterable<Event> elements, Collector<String> out) throws Exception {
                        HashSet<String> urls = new HashSet<>();
                        for (Event element : elements) {
                            urls.add(element.url);
                        }
                        Timestamp start = new Timestamp(ctx.window().getStart());
                        Timestamp end = new Timestamp(ctx.window().getEnd());
                        out.collect("窗口:[ " + start + "~" + end + " ] ->的用户" + s + "总计访问" + urls.size() + "个页面。");
                    }
                });
        // 5.输出结果
        result.print("output ");
        // 6.执行
        env.execute();
    }
}

2.4、ProcessAllWindowFunction

2.4.1、函数详解

// ProcessAllWindowFunction需要3个参数:输入数据类型、输出数据类型、Window的子类
// 由于继承AbstractRichFunction可以实现open和close生命周期方法
// 也可以获取窗口的相关信息
// 可以进行状态编程
public abstract class ProcessAllWindowFunction<IN, OUT, W extends Window>
        extends AbstractRichFunction {
    // 必须实现的方法,用来处理每条数据
    public abstract void process(Context context, Iterable<IN> elements, Collector<OUT> out)
            throws Exception;

2.4.2、基本使用

package com.hpsk.flink.function;

import com.hpsk.flink.beans.Event;
import com.hpsk.flink.source.EventWithWatermarkSource;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.windowing.ProcessAllWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.TumblingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
import java.sql.Timestamp;
import java.util.HashSet;

public class ProcessAllWindowFunctionDS {
    public static void main(String[] args) throws Exception {
        // 1.创建执行环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);
        // 2.从自定义Source读取数据
        DataStream<Event> inputDS = env.addSource(new EventWithWatermarkSource());
        // 3.输出原数据
        inputDS.print("input ");
        // 4.调用ProcessAllWindowFunction处理数据
        SingleOutputStreamOperator<String> result = inputDS
                .windowAll(TumblingEventTimeWindows.of(Time.seconds(10)))
                .process(new ProcessAllWindowFunction<Event, String, TimeWindow>() {
                    @Override
                    public void process(Context ctx, Iterable<Event> elements, Collector<String> out) throws Exception {
                        HashSet<String> users = new HashSet<>();
                        HashSet<String> urls = new HashSet<>();
                        for (Event element : elements) {
                            urls.add(element.url);
                            users.add(element.user);
                        }
                        Timestamp start = new Timestamp(ctx.window().getStart());
                        Timestamp end = new Timestamp(ctx.window().getEnd());
                        out.collect("窗口:[ " + start + "~" + end + " ] -> 总计用户" + users.size() + ",总计访问" + urls.size() + "个独立页面。");
                    }
                });
        // 5.输出结果
        result.print("output ");
        // 6.执行
        env.execute();
    }
}

2.5、CoProcessFunction

2.5.1、函数详解

// CoProcessFunction需要3个参数:输入流1数据类型、输入流2数据类型、输出数据类型
// 由于继承AbstractRichFunction可以实现open和close生命周期方法
// 可以进行状态编程
public abstract class CoProcessFunction<IN1, IN2, OUT> extends AbstractRichFunction {
    // 处理第一条流的数据
    public abstract void processElement1(IN1 value, Context ctx, Collector<OUT> out)
            throws Exception;
    // 处理第二条流的数据
    public abstract void processElement2(IN2 value, Context ctx, Collector<OUT> out)
            throws Exception;
	// 定时器触发时调用方法
    public void onTimer(long timestamp, OnTimerContext ctx, Collector<OUT> out) throws Exception {}

2.5.2、基本使用

package com.hpsk.flink.function;

import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.common.state.ValueState;
import org.apache.flink.api.common.state.ValueStateDescriptor;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.api.java.tuple.Tuple4;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.co.CoProcessFunction;
import org.apache.flink.util.Collector;

public class CoProcessFunctionDS {
    public static void main(String[] args) throws Exception {
        // 1.创建执行环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);
        // 2.从自定义Source读取数据
        // 来自 app 的支付日志
        SingleOutputStreamOperator<Tuple3<String, String, Long>> appStream = env.fromElements(
                Tuple3.of("order-1", "app", 1000L),
                Tuple3.of("order-2", "app", 2000L)
        ).assignTimestampsAndWatermarks(WatermarkStrategy.<Tuple3<String,
                String, Long>>forMonotonousTimestamps()
                .withTimestampAssigner(new SerializableTimestampAssigner<Tuple3<String, String, Long>>() {
                    @Override
                    public long extractTimestamp(Tuple3<String, String, Long> element, long recordTimestamp) {
                        return element.f2;
                    }
                })
        );
        
        // 来自第三方支付平台的支付日志
        SingleOutputStreamOperator<Tuple4<String, String, String, Long>> thirdpartStream =  env.fromElements(
                Tuple4.of("order-1", "third-party", "success", 3000L),
                Tuple4.of("order-3", "third-party", "success", 4000L)
        ).assignTimestampsAndWatermarks(WatermarkStrategy.<Tuple4<String,
                String, String, Long>>forMonotonousTimestamps()
                .withTimestampAssigner(new SerializableTimestampAssigner<Tuple4<String, String, String, Long>>() {
                   @Override
                   public long extractTimestamp(Tuple4<String, String, String, Long>
                                                        element, long recordTimestamp) {
                       return element.f3;
                   }
               })
        );
        
        // 4.调用ProcessAllWindowFunction处理数据
        SingleOutputStreamOperator<String> result = appStream
                .connect(thirdpartStream)
                .keyBy(t -> t.f0, t -> t.f0)
                .process(new CoProcessFunction<Tuple3<String, String, Long>, Tuple4<String, String, String, Long>, String>() {
                    private ValueState<Tuple3<String, String, Long>> appEventState;
                    private ValueState<Tuple4<String, String, String, Long>> thirdPartyEventState;

                    @Override
                    public void open(Configuration parameters) throws Exception {
                        appEventState = getRuntimeContext().getState(new ValueStateDescriptor<Tuple3<String, String, Long>>("appEventState", Types.TUPLE(Types.STRING, Types.STRING, Types.LONG)));
                        thirdPartyEventState = getRuntimeContext().getState(new ValueStateDescriptor<Tuple4<String, String, String, Long>>("thirdPartyEventState", Types.TUPLE(Types.STRING, Types.STRING, Types.STRING, Types.LONG)));
                    }

                    @Override
                    public void processElement1(Tuple3<String, String, Long> value, Context ctx, Collector<String> out) throws Exception {
                        // 看另一条流中事件是否来过
                        if (thirdPartyEventState.value() != null) {
                            out.collect("对账成功 : " + value + " " + thirdPartyEventState.value());
                            // 清空状态
                            thirdPartyEventState.clear();
                        } else {
                            // 更新状态
                            appEventState.update(value);
                            // 注册一个 5 秒后的定时器,开始等待另一条流的事件
                            ctx.timerService().registerEventTimeTimer(value.f2 + 5000L);
                        }
                    }


                    @Override
                    public void processElement2(Tuple4<String, String, String, Long> value, Context ctx, Collector<String> out) throws Exception {
                        if (appEventState.value() != null) {
                            out.collect("对账成功:" + appEventState.value() + " " + value);
                            // 清空状态
                            appEventState.clear();
                        } else {
                            // 更新状态
                            thirdPartyEventState.update(value);
                            // 注册一个 5 秒后的定时器,开始等待另一条流的事件
                            ctx.timerService().registerEventTimeTimer(value.f3 + 5000L);
                        }
                    }

                    @Override
                    public void onTimer(long timestamp, OnTimerContext ctx, Collector<String> out) throws Exception {
                        // 定时器触发,判断状态,如果某个状态不为空,说明另一条流中事件没来
                        if (appEventState.value() != null) {
                            out.collect("对账失败:" + appEventState.value() + " " + "第三方支付平台信息未到");
                        }

                        // 定时器触发,判断状态,如果某个状态不为空,说明另一条流中事件没来
                        if (thirdPartyEventState.value() != null) {
                            out.collect("对账失败:" + thirdPartyEventState.value() + " " + "app信息未到");
                        }
                        appEventState.clear();
                        thirdPartyEventState.clear();
                    }
                });
        // 5.输出结果
        result.print("output ");
        // 6.执行
        env.execute();
    }
}

2.6、ProcessJoinFunction

2.6.1、函数详解

// ProcessJoinFunction需要3个参数:输入流1数据类型、输入流2数据类型、输出数据类型
// 由于继承AbstractRichFunction可以实现open和close生命周期方法
// 可以进行状态编程
public abstract class ProcessJoinFunction<IN1, IN2, OUT> extends AbstractRichFunction {
    public abstract void processElement(IN1 left, IN2 right, Context ctx, Collector<OUT> out)
            throws Exception;

2.6.2、基本使用

package com.hpsk.flink.function;

import com.hpsk.flink.beans.Event;
import org.apache.flink.api.common.eventtime.SerializableTimestampAssigner;
import org.apache.flink.api.common.eventtime.WatermarkStrategy;
import org.apache.flink.api.java.tuple.Tuple3;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.co.ProcessJoinFunction;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.util.Collector;

import java.sql.Timestamp;
// 基于间隔的 join
public class ProcessJoinFunctionDS {
    public static void main(String[] args) throws Exception {
        // 1.创建执行环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);
        // 2.数据流一
        SingleOutputStreamOperator<Tuple3<String, String, Long>> orderStream = env.fromElements(
                Tuple3.of("Mary", "order-1", 5000L),
                Tuple3.of("Alice", "order-2", 5000L),
                Tuple3.of("Bob", "order-3", 20000L),
                Tuple3.of("Alice", "order-4", 20000L),
                Tuple3.of("Cary", "order-5", 51000L)
        ).assignTimestampsAndWatermarks(WatermarkStrategy.<Tuple3<String,
                String, Long>>forMonotonousTimestamps()
                .withTimestampAssigner(new SerializableTimestampAssigner<Tuple3<String, String, Long>>() {
                    @Override
                    public long extractTimestamp(Tuple3<String, String, Long> element, long recordTimestamp) {
                        return element.f2;
                    }
                })
        );
        // 3.数据流二
        SingleOutputStreamOperator<Event> clickStream = env.fromElements(
                new Event("Bob", "./cart", 2000L),
                new Event("Alice", "./prod?id=100", 3000L),
                new Event("Alice", "./prod?id=200", 3500L),
                new Event("Bob", "./prod?id=2", 2500L),
                new Event("Alice", "./prod?id=300", 36000L),
                new Event("Bob", "./home", 30000L),
                new Event("Bob", "./prod?id=1", 23000L),
                new Event("Bob", "./prod?id=3", 33000L)
        ).assignTimestampsAndWatermarks(WatermarkStrategy.<Event>forMonotonousTimestamps()
                .withTimestampAssigner(new SerializableTimestampAssigner<Event>() {
                    @Override
                    public long extractTimestamp(Event element, long recordTimestamp) {
                        return element.timestamp;
                    }
                })
        );
        // 4.流一join流二
        SingleOutputStreamOperator<String> result = orderStream
                .keyBy(t -> t.f0)
                .intervalJoin(clickStream.keyBy(t -> t.user))
                // a.timestamp + lowerBound <= b.timestamp <= a.timestamp + upperBound
                .between(Time.seconds(-5), Time.seconds(10))
                .process(new ProcessJoinFunction<Tuple3<String, String, Long>, Event, String>() {
                    @Override
                    public void processElement(Tuple3<String, String, Long> left, Event right, Context ctx, Collector<String> out) throws Exception {
                        out.collect(right + " => {" + left.f0 + ", " + left.f1 + ", " + new Timestamp(left.f2) + "}");
                    }
                });
        // 5.输出结果
        result.print("output ");
        // 6.执行
        env.execute();
    }
}

2.7、BroadcastProcessFunction

2.7.1、函数详解

// BroadcastProcessFunction需要3个参数:输入流1数据类型、输入流2数据类型、输出数据类型
// 由于继承AbstractRichFunction可以实现open和close生命周期方法
// 可以进行状态编程
public abstract class BroadcastProcessFunction<IN1, IN2, OUT> extends BaseBroadcastProcessFunction {
    // 主流数据处理方法:获取广播数据,对主流进行处理
    public abstract void processElement(
            final IN1 value, final ReadOnlyContext ctx, final Collector<OUT> out) throws Exception;
    // 广播流数据处理方法:广播流进行广播数据
    public abstract void processBroadcastElement(
            final IN2 value, final Context ctx, final Collector<OUT> out) throws Exception;

2.7.2、基本使用

package com.hpsk.flink.function;

import org.apache.flink.api.common.state.BroadcastState;
import org.apache.flink.api.common.state.MapStateDescriptor;
import org.apache.flink.api.common.state.ReadOnlyBroadcastState;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.BroadcastStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.co.BroadcastProcessFunction;
import org.apache.flink.util.Collector;

// 模拟动态配置实时数仓中维度表创建
public class BroadcastProcessFunctionDS {
    public static void main(String[] args) throws Exception {
        // 1.创建执行环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);
        // 2.配置流:维度表的配置表
        SingleOutputStreamOperator<String> tableConfigStream = env.fromElements(
                "table1,createTable",
                "table2,createTable",
                "table3,createTable");

        // 3.主流:业务库实时数据流
        SingleOutputStreamOperator<Tuple2<String, String>> MySqlTableStream = env.fromElements(
                Tuple2.of("table1", "data"),
                Tuple2.of("table2", "data"),
                Tuple2.of("table4", "data")
        );
        // 将配置流处理成广播流
        MapStateDescriptor<String, String> mapStateDescriptor = new MapStateDescriptor<>("map-state", String.class, String.class);
        BroadcastStream<String> broadcast = tableConfigStream.broadcast(mapStateDescriptor);
        // 连接主流与广播流成连接流, 处理连接流,根据配置信息处理主流数据
        SingleOutputStreamOperator<String> result = MySqlTableStream
                .connect(broadcast)
                .process(new MyBroadcastProcessFunction(mapStateDescriptor));
        // 5.输出结果
        result.print("output ");
        // 6.执行
        env.execute();
    }

    public static class MyBroadcastProcessFunction extends BroadcastProcessFunction<Tuple2<String, String>, String, String>{
        private MapStateDescriptor<String, String> mapStateDescriptor;

        public MyBroadcastProcessFunction(MapStateDescriptor<String, String> mapStateDescriptor) {
            this.mapStateDescriptor = mapStateDescriptor;
        }

        @Override
        public void processBroadcastElement(String value, Context ctx, Collector<String> out) throws Exception {
            BroadcastState<String, String> configBroadcast = ctx.getBroadcastState(mapStateDescriptor);
            String[] split = value.split(",");
            configBroadcast.put(split[0].trim(), split[1].trim());
        }

        @Override
        public void processElement(Tuple2<String, String> value, ReadOnlyContext ctx, Collector<String> out) throws Exception {
            ReadOnlyBroadcastState<String, String> broadcastState = ctx.getBroadcastState(mapStateDescriptor);
            String table = value.f0;
            String create = broadcastState.get(table);
            if (create != null) {
                out.collect(value.f0 + "为配置表,需要在phoenix中建表 -> 建表语句:" + create + ", 数据为:" + value.f1);
            } else {
                out.collect(value.f0 + "业务表, 跳过建表");
            }
        }

    }
}

2.8、KeyedBroadcastProcessFunction

2.8.1、函数详解

// KeyedBroadcastProcessFunction需要4个参数:keyBy的数据类型、输入流1数据类型、输入流2数据类型、输出数据类型
// 由于继承AbstractRichFunction可以实现open和close生命周期方法
// 可以进行状态编程
public abstract class KeyedBroadcastProcessFunction<KS, IN1, IN2, OUT>
        extends BaseBroadcastProcessFunction {
    // 处理主流数据的方法:获取广播流广播的数据处理主流
    public abstract void processElement(
            final IN1 value, final ReadOnlyContext ctx, final Collector<OUT> out) throws Exception;
    // 处理广播流的方法:将数据广播出去
    public abstract void processBroadcastElement(
            final IN2 value, final Context ctx, final Collector<OUT> out) throws Exception;
    // 定时器触发调用的方法
    public void onTimer(final long timestamp, final OnTimerContext ctx, final Collector<OUT> out)
            throws Exception {}

2.8.2、基本使用

package com.hpsk.flink.function;

import com.hpsk.flink.beans.Event;
import com.hpsk.flink.source.EventWithWatermarkSource;
import org.apache.flink.api.common.state.BroadcastState;
import org.apache.flink.api.common.state.MapState;
import org.apache.flink.api.common.state.MapStateDescriptor;
import org.apache.flink.api.common.state.ReadOnlyBroadcastState;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.BroadcastStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.co.KeyedBroadcastProcessFunction;
import org.apache.flink.util.Collector;

public class KeyedBroadcastProcessFunctionDS {
    public static void main(String[] args) throws Exception {
        // 1.创建执行环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);
        // 2.配置流
        SingleOutputStreamOperator<String> tableConfigStream = env.fromElements("Tom");

        // 3.主流
        SingleOutputStreamOperator<Event> MySqlTableStream = env.addSource(new EventWithWatermarkSource());
        // 将配置流处理成广播流
        MapStateDescriptor<String, String> mapStateDescriptor = new MapStateDescriptor<>("map-state", String.class, String.class);
        BroadcastStream<String> broadcast = tableConfigStream.broadcast(mapStateDescriptor);
        // 连接主流与广播流成连接流, 处理连接流,根据配置信息处理主流数据
        SingleOutputStreamOperator<String> result = MySqlTableStream
                .keyBy(t -> t.user)
                .connect(broadcast)
                .process(new MyBroadcastProcessFunction(mapStateDescriptor));
        // 5.输出结果
        result.print("output ");
        // 6.执行
        env.execute();
    }

    public static class MyBroadcastProcessFunction extends KeyedBroadcastProcessFunction<String, Event, String, String>{
        private MapStateDescriptor<String, String> mapStateDescriptor;
        private MapState<String, Long> eventMapState;

        @Override
        public void open(Configuration parameters) throws Exception {
            eventMapState = getRuntimeContext().getMapState(new MapStateDescriptor<String, Long>("event-map-state", String.class, Long.class));
        }

        public MyBroadcastProcessFunction(MapStateDescriptor<String, String> mapStateDescriptor) {
            this.mapStateDescriptor = mapStateDescriptor;
        }

        @Override
        public void processElement(Event value, ReadOnlyContext ctx, Collector<String> out) throws Exception {
            if (eventMapState.contains(value.user)) {
                Long num = eventMapState.get(value.user);
                eventMapState.put(value.user, num + 1);
            } else {
                eventMapState.put(value.user, 1L);
            }

            ReadOnlyBroadcastState<String, String> broadcastState = ctx.getBroadcastState(mapStateDescriptor);
            String user = value.user;
            String userConfig = broadcastState.get(user);
            if (userConfig != null) {
                Long aLong = eventMapState.get(value.user);
                out.collect("用户 "+ value.user + " 访问次数 -> " + aLong);
            }
        }

        @Override
        public void processBroadcastElement(String value, Context ctx, Collector<String> out) throws Exception {
            BroadcastState<String, String> configBroadcast = ctx.getBroadcastState(mapStateDescriptor);
            configBroadcast.put(value, value);
        }
    }
}

三、热门url案例

3.1、需求

统计最近10秒内最热门的url链接,并且每5秒钟更新一次

3.2、实现代码

3.2.1、Event类

package com.hpsk.flink.beans;

import java.sql.Timestamp;

public class Event {
    public String user; // 用户
    public String url; // 访问的url
    public Long timestamp; // 访问时间

    public Event(){

    }

    public Event(String user, String url, Long timestamp) {
        this.user = user;
        this.url = url;
        this.timestamp = timestamp;
    }

    @Override
    public String toString() {
        return "Event{" +
                "user='" + user + '\'' +
                ", url='" + url + '\'' +
                ", timestamp=" + new Timestamp(timestamp) +
                '}';
    }
}

3.2.2、UrlViewCount类

package com.hpsk.flink.beans;

import java.sql.Timestamp;

public class UrlViewCount {
    public Long windowStart; // 窗口开始时间
    public Long windowEnd; // 窗口结束时间
    public String url; // url
    public Long count; // 访问次数

    public UrlViewCount() {
    }

    public UrlViewCount(Long windowStart, Long windowEnd, String url, Long count) {
        this.windowStart = windowStart;
        this.windowEnd = windowEnd;
        this.url = url;
        this.count = count;
    }

    @Override
    public String toString() {
        return "UrlViewCount{" +
                "windowStart=" + new Timestamp(windowStart) +
                ", windowEnd=" + new Timestamp(windowEnd) +
                ", url='" + url +
                ", count=" + count +
                '}';
    }
}

3.2.3、TopNExample类

package com.hpsk.flink.demand;

import com.hpsk.flink.beans.Event;
import com.hpsk.flink.beans.UrlViewCount;
import com.hpsk.flink.source.EventWithWatermarkSource;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.windowing.ProcessAllWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;

import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;

/**
 * 统计最近10秒内最热门的url链接,并且每5秒钟更新一次
 */
public class TopNExample {
    public static void main(String[] args) throws Exception {
        // 1.创建执行环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);
        // 2.从自定义Source读取数据
        DataStream<Event> inputDS = env.addSource(new EventWithWatermarkSource());

        // 3.输出原数据
        inputDS.print("input ");

        // 4.计算TopN
        SingleOutputStreamOperator<UrlViewCount> result = inputDS
                .keyBy(t -> t.url)
                .windowAll(SlidingEventTimeWindows.of(Time.seconds(10), Time.seconds(5)))
                .aggregate(new UrlHashViewCountAgg(), new UrlHashViewCountResult());

        // 5.打印输出
        result.print("output ");

        // 6.执行
        env.execute();
    }

    /**
     * 增量统计url访问次数
     */
    public static class UrlHashViewCountAgg implements AggregateFunction<Event, HashMap<String, Long>, ArrayList<Tuple2<String, Long>>>{

        @Override
        public HashMap<String, Long> createAccumulator() {
            return new HashMap<>();
        }

        @Override
        public HashMap<String, Long> add(Event value, HashMap<String, Long> accumulator) {
            // 存在url则加1,否则赋值为1
            if (accumulator.containsKey(value.url)) {
                Long count = accumulator.get(value.url);
                accumulator.put(value.url, count + 1);
            } else {
                accumulator.put(value.url, 1L);
            }
            return accumulator;
        }

        @Override
        public ArrayList<Tuple2<String, Long>> getResult(HashMap<String, Long> accumulator) {
            // 将HashMap数据放入ArrayList
            ArrayList<Tuple2<String, Long>> result = new ArrayList<>();
            for (String key : accumulator.keySet()) {
                result.add(Tuple2.of(key, accumulator.get(key)));
            }
            // 排序
            result.sort(new Comparator<Tuple2<String, Long>>() {
                @Override
                public int compare(Tuple2<String, Long> o1, Tuple2<String, Long> o2) {
                    // 倒序
                    return o2.f1.compareTo(o1.f1);
                }
            });
            return result;
        }

        @Override
        public HashMap<String, Long> merge(HashMap<String, Long> a, HashMap<String, Long> b) {
            return null;
        }


    }

    /**
     * 自定义ProcessAllWindowFunction:包装窗口信息并输出topN
     */
    private static class UrlHashViewCountResult extends ProcessAllWindowFunction<ArrayList<Tuple2<String, Long>>, UrlViewCount, TimeWindow> {

        @Override
        public void process(Context ctx, Iterable<ArrayList<Tuple2<String, Long>>> elements, Collector<UrlViewCount> out) throws Exception {
            // 获取窗口开始时间
            long start = ctx.window().getStart();
            // 窗口结束时间
            long end = ctx.window().getEnd();
            // 获取结果输出top N
            ArrayList<Tuple2<String, Long>> list = elements.iterator().next();
            // 这里写死是top2,由于测试可能启动时间刚好到窗口触发时间,数据量不足2条故加了Math.min(list.size(), 2)防止报超出索引异常
            for (int i = 0; i < Math.min(list.size(), 2); i++) {
                out.collect(new UrlViewCount(start, end, list.get(i).f0, list.get(i).f1));
            }

        }
    }
}

3.3、代码改进

3.3.1、TopNExample类不足

利用windowAll方法将数据放入一个窗口进行统计,失去了并行计算的意义。

3.3.2、改进代码

package com.hpsk.flink.demand;

import com.hpsk.flink.beans.Event;
import com.hpsk.flink.beans.UrlViewCount;
import com.hpsk.flink.source.EventWithWatermarkSource;
import org.apache.flink.api.common.functions.AggregateFunction;
import org.apache.flink.api.common.state.ListState;
import org.apache.flink.api.common.state.ListStateDescriptor;
import org.apache.flink.api.common.typeinfo.Types;
import org.apache.flink.configuration.Configuration;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.functions.KeyedProcessFunction;
import org.apache.flink.streaming.api.functions.windowing.ProcessWindowFunction;
import org.apache.flink.streaming.api.windowing.assigners.SlidingEventTimeWindows;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;

import java.util.ArrayList;
import java.util.Comparator;

/**
 * 统计最近10秒内最热门的url链接,并且每5秒钟更新一次
 */
public class UrlTopNViewCount {
    public static void main(String[] args) throws Exception {
        // 1.创建执行环境
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setParallelism(1);
        // 2.从自定义Source读取数据
        DataStream<Event> inputDS = env.addSource(new EventWithWatermarkSource());

        // 3.输出原数据
        inputDS.print("input ");

        // 4.计算Url访问次数
        SingleOutputStreamOperator<UrlViewCount> result = inputDS
                .keyBy(t -> t.url)
                .window(SlidingEventTimeWindows.of(Time.seconds(10), Time.seconds(5)))
                .aggregate(new UrlViewCountAgg(), new UrlViewCountResult());

        // 5、计算TopN
        SingleOutputStreamOperator<UrlViewCount> topNResult = result
                .keyBy(t -> t.windowEnd)
                .process(new TopNProcessResult(2));

        // 6.打印输出
        topNResult.print("output ");

        // 7.执行
        env.execute();
    }

    /**
     * 自定义AggregateFunction:增量统计Url访问次数
     */
    public static class UrlViewCountAgg implements AggregateFunction<Event, Long, Long>{

        @Override
        public Long createAccumulator() {
            return 0L;
        }

        @Override
        public Long add(Event value, Long accumulator) {
            return accumulator + 1;
        }

        @Override
        public Long getResult(Long accumulator) {
            return accumulator;
        }

        @Override
        public Long merge(Long a, Long b) {
            return null;
        }
    }

    /**
     * 自定义ProcessAllWindowFunction:包装窗口信息做并输出
     */
    private static class UrlViewCountResult extends ProcessWindowFunction<Long, UrlViewCount, String, TimeWindow> {
        @Override
        public void process(String url, Context ctx, Iterable<Long> elements, Collector<UrlViewCount> out) throws Exception {
            long start = ctx.window().getStart();
            long end = ctx.window().getEnd();
            long count = elements.iterator().next();
            out.collect(new UrlViewCount(start, end, url, count));
        }
    }

    /**
     * 自定义KeyedProcessFunction:利用定时器和keyedState状态统计输出url topN
     */
    private static class TopNProcessResult extends KeyedProcessFunction<Long, UrlViewCount, UrlViewCount> {
        // 定义topN
        private Integer n;
        // 定义ListState
        private ListState<UrlViewCount> urlViewCountListState;

        public TopNProcessResult(Integer n) {
            this.n = n;
        }

        @Override
        public void open(Configuration parameters) throws Exception {
            // 从生命周期中获取ListState
            urlViewCountListState = getRuntimeContext().getListState(new ListStateDescriptor<UrlViewCount>("url-list-state", Types.POJO(UrlViewCount.class)));
        }

        @Override
        public void processElement(UrlViewCount value, Context ctx, Collector<UrlViewCount> out) throws Exception {
            // 将结果放入urlViewCountListState
            urlViewCountListState.add(value);
            // 定义定时器
            ctx.timerService().registerEventTimeTimer(value.windowEnd + 1);
        }

        @Override
        public void onTimer(long timestamp, OnTimerContext ctx, Collector<UrlViewCount> out) throws Exception {
            // 将urlViewCountListState中的值放入ArrayList
            ArrayList<UrlViewCount> urlViewCountArrayList = new ArrayList<>();
            for (UrlViewCount urlViewCount : urlViewCountListState.get()) {
                urlViewCountArrayList.add(urlViewCount);
            }
            // 排序
            urlViewCountArrayList.sort(new Comparator<UrlViewCount>() {
                @Override
                public int compare(UrlViewCount o1, UrlViewCount o2) {
                    return o2.count.compareTo(o1.count);
                }
            });

            // 输出TopN
            for (int i = 0; i < n; i++) {
                out.collect(urlViewCountArrayList.get(i));
            }
            // 清除状态
            urlViewCountListState.clear();
        }
    }
}

你可能感兴趣的:(Flink,flink,java,servlet)