flink小例子之side output 、table、sql、多sink的使用

flink小例子之side output 、table、sql、多sink的使用_第1张图片

前沿

这个小例子主要介绍了flink side output 、table、sql、多sink的使用,从一个源消费流数据,然后主流数据存入hdfs,从主流数据引出side output数据,对侧输出数据进行处理,按一秒的窗口计算出pv,平均响应时间,错误率(status不等于200的占比)等,然后将计算结果写入本地的cvs。

自定义source

import org.apache.flink.streaming.api.functions.source.SourceFunction;

import java.util.UUID;

public class GeneData implements SourceFunction<String> {

    private volatile boolean isRunning = true;

    static int status[] = {200, 404, 500, 501, 301};


    @Override
    public void run(SourceContext<String> ctx) throws Exception {

        while (isRunning) {
            Thread.sleep((int) (Math.random() * 5));
            // traceid,userid,timestamp,status,response time


            StringBuffer ss = new StringBuffer();
            ss.append(UUID.randomUUID().toString());
            ss.append(",");
            ss.append((int) (Math.random() * 100));
            ss.append(",");
            ss.append(System.currentTimeMillis());
            ss.append(",");
            ss.append(status[(int) (Math.random() * 4)]);
            ss.append(",");
            ss.append((int) (Math.random() * 200));

            ctx.collect(ss.toString());
        }

    }

    @Override
    public void cancel() {

    }
}

/**
 * side output 、table、sql、多sink的使用
 *
 * 功能
 * 1。source从流接收数据,从主流数据中引出side outout进行计算,消费同一个流,避免多次接入,消耗网略
 * 2.通过table和sql计算pv,平均响应时间,错误率(status不等于200的占比)
 * 3。原始数据输出到hdfs。
 * 4,side output的结果数据发送到csv sink
 */
public class Dashboard {
    public static void main(String[] args) throws Exception {


        // traceid,userid,timestamp,status,response time
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.setStreamTimeCharacteristic(TimeCharacteristic.EventTime);

        DataStream<Tuple5<String, Integer, Long, Integer, Integer>> ds = env.addSource(new GeneData()).
                flatMap((String value, Collector<Tuple5<String, Integer, Long, Integer, Integer>> out) -> {

                    String ss[] = value.split(",");

                    out.collect(Tuple5.of(ss[0], Integer.parseInt(ss[1]), Long.parseLong(ss[2]), Integer.parseInt(ss[3]), Integer.parseInt(ss[4])));
                }).returns(Types.TUPLE(Types.STRING, Types.INT, Types.LONG, Types.INT, Types.INT))
                .assignTimestampsAndWatermarks(new AssignerWithPunctuatedWatermarks<Tuple5<String, Integer, Long, Integer, Integer>>() {
                    @Nullable
                    @Override
                    public Watermark checkAndGetNextWatermark(Tuple5<String, Integer, Long, Integer, Integer> lastElement, long extractedTimestamp) {
                        return new Watermark(lastElement.f2);
                    }

                    @Override
                    public long extractTimestamp(Tuple5<String, Integer, Long, Integer, Integer> element, long previousElementTimestamp) {
                        return element.f2;
                    }
                });


        final OutputTag<Tuple5<String, Integer, Long, Integer, Integer>> outputTag = new OutputTag<Tuple5<String, Integer, Long, Integer, Integer>>("side-output"){};


        SingleOutputStreamOperator<Tuple5<String, Integer, Long, Integer, Integer>> mainDataStream =  ds.process(new ProcessFunction<Tuple5<String, Integer, Long, Integer, Integer>, Tuple5<String, Integer, Long, Integer, Integer>>() {
            @Override
            public void processElement(Tuple5<String, Integer, Long, Integer, Integer> value, Context ctx, Collector<Tuple5<String, Integer, Long, Integer, Integer>> out) throws Exception {
                ctx.output(outputTag,value);
                out.collect(value);
            }
        });




        DataStream<Tuple5<String, Integer, Long, Integer, Integer>>  dataStream = mainDataStream.getSideOutput(outputTag);


        StreamTableEnvironment tenv = TableEnvironment.getTableEnvironment(env);
        tenv.registerDataStream("log", dataStream, "traceid,userid,timestamp,status,restime,proctime.proctime,rowtime.rowtime");


        String sql = "select pv,avg_res_time,round(CAST(errorcount AS DOUBLE)/pv,2) as errorRate,(starttime + interval '8' hour ) as stime from (select count(*) as pv,AVG(restime) as avg_res_time  ," +
                "sum(case when status = 200 then 0 else 1 end) as errorcount, " +
                "TUMBLE_START(rowtime,INTERVAL '1' SECOND)  as starttime " +
                "from log where status <> 404 group by TUMBLE(rowtime,INTERVAL '1' SECOND) )";

        Table result1 = tenv.sqlQuery(sql);


        //write to csv sink

        TableSink csvSink = new CsvTableSink("/Users/user/work/flink_data/log", "|");
        String[] fieldNames = {"a", "b", "c", "d"};
        TypeInformation[] fieldTypes = {Types.LONG, Types.INT, Types.DOUBLE, Types.SQL_TIMESTAMP};
        tenv.registerTableSink("CsvSinkTable", fieldNames, fieldTypes, csvSink);
        result1.insertInto("CsvSinkTable");





        //write to hdfs sink
        BucketingSink<Tuple5<String, Integer, Long, Integer, Integer>> sink = new BucketingSink<>("hdfs://localhost/logs/");
        sink.setUseTruncate(false);
//        sink.setBucketer(new Bucketer() {
//            @Override
//            public Path getBucketPath(Clock clock, Path basePath, UI element) {
//                String newDateTimeString = dateTimeFormatter.format(Instant.ofEpochMilli(element.getTimestamp()));
//                return new Path(basePath + "/" + newDateTimeString);
//            }
//        });
        sink.setBucketer(new DateTimeBucketer<Tuple5<String, Integer, Long, Integer, Integer>>("yyyy-MM-dd--HHmm", ZoneId.of("UTC+8")));
        sink.setWriter(new StringWriter<Tuple5<String, Integer, Long, Integer, Integer>>());
        sink.setBatchSize(1024 * 1024 * 10); // this is 10 MB,
        sink.setBatchRolloverInterval(60 * 1000); // this is 1 min
        ds.addSink(sink);



//        tenv.toAppendStream(result1, Result.class).addSink(sink);

        //输出到控制台
//        tenv.toAppendStream(result1, Row.class).print();




        env.execute();

    }


}

你可能感兴趣的:(flink)