十二、Flink实战--DataStream、DateSet、TableAPI 、SQL各种实现

Flink API实现

    • 1. DataStream 实现
    • 2. DataSet 实现
    • 3. Table API & SQl实现

今天我们模拟实际生成,从kafka消费数据,利用flink的各种API实现,统计wordcount。

1. DataStream 实现

package com.bigdata.flink.datasteam;

import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.CheckpointConfig;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer010;
import org.apache.flink.util.Collector;

import java.util.Properties;

public class KafkaWindowWordCount {
    public static void main(String[] args) throws Exception {
        final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        env.enableCheckpointing(5000);
        env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);
        env.getCheckpointConfig().setMinPauseBetweenCheckpoints(500);
        env.getCheckpointConfig().setCheckpointTimeout(60000);
        env.getCheckpointConfig().setMaxConcurrentCheckpoints(1);
        env.getCheckpointConfig().enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);
        env.setStreamTimeCharacteristic(TimeCharacteristic.ProcessingTime);

        Properties properties = new Properties();
        properties.setProperty("bootstrap.servers", "localhost:9092");//kafka地址
        properties.setProperty("group.id", "flink-group-name");//kafka消费组

        FlinkKafkaConsumer010 consumer = new FlinkKafkaConsumer010<>("test", new SimpleStringSchema(),
                properties);//kafka topic

        DataStream counts = env
                .addSource(consumer)
                .flatMap(new FlatMapFunction() {
                    @Override
                    public void flatMap(String value, Collector collector) throws Exception {
                        for (String word : value.split("\t")) {
                            collector.collect(new WordWithCount(word, 1L));
                        }
                    }
                })
                .keyBy("word")
                .timeWindow(Time.seconds(5))
                .reduce(new ReduceFunction() {
                    @Override
                    public WordWithCount reduce(WordWithCount a, WordWithCount b) throws Exception {
                        return new WordWithCount(a.word, a.count + b.count);
                    }
                });

        counts.print().setParallelism(1);
        env.execute("flink-kafka-wordcount");
    }

    public static class WordWithCount {
        private String word;
        private long count;

        public WordWithCount() {
        }

        public WordWithCount(String word, long count) {
            this.word = word;
            this.count = count;
        }

        public String getWord() {
            return word;
        }

        public void setWord(String word) {
            this.word = word;
        }

        public long getCount() {
            return count;
        }

        public void setCount(long count) {
            this.count = count;
        }

        @Override
        public String toString() {
            return word + " : " + count;
        }
    }
}

提交命令如下:

bin/flink run -c com.bigdata.flink.datasteam.KafkaWindowWordCount demo-1.0-SNAPSHOT.jar

2. DataSet 实现

package com.bigdata.flink.dataset;

import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;

public class DataSetWordCountExample {
    public static void main(String[] args) throws Exception{
        final ExecutionEnvironment environment = ExecutionEnvironment.getExecutionEnvironment();

        DataSet text = environment.fromElements("Who's there?",
                "I think I hear them. Stand, ho! Who's there?");
        DataSet> wordCounts = text
                .flatMap(new LineSplitter())
                .groupBy(0)
                .sum(1);

        wordCounts.print();
    }

    public static class LineSplitter implements org.apache.flink.api.common.functions.FlatMapFunction> {

        @Override
        public void flatMap(String s, Collector> collector) {
            for(String word:s.split(" ")){
                collector.collect(new Tuple2<>(word,1));
            }
        }
    }
}

结果如下:

Starting execution of program
(I,2)
(Stand,,1)
(Who's,2)
(hear,1)
(ho!,1)
(them.,1)
(there?,2)
(think,1)
Program execution finished

3. Table API & SQl实现

第一个例子是DataStream 转为 Table 的方式,双流union

package com.bigdata.flink.table;

import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.TableEnvironment;
import org.apache.flink.table.api.java.StreamTableEnvironment;

import java.util.Arrays;

public class DataStreamTable {
    public static void main(String[] args) throws Exception {
        StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
        StreamTableEnvironment tEnv = TableEnvironment.getTableEnvironment(env);

        DataStream orderA = env.fromCollection(Arrays.asList(
                new Order(1L, "啤酒", 21L),
                new Order(1L, "雪碧", 2L),
                new Order(2L, "可乐", 8L)
        ));

        DataStream orderB = env.fromCollection(Arrays.asList(
                new Order(3L, "王老吉", 2L),
                new Order(4L, "红牛", 3L),
                new Order(4L, "哇哈哈", 2L)
        ));

        Table tableA = tEnv.fromDataStream(orderA, "userId,product,amount");

        tEnv.registerDataStream("orderB", orderB, "userId,product,amount");

        Table result = tEnv.sqlQuery("select * from " + tableA + " where userId=1 union all select * from orderB " +
                "where amount > 2");

        DataStream resultStream = tEnv.toAppendStream(result, Order.class);

        resultStream.print();

        env.execute("table");
    }

    public static class Order {
        public Long userId;
        public String product;
        public Long amount;

        public Order() {
        }

        public Order(Long userId, String product, Long amount) {
            this.userId = userId;
            this.product = product;
            this.amount = amount;
        }

        @Override
        public String toString() {
            return "Order{" + "userId=" + userId + ", product='" + product + '\'' + ", amount=" + amount + '}';
        }
    }
}

第二个例子是DateSet 用Table API实现的例子

package com.bigdata.flink.table;

import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.TableEnvironment;
import org.apache.flink.table.api.java.BatchTableEnvironment;

public class DataSetTable {
    public static void main(String[] args) throws Exception {
        ExecutionEnvironment env = ExecutionEnvironment.createCollectionsEnvironment();

        BatchTableEnvironment tEnv = TableEnvironment.getTableEnvironment(env);
        DataSet input = env.fromElements(
                new WC("hello", 1),
                new WC("word", 1),
                new WC("hello", 1)
        );
        Table table = tEnv.fromDataSet(input);
        Table filtered = table.groupBy("word").select("word, frequency.sum as frequency");
        DataSet result = tEnv.toDataSet(filtered, WC.class);
        result.print();
    }

    public static class WC {
        public String word;
        public long frequency;

        public WC() {
        }

        public WC(String word, long frequency) {
            this.word = word;
            this.frequency = frequency;
        }

        @Override
        public String toString() {
            return "WC{" +
                    "word='" + word + '\'' +
                    ", frequency=" + frequency +
                    '}';
        }
    }
}    

第三个例子是DataSet 和 Sql的实现

package com.bigdata.flink.table;

import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.TableEnvironment;
import org.apache.flink.table.api.java.BatchTableEnvironment;

public class DataSetTable {
    public static void main(String[] args) throws Exception {
        ExecutionEnvironment env = ExecutionEnvironment.createCollectionsEnvironment();

        BatchTableEnvironment tEnv = TableEnvironment.getTableEnvironment(env);
        DataSet input = env.fromElements(
                new WC("hello", 1),
                new WC("word", 1),
                new WC("hello", 1)
        );
        tEnv.registerDataSet("WordCount", input);
        Table filtered = tEnv.sqlQuery("select word,sum(frequency) as frequency from WordCount group by word");
        DataSet result = tEnv.toDataSet(filtered, WC.class);
        result.print();
    }

    public static class WC {
        public String word;
        public long frequency;

        public WC() {
        }

        public WC(String word, long frequency) {
            this.word = word;
            this.frequency = frequency;
        }

        @Override
        public String toString() {
            return "WC{" +
                    "word='" + word + '\'' +
                    ", frequency=" + frequency +
                    '}';
        }
    }
}    

第四个例子是使用DataSet 加 一个自定义UDF函数的例子:

package com.bigdata.flink.table;

import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.TableEnvironment;
import org.apache.flink.table.api.java.BatchTableEnvironment;

public class DataSetTable {
    public static void main(String[] args) throws Exception {
        ExecutionEnvironment env = ExecutionEnvironment.createCollectionsEnvironment();

        BatchTableEnvironment tEnv = TableEnvironment.getTableEnvironment(env);

        tEnv.registerFunction("hashcode", new HashCodeUDF(10));//注册udf函数

        DataSet input = env.fromElements(
                new WC("hello", 1),
                new WC("word", 1),
                new WC("hello", 1)
        );

        //udf函数测试
        tEnv.registerDataSet("wordCount", input);
        Table filtered = tEnv.sqlQuery("select word, hashcode(word) hashcode from wordCount");
        DataSet result = tEnv.toDataSet(filtered, UDF.class);
        result.print();
    }

    public static class WC {
        public String word;
        public long frequency;

        public WC() {
        }

        public WC(String word, long frequency) {
            this.word = word;
            this.frequency = frequency;
        }

        @Override
        public String toString() {
            return "WC{" +
                    "word='" + word + '\'' +
                    ", frequency=" + frequency +
                    '}';
        }
    }

    public static class UDF {
        public String word;
        public Integer hashcode;

        public UDF() {
        }

        public UDF(String word, Integer frequency) {
            this.word = word;
            this.hashcode = hashcode;
        }

        @Override
        public String toString() {
            return "UDF{" +
                    "word='" + word + '\'' +
                    ", hashcode=" + hashcode + '\'' +
                    '}';
        }
    }
}

Table API和Sql的例子均可在本地运行查看结果。

你可能感兴趣的:(Apache,Flink)