package com.bigdata.flink.datasteam;
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.common.functions.ReduceFunction;
import org.apache.flink.api.common.serialization.SimpleStringSchema;
import org.apache.flink.streaming.api.CheckpointingMode;
import org.apache.flink.streaming.api.TimeCharacteristic;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.CheckpointConfig;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.connectors.kafka.FlinkKafkaConsumer010;
import org.apache.flink.util.Collector;
import java.util.Properties;
public class KafkaWindowWordCount {
public static void main(String[] args) throws Exception {
final StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
env.enableCheckpointing(5000);
env.getCheckpointConfig().setCheckpointingMode(CheckpointingMode.EXACTLY_ONCE);
env.getCheckpointConfig().setMinPauseBetweenCheckpoints(500);
env.getCheckpointConfig().setCheckpointTimeout(60000);
env.getCheckpointConfig().setMaxConcurrentCheckpoints(1);
env.getCheckpointConfig().enableExternalizedCheckpoints(CheckpointConfig.ExternalizedCheckpointCleanup.RETAIN_ON_CANCELLATION);
env.setStreamTimeCharacteristic(TimeCharacteristic.ProcessingTime);
Properties properties = new Properties();
properties.setProperty("bootstrap.servers", "localhost:9092");//kafka地址
properties.setProperty("group.id", "flink-group-name");//kafka消费组
FlinkKafkaConsumer010 consumer = new FlinkKafkaConsumer010<>("test", new SimpleStringSchema(),
properties);//kafka topic
DataStream counts = env
.addSource(consumer)
.flatMap(new FlatMapFunction() {
@Override
public void flatMap(String value, Collector collector) throws Exception {
for (String word : value.split("\t")) {
collector.collect(new WordWithCount(word, 1L));
}
}
})
.keyBy("word")
.timeWindow(Time.seconds(5))
.reduce(new ReduceFunction() {
@Override
public WordWithCount reduce(WordWithCount a, WordWithCount b) throws Exception {
return new WordWithCount(a.word, a.count + b.count);
}
});
counts.print().setParallelism(1);
env.execute("flink-kafka-wordcount");
}
public static class WordWithCount {
private String word;
private long count;
public WordWithCount() {
}
public WordWithCount(String word, long count) {
this.word = word;
this.count = count;
}
public String getWord() {
return word;
}
public void setWord(String word) {
this.word = word;
}
public long getCount() {
return count;
}
public void setCount(long count) {
this.count = count;
}
@Override
public String toString() {
return word + " : " + count;
}
}
}
提交命令如下:
bin/flink run -c com.bigdata.flink.datasteam.KafkaWindowWordCount demo-1.0-SNAPSHOT.jar
package com.bigdata.flink.dataset;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;
public class DataSetWordCountExample {
public static void main(String[] args) throws Exception{
final ExecutionEnvironment environment = ExecutionEnvironment.getExecutionEnvironment();
DataSet text = environment.fromElements("Who's there?",
"I think I hear them. Stand, ho! Who's there?");
DataSet> wordCounts = text
.flatMap(new LineSplitter())
.groupBy(0)
.sum(1);
wordCounts.print();
}
public static class LineSplitter implements org.apache.flink.api.common.functions.FlatMapFunction> {
@Override
public void flatMap(String s, Collector> collector) {
for(String word:s.split(" ")){
collector.collect(new Tuple2<>(word,1));
}
}
}
}
结果如下:
Starting execution of program
(I,2)
(Stand,,1)
(Who's,2)
(hear,1)
(ho!,1)
(them.,1)
(there?,2)
(think,1)
Program execution finished
第一个例子是DataStream 转为 Table 的方式,双流union
package com.bigdata.flink.table;
import org.apache.flink.streaming.api.datastream.DataStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.TableEnvironment;
import org.apache.flink.table.api.java.StreamTableEnvironment;
import java.util.Arrays;
public class DataStreamTable {
public static void main(String[] args) throws Exception {
StreamExecutionEnvironment env = StreamExecutionEnvironment.getExecutionEnvironment();
StreamTableEnvironment tEnv = TableEnvironment.getTableEnvironment(env);
DataStream orderA = env.fromCollection(Arrays.asList(
new Order(1L, "啤酒", 21L),
new Order(1L, "雪碧", 2L),
new Order(2L, "可乐", 8L)
));
DataStream orderB = env.fromCollection(Arrays.asList(
new Order(3L, "王老吉", 2L),
new Order(4L, "红牛", 3L),
new Order(4L, "哇哈哈", 2L)
));
Table tableA = tEnv.fromDataStream(orderA, "userId,product,amount");
tEnv.registerDataStream("orderB", orderB, "userId,product,amount");
Table result = tEnv.sqlQuery("select * from " + tableA + " where userId=1 union all select * from orderB " +
"where amount > 2");
DataStream resultStream = tEnv.toAppendStream(result, Order.class);
resultStream.print();
env.execute("table");
}
public static class Order {
public Long userId;
public String product;
public Long amount;
public Order() {
}
public Order(Long userId, String product, Long amount) {
this.userId = userId;
this.product = product;
this.amount = amount;
}
@Override
public String toString() {
return "Order{" + "userId=" + userId + ", product='" + product + '\'' + ", amount=" + amount + '}';
}
}
}
第二个例子是DateSet 用Table API实现的例子
package com.bigdata.flink.table;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.TableEnvironment;
import org.apache.flink.table.api.java.BatchTableEnvironment;
public class DataSetTable {
public static void main(String[] args) throws Exception {
ExecutionEnvironment env = ExecutionEnvironment.createCollectionsEnvironment();
BatchTableEnvironment tEnv = TableEnvironment.getTableEnvironment(env);
DataSet input = env.fromElements(
new WC("hello", 1),
new WC("word", 1),
new WC("hello", 1)
);
Table table = tEnv.fromDataSet(input);
Table filtered = table.groupBy("word").select("word, frequency.sum as frequency");
DataSet result = tEnv.toDataSet(filtered, WC.class);
result.print();
}
public static class WC {
public String word;
public long frequency;
public WC() {
}
public WC(String word, long frequency) {
this.word = word;
this.frequency = frequency;
}
@Override
public String toString() {
return "WC{" +
"word='" + word + '\'' +
", frequency=" + frequency +
'}';
}
}
}
第三个例子是DataSet 和 Sql的实现
package com.bigdata.flink.table;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.TableEnvironment;
import org.apache.flink.table.api.java.BatchTableEnvironment;
public class DataSetTable {
public static void main(String[] args) throws Exception {
ExecutionEnvironment env = ExecutionEnvironment.createCollectionsEnvironment();
BatchTableEnvironment tEnv = TableEnvironment.getTableEnvironment(env);
DataSet input = env.fromElements(
new WC("hello", 1),
new WC("word", 1),
new WC("hello", 1)
);
tEnv.registerDataSet("WordCount", input);
Table filtered = tEnv.sqlQuery("select word,sum(frequency) as frequency from WordCount group by word");
DataSet result = tEnv.toDataSet(filtered, WC.class);
result.print();
}
public static class WC {
public String word;
public long frequency;
public WC() {
}
public WC(String word, long frequency) {
this.word = word;
this.frequency = frequency;
}
@Override
public String toString() {
return "WC{" +
"word='" + word + '\'' +
", frequency=" + frequency +
'}';
}
}
}
第四个例子是使用DataSet 加 一个自定义UDF函数的例子:
package com.bigdata.flink.table;
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.TableEnvironment;
import org.apache.flink.table.api.java.BatchTableEnvironment;
public class DataSetTable {
public static void main(String[] args) throws Exception {
ExecutionEnvironment env = ExecutionEnvironment.createCollectionsEnvironment();
BatchTableEnvironment tEnv = TableEnvironment.getTableEnvironment(env);
tEnv.registerFunction("hashcode", new HashCodeUDF(10));//注册udf函数
DataSet input = env.fromElements(
new WC("hello", 1),
new WC("word", 1),
new WC("hello", 1)
);
//udf函数测试
tEnv.registerDataSet("wordCount", input);
Table filtered = tEnv.sqlQuery("select word, hashcode(word) hashcode from wordCount");
DataSet result = tEnv.toDataSet(filtered, UDF.class);
result.print();
}
public static class WC {
public String word;
public long frequency;
public WC() {
}
public WC(String word, long frequency) {
this.word = word;
this.frequency = frequency;
}
@Override
public String toString() {
return "WC{" +
"word='" + word + '\'' +
", frequency=" + frequency +
'}';
}
}
public static class UDF {
public String word;
public Integer hashcode;
public UDF() {
}
public UDF(String word, Integer frequency) {
this.word = word;
this.hashcode = hashcode;
}
@Override
public String toString() {
return "UDF{" +
"word='" + word + '\'' +
", hashcode=" + hashcode + '\'' +
'}';
}
}
}
Table API和Sql的例子均可在本地运行查看结果。