Flink支持流处理同时也支持批处理。
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.datastream.WindowedStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;
/**
* 使用窗口操作实现WordCount
* 每隔1秒对最近2秒内的数据进行聚合操作
*/
public class WordCount {
public static void main(String[] args) throws Exception{
//服务端口号
int port;
try {
ParameterTool parameterTool = ParameterTool.fromArgs(args);
port = parameterTool.getInt("port");
}catch (Exception e){
System.err.println("No port set. Please use default port 9999");
port = 8888;
}
String hostname = "192.168.1.102";
//初始化对象
StreamExecutionEnvironment executionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment();
//获取数据
DataStreamSource<String> data = executionEnvironment.socketTextStream(hostname, port);
//计算
SingleOutputStreamOperator<WordWithCount> pairWords = data.flatMap(new FlatMapFunction<String, WordWithCount>() {
@Override
public void flatMap(String s, Collector<WordWithCount> collector) throws Exception {
String[] split = s.split(",");
for (String word : split) {
collector.collect(new WordWithCount(word, 1L));
}
}
});
//将元组按照key进行分组
KeyedStream<WordWithCount, Tuple> grouped = pairWords.keyBy("word");
//窗口操作,参数:窗口长度和滑动间隔
WindowedStream<WordWithCount, Tuple, TimeWindow> window = grouped.timeWindow(Time.seconds(2), Time.seconds(1));
SingleOutputStreamOperator<WordWithCount> counts = window.sum("count");
//打印
counts.print().setParallelism(10);
executionEnvironment.execute("wordCount");
}
public static class WordWithCount{
String word;
Long count;
public WordWithCount() {
}
public WordWithCount(String word, Long count) {
this.word = word;
this.count = count;
}
@Override
public String toString() {
return "WordWithCOunt{" +
"word='" + word + '\'' +
", count=" + count +
'}';
}
}
}
需要我们注意的是千万不要导错包。。
import org.apache.flink.api.java.utils.ParameterTool
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.windowing.time.Time
object WordCount_Scala {
def main(args: Array[String]): Unit = {
val port : Int = try {
ParameterTool.fromArgs(args).getInt("port")
}catch {
case e => {
System.err.println("No port set, use default port 9999")
}
8888
}
//获取上下文对象
val environment = StreamExecutionEnvironment.getExecutionEnvironment
//获取数据
val data = environment.socketTextStream("192.168.1.102", port)
import org.apache.flink.api.scala._
val window = data.flatMap(line => line.split("\\s"))
.map(w => WordWithCount(w, 1))
.keyBy("word")
.timeWindow(Time.seconds(2), Time.seconds(1))
window.sum("count").print().setParallelism(10)
environment.execute("WordCount")
}
case class WordWithCount(word:String, count:Int)
}
import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.AggregateOperator;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;
public class BatchWordCount {
public static void main(String[] args) throws Exception {
//初始化环境
ExecutionEnvironment environment = ExecutionEnvironment.getExecutionEnvironment();
//获取数据
DataSource<String> dataSource = environment.readTextFile("/Users/xiaotongxin/IdeaProjects/FlinkPractise/src/main/resources/zxc.txt");
//聚合计算
AggregateOperator<Tuple2<String, Integer>> operator = dataSource.flatMap(new SplitFunction()).groupBy(0).sum(1);
operator.writeAsCsv("/Users/xiaotongxin/IdeaProjects/FlinkPractise/src/main/resources/result").setParallelism(1);
environment.execute("BatchWordCount");
}
private static class SplitFunction implements FlatMapFunction<String, Tuple2<String, Integer>>{
@Override
public void flatMap(String value, Collector<Tuple2<String, Integer>> out) throws Exception {
String[] strings = value.split(" ");
for (String word: strings) {
if (word.length()>0) {
out.collect(new Tuple2<>(word, 1));
}
}
}
}
}
将运行的结果以csv写到一个文件中结果如下:
HDFS,1
hadoop,1
HBase,1
love,1
storm,1
zxc,1
Flink,1
spark,1
import org.apache.flink.api.scala.{
DataSet, ExecutionEnvironment}
object BatchWordCount_Scala {
def main(args: Array[String]): Unit = {
//获取配置
val environment: ExecutionEnvironment = ExecutionEnvironment.getExecutionEnvironment
//加载数据
val data: DataSet[String] = environment
.readTextFile("/Users/xiaotongxin/IdeaProjects/FlinkPractise/src/main/resources/zxc.txt")
import org.apache.flink.api.scala._
//计算
data.flatMap(_.split(" "))
.filter(_.nonEmpty)
.map((_,1))
.groupBy(0)
.sum(1)
.writeAsCsv("/Users/xiaotongxin/IdeaProjects/FlinkPractise/src/main/resources/result")
.setParallelism(1)
environment.execute("BatchWordCountScala")
}
}
运行结果如下:
HDFS,1
hadoop,1
Flink,1
spark,1
HBase,1
love,1
storm,1
zxc,1
从上面的例子中可以看出Java实现的代码量比scala代码量要多,scala是函数式编程这也比Java实现起来看着要简洁很多。。。
流式处理:Streaming
批处理:Batch