Flink入门demo

文章目录

    • 1、流处理实现WordCount-Java
    • 2、流处理实现WordCount-Scala
    • 3、批处理实现WordCount-Java
    • 4、批处理实现WordCount-Scala
    • 5、Streaming和Batch的区别

专栏的上一篇文章已经简单介绍了Flink的一些基础知识和概念,现在我们来使用Flink实现一个大数据入门普遍使用的案例-WordCount。。

Flink支持流处理同时也支持批处理。

1、流处理实现WordCount-Java

import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.tuple.Tuple;
import org.apache.flink.api.java.utils.ParameterTool;
import org.apache.flink.streaming.api.datastream.DataStreamSource;
import org.apache.flink.streaming.api.datastream.KeyedStream;
import org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator;
import org.apache.flink.streaming.api.datastream.WindowedStream;
import org.apache.flink.streaming.api.environment.StreamExecutionEnvironment;
import org.apache.flink.streaming.api.windowing.time.Time;
import org.apache.flink.streaming.api.windowing.windows.TimeWindow;
import org.apache.flink.util.Collector;

/**
 * 使用窗口操作实现WordCount
 * 每隔1秒对最近2秒内的数据进行聚合操作
 */
public class WordCount {
     
    public static void main(String[] args) throws Exception{
     
        //服务端口号
        int port;

        try {
     
            ParameterTool parameterTool = ParameterTool.fromArgs(args);
            port = parameterTool.getInt("port");
        }catch (Exception e){
     
            System.err.println("No port set. Please use default port 9999");
            port = 8888;
        }

        String hostname = "192.168.1.102";

        //初始化对象
        StreamExecutionEnvironment executionEnvironment = StreamExecutionEnvironment.getExecutionEnvironment();

        //获取数据
        DataStreamSource<String> data = executionEnvironment.socketTextStream(hostname, port);

        //计算
        SingleOutputStreamOperator<WordWithCount> pairWords = data.flatMap(new FlatMapFunction<String, WordWithCount>() {
     
            @Override
            public void flatMap(String s, Collector<WordWithCount> collector) throws Exception {
     
                String[] split = s.split(",");
                for (String word : split) {
     
                    collector.collect(new WordWithCount(word, 1L));
                }
            }
        });

        //将元组按照key进行分组
        KeyedStream<WordWithCount, Tuple> grouped = pairWords.keyBy("word");

        //窗口操作,参数:窗口长度和滑动间隔
        WindowedStream<WordWithCount, Tuple, TimeWindow> window = grouped.timeWindow(Time.seconds(2), Time.seconds(1));
        SingleOutputStreamOperator<WordWithCount> counts = window.sum("count");

        //打印
        counts.print().setParallelism(10);
        executionEnvironment.execute("wordCount");
    }

    public static class WordWithCount{
     
        String word;
        Long count;

        public WordWithCount() {
     
        }

        public WordWithCount(String word, Long count) {
     
            this.word = word;
            this.count = count;
        }

        @Override
        public String toString() {
     
            return "WordWithCOunt{" +
                    "word='" + word + '\'' +
                    ", count=" + count +
                    '}';
        }
    }
}

需要我们注意的是千万不要导错包。。

2、流处理实现WordCount-Scala

import org.apache.flink.api.java.utils.ParameterTool
import org.apache.flink.streaming.api.scala.StreamExecutionEnvironment
import org.apache.flink.streaming.api.windowing.time.Time

object WordCount_Scala {
     
  def main(args: Array[String]): Unit = {
     

    val port : Int = try {
     
      ParameterTool.fromArgs(args).getInt("port")
    }catch {
     
      case e => {
     
        System.err.println("No port set, use default port 9999")
      }
        8888
    }

    //获取上下文对象
    val environment = StreamExecutionEnvironment.getExecutionEnvironment

    //获取数据
    val data = environment.socketTextStream("192.168.1.102", port)

    import org.apache.flink.api.scala._

    val window = data.flatMap(line => line.split("\\s"))
      .map(w => WordWithCount(w, 1))
      .keyBy("word")
      .timeWindow(Time.seconds(2), Time.seconds(1))

    window.sum("count").print().setParallelism(10)

    environment.execute("WordCount")

  }
  case class WordWithCount(word:String, count:Int)
}

运行结果如下:
Flink入门demo_第1张图片

3、批处理实现WordCount-Java

import org.apache.flink.api.common.functions.FlatMapFunction;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.AggregateOperator;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.tuple.Tuple2;
import org.apache.flink.util.Collector;

public class BatchWordCount {
     

    public static void main(String[] args) throws Exception {
     

        //初始化环境
        ExecutionEnvironment environment = ExecutionEnvironment.getExecutionEnvironment();

        //获取数据
        DataSource<String> dataSource = environment.readTextFile("/Users/xiaotongxin/IdeaProjects/FlinkPractise/src/main/resources/zxc.txt");

        //聚合计算
        AggregateOperator<Tuple2<String, Integer>> operator = dataSource.flatMap(new SplitFunction()).groupBy(0).sum(1);

        operator.writeAsCsv("/Users/xiaotongxin/IdeaProjects/FlinkPractise/src/main/resources/result").setParallelism(1);

        environment.execute("BatchWordCount");
    }

    private static class SplitFunction implements FlatMapFunction<String, Tuple2<String, Integer>>{
     

        @Override
        public void flatMap(String value, Collector<Tuple2<String, Integer>> out) throws Exception {
     
            String[] strings = value.split(" ");

            for (String word: strings) {
     
                if (word.length()>0) {
     
                    out.collect(new Tuple2<>(word, 1));
                }
            }
        }
    }
}

将运行的结果以csv写到一个文件中结果如下:

HDFS,1
hadoop,1
HBase,1
love,1
storm,1
zxc,1
Flink,1
spark,1

4、批处理实现WordCount-Scala

import org.apache.flink.api.scala.{
     DataSet, ExecutionEnvironment}

object BatchWordCount_Scala {
     
  def main(args: Array[String]): Unit = {
     

    //获取配置
    val environment: ExecutionEnvironment = ExecutionEnvironment.getExecutionEnvironment

    //加载数据
    val data: DataSet[String] = environment
      .readTextFile("/Users/xiaotongxin/IdeaProjects/FlinkPractise/src/main/resources/zxc.txt")

    import org.apache.flink.api.scala._
    //计算
    data.flatMap(_.split(" "))
      .filter(_.nonEmpty)
      .map((_,1))
      .groupBy(0)
      .sum(1)
      .writeAsCsv("/Users/xiaotongxin/IdeaProjects/FlinkPractise/src/main/resources/result")
      .setParallelism(1)

    environment.execute("BatchWordCountScala")
  }
}

运行结果如下:

HDFS,1
hadoop,1
Flink,1
spark,1
HBase,1
love,1
storm,1
zxc,1

从上面的例子中可以看出Java实现的代码量比scala代码量要多,scala是函数式编程这也比Java实现起来看着要简洁很多。。。

5、Streaming和Batch的区别

流式处理:Streaming

  • 初始化对象:StreamExecutionEnvironment
  • 返回类型:DataStreaming

批处理:Batch

  • 初始化对象:ExecutionEnvironment
  • 返回类型:DataSet

你可能感兴趣的:(Flink,Flink)