sparkstreaming API 操作实例 java

sparkstreaming API 操作实例 java

public static void main(String[] args) throws InterruptedException {
        SparkSession spark = SparkSession.builder().appName("test streaming").master("local[2]").getOrCreate();
        JavaSparkContext sc = new JavaSparkContext(spark.sparkContext());
        JavaStreamingContext jssc = new JavaStreamingContext(sc, Durations.seconds(5));
        jssc.sparkContext().setLogLevel("WARN");
        jssc.checkpoint("hdfs://hdp01:8020/user/zhangruichao/streaming/checkpoint");
        JavaReceiverInputDStream<String> stream = jssc.socketTextStream("localhost", 9999);
//        wordCountByFlatMap(stream);
//        countByValue(stream);
//        updateStateByKey(stream);
//        reduceByKey(stream);
//        transform(stream, jssc);
        reduceByKeyAndWindow(stream);
        jssc.start();
        jssc.awaitTermination();
        jssc.close();
    }
    /**
     * 窗口长度,窗口时间间隔;
     * 窗口长度<窗口时间间隔 会导致有数据统计不到;
     * 这两个参数都要是batchDuration的整数倍
     */
    private static void reduceByKeyAndWindow(JavaReceiverInputDStream<String> stream) {
        JavaPairDStream<String, Integer> pairDStream = streamToPair(stream);
        JavaPairDStream<String, Integer> javaPairDStream = pairDStream.reduceByKeyAndWindow(new Function2<Integer, Integer, Integer>() {
            @Override
            public Integer call(Integer integer, Integer integer2) throws Exception {
                return integer + integer2;
            }
        }, Durations.seconds(10), Durations.seconds(10));
        javaPairDStream.print();
    }
    /**
     * 将一种DStream转换为另一种DStream,一般用做数据源流的过滤
     */
    private static void transform(JavaReceiverInputDStream<String> stream, JavaStreamingContext jssc) {
        JavaPairDStream<String, Integer> pairDStream = streamToPair(stream);
        List<Tuple2<String, Boolean>> writeList
                = Arrays.asList(new Tuple2<>("java", true), new Tuple2<>("python", true), new Tuple2<>("spark", false));
        JavaPairRDD<String, Boolean> parallelizePairs = jssc.sparkContext().parallelizePairs(writeList);
        JavaDStream<String> transformRes = pairDStream.transform(javaPairRDD -> {
            //根据key进行join
            JavaPairRDD<String, Tuple2<Integer, Optional<Boolean>>> pairRDD = javaPairRDD.leftOuterJoin(parallelizePairs);
            JavaPairRDD<String, Tuple2<Integer, Optional<Boolean>>> filter = pairRDD.filter(tuple2 -> {
                //白名单为true的不过滤
                if (tuple2._2._2.isPresent() && tuple2._2._2.get())
                    return false;
                return true;
            });
            return filter.keys();
        });
        transformRes.print();
    }
    private static JavaPairDStream<String, Integer> streamToPair(JavaReceiverInputDStream<String> stream) {
        return stream.flatMap(s -> {
            String[] s1 = s.split(" ");
            List<Tuple2<String, Integer>> list = new ArrayList<>();
            for (String s2 : s1) {
                list.add(new Tuple2<>(s2, 1));
            }
            return list.iterator();
        }).mapToPair(tuple2 -> new Tuple2<>(tuple2._1(), tuple2._2()));
    }
    /**
     * 对一段时间的数据reduce,多条变一条
     */
    private static void reduceByKey(JavaReceiverInputDStream<String> stream) {
        JavaDStream<String> reduce = stream.reduce(new Function2<String, String, String>() {
            @Override
            public String call(String s, String s2) throws Exception {
                return s.concat(",").concat(s2);
            }
        });
        reduce.print();
    }

    /**
     * 将所有历史所有相同key的value计算
     */
    private static void updateStateByKey(JavaReceiverInputDStream<String> stream) {
        JavaPairDStream<String, Integer> javaPairDStream = streamToPair(stream);
        JavaPairDStream<String, Integer> wordCount = javaPairDStream.updateStateByKey(new Function2<List<Integer>, Optional<Integer>, Optional<Integer>>() {
            @Override
            public Optional<Integer> call(List<Integer> values, Optional<Integer> state) throws Exception {
                int newValue = 0;
                if (state.isPresent()) {
                    newValue = state.get();
                }
                newValue += values.stream().mapToInt(value -> value).sum();
                return Optional.of(newValue);
            }
        });
        wordCount.print();
    }

    /**
     * 统计一段时间内值的次数
     *
     * @param stream
     */
    private static void countByValue(JavaReceiverInputDStream<String> stream) {
        JavaPairDStream<String, Long> pairDStream = stream.countByValue();
        pairDStream.print();
    }

    /**
     * 对一段时间的单词进行wordcount
     *
     * @param stream
     */
    private static void wordCountByFlatMap(JavaReceiverInputDStream<String> stream) {
        JavaPairDStream<String, Integer> pairDStream = streamToPair(stream).reduceByKey(Integer::sum);
        pairDStream.print();
    }
    ```

你可能感兴趣的:(sparkstreaming API 操作实例 java)