使用滑动窗口进行实时的热词统计


开发环境:

  • kafka 1.1.1
  • spark 2.3

Java代码:

package cn.spark.streaming;

import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;

import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaPairRDD;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaPairInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka.KafkaUtils;

import kafka.serializer.StringDecoder;
import scala.Tuple2;

/**
 * 
 * Hot Word Count
 * 
 */
public class WindowWordCount {

    public static void main(String[] args) throws Exception{
        
        SparkConf conf = new SparkConf().setAppName("WindowWordCount");
        
        // create context
        JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(10));
    
        // kafka properties map
        Map KafkaParams = new HashMap();
        KafkaParams.put("bootstrap.servers", "hserver-1:9092,hserver-2:9092,hserver-3:9092");
        KafkaParams.put("group.id", "WindowWordCount");
        KafkaParams.put("auto.offset.reset", "smallest");
        
        // kafka topic set
        Set topics = new HashSet();
        topics.add(args[0]);
        
        // access data DStream
        JavaPairInputDStream SearchLogDStream = 
                KafkaUtils.createDirectStream(
                        jssc, 
                        String.class, 
                        String.class, 
                        StringDecoder.class, 
                        StringDecoder.class, 
                        KafkaParams, 
                        topics
                        );
        
        // flatMap
        JavaDStream WordDSteram = 
                SearchLogDStream.flatMap(
                        
                        new FlatMapFunction, String>() {

                            private static final long serialVersionUID = 4034522628037914742L;

                            @Override
                            public Iterator call(Tuple2 tuple) throws Exception {

                                return Arrays.asList(tuple._2.split(" ")).iterator();
                            }
                        });
        
        // mapToPair
        JavaPairDStream WordPairDStream = 
                WordDSteram.mapToPair(
                        
                        new PairFunction() {

                            private static final long serialVersionUID = 2101884706537316002L;

                            @Override
                            public Tuple2 call(String word) throws Exception {

                                return new Tuple2(word, 1);
                            }
                        });
        
        // reduceByKeyAndWindow
        JavaPairDStream WindowWordDStream = 
                WordPairDStream.reduceByKeyAndWindow(
                        new Function2() {
                    
                            private static final long serialVersionUID = -358144101893232390L;

                            @Override
                            public Integer call(Integer v1, Integer v2) throws Exception {

                                return v1 + v2;
                            }
                        }, 
                        Durations.seconds(10), 
                        Durations.seconds(60)
                        );
        
        // sort
        JavaPairDStream ResultSortDStream = 
                WindowWordDStream.transformToPair(
                        
                    new Function, JavaPairRDD>() {

                        private static final long serialVersionUID = 1441798634812792342L;

                        @Override
                        public JavaPairRDD call(JavaPairRDD unSortRDD)
                                throws Exception {
                            
                            JavaPairRDD sortRDD = unSortRDD
                            .mapToPair(
                                    
                                new PairFunction, Integer, String>() {

                                    private static final long serialVersionUID = -3715362497048144520L;

                                    @Override
                                    public Tuple2 call(Tuple2 tuple) throws Exception {

                                        return new Tuple2(tuple._2, tuple._1);
                                    }
                                })
                            .sortByKey(false)
                            .mapToPair(
                                    
                                new PairFunction, String, Integer>() {

                                    private static final long serialVersionUID = 7017380215451671038L;

                                    @Override
                                    public Tuple2 call(Tuple2 tuple)
                                            throws Exception {

                                        return new Tuple2(tuple._2, tuple._1);
                                    }
                                });

                            return sortRDD;
                        }
                    });
        
        // print result
        ResultSortDStream.print();
        
        jssc.start();
        
        jssc.awaitTermination();
        
        jssc.close();
        
    }
}

你可能感兴趣的:(使用滑动窗口进行实时的热词统计)