Java实现 SparkStreaming读取Kafka数据,并且单词累加计数

网络上大部分文章都没有做到真正的单词累加计数,终于研究完以后成功实现

简单的Kafka生产者

package com.zwj.utils;

import kafka.javaapi.producer.Producer;
import kafka.producer.KeyedMessage;
import kafka.producer.ProducerConfig;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.storage.StorageLevel;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaPairReceiverInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka.KafkaUtils;
import scala.Tuple2;

import java.util.*;

public class Product {
    public static void KafkaProduct(String message){
        String topics = "TuiJian3";
        Properties properties = new Properties();
        properties.put("serializer.class", "kafka.serializer.StringEncoder");
        properties.put("metadata.broker.list", "192.168.42.132:9092,192.168.42.134:9092,192.168.42.135:9092");
        properties.put("request.required.acks", "1");
        ProducerConfig config = new ProducerConfig(properties);
        Producer producer = new Producer(config);
        List> messageList = new ArrayList<>();
        KeyedMessage message1 = new KeyedMessage(topics,message);
        messageList.add(message1);
        producer.send(messageList);
    }



    public static void main(String[] args) {
                KafkaProduct("hello world");
    }
}

 

SparkStreaming读取Kafka数据

和其他不同的是不能使用ReduceByKey应该使用的是updateStateByKey

package com.zwj.utils;

import com.google.common.base.Optional;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.FlatMapFunction;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.api.java.function.Function2;
import org.apache.spark.api.java.function.PairFunction;
import org.apache.spark.storage.StorageLevel;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaPairDStream;
import org.apache.spark.streaming.api.java.JavaPairReceiverInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka.KafkaUtils;
import scala.Tuple2;

import java.util.Arrays;
import java.util.HashMap;
import java.lang.Iterable;
import java.util.List;


public class SparkStreamingRead {
    public static void SparkStreamingReadProduct() {
        SparkConf conf = new SparkConf().setAppName("ReadKafka").setMaster("local[2]");
        JavaSparkContext sc = new JavaSparkContext(conf);
        JavaStreamingContext ssc = new JavaStreamingContext(sc, Durations.seconds(5));
        String zkQurum = "hadoop01:2181,hadoop02:2181,hadoop03:2181";
        String group = "zwj-consumer-group";
        String topics = "TuiJian3";
        HashMap map = new HashMap<>();
        map.put(topics, 2);
        ssc.checkpoint("hdfs://hadoop02:9000/ck-2018-24-004");
        JavaPairReceiverInputDStream data = KafkaUtils.createStream(ssc, zkQurum, group, map, StorageLevel.MEMORY_AND_DISK_SER());
        JavaDStream lines = data.map(new Function, String>() {
            public String call(Tuple2 tuple2) {
                return tuple2._2();
            }
        });
        JavaDStream words = lines.flatMap(new FlatMapFunction() {
            @Override
            public Iterable call(String lines) {
                Iterable iterator = Arrays.asList(lines.split(" "));
                return iterator;

            }
        });
        JavaPairDStream wordCouts = words.mapToPair(new PairFunction() {

            @Override
            public Tuple2 call(String s) throws Exception {
                return new Tuple2<>(s, 1);
            }
        }).updateStateByKey(new Function2, Optional, Optional>() {
            @Override
            public Optional call(List v1, Optional v2) throws Exception {
                Integer i2 = v2.or(new Integer(0));
                Integer sum = new Integer(0);
                for (Integer temp : v1) {
                    sum += temp;
                }
                sum += i2;
                return Optional.of(sum);
            }
        });

       /* .updateStateByKey(new Function2, Optional, Optional>() {
            @Override
            public Optional call(List v1, Optional v2) throws Exception {


                return Optional.of(sum);
            }
        });*/
        wordCouts.print();
        ssc.start();
        ssc.awaitTermination();
        ssc.close();

    }

    public static void main(String[] args) {
        SparkStreamingReadProduct();
    }
}

 

你可能感兴趣的:(Spark,Kafka)