1、had a not serializable result: org.apache.kafka.clients.consumer.ConsumerRecord

Spark Streaming 整合Kafka报这个错
Spark版本 2.4.0 Kafka版本0.10.2.2

首先看模拟产生日志代码

public class AccessProducer extends Thread {

    private static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
    private static Random random = new Random();
    private static String[] sections = new String[] {"country", "international", "sport", "entertainment", "movie", "carton", "tv-show", "technology", "internet", "car"};
    private static int[] arr = new int[]{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
    private static String date;

    private Producer<Integer, String> producer;
    private String topic;

    public AccessProducer(String topic) {
        this.topic = topic;
        // producer = new Producer(createProducerConfig());
        producer = new KafkaProducer<>(createProducerProperties());
        date = sdf.format(new Date());
    }

    private Properties createProducerProperties() {
        Properties props = new Properties();
        props.put("key.serializer", "org.apache.kafka.common.serialization.IntegerSerializer");
        props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");
        props.put("bootstrap.servers", "192.168.114.200:9092,192.168.114.201:9092:9092,192.168.114.202:9092");
        return props;
    }

    @Override
    public void run() {
        int counter = 0;

        while(true) {
            for(int i = 0; i < 100; i++) {
                String log = null;

                if(arr[random.nextInt(10)] == 1) {
                    log = getRegisterLog();
                } else {
                    log = getAccessLog();
                }

                producer.send(new ProducerRecord<Integer, String>(topic, i, log));

                counter++;
                if(counter == 100) {
                    counter = 0;
                    try {
                        Thread.sleep(1000);
                    } catch (InterruptedException e) {
                        e.printStackTrace();
                    }
                }
            }
        }
    }

    private static String getAccessLog() {
        StringBuffer buffer = new StringBuffer("");

        // 生成时间戳
        long timestamp = System.currentTimeMillis();

        // 生成随机userid(默认1000注册用户,每天1/10的访客是未注册用户)
        Long userid = 0L;

        int newOldUser = arr[random.nextInt(10)];
        if(newOldUser == 1) {
            userid = null;
        } else {
            userid = (long) random.nextInt(1000);
        }

        // 生成随机pageid(总共1k个页面)
        Long pageid = (long) random.nextInt(1000);

        // 生成随机版块(总共10个版块)
        String section = sections[random.nextInt(10)];

        // 生成固定的行为,view
        String action = "view";

        return buffer.append(date).append(" ")
                .append(timestamp).append(" ")
                .append(userid).append(" ")
                .append(pageid).append(" ")
                .append(section).append(" ")
                .append(action).toString();
    }

    private static String getRegisterLog() {
        StringBuffer buffer = new StringBuffer("");

        // 生成时间戳
        long timestamp = System.currentTimeMillis();

        // 新用户都是userid为null
        Long userid = null;

        // 生成随机pageid,都是null
        Long pageid = null;

        // 生成随机版块,都是null
        String section = null;

        // 生成固定的行为,view
        String action = "register";

        return buffer.append(date).append(" ")
                .append(timestamp).append(" ")
                .append(userid).append(" ")
                .append(pageid).append(" ")
                .append(section).append(" ")
                .append(action).toString();
    }

    public static void main(String[] args) {
        AccessProducer producer = new AccessProducer("news");
        producer.start();
    }
}

Kafka创建主题

kafka-topics.sh --zookeeper 192.168.114.200:2181,192.168.114.201:2181,192.168.114.202:2181 --topic news --replication-factor 2 --partitions 1 --create

Spark Streaming 整合Kafka

public class NewsRealtimeStatSpark {
    public static void main(String[] args) {
        // 创建Spark上下文
        SparkConf conf = new SparkConf()
                .setMaster("local[2]")
                .setAppName("NewsRealtimeStatSpark");
        JavaStreamingContext jssc = new JavaStreamingContext(
                conf, Durations.seconds(5));

        // 创建输入DStream
        Map<String, Object> kafkaParams = new HashMap<String, Object>();
        kafkaParams.put("bootstrap.servers", "192.168.114.200:9092,192.168.114.201:9092,192.168.114.202:9092");
        kafkaParams.put("key.deserializer", IntegerDeserializer.class);
        kafkaParams.put("value.deserializer", StringDeserializer.class);
        kafkaParams.put("group.id", "111");
        kafkaParams.put("auto.offset.reset", "latest");
        kafkaParams.put("enable.auto.commit", false);
        Collection<String> topics = Arrays.asList("news");
        JavaInputDStream<ConsumerRecord<Integer, String>> lines = KafkaUtils.createDirectStream(jssc,
                LocationStrategies.PreferConsistent(),
                ConsumerStrategies.Subscribe(topics, kafkaParams));

        lines.print();

        jssc.start();
        try {
            jssc.awaitTermination();
        } catch (InterruptedException e) {
            e.printStackTrace();
        }
        jssc.close();

    }
}

报错信息如下

19/01/26 15:48:45 INFO AppInfoParser: Kafka version : 0.10.2.2
19/01/26 15:48:45 INFO AppInfoParser: Kafka commitId : cd80bc412b9b9701
19/01/26 15:48:45 INFO InternalKafkaConsumer: Initial fetch for spark-executor-111 news-0 115900
19/01/26 15:48:45 ERROR Executor: Exception in task 0.0 in stage 0.0 (TID 0)
java.io.NotSerializableException: org.apache.kafka.clients.consumer.ConsumerRecord
Serialization stack:
	- object not serializable (class: org.apache.kafka.clients.consumer.ConsumerRecord, value: ConsumerRecord(topic = news, partition = 0, offset = 115900, CreateTime = 1548486965892, checksum = 3320474937, serialized key size = -1, serialized value size = 51, key = null, value = 2019-01-26 1548486965891 911 550 entertainment view))
	- element of array (index: 0)
	- array (class [Lorg.apache.kafka.clients.consumer.ConsumerRecord;, size 11)
	at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
	at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46)
	at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:100)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:450)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
19/01/26 15:48:45 ERROR TaskSetManager: Task 0.0 in stage 0.0 (TID 0) had a not serializable result: org.apache.kafka.clients.consumer.ConsumerRecord
Serialization stack:
	- object not serializable (class: org.apache.kafka.clients.consumer.ConsumerRecord, value: ConsumerRecord(topic = news, partition = 0, offset = 115900, CreateTime = 1548486965892, checksum = 3320474937, serialized key size = -1, serialized value size = 51, key = null, value = 2019-01-26 1548486965891 911 550 entertainment view))
	- element of array (index: 0)
	- array (class [Lorg.apache.kafka.clients.consumer.ConsumerRecord;, size 11); not retrying
19/01/26 15:48:45 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool 
19/01/26 15:48:45 INFO TaskSchedulerImpl: Cancelling stage 0
19/01/26 15:48:45 INFO TaskSchedulerImpl: Killing all running tasks in stage 0: Stage cancelled
19/01/26 15:48:45 INFO DAGScheduler: ResultStage 0 (print at NewsRealtimeStatSpark.java:42) failed in 0.532 s due to Job aborted due to stage failure: Task 0.0 in stage 0.0 (TID 0) had a not serializable result: org.apache.kafka.clients.consumer.ConsumerRecord
Serialization stack:
	- object not serializable (class: org.apache.kafka.clients.consumer.ConsumerRecord, value: ConsumerRecord(topic = news, partition = 0, offset = 115900, CreateTime = 1548486965892, checksum = 3320474937, serialized key size = -1, serialized value size = 51, key = null, value = 2019-01-26 1548486965891 911 550 entertainment view))
	- element of array (index: 0)
	- array (class [Lorg.apache.kafka.clients.consumer.ConsumerRecord;, size 11)
19/01/26 15:48:45 INFO DAGScheduler: Job 0 failed: print at NewsRealtimeStatSpark.java:42, took 0.599568 s
19/01/26 15:48:45 INFO JobScheduler: Finished job streaming job 1548488925000 ms.0 from job set of time 1548488925000 ms
19/01/26 15:48:45 ERROR JobScheduler: Error running job streaming job 1548488925000 ms.0
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0.0 in stage 0.0 (TID 0) had a not serializable result: org.apache.kafka.clients.consumer.ConsumerRecord
Serialization stack:
	- object not serializable (class: org.apache.kafka.clients.consumer.ConsumerRecord, value: ConsumerRecord(topic = news, partition = 0, offset = 115900, CreateTime = 1548486965892, checksum = 3320474937, serialized key size = -1, serialized value size = 51, key = null, value = 2019-01-26 1548486965891 911 550 entertainment view))
	- element of array (index: 0)
	- array (class [Lorg.apache.kafka.clients.consumer.ConsumerRecord;, size 11)
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1887)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1875)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1874)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1874)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2108)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2057)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2046)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.streaming.kafka010.KafkaRDD.take(KafkaRDD.scala:139)
	at org.apache.spark.streaming.kafka010.KafkaRDD.take(KafkaRDD.scala:48)
	at org.apache.spark.streaming.dstream.DStream$$anonfun$print$2$$anonfun$foreachFunc$3$1.apply(DStream.scala:735)
	at org.apache.spark.streaming.dstream.DStream$$anonfun$print$2$$anonfun$foreachFunc$3$1.apply(DStream.scala:734)
	at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:51)
	at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
	at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
	at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:416)
	at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:50)
	at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
	at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
	at scala.util.Try$.apply(Try.scala:192)
	at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
	at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:257)
	at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:257)
	at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:257)
	at scala.util.DynamicVariable.withValue(DynamicVariable.scala:58)
	at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:256)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0.0 in stage 0.0 (TID 0) had a not serializable result: org.apache.kafka.clients.consumer.ConsumerRecord
Serialization stack:
	- object not serializable (class: org.apache.kafka.clients.consumer.ConsumerRecord, value: ConsumerRecord(topic = news, partition = 0, offset = 115900, CreateTime = 1548486965892, checksum = 3320474937, serialized key size = -1, serialized value size = 51, key = null, value = 2019-01-26 1548486965891 911 550 entertainment view))
	- element of array (index: 0)
	- array (class [Lorg.apache.kafka.clients.consumer.ConsumerRecord;, size 11)
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1887)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1875)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1874)
	at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1874)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
	at scala.Option.foreach(Option.scala:257)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2108)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2057)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2046)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
	at org.apache.spark.streaming.kafka010.KafkaRDD.take(KafkaRDD.scala:139)
	at org.apache.spark.streaming.kafka010.KafkaRDD.take(KafkaRDD.scala:48)
	at org.apache.spark.streaming.dstream.DStream$$anonfun$print$2$$anonfun$foreachFunc$3$1.apply(DStream.scala:735)
	at org.apache.spark.streaming.dstream.DStream$$anonfun$print$2$$anonfun$foreachFunc$3$1.apply(DStream.scala:734)
	at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:51)
	at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
	at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
	at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:416)
	at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:50)
	at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
	at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
	at scala.util.Try$.apply(Try.scala:192)
	at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
	at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:257)
	at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:257)
	at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:257)
	at scala.util.DynamicVariable.withValue(DynamicVariable.scala:58)
	at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:256)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
	at java.lang.Thread.run(Thread.java:748)
19/01/26 15:48:45 INFO StreamingContext: Invoking stop(stopGracefully=false) from shutdown hook

解决方法

创建spark上下文时设置一个属性
set(“spark.serializer”, “org.apache.spark.serializer.KryoSerializer”)

// 创建Spark上下文
        SparkConf conf = new SparkConf()
                .setMaster("local[2]")
                .setAppName("NewsRealtimeStatSpark")
                .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");

你可能感兴趣的:(含泪踩过的bug)