Spark Streaming 整合Kafka报这个错
Spark版本 2.4.0 Kafka版本0.10.2.2
public class AccessProducer extends Thread {
private static SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd");
private static Random random = new Random();
private static String[] sections = new String[] {"country", "international", "sport", "entertainment", "movie", "carton", "tv-show", "technology", "internet", "car"};
private static int[] arr = new int[]{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
private static String date;
private Producer<Integer, String> producer;
private String topic;
public AccessProducer(String topic) {
this.topic = topic;
// producer = new Producer(createProducerConfig());
producer = new KafkaProducer<>(createProducerProperties());
date = sdf.format(new Date());
}
private Properties createProducerProperties() {
Properties props = new Properties();
props.put("key.serializer", "org.apache.kafka.common.serialization.IntegerSerializer");
props.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");
props.put("bootstrap.servers", "192.168.114.200:9092,192.168.114.201:9092:9092,192.168.114.202:9092");
return props;
}
@Override
public void run() {
int counter = 0;
while(true) {
for(int i = 0; i < 100; i++) {
String log = null;
if(arr[random.nextInt(10)] == 1) {
log = getRegisterLog();
} else {
log = getAccessLog();
}
producer.send(new ProducerRecord<Integer, String>(topic, i, log));
counter++;
if(counter == 100) {
counter = 0;
try {
Thread.sleep(1000);
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
}
}
private static String getAccessLog() {
StringBuffer buffer = new StringBuffer("");
// 生成时间戳
long timestamp = System.currentTimeMillis();
// 生成随机userid(默认1000注册用户,每天1/10的访客是未注册用户)
Long userid = 0L;
int newOldUser = arr[random.nextInt(10)];
if(newOldUser == 1) {
userid = null;
} else {
userid = (long) random.nextInt(1000);
}
// 生成随机pageid(总共1k个页面)
Long pageid = (long) random.nextInt(1000);
// 生成随机版块(总共10个版块)
String section = sections[random.nextInt(10)];
// 生成固定的行为,view
String action = "view";
return buffer.append(date).append(" ")
.append(timestamp).append(" ")
.append(userid).append(" ")
.append(pageid).append(" ")
.append(section).append(" ")
.append(action).toString();
}
private static String getRegisterLog() {
StringBuffer buffer = new StringBuffer("");
// 生成时间戳
long timestamp = System.currentTimeMillis();
// 新用户都是userid为null
Long userid = null;
// 生成随机pageid,都是null
Long pageid = null;
// 生成随机版块,都是null
String section = null;
// 生成固定的行为,view
String action = "register";
return buffer.append(date).append(" ")
.append(timestamp).append(" ")
.append(userid).append(" ")
.append(pageid).append(" ")
.append(section).append(" ")
.append(action).toString();
}
public static void main(String[] args) {
AccessProducer producer = new AccessProducer("news");
producer.start();
}
}
kafka-topics.sh --zookeeper 192.168.114.200:2181,192.168.114.201:2181,192.168.114.202:2181 --topic news --replication-factor 2 --partitions 1 --create
public class NewsRealtimeStatSpark {
public static void main(String[] args) {
// 创建Spark上下文
SparkConf conf = new SparkConf()
.setMaster("local[2]")
.setAppName("NewsRealtimeStatSpark");
JavaStreamingContext jssc = new JavaStreamingContext(
conf, Durations.seconds(5));
// 创建输入DStream
Map<String, Object> kafkaParams = new HashMap<String, Object>();
kafkaParams.put("bootstrap.servers", "192.168.114.200:9092,192.168.114.201:9092,192.168.114.202:9092");
kafkaParams.put("key.deserializer", IntegerDeserializer.class);
kafkaParams.put("value.deserializer", StringDeserializer.class);
kafkaParams.put("group.id", "111");
kafkaParams.put("auto.offset.reset", "latest");
kafkaParams.put("enable.auto.commit", false);
Collection<String> topics = Arrays.asList("news");
JavaInputDStream<ConsumerRecord<Integer, String>> lines = KafkaUtils.createDirectStream(jssc,
LocationStrategies.PreferConsistent(),
ConsumerStrategies.Subscribe(topics, kafkaParams));
lines.print();
jssc.start();
try {
jssc.awaitTermination();
} catch (InterruptedException e) {
e.printStackTrace();
}
jssc.close();
}
}
19/01/26 15:48:45 INFO AppInfoParser: Kafka version : 0.10.2.2
19/01/26 15:48:45 INFO AppInfoParser: Kafka commitId : cd80bc412b9b9701
19/01/26 15:48:45 INFO InternalKafkaConsumer: Initial fetch for spark-executor-111 news-0 115900
19/01/26 15:48:45 ERROR Executor: Exception in task 0.0 in stage 0.0 (TID 0)
java.io.NotSerializableException: org.apache.kafka.clients.consumer.ConsumerRecord
Serialization stack:
- object not serializable (class: org.apache.kafka.clients.consumer.ConsumerRecord, value: ConsumerRecord(topic = news, partition = 0, offset = 115900, CreateTime = 1548486965892, checksum = 3320474937, serialized key size = -1, serialized value size = 51, key = null, value = 2019-01-26 1548486965891 911 550 entertainment view))
- element of array (index: 0)
- array (class [Lorg.apache.kafka.clients.consumer.ConsumerRecord;, size 11)
at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46)
at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:100)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:450)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
19/01/26 15:48:45 ERROR TaskSetManager: Task 0.0 in stage 0.0 (TID 0) had a not serializable result: org.apache.kafka.clients.consumer.ConsumerRecord
Serialization stack:
- object not serializable (class: org.apache.kafka.clients.consumer.ConsumerRecord, value: ConsumerRecord(topic = news, partition = 0, offset = 115900, CreateTime = 1548486965892, checksum = 3320474937, serialized key size = -1, serialized value size = 51, key = null, value = 2019-01-26 1548486965891 911 550 entertainment view))
- element of array (index: 0)
- array (class [Lorg.apache.kafka.clients.consumer.ConsumerRecord;, size 11); not retrying
19/01/26 15:48:45 INFO TaskSchedulerImpl: Removed TaskSet 0.0, whose tasks have all completed, from pool
19/01/26 15:48:45 INFO TaskSchedulerImpl: Cancelling stage 0
19/01/26 15:48:45 INFO TaskSchedulerImpl: Killing all running tasks in stage 0: Stage cancelled
19/01/26 15:48:45 INFO DAGScheduler: ResultStage 0 (print at NewsRealtimeStatSpark.java:42) failed in 0.532 s due to Job aborted due to stage failure: Task 0.0 in stage 0.0 (TID 0) had a not serializable result: org.apache.kafka.clients.consumer.ConsumerRecord
Serialization stack:
- object not serializable (class: org.apache.kafka.clients.consumer.ConsumerRecord, value: ConsumerRecord(topic = news, partition = 0, offset = 115900, CreateTime = 1548486965892, checksum = 3320474937, serialized key size = -1, serialized value size = 51, key = null, value = 2019-01-26 1548486965891 911 550 entertainment view))
- element of array (index: 0)
- array (class [Lorg.apache.kafka.clients.consumer.ConsumerRecord;, size 11)
19/01/26 15:48:45 INFO DAGScheduler: Job 0 failed: print at NewsRealtimeStatSpark.java:42, took 0.599568 s
19/01/26 15:48:45 INFO JobScheduler: Finished job streaming job 1548488925000 ms.0 from job set of time 1548488925000 ms
19/01/26 15:48:45 ERROR JobScheduler: Error running job streaming job 1548488925000 ms.0
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0.0 in stage 0.0 (TID 0) had a not serializable result: org.apache.kafka.clients.consumer.ConsumerRecord
Serialization stack:
- object not serializable (class: org.apache.kafka.clients.consumer.ConsumerRecord, value: ConsumerRecord(topic = news, partition = 0, offset = 115900, CreateTime = 1548486965892, checksum = 3320474937, serialized key size = -1, serialized value size = 51, key = null, value = 2019-01-26 1548486965891 911 550 entertainment view))
- element of array (index: 0)
- array (class [Lorg.apache.kafka.clients.consumer.ConsumerRecord;, size 11)
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1887)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1875)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1874)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1874)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2108)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2057)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2046)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
at org.apache.spark.streaming.kafka010.KafkaRDD.take(KafkaRDD.scala:139)
at org.apache.spark.streaming.kafka010.KafkaRDD.take(KafkaRDD.scala:48)
at org.apache.spark.streaming.dstream.DStream$$anonfun$print$2$$anonfun$foreachFunc$3$1.apply(DStream.scala:735)
at org.apache.spark.streaming.dstream.DStream$$anonfun$print$2$$anonfun$foreachFunc$3$1.apply(DStream.scala:734)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:416)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
at scala.util.Try$.apply(Try.scala:192)
at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:257)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:257)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:257)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:58)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:256)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0.0 in stage 0.0 (TID 0) had a not serializable result: org.apache.kafka.clients.consumer.ConsumerRecord
Serialization stack:
- object not serializable (class: org.apache.kafka.clients.consumer.ConsumerRecord, value: ConsumerRecord(topic = news, partition = 0, offset = 115900, CreateTime = 1548486965892, checksum = 3320474937, serialized key size = -1, serialized value size = 51, key = null, value = 2019-01-26 1548486965891 911 550 entertainment view))
- element of array (index: 0)
- array (class [Lorg.apache.kafka.clients.consumer.ConsumerRecord;, size 11)
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1887)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1875)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1874)
at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1874)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:926)
at scala.Option.foreach(Option.scala:257)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:926)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2108)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2057)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2046)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:737)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2061)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
at org.apache.spark.streaming.kafka010.KafkaRDD.take(KafkaRDD.scala:139)
at org.apache.spark.streaming.kafka010.KafkaRDD.take(KafkaRDD.scala:48)
at org.apache.spark.streaming.dstream.DStream$$anonfun$print$2$$anonfun$foreachFunc$3$1.apply(DStream.scala:735)
at org.apache.spark.streaming.dstream.DStream$$anonfun$print$2$$anonfun$foreachFunc$3$1.apply(DStream.scala:734)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply$mcV$sp(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1$$anonfun$apply$mcV$sp$1.apply(ForEachDStream.scala:51)
at org.apache.spark.streaming.dstream.DStream.createRDDWithLocalProperties(DStream.scala:416)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply$mcV$sp(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
at org.apache.spark.streaming.dstream.ForEachDStream$$anonfun$1.apply(ForEachDStream.scala:50)
at scala.util.Try$.apply(Try.scala:192)
at org.apache.spark.streaming.scheduler.Job.run(Job.scala:39)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply$mcV$sp(JobScheduler.scala:257)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:257)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler$$anonfun$run$1.apply(JobScheduler.scala:257)
at scala.util.DynamicVariable.withValue(DynamicVariable.scala:58)
at org.apache.spark.streaming.scheduler.JobScheduler$JobHandler.run(JobScheduler.scala:256)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
19/01/26 15:48:45 INFO StreamingContext: Invoking stop(stopGracefully=false) from shutdown hook
创建spark上下文时设置一个属性
set(“spark.serializer”, “org.apache.spark.serializer.KryoSerializer”)
// 创建Spark上下文
SparkConf conf = new SparkConf()
.setMaster("local[2]")
.setAppName("NewsRealtimeStatSpark")
.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");