目的:将kafka输出的数据直接发给SparkStreeaming
代码实现:
package com.test; import java.util.Map; import java.util.HashMap; import java.util.regex.Pattern; import scala.Tuple2; import com.google.common.collect.Lists; import org.apache.cassandra.thrift.Cassandra.system_add_column_family_args; import org.apache.spark.SparkConf; import org.apache.spark.api.java.function.FlatMapFunction; import org.apache.spark.api.java.function.Function; import org.apache.spark.api.java.function.Function2; import org.apache.spark.api.java.function.PairFunction; //import org.apache.spark.examples.streaming.StreamingExamples; import org.apache.spark.streaming.Duration; import org.apache.spark.streaming.api.java.JavaDStream; import org.apache.spark.streaming.api.java.JavaPairDStream; import org.apache.spark.streaming.api.java.JavaPairReceiverInputDStream; import org.apache.spark.streaming.api.java.JavaStreamingContext; import org.apache.spark.streaming.kafka.KafkaUtils; public final class JavaKafkaWordCount { private static final Pattern SPACE = Pattern.compile(","); private JavaKafkaWordCount() { } public static void main(String[] args) { String zkQuorum = "10.10.16.252"; String group = "test-consumer-group"; String topicss = "flume1"; String numThread = "1"; JavaStreamingContext jssc = new JavaStreamingContext("spark://10.10.16.251:7077", "JavaKafkaWordCount", new Duration(20000), System.getenv("SPARK_HOME"), "/home/hadoop1/JavaKafkaWordCount.jar"); int numThreads = Integer.parseInt(numThread); Map<String, Integer> topicMap = new HashMap<String, Integer>(); String[] topics = topicss.split(","); for (String topic : topics) { topicMap.put(topic, numThreads); } JavaPairReceiverInputDStream<String, String> messages = KafkaUtils.createStream(jssc, zkQuorum, group, topicMap); JavaDStream<String> lines = messages.map(new Function<Tuple2<String, String>, String>() { @Override public String call(Tuple2<String, String> tuple2) { return tuple2._2(); } }); JavaDStream<Long> javaDScount = lines.count(); System.out.println(javaDScount.toString()); JavaDStream<String> words = lines .flatMap(new FlatMapFunction<String, String>() { @Override public Iterable<String> call(String x) { return Lists.newArrayList(SPACE.split(x)); } }); JavaPairDStream<String, Integer> wordCounts = words.mapToPair( new PairFunction<String, String, Integer>() { @Override public Tuple2<String, Integer> call(String s) { return new Tuple2<String, Integer>(s, 1); } }).reduceByKey(new Function2<Integer, Integer, Integer>() { @Override public Integer call(Integer i1, Integer i2) { return i1 + i2; } }); wordCounts.print(); jssc.start(); jssc.awaitTermination(); } }注意:/home/hadoop1/JavaKafkaWordCount.jar导出的可执行jar文件
输入数据:
利用kafaka的终端命令向flume1主题写入数据:1.1.1.1,25,2.2.2.2,36
输出结果:
15/09/06 20:25:00 INFO DAGScheduler: Stage 130 (print at JavaKafkaWordCount.java:85) finished in 0.059 s 15/09/06 20:25:00 INFO TaskSchedulerImpl: Removed TaskSet 130.0, whose tasks have all completed, from pool 15/09/06 20:25:00 INFO DAGScheduler: Job 65 finished: print at JavaKafkaWordCount.java:85, took 0.072866 s ------------------------------------------- 15/09/06 20:25:00 INFO JobScheduler: Finished job streaming job 1441542300000 ms.0 from job set of time 1441542300000 ms 15/09/06 20:25:00 INFO JobScheduler: Total delay: 0.259 s for time 1441542300000 ms (execution: 0.245 s) Time: 1441542300000 ms ------------------------------------------- (1.1.1.1,1) (2.2.2.2,1) (36,1) (25,1) 15/09/06 20:25:00 INFO ShuffledRDD: Removing RDD 158 from persistence list 15/09/06 20:25:00 INFO BlockManager: Removing RDD 158 15/09/06 20:25:00 INFO MapPartitionsRDD: Removing RDD 157 from persistence list 15/09/06 20:25:00 INFO BlockManager: Removing RDD 157 15/09/06 20:25:00 INFO MapPartitionsRDD: Removing RDD 156 from persistence list 15/09/06 20:25:00 INFO BlockManager: Removing RDD 156 15/09/06 20:25:00 INFO MapPartitionsRDD: Removing RDD 155 from persistence list 15/09/06 20:25:00 INFO BlockManager: Removing RDD 155 15/09/06 20:25:00 INFO BlockRDD: Removing RDD 154 from persistence list 15/09/06 20:25:00 INFO BlockManager: Removing RDD 154 15/09/06 20:25:00 INFO KafkaInputDStream: Removing blocks of RDD BlockRDD[154] at createStream at JavaKafkaWordCount.java:52 of time 1441542300000 ms 15/09/06 20:25:00 INFO ReceivedBlockTracker: Deleting batches ArrayBuffer(1441542260000 ms)