RT。代码备忘。
1.Constant.java
package com.sparktest.util; public class Constant { public static String master = "yarn-client"; public static String topic = "pj"; public static String appName = "sparktest"; public static long duration = 10000; public static String zookeeper = "10.67.2.20:2181,10.67.2.21:2181"; public static String brokerlist = "10.67.2.20:9092,10.67.2.21:9092"; public static String groupId = "com.sparktest"; public static int partitions = 10; }
package com.sparktest.app; import java.io.Serializable; import java.util.HashMap; import java.util.HashSet; import java.util.Map; import java.util.Set; import java.util.concurrent.atomic.AtomicReference; import kafka.common.TopicAndPartition; import kafka.message.MessageAndMetadata; import kafka.serializer.DefaultDecoder; import kafka.serializer.StringDecoder; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.api.java.function.Function; import org.apache.spark.streaming.Duration; import org.apache.spark.streaming.api.java.JavaInputDStream; import org.apache.spark.streaming.api.java.JavaStreamingContext; import org.apache.spark.streaming.kafka.HasOffsetRanges; import org.apache.spark.streaming.kafka.KafkaCluster; import org.apache.spark.streaming.kafka.KafkaUtils; import org.apache.spark.streaming.kafka.OffsetRange; import scala.Predef; import scala.Tuple2; import scala.collection.JavaConversions; import com.sparktest.util.Constant; public class App implements Serializable{ private KafkaCluster kafkaCluster = null; private Map<String, String> kafkaParams = new HashMap<String, String>(); private Set<String> topics = new HashSet<String>(); private Duration duration = new Duration(Constant.duration); private java.util.Map<kafka.common.TopicAndPartition, Long> fromOffsets = new java.util.HashMap<kafka.common.TopicAndPartition, Long>(); private static final AtomicReference<OffsetRange[]> offsetRanges = new AtomicReference<OffsetRange[]>(); public App() { kafkaParams.put("metadata.broker.list", Constant.brokerlist); kafkaParams.put("group.id", Constant.groupId); scala.collection.mutable.Map<String, String> mutableKafkaParam = JavaConversions .mapAsScalaMap(kafkaParams); scala.collection.immutable.Map<String, String> immutableKafkaParam = mutableKafkaParam .toMap(new Predef.$less$colon$less<Tuple2<String, String>, Tuple2<String, String>>() { public Tuple2<String, String> apply( Tuple2<String, String> v1) { return v1; } }); this.kafkaCluster = new KafkaCluster(immutableKafkaParam); this.topics.add(Constant.topic); } public void startApp() { JavaSparkContext ctx = new JavaSparkContext(Constant.master, Constant.appName); JavaStreamingContext jsctx = new JavaStreamingContext(ctx, duration); scala.collection.mutable.Set<String> mutableTopics = JavaConversions .asScalaSet(this.topics); scala.collection.immutable.Set<String> immutableTopics = mutableTopics .toSet(); scala.collection.immutable.Set<TopicAndPartition> scalaTopicAndPartitionSet = kafkaCluster .getPartitions(immutableTopics).right().get(); // 首次消费,默认设置为0 if (kafkaCluster.getConsumerOffsets(kafkaParams.get("group.id"), scalaTopicAndPartitionSet).isLeft()) { Set<TopicAndPartition> javaTopicAndPartitionSet = JavaConversions .setAsJavaSet(scalaTopicAndPartitionSet); for (TopicAndPartition topicAndPartition : javaTopicAndPartitionSet) { this.fromOffsets.put(topicAndPartition, 0L); } } else { scala.collection.immutable.Map<TopicAndPartition, Object> consumerOffsetsTemp = kafkaCluster .getConsumerOffsets(kafkaParams.get("group.id"), scalaTopicAndPartitionSet).right().get(); Map<TopicAndPartition, Object> consumerOffsets = JavaConversions .mapAsJavaMap(consumerOffsetsTemp); Set<TopicAndPartition> javaTopicAndPartitionSet = JavaConversions .setAsJavaSet(scalaTopicAndPartitionSet); for (TopicAndPartition topicAndPartition : javaTopicAndPartitionSet) { Long offset = (Long) consumerOffsets.get(topicAndPartition); this.fromOffsets.put(topicAndPartition, offset); } } JavaInputDStream<byte[]> stream = KafkaUtils.createDirectStream(jsctx, String.class, byte[].class, StringDecoder.class, DefaultDecoder.class, byte[].class, kafkaParams, this.fromOffsets, new Function<MessageAndMetadata<String, byte[]>, byte[]>() { public byte[] call(MessageAndMetadata<String, byte[]> v1) throws Exception { return v1.message(); } }); stream.foreachRDD(new Function<JavaRDD<byte[]>, Void>() { public Void call(JavaRDD<byte[]> arg0) throws Exception { OffsetRange[] offsets = ((HasOffsetRanges) arg0.rdd()).offsetRanges(); for(OffsetRange o: offsets){ // 封装topic.partition 与 offset对应关系 java Map TopicAndPartition topicAndPartition = new TopicAndPartition(o.topic(), o.partition()); Map<TopicAndPartition, Object> topicAndPartitionObjectMap = new HashMap<TopicAndPartition, Object>(); topicAndPartitionObjectMap.put(topicAndPartition, o.untilOffset()); // 转换java map to scala immutable.map scala.collection.mutable.Map<TopicAndPartition, Object> map = JavaConversions.mapAsScalaMap(topicAndPartitionObjectMap); scala.collection.immutable.Map<TopicAndPartition, Object> scalatopicAndPartitionObjectMap = map.toMap(new Predef.$less$colon$less<Tuple2<TopicAndPartition, Object>, Tuple2<TopicAndPartition, Object>>() { public Tuple2<TopicAndPartition, Object> apply(Tuple2<TopicAndPartition, Object> v1) { return v1; } }); // 更新offset到kafkaCluster kafkaCluster.setConsumerOffsets(Constant.groupId, scalatopicAndPartitionObjectMap); } System.out.println("==========================" + arg0.count() + "=================================="); return null; } }); jsctx.start(); jsctx.awaitTermination(); } public static void main(String[] args) { App app = new App(); app.startApp(); } }
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> <modelVersion>4.0.0</modelVersion> <groupId>sparktest</groupId> <artifactId>sparktest</artifactId> <version>0.0.1-SNAPSHOT</version> <packaging>jar</packaging> <name>sparktest</name> <url>http://maven.apache.org</url> <properties> <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding> </properties> <build> <plugins> <plugin> <artifactId>maven-assembly-plugin</artifactId> <configuration> <archive> <manifest> <mainClass>com.allen.capturewebdata.Main</mainClass> </manifest> </archive> <descriptorRefs> <descriptorRef>jar-with-dependencies</descriptorRef> </descriptorRefs> </configuration> </plugin> </plugins> </build> <dependencies> <dependency> <groupId>jdk.tools</groupId> <artifactId>jdk.tools</artifactId> <version>1.7</version> <scope>system</scope> <systemPath>${JAVA_HOME}/lib/tools.jar</systemPath> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-streaming_2.10</artifactId> <version>1.3.0</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-streaming-kafka_2.10</artifactId> <version>1.3.0</version> </dependency> <dependency> <groupId>org.apache.spark</groupId> <artifactId>spark-yarn_2.10</artifactId> <version>1.3.0</version> </dependency> <dependency> <groupId>junit</groupId> <artifactId>junit</artifactId> <version>3.8.1</version> <scope>test</scope> </dependency> </dependencies> </project>