Spark集成Kafka实时流计算Java案例

package com.test;

import java.util.*;
import org.apache.spark.SparkConf;
import org.apache.spark.TaskContext;
import org.apache.spark.api.java.*;
import org.apache.spark.api.java.function.*;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.*;
import org.apache.spark.streaming.kafka010.*;
import org.apache.commons.lang3.StringUtils;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.common.TopicPartition;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.apache.spark.api.java.Optional;
import scala.Tuple2;

public class Test5 {

	public static void main(String[] args) throws InterruptedException {
		// 接收数据的地址和端口
		final JavaPairRDD[] lastRdd = new JavaPairRDD[1];

		SparkConf conf = new SparkConf().setMaster("local").setAppName(
				"streamingTest");
		JavaSparkContext sc = new JavaSparkContext(conf);
		sc.setLogLevel("ERROR");
		sc.setCheckpointDir("./checkpoint");
		JavaStreamingContext ssc = new JavaStreamingContext(sc,
				Durations.seconds(10));

		// kafka相关参数,必要!缺了会报错
		Map kafkaParams = new HashMap<>();
		kafkaParams.put("bootstrap.servers", "192.168.174.200:9092");
		kafkaParams.put("key.deserializer", StringDeserializer.class);
		kafkaParams.put("value.deserializer", StringDeserializer.class);
		kafkaParams.put("group.id", "newgroup2");
		kafkaParams.put("auto.offset.reset", "latest");
		kafkaParams.put("enable.auto.commit", false);

		Collection topics = Arrays.asList("test");

		JavaInputDStream> stream = KafkaUtils
				.createDirectStream(ssc, LocationStrategies.PreferConsistent(),
						ConsumerStrategies. Subscribe(topics,
								kafkaParams));

		// 注意这边的stream里的参数本身是个ConsumerRecord对象
		JavaPairDStream counts = stream
				.flatMap(
						x -> Arrays.asList(x.value().toString().split(" "))
								.iterator())
				.mapToPair(x -> new Tuple2(x, 1))
				.reduceByKey((x, y) -> x + y);
		//counts.print();

		JavaPairDStream result = counts
				.updateStateByKey(new Function2, Optional, Optional>() {

					private static final long serialVersionUID = 1L;

					@Override
					public Optional call(List values,
							Optional state) throws Exception {
						/**
						 * values:经过分组最后 这个key所对应的value,如:[1,1,1,1,1]
						 * state:这个key在本次之前之前的状态
						 */
						Integer updateValue = 0;
						if (state.isPresent()) {
							updateValue = state.get();
						}

						for (Integer value : values) {
							updateValue += value;
						}
						return Optional.of(updateValue);
					}
				});

		result.print();

		ssc.start();
		ssc.awaitTermination();
		ssc.close();
	}
}

你可能感兴趣的:(大数据,分布式框架,Kafka,Spark)