本文介绍saprk实时部分----spark-streaming。spark-streaming可以实现实时批处理功能,实际上还是相当于小的批处理,但是是7*24工作,可以近实时但需要维护成本。本文里的用java写的demo,实现功能是将kafka作为spark-streaming的输入源,Elasticsearch作为输出源来实现实时处理架构。
还是先上代码
maven
org.apache.spark
spark-streaming-kafka-0-10_2.11
2.3.1
org.apache.spark
spark-core_2.11
2.3.1
provided
org.apache.spark
spark-streaming_2.11
2.3.1
provided
org.apache.spark
spark-sql_2.11
2.3.1
provided
org.elasticsearch
elasticsearch-hadoop
6.3.0
代码
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka010.ConsumerStrategies;
import org.apache.spark.streaming.kafka010.KafkaUtils;
import org.apache.spark.streaming.kafka010.LocationStrategies;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaInputDStream;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import java.util.regex.Pattern;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.apache.spark.SparkConf;
import org.apache.spark.streaming.Durations;
import org.elasticsearch.spark.rdd.api.java.JavaEsSpark;
import org.elasticsearch.spark.streaming.api.java.JavaEsSparkStreaming;
public static void main(String[] args) throws Exception {
// 从SparkConf创建StreamingContext并指定1秒钟的批处理大小
String master = "spark://192.168.7.51:7077";
// String master = "local[2]";
SparkConf conf = new SparkConf().setMaster(master).setAppName("StreamingTest")
// .set("spark.executor.memory", "1g")
// .set("spark.dynamicAllocation.enabled", "false")
.set("spark.es.nodes", "192.168.7.51")//指定es地址
.set("spark.es.port", "9200")//指定es端口号
.setJars(new String[] {//如果是集群模式要指定jar包,整个项目打包
"/Users/username/eclipse-workspace/spark-streaming/target/spark-streaming-0.0.1-SNAPSHOT.jar" });
//指定1秒获取一次kafka数据
JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(1));
String brokers = "kafka1:9092";//指定kafka地址
String groupId = "kafka";//指定消费者组id
String topics = "test1";//指定topic
Set topicsSet = new HashSet<>(Arrays.asList(topics.split(",")));
Map kafkaParams = new HashMap<>();
kafkaParams.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers);
kafkaParams.put(ConsumerConfig.GROUP_ID_CONFIG, groupId);
kafkaParams.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
kafkaParams.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
//取出1秒内的数据转成rddstream
JavaInputDStream> messages = KafkaUtils.createDirectStream(jssc,
LocationStrategies.PreferConsistent(), ConsumerStrategies.Subscribe(topicsSet, kafkaParams));
//取出每条message中的value
JavaDStream lines = messages.map(record -> record.value());
//拼成可以插入Elasticsearch的格式
JavaDStream out = lines.map(str -> "{\"test\":\""+str+"\"}");
//打印
out.print();
//写入Elasticsearch
JavaEsSparkStreaming.saveJsonToEs(out, "/spark/doc");
//启动streaming
jssc.start();
// 等待作业完成
jssc.awaitTermination();
jssc.stop();
}
spark-streaming接收kafka数据使用spark-streaming-kafka-0-10_2.11包。
参考官网:http://spark.apache.org/docs/latest/streaming-kafka-0-10-integration.html
maven
org.apache.spark
spark-streaming-kafka-0-10_2.11
2.3.1
scala
val kafkaParams = Map[String, Object](
"bootstrap.servers" -> "localhost:9092,anotherhost:9092",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> "use_a_separate_group_id_for_each_stream",
"auto.offset.reset" -> "latest",
"enable.auto.commit" -> (false: java.lang.Boolean)
)
val topics = Array("topicA", "topicB")
val stream = KafkaUtils.createDirectStream[String, String](
streamingContext,
PreferConsistent,
Subscribe[String, String](topics, kafkaParams)
)
stream.map(record => (record.key, record.value))
java
Map kafkaParams = new HashMap<>();
kafkaParams.put("bootstrap.servers", "localhost:9092,anotherhost:9092");
kafkaParams.put("key.deserializer", StringDeserializer.class);
kafkaParams.put("value.deserializer", StringDeserializer.class);
kafkaParams.put("group.id", "use_a_separate_group_id_for_each_stream");
kafkaParams.put("auto.offset.reset", "latest");
kafkaParams.put("enable.auto.commit", false);
Collection topics = Arrays.asList("topicA", "topicB");
JavaInputDStream> stream =
KafkaUtils.createDirectStream(
streamingContext,
LocationStrategies.PreferConsistent(),
ConsumerStrategies.Subscribe(topics, kafkaParams)
);
stream.mapToPair(record -> new Tuple2<>(record.key(), record.value()));
maven
org.elasticsearch
elasticsearch-hadoop
6.3.0
scala
import org.elasticsearch.spark.streaming.EsSparkStreaming;
java
import org.elasticsearch.spark.streaming.api.java.JavaEsSparkStreaming;
需要指定Elasticsearch,可以写在配置文件里,也可以在程序里指定。
spark.es.nodes eshosts
spark.es.port 9200
SparkConf conf = new SparkConf().setMaster(master).setAppName("StreamingTest")
// .set("spark.executor.memory", "1g")
// .set("spark.dynamicAllocation.enabled", "false")
.set("spark.es.nodes", "192.168.7.51")
.set("spark.es.port", "9200");
写入es
JavaEsSparkStreaming.saveJsonToEs(out, "/spark/doc");
更多:Spark专栏
——————————————————————————————————
作者:桃花惜春风
转载请标明出处,原文地址:
https://blog.csdn.net/xiaoyu_BD/article/details/82777869
如果感觉本文对您有帮助,您的支持是我坚持写作最大的动力,谢谢!