开发环境:spark2.3.1,es6.6.1,jdk1.8
maven依赖:见末尾
完整代码
package spark;
import com.google.gson.Gson;
import org.apache.kafka.clients.consumer.ConsumerConfig;
import org.apache.kafka.clients.consumer.ConsumerRecord;
import org.apache.kafka.common.serialization.StringDeserializer;
import org.apache.log4j.Logger;
import org.apache.spark.SparkConf;
import org.apache.spark.api.java.JavaSparkContext;
import org.apache.spark.api.java.function.Function;
import org.apache.spark.streaming.Durations;
import org.apache.spark.streaming.api.java.JavaDStream;
import org.apache.spark.streaming.api.java.JavaInputDStream;
import org.apache.spark.streaming.api.java.JavaStreamingContext;
import org.apache.spark.streaming.kafka010.ConsumerStrategies;
import org.apache.spark.streaming.kafka010.KafkaUtils;
import org.apache.spark.streaming.kafka010.LocationStrategies;
import org.elasticsearch.spark.streaming.api.java.JavaEsSparkStreaming;
import java.util.*;
/**
* ZhangYaotong
* 2019-02-26
*/
public class KafkaSpark2Es
{
public static void main(String[] args) throws InterruptedException {
SparkConf conf = new SparkConf()
.setMaster("spark://master:7077")
.setAppName("KafkaSpark2Es")
.set("spark.executor.memory", "2g")
.set("spark.dynamicAllocation.enabled", "false");
JavaSparkContext sc = new JavaSparkContext(conf);
JavaStreamingContext ssc = new JavaStreamingContext(sc, Durations.seconds(10));
String brokers = "master:9092";//指定kafka地址
String groupId = "kafka-01";//指定消费者组id
String topics = "topic1";//指定topic
Set topicsSet = new HashSet<>(Arrays.asList(topics.split(",")));
Map kafkaParams = new HashMap<>();
kafkaParams.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, brokers);
kafkaParams.put(ConsumerConfig.GROUP_ID_CONFIG, groupId);
kafkaParams.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
kafkaParams.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
kafkaParams.put("auto.offset.reset", "latest");
kafkaParams.put("enable.auto.commit", false);
JavaInputDStream> messages = KafkaUtils.createDirectStream(ssc,
LocationStrategies.PreferConsistent(), ConsumerStrategies.Subscribe(topicsSet, kafkaParams));
JavaDStream lines = messages.map(new Function, String>() {
Map map = new HashMap<>();
@Override
public String call(ConsumerRecord record) throws Exception {
String [] splits = record.value().split("\\|\\|");
for (String date : splits){
if(date.lastIndexOf("=") != -1 && date.indexOf("=")!= date.length()){
map.put(date.substring(0, date.indexOf("=")), (date.substring(date.indexOf("=")+1, date.length())));
}
}
Gson gson = new Gson();
return gson.toJson(map);
}
});
lines.print();
JavaEsSparkStreaming.saveJsonToEs(lines, "/log/doc1");
ssc.start();
ssc.awaitTermination();
}
}
4.0.0
spark
spark
1.0-SNAPSHOT
spark
http://www.example.com
UTF-8
1.8
1.8
1.8
2.2.3
2.11.8
6.6.1
org.scala-lang
scala-library
${scala.version}
org.apache.spark
spark-core_2.11
${spark.version}
org.apache.spark
spark-streaming_2.11
${spark.version}
org.apache.spark
spark-streaming-kafka-0-10_2.11
${spark.version}
org.elasticsearch
elasticsearch-spark-20_2.11
${elasticsearch.version}
log4j
log4j
1.2.17
junit
junit
4.11
test
maven-clean-plugin
3.1.0
maven-resources-plugin
3.0.2
maven-compiler-plugin
3.8.0
maven-surefire-plugin
2.22.1
maven-jar-plugin
3.0.2
maven-install-plugin
2.5.2
maven-deploy-plugin
2.8.2
maven-site-plugin
3.7.1
maven-project-info-reports-plugin
3.0.0