SparkStreaming读kafka数据保存为一个文件

需求是这样的:kafka里的数据存至afs上,每条数据都有一个start_time字段,是时间戳格式。现需要按照start_time字段存到具体的某一天某个小时的路径下,类似这种目录:xxx/2020-01-01(日期)/16(小时)/xxx

那就开始吧:
pom.xml


        log-processor
        UTF-8
        2.8.2
        3.3.6
        4.3.10.RELEASE
        2.11.0
        2.7.4
    

    
        
            org.apache.hadoop
            hadoop-hdfs
            ${hadoop.version}
            
                
                    io.netty
                    netty
                
            
            provided
        
        
            io.netty
            netty-all
            4.1.18.Final
        
        
            org.scala-lang
            scala-library
            ${scala.version}
        
        
            org.apache.spark
            spark-streaming_2.11
            2.4.0
            provided
        
        
            org.apache.spark
            spark-core_2.11
            2.4.0
            provided
        
        
            com.google.code.gson
            gson
            2.2.4
        


        
            org.apache.spark
            spark-yarn_2.11
            2.4.0
            provided
        
        
            org.apache.hadoop
            hadoop-common
            ${hadoop.version}
            provided
        
        
            org.apache.spark
            spark-streaming-kafka-0-10_2.11
            2.3.1
        
        
        
            org.apache.kafka
            kafka-clients
            0.10.1.1
            
                
                    net.jpountz.lz4
                    lz4
                
            
        
public class SparkStreaming {
    public static void main(String[] args) throws InterruptedException {
        Map kafkaParams = new HashMap<>();
        kafkaParams.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, "brokers");
        kafkaParams.put(ConsumerConfig.GROUP_ID_CONFIG, "kafkaGroupId");
        kafkaParams.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
        kafkaParams.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
        kafkaParams.put(ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG, "true");

        SparkConf conf = new SparkConf().setAppName("sparkTask");
        JavaStreamingContext jssc = new JavaStreamingContext(conf, Durations.seconds(10));
        jssc.sparkContext().setLogLevel("WARN");

        String brokers = "broker1, broker2";
        final JavaInputDStream> lines = KafkaUtils.createDirectStream(
                jssc,
                LocationStrategies.PreferConsistent(),
                ConsumerStrategies.Subscribe(new HashSet<>(Arrays.asList(brokers)), kafkaParams));

        lines.mapToPair(new PairFunction, String, String>() {
            public Tuple2 call(ConsumerRecord record) {
                JsonObject obj = null;
                String date = null;
                try {
                    obj = new JsonParser().parse(record.value()).getAsJsonObject();
                    Long startTime = Long.parseLong(obj.get("start_time").getAsString());
                    LocalDateTime time = LocalDateTime.ofEpochSecond(startTime, 0, ZoneOffset.ofHours(8));
                    date = time.toLocalDate() + "/" + time.getHour();
                } catch (Exception e) {
                    e.printStackTrace();
                    return null;
                }
                return new Tuple2(date, record.value());
            }
        }).foreachRDD(new VoidFunction>() {
            @Override
            public void call(JavaPairRDD dateStringJavaPairRDD) {
                try {
                    // 取出(date, line)中date并去重
                    List dateKeys = dateStringJavaPairRDD.map(new Function, String>() {
                        @Override
                        public String call(Tuple2 v1) throws Exception {
                            return v1._1;
                        }
                    }).distinct().collect();
                    // 根据date过滤数据并分别写入不同目录
                    for (String dateKey : dateKeys) {
                        String savePath = String.join("/",
                                new String[]{"feature", dateKey,
                                        String.valueOf(Timestamp.valueOf(LocalDateTime.now()).getTime())});
                        JavaRDD resultRdd = null;
                        resultRdd = dateStringJavaPairRDD.filter(new Function, Boolean>() {
                            @Override
                            public Boolean call(Tuple2 v1) throws Exception {
                                if (v1._1.equals(dateKey)) {
                                    return true;
                                } else {
                                    return false;
                                }
                            }
                        }).map(new Function, String>() {

                            @Override
                            public String call(Tuple2 v1) throws Exception {
                                return v1._2;
                            }
                        });
                        resultRdd.repartition(1).saveAsTextFile(savePath);
                    }
                } catch (Exception e) {
                    e.printStackTrace();
                }
            }
        });
        jssc.start();
        jssc.awaitTermination();
    }
}

你可能感兴趣的:(spark-streaming,kafka,hdfs)