Java分批写入大量数据到kafka,sparkStreaming实时消费kafka,完成数据下沉(Hive到Hive数据迁移/同步)

最近遇到这样的一个场景:

存在两个Hadoop集群,需要将一个集群中的hive数据传输到另一个集群的hive中。且源端hive为其他公司数据源,涉及到的一定的安全和保密性。

现大致思路为:

Java读取源端hive—>我司kafka—>sparkStreaming读取kafka—>目标端hive

代码示例:

Java获取其他公司hive表数据:
package com.zhbr.dataImport.test;

import com.alibaba.fastjson.JSON;
import com.zhbr.dataImport.rdbms.ImportRDBMSData;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerRecord;

import java.sql.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;

/**
 * @ClassName GW_to_Kafka_test
 * @Description TODO
 * @Autor yanni
 * @Date 2020/3/25 9:07
 * @Version 1.0
 **/
public class GW_to_Kafka_test2 {

    private static String brokerList = "192.168.72.141:9092,192.168.72.142:9092,192.168.72.143:9092";

    // public static final String topic="topic-demo";
    private static String topic = "hive2kafka2";

    public static void main(String[] args) throws SQLException {
    	//自定义的JDBC方式读取
        Connection conn  = ImportRDBMSData.getConn();
        Statement stmt  = ImportRDBMSData.getStatement(conn);
        String querySQL = "select * from lsb_copy";

        //查询
        ResultSet res = stmt.executeQuery(querySQL);

        //创建ListBuffer集合
        ArrayList<Map<String, Object>> list = new ArrayList<Map<String, Object>>();

        //获得结果集结构信息(元数据)
        ResultSetMetaData metaData = res.getMetaData();

        //ResultSet列数
        int columnCount = metaData.getColumnCount();

        //配置生产者客户端参数
        //将配置序列化
        Properties properties = new Properties();
        properties.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
        properties.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");
        //内存缓冲
        properties.put("buffer.memory", 67108864);
        //批处理大小
        properties.put("batch.size", 131072);
        //发送间隔
        properties.put("linger.ms", 100);
        //消息的最大大小
        properties.put("max.request.size", 10485760);
        //失败重试
        properties.put("retries", 3);
        properties.put("retry.backoff.ms", 20000);
        //ack级别(1代表保证leader收到)
        properties.put("acks", "1");
        properties.put("bootstrap.servers", brokerList);
        //压缩
        properties.put("compression.type", "gzip");
        //创建KafkaProducer 实例
        KafkaProducer<String, String> kafkaProducer = new KafkaProducer<>(properties);

        try {
        // ResultSet转List<Map>数据结构
        // next用于移动到ResultSet的下一行,使下一行成为当前行
        while (res.next()) {

            //创建map集合
            HashMap<String, Object> map = new HashMap<String, Object>();

            // 遍历获取对当前行的每一列的键值对,put到map中
            for (int i = 1;i<=columnCount;i++) {
                // 获取当前行某一列字段的字段名
                String allColumnName = metaData.getColumnName(i).toLowerCase();

                // rs.getObject(i) 获得当前行某一列字段的值
                Object columnValue = res.getObject(i);
                map.put(allColumnName,columnValue);
            }

            //将数据添加到list集合
            list.add(map);

            //当list集合容量为5000时,发送一次
            if(list.size()==5000){
                String str = JSON.toJSONString(list);

                //构建待发送的消息
                ProducerRecord<String,String> record=new ProducerRecord<String, String>(topic,str);

                //尝试发送消息
                kafkaProducer.send(record);
                //打印发送成功
                System.out.println("batchSize 5000 send success from producer");

                //清空list集合
                list.clear();
            }


        }

        //将剩下的不满5000条的数据发送
        if(list.size()>0){
            String str = JSON.toJSONString(list);

            //构建待发送的消息
            ProducerRecord<String,String> record=new ProducerRecord<String, String>(topic,str);

            //尝试发送消息
            kafkaProducer.send(record);
            //打印发送成功
            System.out.println("batchSize "+list.size()+" send success from producer");

            //清空list集合
            list.clear();
        }

        } catch (Exception e) {
            e.printStackTrace();
        }finally {
            //关闭生产者客户端实例
            kafkaProducer.close();
            ImportRDBMSData.closeAllConn(stmt,conn);
        }
    }
}

分批次写入,避免因为性能问题导致数据丢失及服务器宕机,如此可基本保证hive表大数据量的写入工作。

\

sparkStreaming实时消费kafka,将数据保存到hive
package com.zhbr.dataImport.test

import kafka.serializer.StringDecoder
import org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}

object Kafka_to_Hive {

    def main(args: Array[String]): Unit = {

      //获取sparkSession
      val spark = SparkSession.builder().appName(this.getClass.getSimpleName.filter(!_.equals('$')))
        .master("local[4]").config("spark.streaming.receiver.writeAheadLog.enable","true").getOrCreate()

      //获取sparkContext
      val sc = spark.sparkContext

      //设置日志级别
      sc.setLogLevel("WARN")

      val ssc: StreamingContext = new StreamingContext(sc,Seconds(5))

      //设置检查点,通常生产环境当中,为了保证数据不丢失,将数据放到hdfs之上,hdfs的高容错,多副本特征
      ssc.checkpoint("./kafka-chk2")

      //设置kafkaParams
      val kafkaParams=Map("metadata.broker.list"->"node01:9092,node02:9092,node03:9092","group.id"->"group1")

      //设置topics
      val topics=Set("hive2kafka2")

      //获取数据
      val data: InputDStream[(String, String)] = KafkaUtils.createDirectStream[String,String,StringDecoder,StringDecoder](ssc,kafkaParams,topics)

      //获取真正的数据,数据在元组的第二位
      val realData: DStream[String] = data.map(x=>x._2)

      realData.map(record => record.toString).foreachRDD(rdd => {
        import spark.implicits._
        val df = spark.read.json(spark.createDataset(rdd))

        //存入MySQL
        df.write.mode(SaveMode.Append).format("jdbc")
                  .option(JDBCOptions.JDBC_URL,"jdbc:mysql://localhost:3306/test11")
                  .option("user","root")
                  .option("password","123")
                  .option(JDBCOptions.JDBC_TABLE_NAME,"lsb_copy")
                  .save()

        //存入hive
        //df.createTempView("df_tmp")
        //spark.sql("insert into table df_copy select * from df_tmp")
      })

      //开启流式计算
      ssc.start()
      ssc.awaitTermination()
  }
}



\

最后:

欢迎各位大神提出更简单、更快捷的解决思路。

你可能感兴趣的:(Kafka,Spark,Java)