存在两个Hadoop集群,需要将一个集群中的hive数据传输到另一个集群的hive中。且源端hive为其他公司数据源,涉及到的一定的安全和保密性。
Java读取源端hive—>我司kafka—>sparkStreaming读取kafka—>目标端hive
package com.zhbr.dataImport.test;
import com.alibaba.fastjson.JSON;
import com.zhbr.dataImport.rdbms.ImportRDBMSData;
import org.apache.kafka.clients.producer.KafkaProducer;
import org.apache.kafka.clients.producer.ProducerRecord;
import java.sql.*;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.Properties;
/**
* @ClassName GW_to_Kafka_test
* @Description TODO
* @Autor yanni
* @Date 2020/3/25 9:07
* @Version 1.0
**/
public class GW_to_Kafka_test2 {
private static String brokerList = "192.168.72.141:9092,192.168.72.142:9092,192.168.72.143:9092";
// public static final String topic="topic-demo";
private static String topic = "hive2kafka2";
public static void main(String[] args) throws SQLException {
//自定义的JDBC方式读取
Connection conn = ImportRDBMSData.getConn();
Statement stmt = ImportRDBMSData.getStatement(conn);
String querySQL = "select * from lsb_copy";
//查询
ResultSet res = stmt.executeQuery(querySQL);
//创建ListBuffer集合
ArrayList<Map<String, Object>> list = new ArrayList<Map<String, Object>>();
//获得结果集结构信息(元数据)
ResultSetMetaData metaData = res.getMetaData();
//ResultSet列数
int columnCount = metaData.getColumnCount();
//配置生产者客户端参数
//将配置序列化
Properties properties = new Properties();
properties.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
properties.put("value.serializer", "org.apache.kafka.common.serialization.StringSerializer");
//内存缓冲
properties.put("buffer.memory", 67108864);
//批处理大小
properties.put("batch.size", 131072);
//发送间隔
properties.put("linger.ms", 100);
//消息的最大大小
properties.put("max.request.size", 10485760);
//失败重试
properties.put("retries", 3);
properties.put("retry.backoff.ms", 20000);
//ack级别(1代表保证leader收到)
properties.put("acks", "1");
properties.put("bootstrap.servers", brokerList);
//压缩
properties.put("compression.type", "gzip");
//创建KafkaProducer 实例
KafkaProducer<String, String> kafkaProducer = new KafkaProducer<>(properties);
try {
// ResultSet转List<Map>数据结构
// next用于移动到ResultSet的下一行,使下一行成为当前行
while (res.next()) {
//创建map集合
HashMap<String, Object> map = new HashMap<String, Object>();
// 遍历获取对当前行的每一列的键值对,put到map中
for (int i = 1;i<=columnCount;i++) {
// 获取当前行某一列字段的字段名
String allColumnName = metaData.getColumnName(i).toLowerCase();
// rs.getObject(i) 获得当前行某一列字段的值
Object columnValue = res.getObject(i);
map.put(allColumnName,columnValue);
}
//将数据添加到list集合
list.add(map);
//当list集合容量为5000时,发送一次
if(list.size()==5000){
String str = JSON.toJSONString(list);
//构建待发送的消息
ProducerRecord<String,String> record=new ProducerRecord<String, String>(topic,str);
//尝试发送消息
kafkaProducer.send(record);
//打印发送成功
System.out.println("batchSize 5000 send success from producer");
//清空list集合
list.clear();
}
}
//将剩下的不满5000条的数据发送
if(list.size()>0){
String str = JSON.toJSONString(list);
//构建待发送的消息
ProducerRecord<String,String> record=new ProducerRecord<String, String>(topic,str);
//尝试发送消息
kafkaProducer.send(record);
//打印发送成功
System.out.println("batchSize "+list.size()+" send success from producer");
//清空list集合
list.clear();
}
} catch (Exception e) {
e.printStackTrace();
}finally {
//关闭生产者客户端实例
kafkaProducer.close();
ImportRDBMSData.closeAllConn(stmt,conn);
}
}
}
分批次写入,避免因为性能问题导致数据丢失及服务器宕机,如此可基本保证hive表大数据量的写入工作。
\
package com.zhbr.dataImport.test
import kafka.serializer.StringDecoder
import org.apache.spark.sql.execution.datasources.jdbc.JDBCOptions
import org.apache.spark.sql.{SaveMode, SparkSession}
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
object Kafka_to_Hive {
def main(args: Array[String]): Unit = {
//获取sparkSession
val spark = SparkSession.builder().appName(this.getClass.getSimpleName.filter(!_.equals('$')))
.master("local[4]").config("spark.streaming.receiver.writeAheadLog.enable","true").getOrCreate()
//获取sparkContext
val sc = spark.sparkContext
//设置日志级别
sc.setLogLevel("WARN")
val ssc: StreamingContext = new StreamingContext(sc,Seconds(5))
//设置检查点,通常生产环境当中,为了保证数据不丢失,将数据放到hdfs之上,hdfs的高容错,多副本特征
ssc.checkpoint("./kafka-chk2")
//设置kafkaParams
val kafkaParams=Map("metadata.broker.list"->"node01:9092,node02:9092,node03:9092","group.id"->"group1")
//设置topics
val topics=Set("hive2kafka2")
//获取数据
val data: InputDStream[(String, String)] = KafkaUtils.createDirectStream[String,String,StringDecoder,StringDecoder](ssc,kafkaParams,topics)
//获取真正的数据,数据在元组的第二位
val realData: DStream[String] = data.map(x=>x._2)
realData.map(record => record.toString).foreachRDD(rdd => {
import spark.implicits._
val df = spark.read.json(spark.createDataset(rdd))
//存入MySQL
df.write.mode(SaveMode.Append).format("jdbc")
.option(JDBCOptions.JDBC_URL,"jdbc:mysql://localhost:3306/test11")
.option("user","root")
.option("password","123")
.option(JDBCOptions.JDBC_TABLE_NAME,"lsb_copy")
.save()
//存入hive
//df.createTempView("df_tmp")
//spark.sql("insert into table df_copy select * from df_tmp")
})
//开启流式计算
ssc.start()
ssc.awaitTermination()
}
}
\
欢迎各位大神提出更简单、更快捷的解决思路。