1,saveAsNewAPIHadoopDataset批量写入(千万级别以下使用)
import org.apache.hadoop.hbase.client._ import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.mapreduce.TableOutputFormat import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.mapreduce.Job import org.apache.spark.SparkConf import org.apache.spark.sql.SparkSession object ReadHive2Hbase3 { /** * 处理null字段 * * @param str * @return */ def nullHandle(str: String): String = { if (str == null || "".equals(str)) { return "null" } else { return str } } def main(args: Array[String]): Unit = { val dateTime = args(0) // val conf2 = new SparkConf().setMaster("local[*]") val conf = new SparkConf() conf.set("javax.jdo.option.ConnectionURL", "jdbc:mysql://mysql.hadoop:5480/hive?createDatabaseIfNotExist=true&characterEncoding=UTF-8") conf.set("javax.jdo.option.ConnectionDriverName", "com.mysql.jdbc.Driver") conf.set("javax.jdo.option.ConnectionUserName", "root") conf.set("javax.jdo.option.ConnectionPassword", "D5u8SS+qCbT8") val spark = SparkSession .builder() .config(conf) .enableHiveSupport() .config("spark.sql.warehouse.dir", "spark-warehouse") .getOrCreate() val sc = spark.sparkContext spark.sql("use aijiami") // val dateTime = "20190718" //从hive中读取数据,数据是在hdfs上,hive是个外部表,你也可以用内部表,都有一样 val hiveData = spark.sql("SELECT " + "deviceId," + "shortappkey," + "province," + "city," + "factory," + "phoneOS," + "networkType," + "deviceScreenStr," + "operateTypeStr," + "appKey," + "user_name," + "user_sex," + "user_age," + "user_education," + "user_occupation," + "source_channel," + "urlTime " + // "from ods_event_detail where dt = " + dateTime + " limit 10") "from ods_event_detail where dt = " + dateTime) // hiveData.show(10) val hconf = sc.hadoopConfiguration hconf.set("hbase.master", "hdfs://node1.hadoop") hconf.set("hbase.zookeeper.quorum", "172.10.4.xx,172.10.4.xx,172.10.4.xx") hconf.set("hbase.zookeeper.property.clientPort", "2181") hconf.set(TableOutputFormat.OUTPUT_TABLE, "wxgz_user_data") val job = Job.getInstance(hconf) job.setOutputKeyClass(classOf[ImmutableBytesWritable]) job.setOutputValueClass(classOf[Result]) job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]]) job.setJobName("hive2hbase") hiveData.rdd.map(row => { val shortappkey = nullHandle(row.getAs[String]("shortappkey")) val deviceId = nullHandle(row.getAs[String]("deviceId")) val rowkey = HbaseRowKeyUtil.getRowKey(shortappkey, deviceId) val put = new Put(Bytes.toBytes(rowkey)) //参数为rowkey put.addColumn(Bytes.toBytes("cf"), Bytes.toBytes("deviceId"), Bytes.toBytes(deviceId)) (new ImmutableBytesWritable, put) //返回元组 }).saveAsNewAPIHadoopDataset(job.getConfiguration) //存入HBase spark.close() sc.stop(); } }
2,以bulkload方式写入(亿级别数据):
package com.dianyou import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path import org.apache.hadoop.hbase.client.ConnectionFactory import org.apache.hadoop.hbase.io.ImmutableBytesWritable import org.apache.hadoop.hbase.mapreduce.{HFileOutputFormat2, LoadIncrementalHFiles} import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.{HBaseConfiguration, KeyValue, TableName} import org.apache.hadoop.mapreduce.Job import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession object ReadHive2Hbase { /** * 处理null字段 * * @param str * @return */ def nullHandle(str: String): String = { if (str == null || "".equals(str)) { return "null" } else { return str } } def main(args: Array[String]): Unit = { val conf2 = new SparkConf().setMaster("local[*]") conf2.set("javax.jdo.option.ConnectionURL", "jdbc:mysql://mysql.hadoop:5480/hive?createDatabaseIfNotExist=true&characterEncoding=UTF-8") conf2.set("javax.jdo.option.ConnectionDriverName", "com.mysql.jdbc.Driver") conf2.set("javax.jdo.option.ConnectionUserName", "root") conf2.set("javax.jdo.option.ConnectionPassword", "D5u8SS+qCbT8") val spark = SparkSession .builder() .config(conf2) .enableHiveSupport() .config("spark.sql.warehouse.dir", "spark-warehouse") .getOrCreate() spark.sql("use aijiami") val dateTime = "20190718" //从hive中读取数据,数据是在hdfs上,hive是个外部表,你也可以用内部表,都有一样 val hiveData = spark.sql("SELECT " + "deviceId," + "shortappkey," + "province," + "city," + "factory," + "phoneOS," + "networkType," + "deviceScreenStr," + "operateTypeStr," + "appKey," + "user_name," + "user_sex," + "user_age," + "user_education," + "user_occupation," + "source_channel," + "urlTime " + "from ods_event_detail where dt = " + dateTime + " limit 10") hiveData.show(10) val dataRdd: RDD[(String, (String, String, String))] = hiveData.rdd.flatMap(row => { val aa = row.getAs[String]("deviceId") val shortappkey = nullHandle(row.getAs[String]("shortappkey")) val deviceId = nullHandle(row.getAs[String]("deviceId")) if (!"null".equals(deviceId)) { val rowkey = HbaseRowKeyUtil.getRowKey(shortappkey, deviceId) Array( (rowkey, ("cf", "deviceId", nullHandle(row.getAs[String]("deviceId")))), (rowkey, ("cf", "shortappkey", nullHandle(row.getAs[String]("shortappkey")))), (rowkey, ("cf", "province", nullHandle(row.getAs[String]("province")))), (rowkey, ("cf", "city", nullHandle(row.getAs[String]("city")))), (rowkey, ("cf", "factory", nullHandle(row.getAs[String]("factory")))), (rowkey, ("cf", "phoneOS", nullHandle(row.getAs[String]("phoneOS")))), (rowkey, ("cf", "networkType", nullHandle(row.getAs[String]("networkType")))), (rowkey, ("cf", "deviceScreenStr", nullHandle(row.getAs[String]("deviceScreenStr")))), (rowkey, ("cf", "operateTypeStr", nullHandle(row.getAs[String]("operateTypeStr")))), (rowkey, ("cf", "appKey", nullHandle(row.getAs[String]("appKey")))), (rowkey, ("cf", "user_name", nullHandle(row.getAs[String]("user_name")))), (rowkey, ("cf", "user_sex", nullHandle(row.getAs[String]("user_sex")))), (rowkey, ("cf", "user_age", nullHandle(row.getAs[String]("user_age")))), (rowkey, ("cf", "user_education", nullHandle(row.getAs[String]("user_education")))), (rowkey, ("cf", "user_occupation", nullHandle(row.getAs[String]("user_occupation")))), (rowkey, ("cf", "source_channel", nullHandle(row.getAs[String]("source_channel")))), (rowkey, ("cf", "user_age", nullHandle(row.getAs[String]("user_age")))), (rowkey, ("cf", "urlTime", nullHandle(row.getAs[String]("urlTime")))) ) }else{ null } }) //要保证行键,列族,列名的整体有序,必须先排序后处理,防止数据异常过滤rowkey val rdds = dataRdd.filter(x => x._1 != null).sortBy(x => (x._1, x._2._1, x._2._2)).map(x => { //将rdd转换成HFile需要的格式,Hfile的key是ImmutableBytesWritable,那么我们定义的RDD也是要以ImmutableBytesWritable的实例为key //KeyValue的实例为value val rowKey = Bytes.toBytes(x._1) val family = Bytes.toBytes(x._2._1) val colum = Bytes.toBytes(x._2._2) val value = Bytes.toBytes(x._2._3) (new ImmutableBytesWritable(rowKey), new KeyValue(rowKey, family, colum, value)) }) //临时文件保存位置,在hdfs上 val tmpdir = "/tmp/test_hbase" val hconf = new Configuration() // hconf.set("fs.defaultFS", "hdfs://node1.hadoop") hconf.set("hbase.master", "hdfs://node1.hadoop") //创建HBase的配置 val conf = HBaseConfiguration.create() conf.set("hbase.zookeeper.quorum", "172.10.4.95,172.10.4.96,172.10.4.97") conf.set("hbase.zookeeper.property.clientPort", "2181") //为了预防hfile文件数过多无法进行导入,设置该参数值 // conf.setInt("hbase.mapreduce.bulkload.max.hfiles.perRegion.perFamily", 1000) //此处运行完成之后,在tmpdir生成的Hfile文件 rdds.saveAsNewAPIHadoopFile(tmpdir, classOf[ImmutableBytesWritable], classOf[KeyValue], classOf[HFileOutputFormat2], conf) //开始即那个HFile导入到Hbase,此处都是hbase的api操作 val load = new LoadIncrementalHFiles(conf) //hbase的表名 val tableName = "wxgz_user_data" //创建hbase的链接,利用默认的配置文件,实际上读取的hbase的master地址 val conn = ConnectionFactory.createConnection(conf) //根据表名获取表 val table = conn.getTable(TableName.valueOf(tableName)) try { //获取hbase表的region分布 val regionLocator = conn.getRegionLocator(TableName.valueOf(tableName)) //创建一个hadoop的mapreduce的job val job = Job.getInstance(conf) //设置job名称,随便起一个就行 job.setJobName("ReadHive2Hbase") //此处最重要,需要设置文件输出的key,因为我们要生成HFil,所以outkey要用ImmutableBytesWritable job.setMapOutputKeyClass(classOf[ImmutableBytesWritable]) //输出文件的内容KeyValue job.setMapOutputValueClass(classOf[KeyValue]) //配置HFileOutputFormat2的信息 HFileOutputFormat2.configureIncrementalLoad(job, table, regionLocator) //开始导入 load.doBulkLoad(new Path(tmpdir), conn.getAdmin, table, regionLocator) } finally { table.close() conn.close() } spark.close() } }
yarn执行脚本
/zywa/spark/spark-2.1.1-bin-hadoop2.7/bin/spark-submit \
--class com.dianyou.offline.utils.ReadHive2Hbase \
--master yarn \
--deploy-mode cluster \
--num-executors 16 \
--executor-memory 2g \
--executor-cores 2 \
--driver-cores 2 \
--driver-memory 4g \
--name aaa3 \
--conf spark.default.parallelism=100 \
--conf spark.memory.storageFraction=0.4 \
--conf spark.streaming.unpersist=true \
--conf spark.streaming.backpressure.enabled=true \
--conf spark.streaming.kafka.maxRatePerPartition=1500 \
--conf spark.network.timeout=300 \
--conf spark.streaming.kafka.consumer.poll.ms=30000 \
--conf spark.driver.extraJavaOptions="-Dlog4j.configuration=file:log4j.properties" \
--conf spark.executor.extraJavaOptions="-XX:+UseParNewGC -XX:+UseConcMarkSweepGC -XX:+CMSParallelRemarkEnabled -XX:+ParallelRefProcEnabled -XX:+CMSClassUnloadingEnabled -XX:MaxTenuringThreshold=15 -XX:SurvivorRatio=5 -XX:MaxDirectMemorySize=1g -Dlog4j.configuration=file:log4j.properties" \
--conf spark.yarn.submit.waitAppCompletion=false \
--conf spark.yarn.maxAppAttempts=4 \
--conf spark.yarn.am.attemptFailuresValidityInterval=1h \
--conf spark.yarn.max.executor.failures=16 \
--conf spark.yarn.executor.failuresValidityInterval=1h \
--conf spark.task.maxFailures=8 \
--files /zywa/job/sparkstreaming/config/log4j.properties \
/zywa/job/sparkstreaming/jars/weixieganzhi.jar 20190803