Spark使用Bulk Load将大量数据导入HBase以及遇到一些问题及解决方案

1、涉及jar包依赖


        2.3.2
        1.8
        2.11.8
        2.11
        3.5.4
        1.7.5
        1.4.11



    
        org.apache.spark
        spark-core_${scala.main.version}
        ${spark.version}
    
    
        org.apache.hbase
        hbase
        ${hbase.version}
    
    
        org.apache.hbase
        hbase-client
        ${hbase.version}
    
    
        org.apache.hbase
        hbase-server
        ${hbase.version}
    
    
        org.apache.hbase
        hbase-common
        ${hbase.version}
    

    
        com.alibaba
        fastjson
        1.2.47
    

2、spark读取hive数据 使用Bulk Load 写入Hbase


import com.bj58.bic.fx.utils.MD5Util
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hbase.{HBaseConfiguration, HConstants, KeyValue, TableName}
import org.apache.hadoop.hbase.client.{ConnectionFactory, HTable, Table}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.{HFileOutputFormat2, LoadIncrementalHFiles, TableOutputFormat}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.slf4j.LoggerFactory

/**
  * @author hao
  * @create 2020-05-26 11:51
  */
object TestConfig {

  val logger = LoggerFactory.getLogger(this.getClass)

  def main(args: Array[String]): Unit = {

    val tableName = ""
    val prdtLine = ""
    val execDate = ""
    val hdfsRootPath = "hdfs://xxx.xxx.xxx.:9000/home/hbase"
    val zookeeperQuorum = "10.162.xxx.201:2181,10.162.xxx.111:2181,10.162.xxx.213:2181"
    val hFilePath = "/home/hbase_bulkimport//"
    val hbasetableName = "hdp_bic_bd:hbase_table"
    val familyName = "cf1"

    logger.info("begining ....")

    //1、基本配置
    val hadoopConf = new Configuration()
    hadoopConf.set("fs.defaultFS", hdfsRootPath)
    val fileSystem = FileSystem.get(hadoopConf)
    val hbaseConf = HBaseConfiguration.create(hadoopConf)
    hbaseConf.set(HConstants.ZOOKEEPER_QUORUM, zookeeperQuorum)
    hbaseConf.set(TableOutputFormat.OUTPUT_TABLE, hbasetableName)
    val hbaseConn = ConnectionFactory.createConnection(hbaseConf)
    val admin = hbaseConn.getAdmin

    //2、删除之前目录
    if (fileSystem.exists(new Path(hFilePath))) {
      fileSystem.delete(new Path(hFilePath), true)
    }

    //3、初始化sparkContext
    val sparkSession: SparkSession = SparkSession.builder()
      .appName("HiveTableDetail2Hbase")
      .enableHiveSupport()
      .getOrCreate()

    //4、读取hive数据
    val detailsDF: DataFrame = sparkSession.sql("select * from hiveTable ")

    //5、需求:将hive表的每一列都同步到hbase
    val columns: Array[String] = detailsDF.columns
    val data: RDD[(ImmutableBytesWritable, KeyValue)] = detailsDF.rdd
      .map(row => {
        // 使用md5加密离散化rowkey
        val rowkey = MD5Util.getMD5(row.getAs[String]("user_id") , false, 16)
        columns.map(col => {
          var value: String = ""
          val index: Int = row.fieldIndex(col)
          if (!row.isNullAt(index)) {
              value = String.valueOf(row.getAs[Object](index))
          }
          (rowkey, (col, value))
        })
      })
      .flatMap(arr => (arr))
      //新版本不仅需要对rowkey 排序还要对列进行排序
      .sortBy(x => (x._1, x._2._1), true, 4)
      .map(e => {
        val kv = new KeyValue(Bytes.toBytes(e._1), Bytes.toBytes(familyName), Bytes.toBytes(e._2._1), System.currentTimeMillis(), Bytes.toBytes(e._2._2))
        (new ImmutableBytesWritable(Bytes.toBytes(e._1)), kv)
      })


    // 6、 保存 Hfiles 到 HDFS
    val job = Job.getInstance(hbaseConf)
    val table: Table = hbaseConn.getTable(TableName.valueOf(hbasetableName))
    job.setMapOutputKeyClass(classOf[ImmutableBytesWritable])
    job.setMapOutputValueClass(classOf[KeyValue])
    HFileOutputFormat2.configureIncrementalLoadMap(job, table)

    job.getConfiguration.set("mapred.output.dir", hFilePath)

    data.saveAsNewAPIHadoopDataset(job.getConfiguration)
    //也可以使用下面API
    //    data.saveAsNewAPIHadoopFile(hFilePath,classOf[ImmutableBytesWritable],classOf[KeyValue],classOf[HFileOutputFormat2],hbaseConf)

    //7、 Bulk load Hfiles 到 Hbase
    val bulkLoader = new LoadIncrementalHFiles(hbaseConf)
    val regionLocator = hbaseConn.getRegionLocator(TableName.valueOf(hbasetableName))
    val hTable = new HTable(hbaseConf, TableName.valueOf(hbasetableName))
    bulkLoader.doBulkLoad(new Path(hFilePath), hTable)
    //也可使用下面API
    //    bulkLoader.doBulkLoad(new Path(hFilePath), admin, table, regionLocator)

    //删除路径
    if (fileSystem.exists(new Path(hFilePath))) {
      fileSystem.delete(new Path(hFilePath), true)
    }
    hbaseConn.close()
    fileSystem.close()
    sparkSession.stop()
  }

3、遇到一些问题

1)异常

java.io.IOException: Added a key not lexically larger than previous. Current cell = 152520177093/cf:categroy_id/1542428767383/Put/vlen=6/seqid=0, lastCell = 152520177093/cf:product_name/1542428767383/Put/vlen=72/seqid=0

原因:写hfile的时候 要对 rowkey排序 。新版本不仅要对rowkey排序还要对列进行排序,所以 sorbykey 解决不了问题,具体解决方式看代码。

2)spark程序运行中出现异常

重试失败原因:

Spark使用Bulk Load将大量数据导入HBase以及遇到一些问题及解决方案_第1张图片

hbase 会对 写出来的Hfile 按照现有的Region 进行切分,如果数据打的够散的情况下,每个Region 就进行一次切分,每次切分就是一次重试,hbase 默认的重试次数为10 ,当切分的次数大于10次时,程序就会出现上面的异常。

解决方式:设置重试次数为0,表示可以无限重试。

hbaseConf.set("hbase.bulkload.retries.number", "0")

以上就是一些踩坑经历,喔喔 ... 有小仙女找我咧 ,溜了溜了。

你可能感兴趣的:(spark)