Spark读写HBase之使用Spark自带的API以及使用Bulk Load将大量数据导入HBase

参考博客: https://www.jianshu.com/p/b6c5a5ba30af

遇到的问题:

1. java.lang.ClassCastException: scala.collection.immutable.$colon$colon cannot be cast to org.apache.hadoop.hbase.Cell

20/08/06 10:31:20 ERROR Utils: Aborting task
java.lang.ClassCastException: scala.collection.immutable.$colon$colon cannot be cast to org.apache.hadoop.hbase.Cell
	at org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2$1.write(HFileOutputFormat2.java:152)
	at org.apache.spark.internal.io.HadoopMapReduceWriteConfigUtil.write(SparkHadoopWriter.scala:352)
	at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$4.apply(SparkHadoopWriter.scala:126)
	at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$4.apply(SparkHadoopWriter.scala:123)
	at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1411)
	at org.apache.spark.internal.io.SparkHadoopWriter$.org$apache$spark$internal$io$SparkHadoopWriter$$executeTask(SparkHadoopWriter.scala:135)
	at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$3.apply(SparkHadoopWriter.scala:79)
	at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$3.apply(SparkHadoopWriter.scala:78)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)
20/08/06 10:31:20 WARN FileOutputCommitter: Could not delete hdfs://hadoop001:8020/data/tmp/t_personas_user_tags_collect/_temporary/0/_temporary/attempt_20200806103103_0020_r_000000_0
20/08/06 10:31:20 ERROR SparkHadoopWriter: Task attempt_20200806103103_0020_r_000000_0 aborted.
20/08/06 10:31:20 ERROR Executor: Exception in task 0.0 in stage 3.0 (TID 9)
org.apache.spark.SparkException: Task failed while writing rows
	at org.apache.spark.internal.io.SparkHadoopWriter$.org$apache$spark$internal$io$SparkHadoopWriter$$executeTask(SparkHadoopWriter.scala:151)
	at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$3.apply(SparkHadoopWriter.scala:79)
	at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$3.apply(SparkHadoopWriter.scala:78)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.ClassCastException: scala.collection.immutable.$colon$colon cannot be cast to org.apache.hadoop.hbase.Cell
	at org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2$1.write(HFileOutputFormat2.java:152)
	at org.apache.spark.internal.io.HadoopMapReduceWriteConfigUtil.write(SparkHadoopWriter.scala:352)
	at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$4.apply(SparkHadoopWriter.scala:126)
	at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$4.apply(SparkHadoopWriter.scala:123)
	at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1411)
	at org.apache.spark.internal.io.SparkHadoopWriter$.org$apache$spark$internal$io$SparkHadoopWriter$$executeTask(SparkHadoopWriter.scala:135)
	... 8 more
20/08/06 10:31:20 INFO TaskSetManager: Starting task 1.0 in stage 3.0 (TID 10, localhost, executor driver, partition 1, ANY, 7754 bytes)
20/08/06 10:31:20 INFO Executor: Running task 1.0 in stage 3.0 (TID 10)
20/08/06 10:31:20 WARN TaskSetManager: Lost task 0.0 in stage 3.0 (TID 9, localhost, executor driver): org.apache.spark.SparkException: Task failed while writing rows
	at org.apache.spark.internal.io.SparkHadoopWriter$.org$apache$spark$internal$io$SparkHadoopWriter$$executeTask(SparkHadoopWriter.scala:151)
	at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$3.apply(SparkHadoopWriter.scala:79)
	at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$3.apply(SparkHadoopWriter.scala:78)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
	at org.apache.spark.scheduler.Task.run(Task.scala:109)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.ClassCastException: scala.collection.immutable.$colon$colon cannot be cast to org.apache.hadoop.hbase.Cell
	at org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2$1.write(HFileOutputFormat2.java:152)
	at org.apache.spark.internal.io.HadoopMapReduceWriteConfigUtil.write(SparkHadoopWriter.scala:352)
	at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$4.apply(SparkHadoopWriter.scala:126)
	at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$4.apply(SparkHadoopWriter.scala:123)
	at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1411)
	at org.apache.spark.internal.io.SparkHadoopWriter$.org$apache$spark$internal$io$SparkHadoopWriter$$executeTask(SparkHadoopWriter.scala:135)
	... 8 more

解决办法:

​​​​​​​/**
  * 核心代码
  */
//利用 Spark 将数据写入 HBase
val dataDF: DataFrame = SparkETL2HBase(spark)
dataDF.show(false)
//获取列名, 第一个为key
var columnsName: Array[String] = dataDF.columns
//把key去掉, 因为要排序
columnsName = columnsName.drop(1).sorted 

val data: RDD[(ImmutableBytesWritable, Seq[KeyValue])] = dataDF.rdd.map(x => {
  var kvlist: Seq[KeyValue] = List()
  var rowkey: Array[Byte] = null
  //columnName: 列名
  var cn: Array[Byte] = null
  //value: 列的值
  var v: Array[Byte] = null
  var kv: KeyValue = null
  /* 获取rowkey, familyName */
  rowkey = Bytes.toBytes(x.getAs[String]("key")) //key
  val cf: Array[Byte] = Bytes.toBytes("cf") //列族: columnFamily

  //需要插入HBase的值
  for (i <- 1 to (columnsName.length - 1)) {
    cn = columnsName(i).getBytes() //列的名称
    v = Bytes.toBytes(x.getAs[String](columnsName(i))) //列的值
    //将rdd转换成HFile需要的格式,我们上面定义了HFile的key是ImmutableBytesWritable, 那么我们定义的RDD也是要以ImmutableBytesWritable的实例为key
    kv = new KeyValue(rowkey, cf, cn, v) //封装一下 rowkey, cf, columnName, value
    kvlist = kvlist :+ kv //将新的kv加在kvlist后面(不能反 需要整体有序)
  }

  //      //需要插入HBase的值(修改为上面的写法, 遍历减少出错)
  //      val cn1 = Bytes.toBytes("create_date") //val cn1 = Bytes.toBytes("columnName1")
  //      val v1 = Bytes.toBytes(x.getAs("create_date").toString) //val value1 = Bytes.toBytes(x.getString(1))
  //      val kv1: KeyValue = new KeyValue(rowkey, cf, cn1, v1)
  //      kvlist = kvlist :+ kv1
  //
  //      val cn2 = Bytes.toBytes("user_id") //val cn2 = Bytes.toBytes("columnName2")
  //      val v2 = Bytes.toBytes(x.getAs("user_id").toString) //val value2 = Bytes.toBytes(x.getString(2))
  //      val kv2: KeyValue = new KeyValue(rowkey, cf, cn2, v2)
  //      kvlist = kvlist :+ kv2
  //
  //      val cn3 = Bytes.toBytes("user_tag_ids") //val cn3 = Bytes.toBytes("columnName3")
  //      val v3 = Bytes.toBytes(x.getAs("user_tag_ids").toString) //val value3 = Bytes.toBytes(x.getString(3))
  //      val kv3: KeyValue = new KeyValue(rowkey, cf, cn3, v3)
  //      kvlist = kvlist :+ kv3
  //
  //      val cn4 = Bytes.toBytes("user_tag_names") //val cn4 = Bytes.toBytes("columnName4")
  //      val v4 = Bytes.toBytes(x.getAs("user_tag_names").toString) //val value4 = Bytes.toBytes(x.getString(4))
  //      val kv4: KeyValue = new KeyValue(rowkey, cf, cn4, v4)
  //      kvlist = kvlist :+ kv4
  //
  //      val cn5 = Bytes.toBytes("business_type") //val cn5 = Bytes.toBytes("columnName5")
  //      val v5 = Bytes.toBytes(x.getAs("business_type").toString) //val value5 = Bytes.toBytes(x.getString(5))
  //      val kv5: KeyValue = new KeyValue(rowkey, cf, cn5, v5)
  //      kvlist = kvlist :+ kv5
  //
  //      val cn6 = Bytes.toBytes("data_source_group") //val cn6 = Bytes.toBytes("columnName6")
  //      val v6 = Bytes.toBytes(x.getAs("data_source_group").toString) //val value6 = Bytes.toBytes(x.getString(6))
  //      val kv6: KeyValue = new KeyValue(rowkey, cf, cn6, v6)
  //      kvlist = kvlist :+ kv6

  //返回tuple
  (new ImmutableBytesWritable(rowkey), kvlist)
})

//这里一定通过 "遍历" 把 Seq[KeyValue] 转为 KeyValue
val dataRDD: RDD[(ImmutableBytesWritable, KeyValue)] = data.flatMapValues(s => {
  s.iterator
})
//清洗需要存放到 HFile 中的数据, rowKey 一定要排序, 否则会报错:
//TODO 报错: java.io.IOException: Added a key not lexically larger than previous.
  .sortBy(x => x._1, true)

// 下游所需要的格式为 RDD[(ImmutableBytesWritable, KeyValue)]

2. java.io.IOException: Added a key not lexically larger than previous. Current cell = 851109b18cfa028bfc6cc09164d15eec/cf:business_type/1596682686231/Put/vlen=1/seqid=0, lastCell = 851109b18cfa028bfc6cc09164d15eec/cf:user_tag_names/1596682686231/Put/vlen=26/seqid=0

(1) 参考博客: https://blog.csdn.net/haibucuoba/article/details/106414867

  • 原因:写hfile的时候 要对 rowkey排序 。新版本不仅要对rowkey排序还要对列进行排序,所以 sorbykey 解决不了问题,具体解决方式看代码。
// 清洗需要存放到 HFile 中的数据, rowKey 一定要排序, 否则会报错:
// java.io.IOException: Added a key not lexically larger than previous.
  • hbase 会对 写出来的 Hfile 按照现有的 Region 进行切分,如果数据打的够散的情况下,每个Region 就进行一次切分,每次切分就是一次重试,hbase 默认的重试次数为10 ,当切分的次数大于10次时,程序就会出现上面的异常。

解决方式:设置重试次数为 0,表示可以无限重试。

hbaseConf.set("hbase.bulkload.retries.number", "0")

(2) 参考博客:

  • MapReduce写HFile,doBulkLoad方式批量导入到HBase(用HFileOutputFormat2.configureIncrementalLoad方式不推荐)

https://blog.csdn.net/ITwangnengjie/article/details/103175909

  • MapReduce写HFile,doBulkLoad方式批量导入到HBase(用 HFileOutputFormat2.configureIncrementalLoadMap方式推荐)

https://blog.csdn.net/ITwangnengjie/article/details/103194518

       使用configureIncrementalLoadMap的时候,输出格式为输出格式KeyValue,设置的reduce类为org.apache.hadoop.hbase.mapreduce.KeyValueSortReducer包里自带的reduce的KeyValue排序类。本来是准备自己写的但是我们在使用HFileOutputFormat2.configureIncrementalLoad的时候发现它有一个设置reduce的方式就是这个类,那么我们就直接应用了。注意的是,使用configureIncrementalLoadMap的时候,不仅rowkey要排序,而且KeyValue也是要排序的(实际上是KeyValue的列族的列名排序,要按列名的顺序写数据,否则会爆错误:

Error: java.io.IOException: Added a key not lexically larger than previous. Current cell = xxxxxxxxxx_574747147/common:00000002/1572736500000/Put/vlen=16/seqid=0, lastCell = xxxxxxxxxx_574747147/common:000000001/1572736500000/Put/vlen=16/seqid=0
	at org.apache.hadoop.hbase.io.hfile.AbstractHFileWriter.checkKey(AbstractHFileWriter.java:204)
	at org.apache.hadoop.hbase.io.hfile.HFileWriterV2.append(HFileWriterV2.java:265)
	at org.apache.hadoop.hbase.regionserver.StoreFile$Writer.append(StoreFile.java:994)
	at org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2$1.write(HFileOutputFormat2.java:199)
	at org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2$1.write(HFileOutputFormat2.java:152)
	at org.apache.hadoop.mapred.ReduceTask$NewTrackingRecordWriter.write(ReduceTask.java:558)
	at org.apache.hadoop.mapreduce.task.TaskInputOutputContextImpl.write(TaskInputOutputContextImpl.java:89)
	at org.apache.hadoop.mapreduce.lib.reduce.WrappedReducer$Context.write(WrappedReducer.java:105)
	at com.xxx.xxx.xxx.xxx.usepartition.five_min_qu_to_hbase.FiveMinQuMRT2$ToHFileReducer.reduce(FiveMinQuMRT2.java:220)
	at com.xxx.xxx.xxx.xxx.usepartition.five_min_qu_to_hbase.FiveMinQuMRT2$ToHFileReducer.reduce(FiveMinQuMRT2.java:211)
	at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:171)
	at org.apache.hadoop.mapred.ReduceTask.runNewReducer(ReduceTask.java:627)
	at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:389)
	at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:164)
	at java.security.AccessController.doPrivileged(Native Method)
	at javax.security.auth.Subject.doAs(Subject.java:422)
	at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1917)
	at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)

因为当前放入的Cell数据的KeyValue小于已经放入Cell的数据。)。然后,我们还设置了Reduce的个数job.setNumReduceTasks(200)。特别注意的是,我们要在配置项需要手动加上一些序列化的配置,

//spark2.x 写法
.config("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," +
"org.apache.hadoop.io.serializer.WritableSerialization," +
"org.apache.hadoop.hbase.mapreduce.KeyValueSerialization," +
"org.apache.hadoop.hbase.mapreduce.MutationSerialization," +
"org.apache.hadoop.hbase.mapreduce.ResultSerialization")

//spark1.x 写法
conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," +
        "org.apache.hadoop.io.serializer.WritableSerialization," +
        "org.apache.hadoop.hbase.mapreduce.KeyValueSerialization," +
        "org.apache.hadoop.hbase.mapreduce.MutationSerialization," +
        "org.apache.hadoop.hbase.mapreduce.ResultSerialization")

不加则会报错。我们还使用了写时覆盖策略,保证一个列五分钟的最后的数据。rowkey里数据时间为归整的五分钟倍数时间间隔的时间戳,每个Cell的timestamp则为真正的时间,当某一个Cell相同rowkey有多个数据,保留最新的数据。

3. java.io.NotSerializableException: org.apache.hadoop.hbase.io.ImmutableBytesWritable

参考博客: https://www.iteblog.com/archives/2081.html

解决办法:

//指定序列化格式, 默认是java序列化
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
//sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")

报错信息:

20/08/06 14:54:13 ERROR Executor: Exception in task 0.0 in stage 3.0 (TID 9)
java.io.NotSerializableException: org.apache.hadoop.hbase.io.ImmutableBytesWritable
Serialization stack:
	- object not serializable (class: org.apache.hadoop.hbase.io.ImmutableBytesWritable, value: 32 34 64 39 38 35 32 64 30 37 35 63 37 30 31 31 64 38 37 30 38 65 63 31 39 61 36 65 63 38 39 38)
	- element of array (index: 0)
	- array (class [Lorg.apache.hadoop.hbase.io.ImmutableBytesWritable;, size 60)
	- field (class: scala.Tuple3, name: _3, type: class java.lang.Object)
	- object (class scala.Tuple3, (0,62325,[Lorg.apache.hadoop.hbase.io.ImmutableBytesWritable;@5bbe0f7b))
	- element of array (index: 0)
	- array (class [Lscala.Tuple3;, size 1)
	at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
	at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46)
	at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:100)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:393)
	at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
	at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
	at java.lang.Thread.run(Thread.java:745)

4. Trying to load more than 32 hfiles to family cf of region with start key

原因: hbase在bulkload时默认的hfile的个数是32个,当hfile文件格式超过32个,则会报错误:

ERROR LoadIncrementalHFiles: Trying to load more than 32 hfiles to family cf of region with start key 
Exception in thread "main" java.io.IOException: Trying to load more than 32 hfiles to one family of one region

解决办法:

  • 当然在生产环境中一般是不能去随便修改配置文件的,所以可以用代码的方式去设置
hbaseConf.setInt("hbase.mapreduce.bulkload.max.hfiles.perRegion.perFamily", 3200);
hbaseConf.set("hbase.hregion.max.filesize", "10737418240")

永久修改,需要在 hbase-site.xml 中添加:


    hbase.mapreduce.bulkload.max.hfiles.perRegion.perFamily
    3200


    hbase.hregion.max.filesize
    10737418240

 

你可能感兴趣的:(Spark读写HBase之使用Spark自带的API以及使用Bulk Load将大量数据导入HBase)