参考博客: https://www.jianshu.com/p/b6c5a5ba30af
遇到的问题:
20/08/06 10:31:20 ERROR Utils: Aborting task
java.lang.ClassCastException: scala.collection.immutable.$colon$colon cannot be cast to org.apache.hadoop.hbase.Cell
at org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2$1.write(HFileOutputFormat2.java:152)
at org.apache.spark.internal.io.HadoopMapReduceWriteConfigUtil.write(SparkHadoopWriter.scala:352)
at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$4.apply(SparkHadoopWriter.scala:126)
at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$4.apply(SparkHadoopWriter.scala:123)
at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1411)
at org.apache.spark.internal.io.SparkHadoopWriter$.org$apache$spark$internal$io$SparkHadoopWriter$$executeTask(SparkHadoopWriter.scala:135)
at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$3.apply(SparkHadoopWriter.scala:79)
at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$3.apply(SparkHadoopWriter.scala:78)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
20/08/06 10:31:20 WARN FileOutputCommitter: Could not delete hdfs://hadoop001:8020/data/tmp/t_personas_user_tags_collect/_temporary/0/_temporary/attempt_20200806103103_0020_r_000000_0
20/08/06 10:31:20 ERROR SparkHadoopWriter: Task attempt_20200806103103_0020_r_000000_0 aborted.
20/08/06 10:31:20 ERROR Executor: Exception in task 0.0 in stage 3.0 (TID 9)
org.apache.spark.SparkException: Task failed while writing rows
at org.apache.spark.internal.io.SparkHadoopWriter$.org$apache$spark$internal$io$SparkHadoopWriter$$executeTask(SparkHadoopWriter.scala:151)
at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$3.apply(SparkHadoopWriter.scala:79)
at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$3.apply(SparkHadoopWriter.scala:78)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.ClassCastException: scala.collection.immutable.$colon$colon cannot be cast to org.apache.hadoop.hbase.Cell
at org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2$1.write(HFileOutputFormat2.java:152)
at org.apache.spark.internal.io.HadoopMapReduceWriteConfigUtil.write(SparkHadoopWriter.scala:352)
at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$4.apply(SparkHadoopWriter.scala:126)
at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$4.apply(SparkHadoopWriter.scala:123)
at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1411)
at org.apache.spark.internal.io.SparkHadoopWriter$.org$apache$spark$internal$io$SparkHadoopWriter$$executeTask(SparkHadoopWriter.scala:135)
... 8 more
20/08/06 10:31:20 INFO TaskSetManager: Starting task 1.0 in stage 3.0 (TID 10, localhost, executor driver, partition 1, ANY, 7754 bytes)
20/08/06 10:31:20 INFO Executor: Running task 1.0 in stage 3.0 (TID 10)
20/08/06 10:31:20 WARN TaskSetManager: Lost task 0.0 in stage 3.0 (TID 9, localhost, executor driver): org.apache.spark.SparkException: Task failed while writing rows
at org.apache.spark.internal.io.SparkHadoopWriter$.org$apache$spark$internal$io$SparkHadoopWriter$$executeTask(SparkHadoopWriter.scala:151)
at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$3.apply(SparkHadoopWriter.scala:79)
at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$3.apply(SparkHadoopWriter.scala:78)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:345)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.lang.ClassCastException: scala.collection.immutable.$colon$colon cannot be cast to org.apache.hadoop.hbase.Cell
at org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2$1.write(HFileOutputFormat2.java:152)
at org.apache.spark.internal.io.HadoopMapReduceWriteConfigUtil.write(SparkHadoopWriter.scala:352)
at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$4.apply(SparkHadoopWriter.scala:126)
at org.apache.spark.internal.io.SparkHadoopWriter$$anonfun$4.apply(SparkHadoopWriter.scala:123)
at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1411)
at org.apache.spark.internal.io.SparkHadoopWriter$.org$apache$spark$internal$io$SparkHadoopWriter$$executeTask(SparkHadoopWriter.scala:135)
... 8 more
/**
* 核心代码
*/
//利用 Spark 将数据写入 HBase
val dataDF: DataFrame = SparkETL2HBase(spark)
dataDF.show(false)
//获取列名, 第一个为key
var columnsName: Array[String] = dataDF.columns
//把key去掉, 因为要排序
columnsName = columnsName.drop(1).sorted
val data: RDD[(ImmutableBytesWritable, Seq[KeyValue])] = dataDF.rdd.map(x => {
var kvlist: Seq[KeyValue] = List()
var rowkey: Array[Byte] = null
//columnName: 列名
var cn: Array[Byte] = null
//value: 列的值
var v: Array[Byte] = null
var kv: KeyValue = null
/* 获取rowkey, familyName */
rowkey = Bytes.toBytes(x.getAs[String]("key")) //key
val cf: Array[Byte] = Bytes.toBytes("cf") //列族: columnFamily
//需要插入HBase的值
for (i <- 1 to (columnsName.length - 1)) {
cn = columnsName(i).getBytes() //列的名称
v = Bytes.toBytes(x.getAs[String](columnsName(i))) //列的值
//将rdd转换成HFile需要的格式,我们上面定义了HFile的key是ImmutableBytesWritable, 那么我们定义的RDD也是要以ImmutableBytesWritable的实例为key
kv = new KeyValue(rowkey, cf, cn, v) //封装一下 rowkey, cf, columnName, value
kvlist = kvlist :+ kv //将新的kv加在kvlist后面(不能反 需要整体有序)
}
// //需要插入HBase的值(修改为上面的写法, 遍历减少出错)
// val cn1 = Bytes.toBytes("create_date") //val cn1 = Bytes.toBytes("columnName1")
// val v1 = Bytes.toBytes(x.getAs("create_date").toString) //val value1 = Bytes.toBytes(x.getString(1))
// val kv1: KeyValue = new KeyValue(rowkey, cf, cn1, v1)
// kvlist = kvlist :+ kv1
//
// val cn2 = Bytes.toBytes("user_id") //val cn2 = Bytes.toBytes("columnName2")
// val v2 = Bytes.toBytes(x.getAs("user_id").toString) //val value2 = Bytes.toBytes(x.getString(2))
// val kv2: KeyValue = new KeyValue(rowkey, cf, cn2, v2)
// kvlist = kvlist :+ kv2
//
// val cn3 = Bytes.toBytes("user_tag_ids") //val cn3 = Bytes.toBytes("columnName3")
// val v3 = Bytes.toBytes(x.getAs("user_tag_ids").toString) //val value3 = Bytes.toBytes(x.getString(3))
// val kv3: KeyValue = new KeyValue(rowkey, cf, cn3, v3)
// kvlist = kvlist :+ kv3
//
// val cn4 = Bytes.toBytes("user_tag_names") //val cn4 = Bytes.toBytes("columnName4")
// val v4 = Bytes.toBytes(x.getAs("user_tag_names").toString) //val value4 = Bytes.toBytes(x.getString(4))
// val kv4: KeyValue = new KeyValue(rowkey, cf, cn4, v4)
// kvlist = kvlist :+ kv4
//
// val cn5 = Bytes.toBytes("business_type") //val cn5 = Bytes.toBytes("columnName5")
// val v5 = Bytes.toBytes(x.getAs("business_type").toString) //val value5 = Bytes.toBytes(x.getString(5))
// val kv5: KeyValue = new KeyValue(rowkey, cf, cn5, v5)
// kvlist = kvlist :+ kv5
//
// val cn6 = Bytes.toBytes("data_source_group") //val cn6 = Bytes.toBytes("columnName6")
// val v6 = Bytes.toBytes(x.getAs("data_source_group").toString) //val value6 = Bytes.toBytes(x.getString(6))
// val kv6: KeyValue = new KeyValue(rowkey, cf, cn6, v6)
// kvlist = kvlist :+ kv6
//返回tuple
(new ImmutableBytesWritable(rowkey), kvlist)
})
//这里一定通过 "遍历" 把 Seq[KeyValue] 转为 KeyValue
val dataRDD: RDD[(ImmutableBytesWritable, KeyValue)] = data.flatMapValues(s => {
s.iterator
})
//清洗需要存放到 HFile 中的数据, rowKey 一定要排序, 否则会报错:
//TODO 报错: java.io.IOException: Added a key not lexically larger than previous.
.sortBy(x => x._1, true)
// 下游所需要的格式为 RDD[(ImmutableBytesWritable, KeyValue)]
(1) 参考博客: https://blog.csdn.net/haibucuoba/article/details/106414867
// 清洗需要存放到 HFile 中的数据, rowKey 一定要排序, 否则会报错:
// java.io.IOException: Added a key not lexically larger than previous.
hbaseConf.set("hbase.bulkload.retries.number", "0")
(2) 参考博客:
https://blog.csdn.net/ITwangnengjie/article/details/103175909
https://blog.csdn.net/ITwangnengjie/article/details/103194518
使用configureIncrementalLoadMap的时候,输出格式为输出格式KeyValue,设置的reduce类为org.apache.hadoop.hbase.mapreduce.KeyValueSortReducer包里自带的reduce的KeyValue排序类。本来是准备自己写的但是我们在使用HFileOutputFormat2.configureIncrementalLoad的时候发现它有一个设置reduce的方式就是这个类,那么我们就直接应用了。注意的是,使用configureIncrementalLoadMap的时候,不仅rowkey要排序,而且KeyValue也是要排序的(实际上是KeyValue的列族的列名排序,要按列名的顺序写数据,否则会爆错误:
Error: java.io.IOException: Added a key not lexically larger than previous. Current cell = xxxxxxxxxx_574747147/common:00000002/1572736500000/Put/vlen=16/seqid=0, lastCell = xxxxxxxxxx_574747147/common:000000001/1572736500000/Put/vlen=16/seqid=0
at org.apache.hadoop.hbase.io.hfile.AbstractHFileWriter.checkKey(AbstractHFileWriter.java:204)
at org.apache.hadoop.hbase.io.hfile.HFileWriterV2.append(HFileWriterV2.java:265)
at org.apache.hadoop.hbase.regionserver.StoreFile$Writer.append(StoreFile.java:994)
at org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2$1.write(HFileOutputFormat2.java:199)
at org.apache.hadoop.hbase.mapreduce.HFileOutputFormat2$1.write(HFileOutputFormat2.java:152)
at org.apache.hadoop.mapred.ReduceTask$NewTrackingRecordWriter.write(ReduceTask.java:558)
at org.apache.hadoop.mapreduce.task.TaskInputOutputContextImpl.write(TaskInputOutputContextImpl.java:89)
at org.apache.hadoop.mapreduce.lib.reduce.WrappedReducer$Context.write(WrappedReducer.java:105)
at com.xxx.xxx.xxx.xxx.usepartition.five_min_qu_to_hbase.FiveMinQuMRT2$ToHFileReducer.reduce(FiveMinQuMRT2.java:220)
at com.xxx.xxx.xxx.xxx.usepartition.five_min_qu_to_hbase.FiveMinQuMRT2$ToHFileReducer.reduce(FiveMinQuMRT2.java:211)
at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:171)
at org.apache.hadoop.mapred.ReduceTask.runNewReducer(ReduceTask.java:627)
at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:389)
at org.apache.hadoop.mapred.YarnChild$2.run(YarnChild.java:164)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1917)
at org.apache.hadoop.mapred.YarnChild.main(YarnChild.java:158)
因为当前放入的Cell数据的KeyValue小于已经放入Cell的数据。)。然后,我们还设置了Reduce的个数job.setNumReduceTasks(200)。特别注意的是,我们要在配置项需要手动加上一些序列化的配置,
//spark2.x 写法
.config("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," +
"org.apache.hadoop.io.serializer.WritableSerialization," +
"org.apache.hadoop.hbase.mapreduce.KeyValueSerialization," +
"org.apache.hadoop.hbase.mapreduce.MutationSerialization," +
"org.apache.hadoop.hbase.mapreduce.ResultSerialization")
//spark1.x 写法
conf.set("io.serializations", "org.apache.hadoop.io.serializer.JavaSerialization," +
"org.apache.hadoop.io.serializer.WritableSerialization," +
"org.apache.hadoop.hbase.mapreduce.KeyValueSerialization," +
"org.apache.hadoop.hbase.mapreduce.MutationSerialization," +
"org.apache.hadoop.hbase.mapreduce.ResultSerialization")
不加则会报错。我们还使用了写时覆盖策略,保证一个列五分钟的最后的数据。rowkey里数据时间为归整的五分钟倍数时间间隔的时间戳,每个Cell的timestamp则为真正的时间,当某一个Cell相同rowkey有多个数据,保留最新的数据。
参考博客: https://www.iteblog.com/archives/2081.html
解决办法:
//指定序列化格式, 默认是java序列化
.config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
//sparkConf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
报错信息:
20/08/06 14:54:13 ERROR Executor: Exception in task 0.0 in stage 3.0 (TID 9)
java.io.NotSerializableException: org.apache.hadoop.hbase.io.ImmutableBytesWritable
Serialization stack:
- object not serializable (class: org.apache.hadoop.hbase.io.ImmutableBytesWritable, value: 32 34 64 39 38 35 32 64 30 37 35 63 37 30 31 31 64 38 37 30 38 65 63 31 39 61 36 65 63 38 39 38)
- element of array (index: 0)
- array (class [Lorg.apache.hadoop.hbase.io.ImmutableBytesWritable;, size 60)
- field (class: scala.Tuple3, name: _3, type: class java.lang.Object)
- object (class scala.Tuple3, (0,62325,[Lorg.apache.hadoop.hbase.io.ImmutableBytesWritable;@5bbe0f7b))
- element of array (index: 0)
- array (class [Lscala.Tuple3;, size 1)
at org.apache.spark.serializer.SerializationDebugger$.improveException(SerializationDebugger.scala:40)
at org.apache.spark.serializer.JavaSerializationStream.writeObject(JavaSerializer.scala:46)
at org.apache.spark.serializer.JavaSerializerInstance.serialize(JavaSerializer.scala:100)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:393)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
原因: hbase在bulkload时默认的hfile的个数是32个,当hfile文件格式超过32个,则会报错误:
ERROR LoadIncrementalHFiles: Trying to load more than 32 hfiles to family cf of region with start key
Exception in thread "main" java.io.IOException: Trying to load more than 32 hfiles to one family of one region
解决办法:
hbaseConf.setInt("hbase.mapreduce.bulkload.max.hfiles.perRegion.perFamily", 3200);
hbaseConf.set("hbase.hregion.max.filesize", "10737418240")
永久修改,需要在 hbase-site.xml 中添加:
hbase.mapreduce.bulkload.max.hfiles.perRegion.perFamily
3200
hbase.hregion.max.filesize
10737418240