2.3.2
1.8
2.11.8
2.11
3.5.4
1.7.5
1.4.11
org.apache.spark
spark-core_${scala.main.version}
${spark.version}
org.apache.hbase
hbase
${hbase.version}
org.apache.hbase
hbase-client
${hbase.version}
org.apache.hbase
hbase-server
${hbase.version}
org.apache.hbase
hbase-common
${hbase.version}
com.alibaba
fastjson
1.2.47
import com.bj58.bic.fx.utils.MD5Util
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hbase.{HBaseConfiguration, HConstants, KeyValue, TableName}
import org.apache.hadoop.hbase.client.{ConnectionFactory, HTable, Table}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.{HFileOutputFormat2, LoadIncrementalHFiles, TableOutputFormat}
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapreduce.Job
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{DataFrame, SparkSession}
import org.slf4j.LoggerFactory
/**
* @author hao
* @create 2020-05-26 11:51
*/
object TestConfig {
val logger = LoggerFactory.getLogger(this.getClass)
def main(args: Array[String]): Unit = {
val tableName = ""
val prdtLine = ""
val execDate = ""
val hdfsRootPath = "hdfs://xxx.xxx.xxx.:9000/home/hbase"
val zookeeperQuorum = "10.162.xxx.201:2181,10.162.xxx.111:2181,10.162.xxx.213:2181"
val hFilePath = "/home/hbase_bulkimport//"
val hbasetableName = "hdp_bic_bd:hbase_table"
val familyName = "cf1"
logger.info("begining ....")
//1、基本配置
val hadoopConf = new Configuration()
hadoopConf.set("fs.defaultFS", hdfsRootPath)
val fileSystem = FileSystem.get(hadoopConf)
val hbaseConf = HBaseConfiguration.create(hadoopConf)
hbaseConf.set(HConstants.ZOOKEEPER_QUORUM, zookeeperQuorum)
hbaseConf.set(TableOutputFormat.OUTPUT_TABLE, hbasetableName)
val hbaseConn = ConnectionFactory.createConnection(hbaseConf)
val admin = hbaseConn.getAdmin
//2、删除之前目录
if (fileSystem.exists(new Path(hFilePath))) {
fileSystem.delete(new Path(hFilePath), true)
}
//3、初始化sparkContext
val sparkSession: SparkSession = SparkSession.builder()
.appName("HiveTableDetail2Hbase")
.enableHiveSupport()
.getOrCreate()
//4、读取hive数据
val detailsDF: DataFrame = sparkSession.sql("select * from hiveTable ")
//5、需求:将hive表的每一列都同步到hbase
val columns: Array[String] = detailsDF.columns
val data: RDD[(ImmutableBytesWritable, KeyValue)] = detailsDF.rdd
.map(row => {
// 使用md5加密离散化rowkey
val rowkey = MD5Util.getMD5(row.getAs[String]("user_id") , false, 16)
columns.map(col => {
var value: String = ""
val index: Int = row.fieldIndex(col)
if (!row.isNullAt(index)) {
value = String.valueOf(row.getAs[Object](index))
}
(rowkey, (col, value))
})
})
.flatMap(arr => (arr))
//新版本不仅需要对rowkey 排序还要对列进行排序
.sortBy(x => (x._1, x._2._1), true, 4)
.map(e => {
val kv = new KeyValue(Bytes.toBytes(e._1), Bytes.toBytes(familyName), Bytes.toBytes(e._2._1), System.currentTimeMillis(), Bytes.toBytes(e._2._2))
(new ImmutableBytesWritable(Bytes.toBytes(e._1)), kv)
})
// 6、 保存 Hfiles 到 HDFS
val job = Job.getInstance(hbaseConf)
val table: Table = hbaseConn.getTable(TableName.valueOf(hbasetableName))
job.setMapOutputKeyClass(classOf[ImmutableBytesWritable])
job.setMapOutputValueClass(classOf[KeyValue])
HFileOutputFormat2.configureIncrementalLoadMap(job, table)
job.getConfiguration.set("mapred.output.dir", hFilePath)
data.saveAsNewAPIHadoopDataset(job.getConfiguration)
//也可以使用下面API
// data.saveAsNewAPIHadoopFile(hFilePath,classOf[ImmutableBytesWritable],classOf[KeyValue],classOf[HFileOutputFormat2],hbaseConf)
//7、 Bulk load Hfiles 到 Hbase
val bulkLoader = new LoadIncrementalHFiles(hbaseConf)
val regionLocator = hbaseConn.getRegionLocator(TableName.valueOf(hbasetableName))
val hTable = new HTable(hbaseConf, TableName.valueOf(hbasetableName))
bulkLoader.doBulkLoad(new Path(hFilePath), hTable)
//也可使用下面API
// bulkLoader.doBulkLoad(new Path(hFilePath), admin, table, regionLocator)
//删除路径
if (fileSystem.exists(new Path(hFilePath))) {
fileSystem.delete(new Path(hFilePath), true)
}
hbaseConn.close()
fileSystem.close()
sparkSession.stop()
}
java.io.IOException: Added a key not lexically larger than previous. Current cell = 152520177093/cf:categroy_id/1542428767383/Put/vlen=6/seqid=0, lastCell = 152520177093/cf:product_name/1542428767383/Put/vlen=72/seqid=0
hbase 会对 写出来的Hfile 按照现有的Region 进行切分,如果数据打的够散的情况下,每个Region 就进行一次切分,每次切分就是一次重试,hbase 默认的重试次数为10 ,当切分的次数大于10次时,程序就会出现上面的异常。
解决方式:设置重试次数为0,表示可以无限重试。
hbaseConf.set("hbase.bulkload.retries.number", "0")
以上就是一些踩坑经历,喔喔 ... 有小仙女找我咧 ,溜了溜了。