亲测有效
spark-shell --jars /home/wsy/jars/hbase-spark-1.2.0-cdh5.7.1.jar
def readHbase(sc:org.apache.spark.SparkContext,readTableName:String="USER") ={
val hbaseConf = org.apache.hadoop.hbase.HBaseConfiguration.create()
hbaseConf.set("hbase.zookeeper.quorum","s1sl11,s1ma11,s1sl22")
hbaseConf.set("hbase.zookeeper.property.clientPort", "2181")
hbaseConf.set(org.apache.hadoop.hbase.mapreduce.TableInputFormat.INPUT_TABLE,readTableName)
val hbaseRDD = sc.newAPIHadoopRDD(hbaseConf,
classOf[org.apache.hadoop.hbase.mapreduce.TableInputFormat],
classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
classOf[org.apache.hadoop.hbase.client.Result])
import org.apache.hadoop.hbase.util.Bytes
val m2cRDD=hbaseRDD.map(
r=>{
val mobile_no:String=Bytes.toString(r._2.getValue(Bytes.toBytes("cf"),Bytes.toBytes("mobile_no")))
val user_id:String=Bytes.toString(r._2.getValue(Bytes.toBytes("cf"),Bytes.toBytes("user_id")))
val is_validated:String=Bytes.toString(r._2.getValue(Bytes.toBytes("cf"),Bytes.toBytes("is_validated")))
if(is_validated == "true" && mobile_no != null && user_id != null ){
(mobile_no,user_id,is_validated)
}else{
("-1","-1","-1")
}
}
)
m2cRDD
}
//此方法效率极低,1亿4千万的数据需要9.1个小时,请使用
spark-shell读写HBase,dataframe方法,连接如下
https://www.jianshu.com/p/af29f76d4f91
def putWriteHbase(sc:org.apache.spark.SparkContext,m2cRDD:org.apache.spark.rdd.RDD[(String, String,String)],writeTableName:String="MOBILE2CMPAYID")={
m2cRDD.foreachPartition(
iter => {
val hbaseConf = org.apache.hadoop.hbase.HBaseConfiguration.create()
hbaseConf.set("hbase.zookeeper.quorum","s1sl11,s1ma11,s1sl22")
hbaseConf.set("hbase.zookeeper.property.clientPort", "2181")
val cn = org.apache.hadoop.hbase.client.ConnectionFactory.createConnection(hbaseConf)
val hbaseTable=cn.getTable(org.apache.hadoop.hbase.TableName.valueOf(writeTableName))
iter.foreach(
row => {
import org.apache.hadoop.hbase.util.Bytes
val mobile_no:String=row._1
val user_id:String=row._2
val is_validated:String=row._3
if(mobile_no != "-1"){
val put = new org.apache.hadoop.hbase.client.Put(Bytes.toBytes(mobile_no))
put.addColumn(Bytes.toBytes("cf"),Bytes.toBytes("mobile_no"),Bytes.toBytes(mobile_no))
put.addColumn(Bytes.toBytes("cf"),Bytes.toBytes("user_id"),Bytes.toBytes(user_id))
put.addColumn(Bytes.toBytes("cf"),Bytes.toBytes("is_validated"),Bytes.toBytes(is_validated))
hbaseTable.put(put)
}
}
)
hbaseTable.close()
}
)
}
val m2cRDD=readHbase(sc,"USER")
putWriteHbase(sc,m2cRDD,"MOBILE2CMPAYID")
def HFileWriteHbase(sc:org.apache.spark.SparkContext,m2cRDD:org.apache.spark.rdd.RDD[(String, String)],writeTableName:String="tmp_mobile2cmpayidHFile")={
import org.apache.hadoop.hbase.util.Bytes
val toByteArrays= m2cRDD.map(row => {
val mobile_no:String=row._1
val user_id:String=row._2
val rowkeyBytes=Bytes.toBytes(mobile_no)
val kvs=List(
(Bytes.toBytes("mobile_no"),Bytes.toBytes(mobile_no)),
(Bytes.toBytes("user_id"),Bytes.toBytes(user_id))
)
(rowkeyBytes,kvs)
})
val hbaseConf = org.apache.hadoop.hbase.HBaseConfiguration.create()
hbaseConf.set("hbase.zookeeper.quorum","s1sl11,s1ma11,s1sl22")
val hbaseContext=new org.apache.hadoop.hbase.spark.HBaseContext(sc,hbaseConf)
val tableName=org.apache.hadoop.hbase.TableName.valueOf(writeTableName)
val stagingFolder="hdfs:///tmp/wsy/hfile/test"
import org.apache.hadoop.hbase.spark.HBaseRDDFunctions._
toByteArrays.hbaseBulkLoad(hbaseContext,tableName,
t => {
val rowKey=t._1
val seq=scala.collection.mutable.ListBuffer[(org.apache.hadoop.hbase.spark.KeyFamilyQualifier,Array[Byte])]()
for (kv <- t._2){
val qualifier=kv._1
val value=kv._2
if(Bytes.toString(value) != "-1"){
val keyFamilyQualifier=new org.apache.hadoop.hbase.spark.KeyFamilyQualifier(rowKey,Bytes.toBytes("cf"),qualifier)
seq.append((keyFamilyQualifier,value))
}
}
seq.iterator
},
stagingFolder)
val load=new org.apache.hadoop.hbase.mapreduce.LoadIncrementalHFiles(hbaseConf)
load.run(Array(stagingFolder,writeTableName))
}
putWriteHbase(sc,readHbase(sc))
HFileWriteHbase(sc,readHbase(sc))
disable 'tmp_mobile2cmpayid'
drop 'tmp_mobile2cmpayid'
create 'tmp_mobile2cmpayid','cf'
需要注意的几点:
1,hbaseConf.set("hbase.zookeeper.quorum","s1sl11,s1ma11,s1sl22")
这一句不加,运行报错见下图
2,spark-shell --jars /home/wsy/jars/hbase-spark-1.2.0-cdh5.7.1.jar
这个jar包不加,org.apache.hadoop.hbase.spark此package下的类找不到