环境配置
scala -> 2.11.12
spark->2.2.0
HBASE ->1.3.0 注意:用2.0的jar包写入不进去,但也不报错
/**
* spark直接读写Hbase,已测试
* @Author: stsahana
* @Date: 2019-8-21 18:27
**/
object HbaseDemo {
def main(args: Array[String]): Unit = {
val spark = SparkSession
.builder
.enableHiveSupport()
.appName("habseDemo")
.master("local[2]")
.config("executor.memory", "2G")
.config("total.executor.cores", "2")
.config("spark.hadoop.validateOutputSpecs", false)
.getOrCreate()
read(spark)
write(spark);
read(spark)
}
def read(spark: SparkSession): DataFrame = {
val sc = spark.sparkContext
val hbaseConf = HBaseConfiguration.create()
hbaseConf.set("hbase.zookeeper.quorum", "localhost") //设置zooKeeper集群地址,也可以通过将hbase-site.xml导入classpath,但是建议在程序里这样设置
hbaseConf.set("hbase.zookeeper.property.clientPort", "2181") //设置zookeeper连接端口,默认2181
hbaseConf.set(TableInputFormat.INPUT_TABLE, "Contacts")
//读取数据并转化成rdd TableInputFormat 是 org.apache.hadoop.hbase.mapreduce 包下的
val hBaseRDD = sc.newAPIHadoopRDD(hbaseConf, classOf[TableInputFormat],
classOf[ImmutableBytesWritable],
classOf[Result])
import spark.implicits._
val results = hBaseRDD.map(r => (
Bytes.toString(r._2.getRow),
Bytes.toString(r._2.getValue("Personal".getBytes, "Name".getBytes)),
Bytes.toString(r._2.getValue("Office".getBytes, "Address".getBytes)),
Bytes.toString(r._2.getValue("Personal".getBytes, "a".getBytes)),
Bytes.toString(r._2.getValue("Personal".getBytes, "b".getBytes)),
Bytes.toString(r._2.getValue("Personal".getBytes, "c".getBytes))
)).toDF("row", "B", "C", "d", "e", "f");
results.show(20, false);
return results;
}
def write(spark: SparkSession) = {
val sc = spark.sparkContext
import spark.implicits._
val tableName = "Contacts"
//create configuration object
val conf: Configuration = HBaseConfiguration.create()
//set zookeeper information
conf.set("hbase.zookeeper.quorum", "localhost");
conf.set("hbase.zookeeper.property.clientPort", "2181");
//setup job object
val job: Job = Job.getInstance(conf)
//define outputformat class
job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])
//add table name to configuration
job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, tableName)
val indataRDD = sc.makeRDD(Array("3,jack,15", "4,Lily,16", "5,mike,16"))
val df = indataRDD.map(_.split(",")).map(a => {
(a(0), a(1), a(2))
}).toDF("a", "b", "c")
df.show();
val prepareHBaseToLoad: RDD[(ImmutableBytesWritable, Put)] =
df.rdd.map(row => rowToPut(row: Row))
try {
prepareHBaseToLoad.saveAsNewAPIHadoopDataset(job.getConfiguration())
} catch {
//handle the null string excpetion while inserting to Hbase throws
case e: Exception => {
if (e.getMessage().equals("Can not create a Path from a null string")) {
println(" saveAsNewAPIHadoopDataset - Exception caused due to a bug in spark 2.2 - Data is saved in HBASE but still excepton is thrown - java.lang.IllegalArgumentException: Can not create a Path from a null string at org.apache.hadoop.fs.Path.checkPathArg ")
} else {
throw (e)
}
}
}
}
def rowToPut(row: Row): (ImmutableBytesWritable, Put) = {
// rowToPut(row)
val columnList = row.length
//convert the rowKey into String
val arrayList = row.schema.fieldNames
//
val rowkey: String = row.getAs[String](arrayList(0));
var put = new Put(Bytes.toBytes(rowkey))
// println("rowkey=="+rowkey);
for (field <- 1 until arrayList.size) {
// Add the score data columns to the Put object
put.addColumn(Bytes.toBytes("Personal"), Bytes.toBytes(arrayList(field)), Bytes.toBytes(row.getAs[String](arrayList(field))))
}
// Returns the assembled Put object
return (new ImmutableBytesWritable(Bytes.toBytes(rowkey)), put)
}
}
/** 测试表
* create "Contacts", { NAME => "Personal", VERSIONS=>5},{ NAME =>"Office",VERSIONS=>2}
* put 'Contacts', '1', 'Personal:Name', 'John Dole'
* put 'Contacts', '1', 'Personal:Phone', '1-234-000-0001'
* put 'Contacts', '1', 'Office:Phone', '1-234-000-0002'
* put 'Contacts', '1', 'Office:Address', '1111 San Gabriel Dr.'
* put 'Contacts', '2', 'Personal:Name', 'Calvin Raji'
* put 'Contacts', '2', 'Personal:Phone', '123-555-0191'
* put 'Contacts', '2', 'Office:Phone', '123-555-0191'
* put 'Contacts', '2', 'Office:Address', '5415 San Gabriel Dr.'
*
*/
Exception in thread "main" java.lang.NullPointerException
at org.apache.hadoop.hbase.security.UserProvider.instantiate(UserProvider.java:122)
at org.apache.hadoop.hbase.client.ConnectionFactory.createConnection(ConnectionFactory.java:214)
at org.apache.hadoop.hbase.client.ConnectionFactory.createConnection(ConnectionFactory.java:119)
at org.apache.hadoop.hbase.mapreduce.TableOutputFormat.checkOutputSpecs(TableOutputFormat.java:177)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1099)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1.apply(PairRDDFunctions.scala:1085)
at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1.apply(PairRDDFunctions.scala:1085)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)