spark读写HBASE

环境配置
scala -> 2.11.12
spark->2.2.0
HBASE ->1.3.0 注意:用2.0的jar包写入不进去,但也不报错

/**
  * spark直接读写Hbase,已测试
  * @Author: stsahana
  * @Date: 2019-8-21 18:27
  **/
object HbaseDemo {

  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder
      .enableHiveSupport()
      .appName("habseDemo")
      .master("local[2]")
      .config("executor.memory", "2G")
      .config("total.executor.cores", "2")
      .config("spark.hadoop.validateOutputSpecs", false)
      .getOrCreate()

    read(spark)
    write(spark);
    read(spark)
  }


  def read(spark: SparkSession): DataFrame = {
    val sc = spark.sparkContext

    val hbaseConf = HBaseConfiguration.create()
    hbaseConf.set("hbase.zookeeper.quorum", "localhost") //设置zooKeeper集群地址,也可以通过将hbase-site.xml导入classpath,但是建议在程序里这样设置
    hbaseConf.set("hbase.zookeeper.property.clientPort", "2181") //设置zookeeper连接端口,默认2181
    hbaseConf.set(TableInputFormat.INPUT_TABLE, "Contacts")

    //读取数据并转化成rdd TableInputFormat 是 org.apache.hadoop.hbase.mapreduce 包下的
    val hBaseRDD = sc.newAPIHadoopRDD(hbaseConf, classOf[TableInputFormat],
      classOf[ImmutableBytesWritable],
      classOf[Result])
    import spark.implicits._

    val results = hBaseRDD.map(r => (
      Bytes.toString(r._2.getRow),
      Bytes.toString(r._2.getValue("Personal".getBytes, "Name".getBytes)),
      Bytes.toString(r._2.getValue("Office".getBytes, "Address".getBytes)),
      Bytes.toString(r._2.getValue("Personal".getBytes, "a".getBytes)),
      Bytes.toString(r._2.getValue("Personal".getBytes, "b".getBytes)),
      Bytes.toString(r._2.getValue("Personal".getBytes, "c".getBytes))
    )).toDF("row", "B", "C", "d", "e", "f");
    results.show(20, false);
    return results;
  }

  def write(spark: SparkSession) = {

    val sc = spark.sparkContext
    import spark.implicits._

    val tableName = "Contacts"
    //create configuration object
    val conf: Configuration = HBaseConfiguration.create()
    //set zookeeper information
    conf.set("hbase.zookeeper.quorum", "localhost");
    conf.set("hbase.zookeeper.property.clientPort", "2181");
    //setup job object
    val job: Job = Job.getInstance(conf)
    //define outputformat class
    job.setOutputFormatClass(classOf[TableOutputFormat[ImmutableBytesWritable]])
    //add table name to configuration
    job.getConfiguration().set(TableOutputFormat.OUTPUT_TABLE, tableName)


    val indataRDD = sc.makeRDD(Array("3,jack,15", "4,Lily,16", "5,mike,16"))
    val df = indataRDD.map(_.split(",")).map(a => {
      (a(0), a(1), a(2))
    }).toDF("a", "b", "c")
    df.show();
    val prepareHBaseToLoad: RDD[(ImmutableBytesWritable, Put)] =
      df.rdd.map(row => rowToPut(row: Row))
    try {
      prepareHBaseToLoad.saveAsNewAPIHadoopDataset(job.getConfiguration())
    } catch {
      //handle the null string excpetion while inserting to Hbase throws
      case e: Exception => {
        if (e.getMessage().equals("Can not create a Path from a null string")) {
          println(" saveAsNewAPIHadoopDataset - Exception caused due to a bug in spark 2.2 - Data is saved in HBASE but still excepton is thrown - java.lang.IllegalArgumentException: Can not create a Path from a null string at org.apache.hadoop.fs.Path.checkPathArg ")
        } else {
          throw (e)
        }
      }
    }

  }

  def rowToPut(row: Row): (ImmutableBytesWritable, Put) = {
    //  rowToPut(row)
    val columnList = row.length
    //convert the rowKey into String
    val arrayList = row.schema.fieldNames
    //
    val rowkey: String = row.getAs[String](arrayList(0));
    var put = new Put(Bytes.toBytes(rowkey))
    //   println("rowkey=="+rowkey);
    for (field <- 1 until arrayList.size) {
      // Add the score data columns to the Put object
      put.addColumn(Bytes.toBytes("Personal"), Bytes.toBytes(arrayList(field)), Bytes.toBytes(row.getAs[String](arrayList(field))))
    }
    // Returns the assembled Put object
    return (new ImmutableBytesWritable(Bytes.toBytes(rowkey)), put)
  }
}

/** 测试表
  * create "Contacts", { NAME => "Personal", VERSIONS=>5},{ NAME =>"Office",VERSIONS=>2}
  * put 'Contacts', '1', 'Personal:Name', 'John Dole'
  * put 'Contacts', '1', 'Personal:Phone', '1-234-000-0001'
  * put 'Contacts', '1', 'Office:Phone', '1-234-000-0002'
  * put 'Contacts', '1', 'Office:Address', '1111 San Gabriel Dr.'
  * put 'Contacts', '2', 'Personal:Name', 'Calvin Raji'
  * put 'Contacts', '2', 'Personal:Phone', '123-555-0191'
  * put 'Contacts', '2', 'Office:Phone', '123-555-0191'
  * put 'Contacts', '2', 'Office:Address', '5415 San Gabriel Dr.'
  *
  */

  1. 报错一
    链接:https://issues.apache.org/jira/browse/HBASE-20295
Exception in thread "main" java.lang.NullPointerException
	at org.apache.hadoop.hbase.security.UserProvider.instantiate(UserProvider.java:122)
	at org.apache.hadoop.hbase.client.ConnectionFactory.createConnection(ConnectionFactory.java:214)
	at org.apache.hadoop.hbase.client.ConnectionFactory.createConnection(ConnectionFactory.java:119)
	at org.apache.hadoop.hbase.mapreduce.TableOutputFormat.checkOutputSpecs(TableOutputFormat.java:177)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1099)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1.apply(PairRDDFunctions.scala:1085)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsNewAPIHadoopDataset$1.apply(PairRDDFunctions.scala:1085)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:362)

你可能感兴趣的:(大数据)