黑猴子的家:Spark RDD 之 HBase的输入输出(数据读取与保存的主要方式之一)

HBase数据库
由于 org.apache.hadoop.hbase.mapreduce.TableInputFormat 类的实现,Spark 可以通过Hadoop输入格式访问HBase。
这个输入格式会返回键值对数据,
其中键的类型为org. apache.hadoop.hbase.io.ImmutableBytesWritable,
而值的类型为org.apache.hadoop.hbase.client.Result。

1、HBase读取

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.log4j.Logger

object HbaseRedTest extends java.io.Serializable{

  val logger = Logger.getLogger(HbaseRedTest.getClass)

  def main(args: Array[String]): Unit = {

    val sparkConf = new SparkConf().setAppName("HBaseApp")//.setMaster("local[*]")
    val sc = new SparkContext(sparkConf)

    val conf = HBaseConfiguration.create()
    //HBase中的表名
    conf.set(TableInputFormat.INPUT_TABLE, "fruit")

    val hBaseRDD = sc.newAPIHadoopRDD(conf, classOf[TableInputFormat],
      classOf[org.apache.hadoop.hbase.io.ImmutableBytesWritable],
      classOf[org.apache.hadoop.hbase.client.Result])

    val count = hBaseRDD.count()
    println("hBaseRDD RDD Count:"+ count)
    hBaseRDD.cache()
    hBaseRDD.foreach {
      case (_, result) =>
        val key = Bytes.toString(result.getRow)
        val name = Bytes.toString(result.getValue("info".getBytes, "name".getBytes))
        val color = Bytes.toString(result.getValue("info".getBytes, "color".getBytes))
        println("Row key:" + key + " Name:" + name + " Color:" + color)
    }
    sc.stop()
  }
}

2、HBase写入

import org.apache.hadoop.hbase.client.{HBaseAdmin, Put}
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.{HBaseConfiguration, HColumnDescriptor, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.mapred.TableOutputFormat
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.mapred.JobConf
import org.apache.spark.{SparkConf, SparkContext}


object HbaseWriTest extends java.io.Serializable{
  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf().setAppName("HBaseApp")//.setMaster("local[*]")
    val sc = new SparkContext(sparkConf)

    val conf = HBaseConfiguration.create()
    val jobConf = new JobConf(conf)
    jobConf.setOutputFormat(classOf[TableOutputFormat])
    jobConf.set(TableOutputFormat.OUTPUT_TABLE, "fruit_spark")

    val fruitTable = TableName.valueOf("fruit_spark")
    val tableDescr = new HTableDescriptor(fruitTable)
    tableDescr.addFamily(new HColumnDescriptor("info".getBytes))

    val admin = new HBaseAdmin(conf)
    if (admin.tableExists(fruitTable)) {
      admin.disableTable(fruitTable)
      admin.deleteTable(fruitTable)
    }
    admin.createTable(tableDescr)

    def convert(triple: (Int, String, Int)) = {
      val put = new Put(Bytes.toBytes(triple._1))
      put.addImmutable(Bytes.toBytes("info"), Bytes.toBytes("name"), Bytes.toBytes(triple._2))
      put.addImmutable(Bytes.toBytes("info"), Bytes.toBytes("price"), Bytes.toBytes(triple._3))
      (new ImmutableBytesWritable, put)
    }
    val initialRDD = sc.parallelize(List((1,"apple",11), (2,"banana",12), (3,"pear",13)))
    val localData = initialRDD.map(convert)
    localData.saveAsHadoopDataset(jobConf)
  }
}

3、Hbase -> pom.xml



    4.0.0

    com.heihouzi.sparkhbase
    sparkhbase
    1.0-SNAPSHOT

    
        UTF-8
        1.2.17
        1.7.22
        2.1.1
        2.11.8
    

    
        
            org.slf4j
            jcl-over-slf4j
            ${slf4j.version}
        
        
            org.slf4j
            slf4j-api
            ${slf4j.version}
        
        
            org.slf4j
            slf4j-log4j12
            ${slf4j.version}
        
        
            log4j
            log4j
            ${log4j.version}
        

        
            junit
            junit
            3.8.1
            test
        

        
            org.apache.hbase
            hbase-server
            1.3.1
        

        
            org.apache.hbase
            hbase-client
            1.3.1
        

        
            org.scala-lang
            scala-library
            ${scala.version}
            
        

        
            org.apache.spark
            spark-core_2.11
            ${spark.version}
            
        

    

    
        SparkHbase
        
            
                org.apache.maven.plugins
                maven-compiler-plugin
                3.6.1
                
                    1.8
                    1.8
                
            

            
                net.alchim31.maven
                scala-maven-plugin
                3.2.2
                
                    
                        
                            compile
                            testCompile
                        
                    
                
            

            
                org.apache.maven.plugins
                maven-assembly-plugin
                3.0.0
                
                    
                        
                            com.yinggu.spark.WordCount
                        
                    
                    
                        jar-with-dependencies
                    
                
                
                    
                        make-assembly
                        package
                        
                            single
                        
                    
                
            
        
    

4、log4j.properties

log4j.rootLogger=info,stdout,R
log4j.appender.stdout=org.apache.log4j.ConsoleAppender
log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss,SSS}  %5p --- [%t]  %-c(line:%L) : %m%n

log4j.appender.R=org.apache.log4j.RollingFileAppender
log4j.appender.R.File=spark.log
log4j.appender.R.MaxFileSize=1024KB
log4j.appender.R.MaxBackupIndex=1

log4j.appender.R.layout=org.apache.log4j.PatternLayout
log4j.appender.R.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss,SSS}  %5p --- [%t]  %-c(line:%L) : %m%n

5、spark-submit

[heihouzi@hadoop102 spark]$ bin/spark-submit \
--master spark://hadoop102:7077 \
--class unit.HbaseRedTest 
SparkHbase-jar-with-dependencies.jar

你可能感兴趣的:(黑猴子的家:Spark RDD 之 HBase的输入输出(数据读取与保存的主要方式之一))