Spark拉取Kafka的流数据,转插入HBase中

Spark拉取Kafka的流数据,转插入HBase中


pom.xml文件样例


    4.0.0
    com.yys.spark
    spark
    1.0
    2008
    
        2.11.12
        0.9.0.1
        2.2.0
        2.7.5
        1.4.0
    

    
        
            org.scala-lang
            scala-library
            ${scala.version}
        

        

        
            org.apache.kafka
            kafka_2.11
            ${kafka.version}
        

        
        
            org.apache.hadoop
            hadoop-client
            ${hadoop.version}
        

        
        
            org.apache.hbase
            hbase-client
            ${hbase.version}
        

        
            org.apache.hbase
            hbase-server
            ${hbase.version}
        

        
        
            org.apache.spark
            spark-streaming_2.11
            ${spark.version}
        


        
        
            org.apache.spark
            spark-streaming-flume_2.11
            ${spark.version}
        

        
            org.apache.spark
            spark-streaming-flume-sink_2.11
            ${spark.version}
        

        
            org.apache.spark
            spark-streaming-kafka-0-8_2.11
            ${spark.version}
        

        
            org.apache.commons
            commons-lang3
            3.5
        

        
        
            org.apache.spark
            spark-sql_2.11
            ${spark.version}
        


        
            com.fasterxml.jackson.module
            jackson-module-scala_2.11
            2.6.5
        

        
            net.jpountz.lz4
            lz4
            1.3.0
        

        
            mysql
            mysql-connector-java
            5.1.44
        

        
            org.apache.flume.flume-ng-clients
            flume-ng-log4jappender
            1.8.0
        

    

    
        
        
            
                org.scala-tools
                maven-scala-plugin
                
                    
                        
                            compile
                            testCompile
                        
                    
                
                
                    ${scala.version}
                    
                        -target:jvm-1.5
                    
                
            
            
                org.apache.maven.plugins
                maven-eclipse-plugin
                
                    true
                    
                        ch.epfl.lamp.sdt.core.scalabuilder
                    
                    
                        ch.epfl.lamp.sdt.core.scalanature
                    
                    
                        org.eclipse.jdt.launching.JRE_CONTAINER
                        ch.epfl.lamp.sdt.launching.SCALA_CONTAINER
                    
                
            
        
    
    
        
            
                org.scala-tools
                maven-scala-plugin
                
                    ${scala.version}
                
            
        
    

scala代码:

import org.apache.hadoop.hbase.HBaseConfiguration
import org.apache.hadoop.hbase.HColumnDescriptor
import org.apache.hadoop.hbase.HTableDescriptor
import org.apache.hadoop.hbase.client.HBaseAdmin
import org.apache.hadoop.hbase.client.HTable
import org.apache.hadoop.hbase.client.Put
import org.apache.hadoop.hbase.client.Result
import org.apache.hadoop.hbase.client.Scan
import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark._
import org.apache.spark.SparkContext
import org.apache.hadoop.hbase.client.Get
import org.apache.spark.serializer.KryoSerializer
import org.apache.hadoop.hbase.io.ImmutableBytesWritable
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.hadoop.mapred.JobConf
import org.apache.hadoop.hbase.mapred.TableOutputFormat
import org.apache.spark.rdd.RDD.rddToPairRDDFunctions
import org.apache.hadoop.hbase.util.Bytes

//拉取kafka的数据流,转插入hbase中
object Kafka2Hbase {

  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf().setMaster("local").setAppName("HBaseTest")
      .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
    val sc = new SparkContext(sparkConf)
    //hbase中的表名称
    var table_name = "create_table_at_first"
    val conf = HBaseConfiguration.create()

    //hbase的配置信息可以从/home/hadoop/HBase/hbase/conf/hbase-site.xml得到
    conf.set("hbase.rootdir", "hdfs://master:9000/hbase_db")
    conf.set("hbase.zookeeper.quorum", "master,Slave1,Slave2")
    conf.set("hbase.zookeeper.property.clientPort", "2181")
    conf.set("hbase.master", "60000")
    conf.set(TableInputFormat.INPUT_TABLE, table_name)

    //初始化jobconf,TableOutputFormat必须是org.apache.hadoop.hbase.mapred包下的!
    val jobConf = new JobConf(conf)
    jobConf.setOutputFormat(classOf[TableOutputFormat])
    jobConf.set(TableOutputFormat.OUTPUT_TABLE, table_name)

    val indataRDD = sc.makeRDD(Array("1,jack15,15", "2,mike16,16"))
    val rdd = indataRDD.map(_.split(',')).map { arr => {
      /*一个Put对象就是一行记录,在构造方法中指定主键
       * 所有插入的数据必须用org.apache.hadoop.hbase.util.Bytes.toBytes方法转换
       * Put.add方法接收三个参数:列族,列名,数据
       * myfamily:为列族名
       */
      val put = new Put(Bytes.toBytes(arr(0).toInt))
      put.add(Bytes.toBytes("myfamily"), Bytes.toBytes("name"), Bytes.toBytes(arr(1)))
      put.add(Bytes.toBytes("myfamily"), Bytes.toBytes("age"), Bytes.toBytes(arr(2).toInt))
      //转化成RDD[(ImmutableBytesWritable,Put)]类型才能调用saveAsHadoopDataset
      (new ImmutableBytesWritable, put)
    }
    }

    rdd.saveAsHadoopDataset(jobConf)

    sc.stop()
    //之后在hbase中,可以get 'create_table_at_first','jack15','myfamily'  查询这条数据即可
  }
}

你可能感兴趣的:(Spark拉取Kafka的流数据,转插入HBase中)