SparkStreaming读取Kafka的数据并写入到HBase

1、编写HBase工具类

package HBaseDao;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.client.HBaseAdmin;
import org.apache.hadoop.hbase.client.HTable;
import org.apache.hadoop.hbase.client.Put;
import java.io.IOException;

/*
*
*   操作Hbase的工具类
* */
public class HBaseUtils {
    HBaseAdmin admin = null;
    Configuration configuration = null;
    /*
     *  私有化构造器
     * */
    private HBaseUtils() {
        configuration = new Configuration();
        configuration.set("hbase.zookeeper.quorum","qyl01,qyl02,qyl03");
        configuration.set("hbase.rootdir","hdfs:///hbase");
        try{
            admin = new HBaseAdmin(configuration);
        }
        catch(IOException e){
            e.printStackTrace();
        }
    }
    private static HBaseUtils instance = null;

    public static HBaseUtils getInstance() {
        if (null == instance){
            instance = new HBaseUtils();
        }
        return instance;
    }
    /*
    *   根据表名获取HBase实例
    *   @param tableName
    *   @return
    * */
    public HTable getTable(String tableName){
        HTable table  = null ;
        try {
            table  = new HTable(configuration, tableName);
        }catch (IOException e){
            e.printStackTrace();
        }
        return table;
    }

    /**
     * 添加一条记录到HBase表
     * @param tableName 表名
     * @param rowkey rowkey
     * @param cf columnFamily
     * @param column 列
     * @param value 写入的值
     */
    public void put(String tableName,String rowkey,String cf,String column,String value){
        HTable table = getTable(tableName);
        Put put = new Put(rowkey.getBytes());
        put.add(cf.getBytes(),column.getBytes(),value.getBytes());

        try{
            table.put(put);
        }catch (IOException e){
            e.printStackTrace();
        }
    }
    /*
    * 插入一条数据到HBase中进行操作
    * */
/*
    public static void main(String[] args) {
        String tableName = "course_clickcount";
        String rowkey = "20191111_188";
        String cf = "info";
        String column = "click_count";
        String value = "2";
        HBaseUtils.getInstance().put(tableName,rowkey,cf,column,value);

      }
*/
}

2、编写HBase操作类

package HBaseDao

import java.io.IOException

import org.apache.hadoop.hbase.client.Get
import org.apache.hadoop.hbase.util.Bytes

import scala.collection.mutable.ListBuffer

/*
* 操作hbase的dao
* */
object ClickCourseCountDao {
    /*
    * hbase中的各参数,参照个人配置
    * */
  val tableName = "course_clickcount"
  val cf = "info"
  val column = "clickcount"


  /*
  * 插入结果数据的方法
  * */
  def save(List:ListBuffer[ClickCoursCount]):Unit = {
       val htable = HBaseUtils.getInstance().getTable(tableName)
       for(clk <- List){
         htable.incrementColumnValue(
           clk.dayCourse.getBytes(),
           cf.getBytes(),
           column.getBytes(),
           clk.clickCount
         )
       }
  }
  /*
  * 取出tableName中column对应的value的数据
  * */
  def count(dayCourse:String):Long = {
    val htable = HBaseUtils.getInstance().getTable(tableName)
    val get = new Get(dayCourse.getBytes())
    val value = htable.get(get).getValue(cf.getBytes(),column.getBytes())
    if(null == value){
      0L
    }else{
      Bytes.toLong(value)
    }
  }




  def main(args: Array[String]): Unit = {
      val listbuffer = new ListBuffer[ClickCoursCount]
     /*
     * 插入数据测试
     * */
      listbuffer.append(ClickCoursCount("20191111_88",1L))
      listbuffer.append(ClickCoursCount("20191111_88",2L))
      listbuffer.append(ClickCoursCount("20191111_88",2L))
      save(listbuffer)

     println(count("20191111_88")+"------"+count("20191111_88"))

  }
}
  /*
  * 点击量的实体类
  * */
case class ClickCoursCount(dayCourse:String,clickCount:Long)

3、编写SparkStreaming代码读取Kafka的数据写入HBase

package com.bonc.qyl.Spark

import HBaseDao.{ClickCoursCount, ClickCourseCountDao}
import kafka.serializer.StringDecoder
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}

import scala.collection.mutable.ListBuffer

/*
*  flume + kafka +SparkStreaming +HBase
* */
object ProjectStreaming {
  def main(args: Array[String]): Unit = {
    /*
    *    实际项目中 应该是传入参数,这里不做演示了
    * if(args.length != 2){
      System.err.println("Usage ProjectStreaming:  ")
      System.exit(1)
    }
   */

    /*
    * 建立连接
    * */
    System.setProperty("HADOOP_USER_NAME","qyl")
    val conf = new SparkConf().setMaster("local[2]").setAppName("ProjectStreaming")
    val ssc = new StreamingContext(conf,Seconds(5))
    ssc.checkpoint("hdfs:///flume-kafka-direct")

    /*
    *  读取kafka中的数据
    * */
    val kafkaParams = Map[String,String]("metadata.broker.list" -> "qyl01:9092,qyl02:9092,qyl03:9092","auto.offset.reset" -> "smallest")
    val topics = Set("flume-kafka-sparkStreaming-HBase1")
    val kafkaDStream: DStream[String] = KafkaUtils.createDirectStream
      [String, String, StringDecoder, StringDecoder](ssc,kafkaParams,topics).map(_._2)

    /*
  * 数据过滤
  * 数据格式
  * 132.168.89.224    2018-07-13 05:53:02 "GET /class/145.html HTTP/1.1"  200 https://search.yahoo.com/search?p=Flink实战
  * */
    val   cleanData : DStream[ClickLog]  = kafkaDStream.map { x =>
      val strArr = x.split("\t")
        val ip = strArr(0)
        val time = strArr(1).substring(0,10).trim()
        val refer = strArr(2).split(" ")(1)
        val status = strArr(3).toInt
        val searchArr = strArr(4).replaceAll("//", "/").split("/")
        var searchUrl = ""
        if (searchArr.length > 2) {
          searchUrl = searchArr(1)
        } else {
          searchUrl = searchArr(0)
        }
        (ip, time, refer, status, searchUrl)
    }.filter(_._3.startsWith("/class")).map { x =>
      // 145.html
      val referStr = x._3.split("/")(2)
      val refer = referStr.substring(0, referStr.lastIndexOf("."))
      ClickLog(x._1, x._2, refer, x._4, x._5)
    }
      /*
    * 需求:统计到今天为止,的访问量
    */

    cleanData.map(x =>(x.time +"_"+x.refer,1)).reduceByKey(_+_).foreachRDD{rdd =>{
      rdd.foreachPartition{rddPartition =>
        val list = new ListBuffer[ClickCoursCount]
        rddPartition.foreach{ pair =>
          list.append(ClickCoursCount(pair._1,pair._2))
        }
        /*
        * 写入数据到HBase
        * */
        ClickCourseCountDao.save(list)
      }
    }}


    ssc.start()
    ssc.awaitTermination()

  }
}

case class ClickLog(ip:String,time:String,refer:String,status:Int,searchUrl:String)

4、pom.xml文件


  4.0.0
  com.bonc.qyl.Spark
  Kafka_SparkStreaming_Hbase
  1.0-SNAPSHOT
  2008
    
        UTF8
        1.8
        1.8
        UTF-8
        2.11.8
        2.3.2
        2.7.7
        5.1.46
        1.1.0
        4.12
        2.3.2
        2.11
    

    

        
            org.scala-lang
            scala-library
            ${scala.version}
        

        
            org.apache.spark
            spark-core_2.11
            ${spark.version}
        

        
            org.apache.spark
            spark-sql_2.11
            ${spark.version}
        

        
            org.apache.spark
            spark-streaming_2.11
            ${spark.version}
        

        
            org.apache.spark
            spark-graphx_2.11
            ${spark.version}
        

        
            org.apache.spark
            spark-mllib_2.11
            ${spark.version}
        

        
            org.apache.hadoop
            hadoop-client
            ${hadoop.version}
        

        
        
        
            org.apache.spark
            spark-streaming-kafka-0-8_2.11
            ${streaming.kafka.version}
        

        
            org.apache.spark
            spark-streaming-flume_2.11
            ${spark.version}
        

        
            mysql
            mysql-connector-java
            ${mysql.version}
        

        
            org.apache.spark
            spark-hive_2.11
            ${spark.version}
        

        
        
        
        
        
        

        
        
            junit
            junit
            ${junit.version}
            test
        

        
        
            org.apache.hbase
            hbase-client
            1.2.6
        
        
            org.apache.hbase
            hbase-server
            1.2.6
        

        
        
        
        
        
        

        
        
            com.101tec
            zkclient
            0.3
        


        
        
            org.apache.zookeeper
            zookeeper
            3.4.12
            pom
        

    

    
        
            
                
                    net.alchim31.maven
                    scala-maven-plugin
                    3.2.2
                
                
                    org.apache.maven.plugins
                    maven-compiler-plugin
                    3.5.1
                
            
        
        
            
                net.alchim31.maven
                scala-maven-plugin
                
                    
                        scala-compile-first
                        process-resources
                        
                            add-source
                            compile
                        
                    
                    
                        scala-test-compile
                        process-test-resources
                        
                            testCompile
                        
                    
                
            

            
                org.apache.maven.plugins
                maven-compiler-plugin
                
                    
                        compile
                        
                            compile
                        
                    
                
            

            
                    org.apache.maven.plugins
                    maven-shade-plugin
                    2.3
                
                    
                        package
                        
                            shade
                        
                        
                            
                                
                                    *:*
                                    
                                        META-INF/*.SF
                                        META-INF/*.DSA
                                        META-INF/*.RSA
                                    
                                
                            
                        
                    
                
            
        
    

 

你可能感兴趣的:(spark)