使用Spark streaming 获取 kafka的json数据分析并用Hbase保存结果

使用Spark streaming 连接 kafka分析并用Hbase保存结果

  1. kafka发送的数据是json格式
{"userId":20400,"day":"2017-03-01","begintime":1488326400000,"endtime":1488327000000,"data":[{"package":"com.browser1","activetime":60000},{"package":"com.browser","activetime":1207000}]}
{"userId":2000,"day":"2017-03-05","begintime":1488326400000,"endtime":1488327000000,"data":[{"package":"com.browser","activetime":120000}]}
  1. 项目maven 的配置
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  <modelVersion>4.0.0modelVersion>
  <groupId>edu.zhkugroupId>
  <artifactId>SparkStreamingartifactId>
  <packaging>jarpackaging>
  <version>1.0-SNAPSHOTversion>
  <properties>
    <spark.version>2.2.1spark.version>
    <scala.version>2.11.6scala.version>
  properties>

    <repositories>
        <repository>
            <id>nexus-aliyunid>
            <name>Nexus aliyunname>
            <url>http://maven.aliyun.com/nexus/content/groups/publicurl>
        repository>
    repositories>



  <dependencies>
      <dependency>
          <groupId>org.scala-langgroupId>
          <artifactId>scala-libraryartifactId>
          <version>${scala.version}version>
      dependency>

      <dependency>
          <groupId>org.scala-langgroupId>
          <artifactId>scala-xmlartifactId>
          <version>2.11.0-M4version>
      dependency>


      
      <dependency>
          <groupId>org.apache.sparkgroupId>
          <artifactId>spark-core_2.11artifactId>
          <version>2.3.0version>
      dependency>


      
      <dependency>
          <groupId>org.apache.sparkgroupId>
          <artifactId>spark-streaming_2.11artifactId>
          <version>2.3.0version>
      dependency>



      <dependency>
          <groupId>org.apache.sparkgroupId>
          <artifactId>spark-streaming-kafka-0-10_2.11artifactId>
          <version>2.3.0version>
      dependency>



      
      <dependency>
          <groupId>org.apache.sparkgroupId>
          <artifactId>spark-sql_2.11artifactId>
          <version>2.3.0version>
      dependency>


      
    
      
      
      
    

    <dependency>
      <groupId>org.apache.sparkgroupId>
      <artifactId>spark-sql-kafka-0-10_2.11artifactId>
      <version>2.3.0version>
    dependency>

      <dependency>
          <groupId>org.apache.hadoopgroupId>
          <artifactId>hadoop-clientartifactId>
          <version>2.7.1version>
      dependency>

      <dependency>
          <groupId>org.apache.hadoopgroupId>
          <artifactId>hadoop-hdfsartifactId>
          <version>2.7.1version>
      dependency>

      
      <dependency>
          <groupId>org.apache.hbasegroupId>
          <artifactId>hbase-clientartifactId>
          <version>2.0.0-beta-2version>
      dependency>

  dependencies>


  <build>
      <pluginManagement>
          <plugins>
              
              <plugin>
                  <groupId>net.alchim31.mavengroupId>
                  <artifactId>scala-maven-pluginartifactId>
                  <version>3.2.2version>
              plugin>
              
              <plugin>
                  <groupId>org.apache.maven.pluginsgroupId>
                  <artifactId>maven-compiler-pluginartifactId>
                  <version>3.5.1version>
              plugin>
          plugins>
      pluginManagement>
      <plugins>
          <plugin>
              <groupId>net.alchim31.mavengroupId>
              <artifactId>scala-maven-pluginartifactId>
              <executions>
                  <execution>
                      <id>scala-compile-firstid>
                      <phase>process-resourcesphase>
                      <goals>
                          <goal>add-sourcegoal>
                          <goal>compilegoal>
                      goals>
                  execution>
                  <execution>
                      <id>scala-test-compileid>
                      <phase>process-test-resourcesphase>
                      <goals>
                          <goal>testCompilegoal>
                      goals>
                  execution>
              executions>
          plugin>

          <plugin>
              <groupId>org.apache.maven.pluginsgroupId>
              <artifactId>maven-compiler-pluginartifactId>
              <executions>
                  <execution>
                      <phase>compilephase>
                      <goals>
                          <goal>compilegoal>
                      goals>
                  execution>
              executions>
          plugin>
          
          <plugin>
              <groupId>org.apache.maven.pluginsgroupId>
              <artifactId>maven-shade-pluginartifactId>
              <version>2.4.3version>
              <executions>
                  <execution>
                      <phase>packagephase>
                      <goals>
                          <goal>shadegoal>
                      goals>
                      <configuration>
                          <filters>
                              <filter>
                                  <artifact>*:*artifact>
                                  <excludes>
                                      <exclude>META-INF/*.SFexclude>
                                      <exclude>META-INF/*.DSAexclude>
                                      <exclude>META-INF/*.RSAexclude>
                                  excludes>
                              filter>
                          filters>
                      configuration>
                  execution>
              executions>
          plugin>
      plugins>
  build>
      <reporting>
        <plugins>
          <plugin>
            <groupId>org.scala-toolsgroupId>
            <artifactId>maven-scala-pluginartifactId>
            <configuration>
              <scalaVersion>${scala.version}scalaVersion>
            configuration>
          plugin>
        plugins>
      reporting>
project>
  1. 代码实现:使用了scala的json4s包和case类来解析json
package edu.zhku


import java.util.Date

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.hbase.{HBaseConfiguration, TableName}
import org.apache.hadoop.hbase.client.{Connection, ConnectionFactory, Put, Table}
import org.apache.hadoop.hbase.util.Bytes
import org.json4s._
import org.json4s.jackson.JsonMethods._
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe

/*
 *  @author : 钱伟健 [email protected]
 *  @version    : 2018/4/12 21:32.
 *  说明:
 */
/**
  * 
 
  */
object BehaviorHourly {
    System.setProperty("hadoop.home.dir", "/data/install/apache/hadoop-2.9.0")
    //System.setProperty("hadoop.home.dir", "D:\\program\\hadoop")

    var zookeeperservers = "master:2181,slave1:2181,slave2:2181"
    var tablename = "userHourly"

    //  hbase的相关配置
    val hbaseconf: Configuration = HBaseConfiguration.create()
    hbaseconf.set("hbase.zookeeper.quorum",zookeeperservers)
    hbaseconf.set("hbase.zookeeper.property.clientPort", "2181")

    var table: Table = _

    // 定义case类来析构json数据
    case class apptimes(activetime:String, `package`:String)
    case class UserHourly(userId:String, endtime:Long, data: List[(String,Long)])
    case class log(userId:String, day:String, begintime:String,  endtime:Long, data: List[apptimes])


    def main(args: Array[String]): Unit = {
    //def BehaviorHourly() {
        val conf = new SparkConf().setMaster("local[2]").setAppName("behavior")
        // 设置每三秒更新一下
        val ssc = new StreamingContext(conf, Seconds(3))

        val kafkaParams = Map[String, Object](
            "bootstrap.servers" -> "master:9092,slave1:9092,slave2:9092",
            "key.deserializer" -> classOf[StringDeserializer],
            "value.deserializer" -> classOf[StringDeserializer],
            "group.id" -> "use_a_separate_group_id_for_each_stream",
            "auto.offset.reset" -> "latest",
            "enable.auto.commit" -> (false: java.lang.Boolean)
        )

        val topics = Array("behavior")
        val stream = KafkaUtils.createDirectStream[String, String](
            ssc,
            PreferConsistent,
            Subscribe[String, String](topics, kafkaParams)
        )


        stream.map(record => record.value)
                .map(value => {
                    //  隐式转换,使用json4s的默认转化器
                    implicit val formats: DefaultFormats.type = DefaultFormats
                    val json = parse(value)
                    // 样式类从JSON对象中提取值
                   json.extract[log]
                }).window( Seconds(3600), Seconds(60))  // 设置窗口时间,这个为每分钟分析一次一小时内的内容
                .foreachRDD(    // 这里请去了解RDD的概念
                    rdd => {
                        rdd.foreachPartition(partitionOfRecords => {    // 循环分区
                            // 获取Hbase连接,分区创建一个连接,分区不跨节点,不需要序列化
                            var connection: Connection = ConnectionFactory.createConnection(hbaseconf)
                            table = connection.getTable(TableName.valueOf(tablename))
                            partitionOfRecords.foreach(logData => {
                                val theput= new Put(Bytes.toBytes(String.valueOf(new Date().getTime)+"_"+logData.endtime))
                                theput.addColumn(Bytes.toBytes("info"),Bytes.toBytes("userId"),Bytes.toBytes(logData.userId.toString))
                                logData.data.foreach(
                                    appTime => {
                                        theput.addColumn(Bytes.toBytes("info"),Bytes.toBytes(appTime.`package`.toString),Bytes.toBytes(appTime.activetime.toString))
                                    }
                                )
                                table.put(theput)
                                table.close()
                            })
                        })
                    }
                )


        ssc.start()             // Start the computation
        ssc.awaitTermination()  // Wait for the computation to terminate
    }

}

你可能感兴趣的:(spark)