spark streaming消费kafka的数据 并写入HDFS和直接写hive表 (scala版本)

  1. 首先我消费的kafka的数据的类型为json类型数据 
  2. 话不多说直接上代码
  3. pom.xml
    
      4.0.0
      hive_sql
      hive_sql
      1.0-SNAPSHOT
      2008
      
        2.7.0
      
    
      
        
          scala-tools.org
          Scala-Tools Maven2 Repository
          http://scala-tools.org/repo-releases
        
      
    
      
        
          scala-tools.org
          Scala-Tools Maven2 Repository
          http://scala-tools.org/repo-releases
        
      
    
      
        
          org.scala-lang
          scala-library
          2.12.12
        
    
        
          com.alibaba
          fastjson
          1.2.47
        
    
        
          org.apache.spark
          spark-sql_2.12
          2.4.3
        
    
        
          org.apache.spark
          spark-hive_2.12
          2.4.3
        
    
        
          org.apache.spark
          spark-core_2.12
          2.4.3
        
    
        
          org.apache.spark
          spark-streaming_2.12
          2.4.3
        
        
          org.apache.spark
          spark-streaming-kafka-0-10_2.12
          2.4.3
        
      
    
      
        src/main/scala
        src/test/scala
        
          
            org.scala-tools
            maven-scala-plugin
            
              
                
                  compile
                  testCompile
                
              
            
            
              ${scala.version}
              
                -target:jvm-1.5
              
            
          
          
            org.apache.maven.plugins
            maven-eclipse-plugin
            
              true
              
                ch.epfl.lamp.sdt.core.scalabuilder
              
              
                ch.epfl.lamp.sdt.core.scalanature
              
              
                org.eclipse.jdt.launching.JRE_CONTAINER
                ch.epfl.lamp.sdt.launching.SCALA_CONTAINER
              
            
          
        
      
      
        
          
            org.scala-tools
            maven-scala-plugin
            
              ${scala.version}
            
          
        
      
    
    
    package hive_sql
    import org.apache.spark.SparkConf
    import org.apache.spark.streaming.{Seconds, StreamingContext}
    import org.apache.spark.sql._
    import org.apache.spark.sql.SparkSession
    // streaming
    import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType, DoubleType, FloatType}
    import com.alibaba.fastjson.{JSON,JSONObject,JSONArray}
    import org.apache.spark.streaming.kafka010._
    import org.apache.kafka.common.serialization.StringDeserializer
    import scala.collection.mutable.ArrayBuffer
    import scala.util.Try
    
    object hiv_sql {
      def main(args: Array[String]): Unit = {
        // spark config
        val conf = new SparkConf().setAppName("oe streaming")
        conf.setMaster("local")
        conf.set("spark.testing.memory","2147480000")
        conf.set("spark.executor.instances", "4")
        conf.set("spark.executor.memory", "2g")
        conf.set("spark.driver.allowMultipleContexts", "true")
        conf.set("hive.metastore.uris", "thrift://IP:9083")
        conf.set("spark.sql.warehouse.dir", "hdfs://IP:8020/user/hive/warehouse")
    
        // spark sql
        val spark = SparkSession
          .builder()
          .config(conf)
          .enableHiveSupport()
          .getOrCreate()
        val sql = "show databases"
        val r = spark.sql(sql)
        r.show()
        spark.sql("use report")
    
        // streaming connect
        val ssc = new StreamingContext(spark.sparkContext, Seconds(5))
        ssc.sparkContext.setLogLevel("ERROR")
        val bootstrapServers = "IP:9092,IP:9092,IP:9092"
        val groupId = "test-group-id7"
        val topicName = "event_track"
        val kafkaParams = Map[String, Object](
          "bootstrap.servers" -> bootstrapServers,
          "key.deserializer" -> classOf[StringDeserializer],
          "value.deserializer" -> classOf[StringDeserializer],
          "group.id" -> groupId,//消费者组名
          "auto.offset.reset" -> "earliest", //latest自动重置偏移量为最新的偏移量
          "enable.auto.commit" -> (false: java.lang.Boolean) //如果是true,则这个消费者的偏移量会在后台自动提交
        )
        val kafkaTopicDS = KafkaUtils.createDirectStream(
          ssc,
          LocationStrategies.PreferConsistent,
          ConsumerStrategies.Subscribe[String, String](Set(topicName), kafkaParams)
        )
    
        // df schema
        val schema = StructType(
          List(
            StructField("brandCode", IntegerType, true),
    //        StructField("brandCode", IntegerType, true),
            StructField("brandName", StringType, true),
            StructField("categoryName", StringType, true),
            StructField("cityName", StringType, true),
            StructField("date", StringType, true),
            StructField("endTime", StringType, true),
            StructField("nlogoCode", IntegerType, true),
            StructField("nlogoName", StringType, true),
            StructField("oe", StringType, true),
            StructField("proviceName", StringType, true),
            StructField("startTime", StringType, true),
            StructField("status", StringType, true),
            StructField("userID", StringType, true),
            StructField("userPurchaseId", IntegerType, true),
            StructField("vinCode", StringType, true),
          )
        )
        // kafkaTopicDS.map(_.value).print()
        // transform
        kafkaTopicDS.map(_.value).foreachRDD(rdd=>{
    
          // trdd = each rdd total
          val trdd = rdd.map(x => JSON.parseObject(x.toString).getJSONArray("value"))
            .map(x=>format_value(x))
            .flatMap(x => x)
            .map(x=>Row(
              tryToInt(x.getString("brandCode")).getOrElse(null),
    //          x.getString("brandCode"),
              x.getString("brandName"),
              x.getString("categoryName"),
              x.getString("cityName"),
              x.getString("date"),
              x.getString("endTime"),
              tryToInt(x.getString("nlogoCode")).getOrElse(null),
    //          x.getString("nlogoCode").toInt,
              x.getString("nlogoName"),
              x.getString("oe"),
              x.getString("proviceName"),
              x.getString("startTime"),
              x.getString("status"),
              x.getString("userID"),
    //          x.getString("userPurchaseId"),
              tryToInt(x.getString("userPurchaseId")).getOrElse(null),
              x.getString("vinCode"),
            ))
    
          val rddDF = spark.createDataFrame(trdd,schema)
          if (rddDF.count()>0){
    //写入到hdfs
            val value = rddDF.coalesce(1)
            value.write.format("parquet").mode("append").save("hdfs://IP:8020/user/hive/warehouse/report.db/oe_kafka")
       // 直接写hive
           rddDF.createOrReplaceTempView("v_table")
          spark.sql("select * from v_table").write.mode("append").saveAsTable("oe_kafka")
    
    
          
    
          }
        })
    
        def tryToInt( s: String ) = Try(s.toInt).toOption
    
        // json format
        def format_value(t:JSONArray):ArrayBuffer[JSONObject]={
          val result = ArrayBuffer[JSONObject]()
          for(i<- 1 to t.size)result.append(t.getJSONObject(i-1))
          result
        }
    
        // run daemon kafka
        ssc.start()
        ssc.awaitTermination()
      }
    }
    

     

你可能感兴趣的:(hadoop,kafka,spark,hive,spark,大数据,kafka,hadoop)