spark streaming消费kafka

Spark Streaming 消费kafka有两种形式,这里使用no receivers方式:

使用no receivers方式消费kafka

spark版本:2.4.0-cdh6.2.0
使用spark streaming消费kafka数据并写入hive中:

  package com.pica.bi.zhuque.stream

  import java.sql.Timestamp
  import java.util.Date

  import com.alibaba.fastjson.{JSON, JSONArray, JSONObject}
  import org.apache.hadoop.hive.ql.exec.UDF
  import org.apache.kafka.clients.consumer.ConsumerRecord
  import org.apache.spark.SparkConf
  import org.apache.spark.sql.{DataFrame, Row, SparkSession, types}
  import org.apache.spark.streaming.{Seconds, StreamingContext}
  import org.apache.kafka.common.serialization.StringDeserializer
  import org.apache.spark.rdd.RDD
  import org.apache.spark.sql.types.{IntegerType, StringType, StructField, TimestampType}
  import org.apache.spark.streaming.dstream.InputDStream
  import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}

  import scala.collection.mutable.{ArrayBuffer, ListBuffer}

  object ParseLog {
    val traceFields = List(
    	"id",
      "remark1",
      "remark2",
      "remark3",
      "remark4",
      "remark5",
      "created_time")

    def main(args: Array[String]): Unit = {
      
      //入口 
      val spark = SparkSession
        .builder()
  //      .master("local[*]")
        .appName("Spark SQL To Hive") 
        .enableHiveSupport()
        .getOrCreate()
      spark.conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")

      val ssc = new StreamingContext(spark.sparkContext,Seconds(120))
      val kafkaParams = Map(
        "bootstrap.servers" -> "broker1:9092",
        "key.deserializer" -> classOf[StringDeserializer],
        "value.deserializer" -> classOf[StringDeserializer],
        "group.id" -> "gp1101",
        //如果没有记录偏移量,就消费最新的数据
        "auto.offset.reset" -> "earliest",
        //spark 消费kafka中的偏移量自动维护: kafka 0.10之前的版本自动维护在zookeeper  kafka 0.10之后偏移量自动维护topic(__consumer_offsets)
        //开启自动维护偏移量
        "enable.auto.commit" ->  (true: java.lang.Boolean)
      )
      val topics = Array("test1")

      //直连方式
      val stream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String,String](ssc,
        LocationStrategies.PreferConsistent,
        ConsumerStrategies.Subscribe[String,String](topics,kafkaParams))
      stream.map(cr => cr.value()).filter(_.toString().contains("json:")).foreachRDD(rs=> {
        val rdd = rs.flatMap(line=>{
          //去掉行首[行尾]
          var linex = line.toString()
          if(line.toString().endsWith("]") || line.toString().endsWith(")")){
            linex =  line.toString().dropRight(1)
          }
          val jsonStr = linex.toString().split("json:").apply(1)
  //        println(s"jsonStr:${jsonStr}")
          val jsonObj = JSON.parseObject(jsonStr)
          val lines =  new ListBuffer[Row]()
          if(jsonObj.containsKey("datas")){
            val jsonArr:JSONArray = jsonObj.getJSONArray("datas")
            val fieldValues =   ArrayBuffer[Any]()
            fieldValues.append(0)//id值默认为0
            if(jsonArr.size()>0){
              for(i <- 0 to jsonArr.size()-1){
                val eachJson:JSONObject = jsonArr.getJSONObject(i)
                for(field <- traceFields){
                  if(field.equals("created_time")){
                    fieldValues.append( new Timestamp(new Date().getTime()))
                  }else if(eachJson.containsKey(field)){
                    fieldValues.append(eachJson.getString(field))
                  }else{
                    fieldValues.append("")
                  }
                }
                lines.append(Row.fromSeq(fieldValues.toSeq))
              }
            }
          }
          lines.toList
        })
        val df = createDf(spark,rdd)
        writeToHive(spark,df)
      })
      ssc.start()
      ssc.awaitTermination()
    }

    def createDf(spark:SparkSession,rdd: RDD[Row]): DataFrame ={
      val schemaList = new ListBuffer[StructField]
       
      traceFields.map(eachField=>{
        var struct:StructField = null
        if(eachField.equals("created_time")){
          struct = StructField(eachField, TimestampType, false)
        }else if(eachField.equals("id")){
          struct = StructField(eachField, IntegerType, false)
        }else {
          struct = StructField(eachField, StringType, false)
        }
        schemaList.append(struct)
      })
      val schema = types.StructType(schemaList.toList)
      println(schema)
      val resDF = spark.createDataFrame(rdd,schema)
  //    resDF.printSchema()
  //    resDF.show(false)
      return resDF
    }

    def writeToHive(spark: SparkSession, df: DataFrame): Unit = {
      df.createOrReplaceTempView("temp_table")
      val tday = "2019-11-08"
      val sql = s"insert into pica_temp.table partition(created_day='${tday}') select * from temp_table"
      println(s"准备执行:${sql}")
      spark.sql(sql)

    }
  }

spark streaming编程指南
  • 编程指南:http://spark.apache.org/docs/latest/streaming-programming-guide.html

你可能感兴趣的:(spark)