工业物联网sparkstreaming+kafka+mqtt

一、概述

工业物联网数据通过mqtt协议发送到emqtt,kafka订阅emqtt数据,sparkstreaming消费kafka数据和原始留存在oracle的信息表关联计算。

二、demo示例

package streamTest

import java.util.concurrent.Future
import java.util.{Date, Properties}

import com.google.gson.Gson
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord, RecordMetadata}
import org.apache.kafka.common.serialization.{StringDeserializer, StringSerializer}
import org.apache.spark._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.sql.types._
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.{Durations, Seconds, StreamingContext}
import org.apache.spark.streaming.kafka010._

object KafkaStreamTest {
  def main(args: Array[String]): Unit = {
    val property = new Properties()
    val url = "jdbc:oracle:thin:@//xxx:1634/GPS"
    property.put("user","GPS2")
    property.put("password","123456")
    val conf = new SparkConf().setAppName("kafkaStreamTest").set("spark.driver.allowMultipleContexts","true")
    val ssc = new StreamingContext(conf, Durations.seconds(1))
    val sparkSession = SparkSession.builder().appName("kafkaStreamTest").enableHiveSupport().getOrCreate()
    val rfrunDF=sparkSession.read.jdbc(url,"t_rfrun",property)
    val driverDF=sparkSession.read.jdbc(url,"t_driver",property)
    val msgDF=rfrunDF.join(driverDF,rfrunDF("WORKNUM")===driverDF("WORKNUM"),"inner")
      .select(rfrunDF("equipnum"),rfrunDF("worknum"),rfrunDF("worktime"),rfrunDF("unworktime"),driverDF("name"),driverDF("remark2"))
    msgDF.persist()
    val topics = Array("pocGPS01")
    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> "xxx:9092",
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> "stream",
      "auto.offset.reset" -> "latest",
      "enable.auto.commit" -> (false: java.lang.Boolean)
    )
    val  kafkaStream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String](
      ssc,
      LocationStrategies.PreferConsistent,
      ConsumerStrategies.Subscribe[String, String](topics, kafkaParams)
    )

    val recieveStream=kafkaStream
      .filter(_.value().length>0)
      .filter(_.value().nonEmpty)
      .map(x=>handleJson2CaseClass(x.value()))
      .mapPartitions(iter=>{
          iter.map(x=>{
          var distance=getDistance(x.jd1,x.wd1,x.jd2,x.wd2)
          (x.num1,x.num2,distance,x.alertTime)
      })
    })
    val crashSchema=StructType(List(
      StructField("num1",StringType,false),
      StructField("num2",StringType,false),
      StructField("distance",DoubleType,false),
      StructField("alterTime",DateType,false)))

    val resultWorker=recieveStream.transform(rdd=>{
      val crash=rdd.map(x=>Row(x._1,x._2,x._3,x._4))
      val crashDF=sparkSession.createDataFrame(crash,crashSchema)
      val result=crashDF.filter("distance<=0.04")
        .join(msgDF,crashDF("num1")===msgDF("equipnum"),"left_outer")
        .filter("alterTime <=  unworktime and alterTime >= worktime")
        .select(msgDF("worknum"),msgDF("name"),msgDF("remark2"),msgDF("worktime"),msgDF("unworktime"),crashDF("alterTime"))
      .toDF().rdd
      result
    })

    val streamAlertWindowRdd1=recieveStream.filter(_._3<0.04).countByWindow(Seconds(180),Seconds(30))

    val kafkaProducer: Broadcast[KafkaSink[String, String]] = {
      val kafkaProducerConfig = {
        val p = new Properties()
        p.setProperty("bootstrap.servers", "100.69.149.210:9092")
        p.setProperty("key.serializer", classOf[StringSerializer].getName)
        p.setProperty("value.serializer", classOf[StringSerializer].getName)
        p.setProperty("acks","0")
        p.setProperty("buffer.memory","102400")
        p.setProperty("batch.size","1000")
        p
      }
      ssc.sparkContext.broadcast(KafkaSink[String, String](kafkaProducerConfig))
    }
    resultWorker.foreachRDD(rdd => {
      if (!rdd.isEmpty) {
        rdd.foreach(record => {
          kafkaProducer.value.send("ssc_test_1",record.toString())
          // do something else
        })
      }
    })
    streamAlertWindowRdd1.foreachRDD(rdd => {
      if (!rdd.isEmpty) {
        rdd.foreach(record => {
          kafkaProducer.value.send("ssc_test_window_1","3分钟警次数:"+record.toString())
          // do something else
        })
      }
    })

    ssc.start()
    ssc.awaitTermination()

    }


  case class CrashAlert(num1 :String,num2 :String,tp: Int,status: Int,jd1: Double,wd1: Double,jd2: Double,wd2: Double,alertTime: Date)
  def handleJson2CaseClass(jsonStr: String): CrashAlert = {
    val gson = new Gson()
    gson.fromJson(jsonStr, classOf[CrashAlert])
  }
  def getDistance(jd1: Double,wd1: Double,jd2: Double,wd2: Double): Double={
    if (jd1 != 0 && wd1 != 0 && jd2 != 0 && wd2 != 0) {
      val R = 6378.137
      val radLat1 = jd1 * Math.PI / 180
      val radLat2 = jd2 * Math.PI / 180
      val a = radLat1 - radLat2
      val b = wd1 * Math.PI / 180 - wd2 * Math.PI / 180
      val s = 2 * Math.sin(Math.sqrt(Math.pow(Math.sin(a / 2), 2) + Math.cos(radLat1) * Math.cos(radLat2) * Math.pow(Math.sin(b / 2), 2)))
      //BigDecimal.double2bigDecimal(s * R).setScale(2, BigDecimal.RoundingMode.HALF_UP)
      double2Double(s * R)
    } else {
      //BigDecimal.double2bigDecimal(0).setScale(2, BigDecimal.RoundingMode.HALF_UP)
      double2Double(0)
    }
  }

  class KafkaSink[K, V](createProducer: () => KafkaProducer[K, V]) extends Serializable {
    /* This is the key idea that allows us to work around running into
       NotSerializableExceptions. */
    lazy val producer = createProducer()
    def send(topic: String, key: K, value: V): Future[RecordMetadata] =
      producer.send(new ProducerRecord[K, V](topic, key, value))
    def send(topic: String, value: V): Future[RecordMetadata] =
      producer.send(new ProducerRecord[K, V](topic, value))
  }

  object KafkaSink {

    import scala.collection.JavaConversions._

    def apply[K, V](config: Map[String, Object]): KafkaSink[K, V] = {
      val createProducerFunc = () => {
        val producer = new KafkaProducer[K, V](config)
        sys.addShutdownHook {
          // Ensure that, on executor JVM shutdown, the Kafka producer sends
          // any buffered messages to Kafka before shutting down.
          producer.close()
        }
        producer
      }
      new KafkaSink(createProducerFunc)
    }

    def apply[K, V](config: java.util.Properties): KafkaSink[K, V] = apply(config.toMap)
  }

}

maven依赖



    4.0.0

    com.cmft
    testSpark
    1.0-SNAPSHOT
    
        
            cloudera
            https://repository.cloudera.com/artifactory/cloudera-repos/
        
        
            cmhk.mirror
            cmhk mirror.
            xxx
        
        
            nexus-cmft
            cmft repository
            http:/xxx/
        
    

    
        
            nexus-cmft
            cmft repository
            xxx
        
    

    
        UTF-8
        2.3.2
    

    
        
            org.apache.hadoop
            hadoop-client
            2.6.0-cdh5.12.1
        
        
            org.apache.hbase
            hbase-client
            1.2.0-cdh5.12.1
        


        
            com.alibaba
            fastjson
            1.2.28
        
        
        
            org.apache.spark
            spark-core_2.11
            ${spark.version}
            
                
                    org.tachyonproject
                    tachyon-client
                
                
                    org.apache.curator
                    curator-recipes
                
                
                    commons-codec
                    commons-codec
                
            
        
        
            org.apache.spark
            spark-streaming_2.11
            ${spark.version}
        
        
            org.apache.spark
            spark-streaming-kafka-0-10_2.11
            ${spark.version}
        
        
            net.sf.json-lib
            
                json-lib
            
            2.3
            jdk15
        
        
            org.json4s
            json4s-core_2.10
            3.2.10
        
        
            org.json4s
            json4s-jackson_2.10
            3.2.10
        
        
            org.apache.spark
            spark-hive_2.11
            ${spark.version}
        
    

    
        
            
                org.scala-tools
                maven-scala-plugin
                2.15.2
                
                    modified-only
                
                
                    
                        main-scalac
                        process-resources
                        
                            add-source
                            compile
                        
                    
                    
                        scala-test-compile
                        process-test-resources
                        
                            testCompile
                        
                    
                
            
            
                org.apache.maven.plugins
                maven-compiler-plugin
                3.6.0
                
                    1.8
                    1.8
                
            


            
                org.apache.maven.plugins
                maven-compiler-plugin
                3.1
                
                    
                        compile
                        
                            compile
                        
                    
                
            
        
        target
        target/classes
        target/test-classes
        src
    


你可能感兴趣的:(大数据)