spark streaming准实时计算demo

 

 

package com.chexun.statistic

import java.sql.{Connection, DriverManager}
import java.util.Date

import com.chexun.statistic.RealTimeAdv._
import kafka.serializer.StringDecoder
import org.apache.commons.lang.time.DateFormatUtils
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}

/**
 * 记录最近1分钟的数据
 * Created by hanyiting on 2015/08/13.
 */
object  RealtimeCount {

  case class AdvLoging(vtime: Long, userIp: Long, muid: String, uid: String, ucp: String, adurl: String)

  case class Adv(userIp: Long, muid: String, ucp: String, adurl: String, location: String)

  def main(args: Array[String]) {

    val url = "jdbc:mysql://10.0.0.198:3306/test"
    val usr = "test"
    val pwd = "test"

    val sparkConf = new SparkConf().set("spark.streaming.unpersist", "true").set("spark.cleaner.ttl", "43200")
      .setExecutorEnv("SPARK_JAVA_OPTS", "-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps").setAppName("RealtimeCount")
    val sc = new SparkContext(sparkConf)
    val ssc = new StreamingContext(sc, Seconds(60))


    //define the kafka parameters, broker list must be specified
    val kafkaParams = Map("metadata.broker.list" -> "10.0.0.37:9092,10.0.0.30:9092,10.0.0.35:9092,10.0.0.26:9092,10.0.0.27:9092")

    //define which topics to read from
    val topics = Set("chexun1", "chexun2")

    val lines = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics).map(x => x._2)

    //过滤掉adurl为空的数据,然后进行转换,提取出location,并过滤掉location为空的数据
    val tmpdf = lines.map(_.split("\t")).map(x => AdvLoging(x(9).toLong, x(8).toLong, x(1), x(0), x(3), x(24))).filter(y => (y.adurl != null && !y.adurl.equals("null"))).map(x => Adv(x.userIp, x.muid, x.ucp, getUrl(x.adurl), getLocation(x.adurl))).filter(z => z.location != null && !("").equals(z.location))
    tmpdf.foreachRDD { rdd =>
      val sqlContext = new org.apache.spark.sql.SQLContext(sc)
      import sqlContext.implicits._
      val df = rdd.toDF().registerTempTable("adv")
      //获取当前时间,精确到分
      val stattime = DateFormatUtils.format(new Date, "yyyy-MM-dd HH:mm:00")
      //对不同位置的广告进行分组,求pv和uv
      val rcount = sqlContext.sql("select location,count(*),count(distinct muid) from adv group by location").foreachPartition(
        datas => {
          val conn: Connection = DriverManager.getConnection(url, usr, pwd)
          val pstat = conn.prepareStatement("insert into loging_adv_realtime(stat_time,location,pv,uv) values (?,?,?,?)")
          for (data <- datas) {
            pstat.setString(1, stattime)
            pstat.setString(2, data(0).toString)
            pstat.setString(3, data(1).toString)
            pstat.setString(4, data(2).toString)
            pstat.executeUpdate()
          }
        }
      )
    }

    ssc.start()
    ssc.awaitTermination()
  }
}

 

你可能感兴趣的:(mysql,spark)