package com.chexun.statistic import java.sql.{Connection, DriverManager} import java.util.Date import com.chexun.statistic.RealTimeAdv._ import kafka.serializer.StringDecoder import org.apache.commons.lang.time.DateFormatUtils import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} /** * 记录最近1分钟的数据 * Created by hanyiting on 2015/08/13. */ object RealtimeCount { case class AdvLoging(vtime: Long, userIp: Long, muid: String, uid: String, ucp: String, adurl: String) case class Adv(userIp: Long, muid: String, ucp: String, adurl: String, location: String) def main(args: Array[String]) { val url = "jdbc:mysql://10.0.0.198:3306/test" val usr = "test" val pwd = "test" val sparkConf = new SparkConf().set("spark.streaming.unpersist", "true").set("spark.cleaner.ttl", "43200") .setExecutorEnv("SPARK_JAVA_OPTS", "-verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps").setAppName("RealtimeCount") val sc = new SparkContext(sparkConf) val ssc = new StreamingContext(sc, Seconds(60)) //define the kafka parameters, broker list must be specified val kafkaParams = Map("metadata.broker.list" -> "10.0.0.37:9092,10.0.0.30:9092,10.0.0.35:9092,10.0.0.26:9092,10.0.0.27:9092") //define which topics to read from val topics = Set("chexun1", "chexun2") val lines = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics).map(x => x._2) //过滤掉adurl为空的数据,然后进行转换,提取出location,并过滤掉location为空的数据 val tmpdf = lines.map(_.split("\t")).map(x => AdvLoging(x(9).toLong, x(8).toLong, x(1), x(0), x(3), x(24))).filter(y => (y.adurl != null && !y.adurl.equals("null"))).map(x => Adv(x.userIp, x.muid, x.ucp, getUrl(x.adurl), getLocation(x.adurl))).filter(z => z.location != null && !("").equals(z.location)) tmpdf.foreachRDD { rdd => val sqlContext = new org.apache.spark.sql.SQLContext(sc) import sqlContext.implicits._ val df = rdd.toDF().registerTempTable("adv") //获取当前时间,精确到分 val stattime = DateFormatUtils.format(new Date, "yyyy-MM-dd HH:mm:00") //对不同位置的广告进行分组,求pv和uv val rcount = sqlContext.sql("select location,count(*),count(distinct muid) from adv group by location").foreachPartition( datas => { val conn: Connection = DriverManager.getConnection(url, usr, pwd) val pstat = conn.prepareStatement("insert into loging_adv_realtime(stat_time,location,pv,uv) values (?,?,?,?)") for (data <- datas) { pstat.setString(1, stattime) pstat.setString(2, data(0).toString) pstat.setString(3, data(1).toString) pstat.setString(4, data(2).toString) pstat.executeUpdate() } } ) } ssc.start() ssc.awaitTermination() } }