日志的分析

package hadoop

import java.security.MessageDigest
import java.text.SimpleDateFormat

import IPInfo.IP
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, SQLContext, SparkSession}
import org.apache.spark.{SparkConf, SparkContext}
import java.util.Properties

import com.sun.deploy.net.URLEncoder

/**
  * Created by sicong on 2017/1/13.
  */
object YiTianYiKeAnalyser {
       val prop = new Properties()
        prop.put("user", "root")
        prop.put("password", "")
  case class Record(user: String, ip: String, country: String, province: String, city: String,
                    restime: Int, time: Long, code: Int, size: Long,
                    firm: String, device: String, rom: String, ke: String)
  case class Devices(city: String,num:Int,device: String)
  case class Ipmap(ip: String, provinceCode: Int, cityCode: Int, province: String, city: String)
  case class CityFlow(city:String,flow:Long)
  case class StatusCode(code:Int,num:Int)

  case class ThreadCache(dateParser: SimpleDateFormat, sha1Digester: MessageDigest)

  val threadSafeCache = new ThreadLocal[ThreadCache]();
  val Iphelpk = new IP()

  def getIpInfohga(Str: String): String = {
    Iphelpk.mains(Str)

  }

  def logbegin(): Unit = {
    val spark = SparkSession
      .builder()
      .appName("Spark SQL Example").master("local[4]")
      .config("spark.some.config.option", "some-value")
      .getOrCreate()
//    readsp(spark)
    parseLog("/Users/sicong/Downloads/yitianyike.txt", spark)
  }

  def getNeedParseLog(): Array[String] = {
    // TODO
    // 1. 当前时间前推 7 小时;2 当前时间前推 12 小时;
    // 以 1、2 为时间范围,查询日志列表
    // 日志列表与近期的处理记录比对,若获得的日志为处理,则解析日志,成功后标记为已处理
    Array("/Users/Simon/Downloads/7xna64.com2.z0.glb.qiniucdn.com_2017-01-03-23_0602")
  }

  def logdevicesMysql(kk: Dataset[Devices], s: String):Unit={
    val prop = new Properties()
    prop.put("user", "root")
    prop.put("password", "")
    kk.write.mode("append").jdbc("jdbc:mysql://localhost:3306/test1?useUnicode=true&characterEncoding=utf8", s,prop)

  }

  def CizeFlowStatus(logrdd: RDD[Record],sparkSession: SparkSession) ={
    import sparkSession.implicits._
    val dataOfFlow=logrdd.map(x=>(x.city,x.size)).groupByKey().map(x=>(x._1,(x._2.sum.toDouble/(1024)).round))
     val logMysqldata=dataOfFlow.flatMap(x=>
       Seq(CityFlow(x._1,x._2))).toDS()
       logMysqldata.write.mode("append").jdbc("jdbc:mysql://localhost:3306/test1?useUnicode=true&characterEncoding=utf8", "test1.CityFlow",prop)
  }
  def HttpStatusCode(logrdd:RDD[Record],sparkSession: SparkSession): Unit ={
    import sparkSession.implicits._
   val logMysqldata= logrdd.map(x=>(x.code,x)).groupByKey().flatMap(x=>{
      Seq(StatusCode(x._1,x._2.size))
    }).toDS()
    logMysqldata.write.mode("append").jdbc("jdbc:mysql://localhost:3306/test1?useUnicode=true&characterEncoding=utf8", "test1.StatusCode",prop)

  }
  def cityTopURL(logrdd:RDD[Record],sparkSession: SparkSession): Unit ={
   import sparkSession.implicits._
    logrdd.map(x=>(x.ke+x.city,x)).groupByKey().map(x=>(x._2.size,x._1)).sortBy(x=>x,false,1).foreach(x=>println(x))
  }
  def parseLog(url: String, spark: SparkSession): Unit = {

    import spark.implicits._
    val peopleDF = spark.sparkContext
      .textFile(url)
    val logrdd = peopleDF.flatMap(line => {
      val record = parses(line)
      if (record != null) {
        Seq(record)
      } else {
        Seq()
      }
    })
    //这里对logrdd进行缓存到内存cache 因为接下来的每个算子action如果没有缓存到内存是会每次重新从头开始计算
    logrdd.cache()
    //统计个个省份对应的流量的接口
    // CizeFlowStatus(logrdd,spark)
    //统计个个状态码的占有率
   // HttpStatusCode(logrdd,spark)
    //cityTopURL(logrdd,spark)


//
//      val logrdds=logrdd.map(x=>(x.ip+x.device,x))
//      .groupByKey().map(x=>(x._2.head))
//    val datacount=logrdds.count()
//    val deviceOfNum=logrdds.map(x=>(x.device,x)).groupByKey().map(x=>(x._2.head.device,x._2.head.city,x._2.size))
//    val kk= deviceOfNum.flatMap(x=>{
//       Seq(Devices(x._2,x._3,x._1))
//     }).toDS()
//    logdevicesMysql(kk,"test1.Devices")
    //filter(x=>x._2.size > 1)
    // foreach(x=>println(x))
      //.take(1000).foreach(x=>{println(x.device)})
 ///     .toDS()

//    logrdd.foreach(x=>println(x))
//    spark.sql("set names utf8")
//    val jdbcDF = spark.read
//      .format("jdbc")
//      .option("url", "jdbc:mysql://localhost:3306/test1?useUnicode=true&characterEncoding=utf8")
//      .option("dbtable", "test1.Devices")
//      .option("user", "root")
//      .option("password", "")
//      .load()
//
    //怎么样写入数据库
//     val prop = new Properties()
//      prop.put("user", "root")
//      prop.put("password", "")
//    kk.write.mode("append").jdbc("jdbc:mysql://localhost:3306/test1?useUnicode=true&characterEncoding=utf8", "test1.Devices",prop)
    //val prop = new Properties()
//    //  prop.put("user", "root")
//    //  prop.put("password", "")
//    //    peopleDf.write.mode("append").jdbc("jdbc:mysql://localhost:3306/mytest", "mytest.student3",prop)
//      .map(x => (x.ip+x.device, x))
      //.groupByKey().filter(x=>x._2.size > 1).map(x=>x._2.head).take(1000).foreach(x=>{println(x.device)})
//      .foreach {
//        x =>
//          println(s"ip:${x._1}")
//          x._2.foreach(y => println(y.device))
//          println()
//      }
    // .map(x => (x.ip, x)).groupByKey().map(_._2.head).toDS()
//        logrdd.createOrReplaceTempView("fusion")
//        spark.sql("select * from fusion").foreach(x => println(x))
//        spark.sql("select count(*) from fusion").foreach(x => print(x))
  }
  def logprovincecity(str: String): Array[String] = {
    val Iphelp = new IP();
    val data = Iphelp.mains(str)
    data.substring(1, data.length - 1).split(",")
  }


  def parses(line: String): Record = {
    setThreadCache()
    val as = line.split(" ")

    val ip = as(0)
    val restime = as(2).toInt
    val time = "1403931367000".toLong

    val code = as(8).toInt
    val size = as(9).toLong
    val ua = line.substring(line.lastIndexOf(" \"") + 2, line.lastIndexOf("\""))
    val region = logprovincecity(ip)
    val province = changeEncodeing(region(0))
    val city = changeEncodeing(region(1))
    val country = changeEncodeing(region(2))
//val province =region(0)
//    val city = region(1)
//    val country =region(2)
    val driver = parseUa(ua)
    val firm = driver._1
    val device = driver._2
    val rom = driver._3


    val user = mixtureUser(ip, ua)

    val ke = parseToKey(as(6))

    val obj = Record(user, ip, country, province, city,
      restime, time, code, size,
      firm, device, rom, ke)
    obj
  }
  def changeEncodeing(string: String): String={
string
  }

  // 106.18.21.156 - 282 [03/Jan/2017:23:30:14 +0800] "GET http://7xna64.com2.z0.glb.qiniucdn.com/Fjm_mLtcPN3DbTtLpywOmX5gq9cl.jpg?imageView2/2/w/1080/h/1920&e=1483545599&token=Q-hCY0VbL4F6NTX3TgRvE_T3vcpNEo2Gr3S9RA-b:ffDUURujc65VJLj1mKdGDMOrhIg= HTTP/1.1" 200 478114 "-" "AndroidDownloadManager/5.1+(Linux;+U;+Android+5.1;+OPPO+R9m+Build/LMY47I)"
  // 139.148.121.96 - 248 [03/Jan/2017:23:30:11 +0800] "GET http://7xna64.com2.z0.glb.qiniucdn.com/FiU3bxGjI6PutwVphDQQihBgP0uw.jpg?imageView2/2/w/1080/h/1920&e=1483545599&token=Q-hCY0VbL4F6NTX3TgRvE_T3vcpNEo2Gr3S9RA-b:1wKdyBO_iYMQh7_MBqGcifYQX50= HTTP/1.1" 200 552867 "-" "AndroidDownloadManager/5.1.1+(Linux;+U;+Android+5.1.1;+OPPO+R9+Plusm+A+Build/LMY47V)"
  // 220.178.4.219 - 1 [03/Jan/2017:23:30:35 +0800] "GET http://7xna64.com2.z0.glb.qiniucdn.com/FiwmSuSIuu981zLWENSCOJvIoj2P.jpg?imageView2/2/w/1080/h/1920&e=1483592399&token=Q-hCY0VbL4F6NTX3TgRvE_T3vcpNEo2Gr3S9RA-b:vsvEgQcb8-cU3BDLNp6sLCG72DI= HTTP/1.1" 200 456693 "-" "Dalvik/2.1.0+(Linux;+U;+Android+5.1.1;+NX529J+Build/LMY47V)"
//  def parse(line: String): Record = {
//    println(line)
//    try {
//      setThreadCache()
//
//      val as = line.split(" ")
//
//      val ip = as(0)
//      val restime = as(2).toInt
//      val time = parseToDate(line)
//
//      val code = as(8).toInt
//      val size = as(9).toLong
//      val ua = line.substring(line.lastIndexOf(" \"") + 2, line.lastIndexOf("\""))
//      val region = parseRegion(ip)
//      val province = region._1
//      val city = region._2
//
//      val driver = parseUa(ua)
//      val firm = driver._1
//      val device = driver._2
//      val rom = driver._3
//
//      val user = mixtureUser(ip, ua)
//      val country = ""
//      val key = parseToKey(as(6))
//
//      val obj = Record(user, ip, country, province, city,
//        restime, time, code, size,
//        firm, device, rom, key)
//      obj
//    } catch {
//      case e: Exception => {
//        println(s"wrong line: ${line}")
//        return null
//      }
//    }
//  }

  def setThreadCache(): Unit = {
    val cache = threadSafeCache.get()
    if (cache == null) {
      val dateParser = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss ZZZZ")
      val sha1 = MessageDigest.getInstance("SHA1")
      threadSafeCache.set(ThreadCache(dateParser, sha1))
    }
  }


  def parseToDate(line: String): Long = {
    threadSafeCache.get().dateParser.parse(line.substring(line.indexOf("[") + 1, line.indexOf("]"))).getTime() / 1000
  }

  private val Iphelp = new IP();

  def getIpInfo(Str: String): String = {
    Iphelp.mains(Str)
  }

  def parseRegion(ip: String) = {
    // TODO
    //  IP.load()
    ("", "")
  }

  // AndroidDownloadManager/5.1.1+(Linux;+U;+Android+5.1.1;+OPPO+R9+Plusm+A+Build/LMY47V)
  // Dalvik/2.1.0+(Linux;+U;+Android+5.1.1;+NX529J+Build/LMY47V)
  // Dalvik/2.1.0+(Linux;+U;+Android+5.1.1;+NX523J_V1+Build/LMY47V)
  // Dalvik/2.1.0+(Linux;+U;+Android+6.0.1;+vivo+Y55A+Build/MMB29M)
  // AndroidDownloadManager/5.1+(Linux;+U;+Android+5.1;+OPPO+R9m+Build/LMY47I)
  // ua 也包含其它字符
  // -
  // Java/1.7.0_09
  // Go-http-client/1.1
  // VAYXXLWZIKRFDGFHPOXDNHJTDLTNBTV
  // ("Android 6.0.1", "vivo Y55A", "Build/MMB29M")
  def parseUa(ua: String): (String, String, String) = {
    try {
      val t1 = ua.split(";").reverse
      val t2 = t1(0).split("\\+")
      return (t1(1).replaceAll("\\+", " ").trim, t2.slice(0, t2.length - 1).mkString(" ").trim, t2(t2.length - 1))
    } catch {
      case e: Exception => {
        return ("Error", "Error", "Error")
      }
    }
  }


  def mixtureUser(ip: String, ua: String) = {
    hash(ip + ":" + ua)
  }

  def hash(s: String): String = {
    threadSafeCache.get().sha1Digester.digest(s.getBytes).map("%02x".format(_)).mkString
  }


  def parseToKey(url: String) = {
    // https://a 至少有 9 个字符
    val l = url.indexOf("?", 9);
    val end = if (l > 0) l else url.length()
    url.substring(url.indexOf("/", 9) + 1, end)
  }


  def getIpInfos(Str: String): Array[String] = {
//    val hell = new hello();
//    hell.getipdata(Str).split(";")
    return Array()
  }
   def readsp(spark: SparkSession): Unit ={
     import spark.implicits._
     val prop = new Properties()
     prop.put("user", "root")
     prop.put("password", "")
     val jdbcDF2 = spark.read
       .jdbc("jdbc:mysql://localhost:3306", "test1.tutorials_tbl",prop)
     jdbcDF2.foreach(x=>println(x))
   }
  def main(args: Array[String]): Unit = {
    IP.load("/Users/sicong/Downloads/17monipdb/17monipdb.dat");
    logbegin()

  }

}

你可能感兴趣的:(scala,spark)