spark 日志解析格式化

ip库的信息在这里下载 http://www.ipip.net/download.html

 182.146.100.97 - 3 [03/Jan/2017:23:30:01 +0800] "GET http://7xna64.com2.z0.glb.qiniucdn.com/Fq9M_Gn0RRWy9eprb0T0CAdrybv3.jpg?imageView2/2/w/1080/h/1920&e=1483592399&token=Q-hCY0VbL4F6NTX3TgRvE_T3vcpNEo2Gr3S9RA-b:HJPKZifauy-LOmjJgA5F1uG9ibs= HTTP/1.1" 200 219736 "-" "Dalvik/2.1.0+(Linux;+U;+Android+6.0.1;+NX549J+Build/MMB29M)"

代码案例

hadoop

import java.security.MessageDigest
import java.text.SimpleDateFormat
import java.util.{Locale, Properties}

import IPInfo.IP
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Dataset, SparkSession}

/**
  * Created by sicong on 2017/4/19.
  * 182.146.100.97 - 3 [03/Jan/2017:23:30:01 +0800] "GET http://7xna64.com2.z0.glb.qiniucdn.com/Fq9M_Gn0RRWy9eprb0T0CAdrybv3.jpg?imageView2/2/w/1080/h/1920&e=1483592399&token=Q-hCY0VbL4F6NTX3TgRvE_T3vcpNEo2Gr3S9RA-b:HJPKZifauy-LOmjJgA5F1uG9ibs= HTTP/1.1" 200 219736 "-" "Dalvik/2.1.0+(Linux;+U;+Android+6.0.1;+NX549J+Build/MMB29M)"
  *
  */
object paseLogData {
  val prop = new Properties()
  prop.put("user", "root")
  prop.put("password", "")
  case class Record(user: String, ip: String, country: String, province: String, city: String,
                    restime: Int, time: Long, code: Int, size: Long,
                    firm: String, device: String, rom: String, ke: String)
  case class Devices(city: String,num:Int,device: String)
  case class Ipmap(ip: String, provinceCode: Int, cityCode: Int, province: String, city: String)
  case class CityFlow(city:String,flow:Long)
  case class StatusCode(code:Int,num:Int)

  case class ThreadCache(dateParser: SimpleDateFormat, sha1Digester: MessageDigest)

  val threadSafeCache = new ThreadLocal[ThreadCache]();
  val Iphelpk = new IP()

  def getIpInfohga(Str: String): String = {
    Iphelpk.mains(Str)

  }
//主入口程序
  def logbegin(): Unit = {
    val spark = SparkSession
      .builder()
      .appName("Spark SQL Example").master("local[4]")
      .config("spark.some.config.option", "some-value")
      .getOrCreate()
    //    readsp(spark)
    parseLog("/Users/sicong/Downloads/yitianyike.txt", spark)
  }

  def getNeedParseLog(): Array[String] = {
    // TODO
    // 1. 当前时间前推 7 小时;2 当前时间前推 12 小时;
    // 以 1、2 为时间范围,查询日志列表
    // 日志列表与近期的处理记录比对,若获得的日志为处理,则解析日志,成功后标记为已处理
    Array("/Users/Simon/Downloads/7xna64.com2.z0.glb.qiniucdn.com_2017-01-03-23_0602")
  }

  def logdevicesMysql(kk: Dataset[Devices], s: String):Unit={
    val prop = new Properties()
    prop.put("user", "root")
    prop.put("password", "")
    kk.write.mode("append").jdbc("jdbc:mysql://localhost:3306/test1?useUnicode=true&characterEncoding=utf8", s,prop)

  }

  def CizeFlowStatus(logrdd: RDD[Record],sparkSession: SparkSession) ={
    import sparkSession.implicits._
    val dataOfFlow=logrdd.map(x=>(x.city,x.size)).groupByKey().map(x=>(x._1,(x._2.sum.toDouble/(1024)).round))
    val logMysqldata=dataOfFlow.flatMap(x=>
      Seq(CityFlow(x._1,x._2))).toDS()
    logMysqldata.write.mode("append").jdbc("jdbc:mysql://localhost:3306/test1?useUnicode=true&characterEncoding=utf8", "test1.CityFlow",prop)
  }
  def HttpStatusCode(logrdd:RDD[Record],sparkSession: SparkSession): Unit ={
    import sparkSession.implicits._
    val logMysqldata= logrdd.map(x=>(x.code,x)).groupByKey().flatMap(x=>{
      Seq(StatusCode(x._1,x._2.size))
    }).toDS()
    logMysqldata.write.mode("append").jdbc("jdbc:mysql://localhost:3306/test1?useUnicode=true&characterEncoding=utf8", "test1.StatusCode",prop)

  }
  def cityTopURL(logrdd:RDD[Record],sparkSession: SparkSession): Unit ={
    import sparkSession.implicits._
    logrdd.map(x=>(x.ke+x.city,x)).groupByKey().map(x=>(x._2.size,x._1)).sortBy(x=>x,false,1).foreach(x=>println(x))
  }
  //spark 的解析入口
  def parseLog(url: String, spark: SparkSession): Unit = {

    import spark.implicits._
    val peopleDF = spark.sparkContext
      .textFile(url)
    val logrdd = peopleDF.flatMap(line => {
      val record = parses(line)
      if (record != null) {
        Seq(record)
      } else {
        Seq()
      }
    })
    //这里对logrdd进行缓存到内存cache 因为接下来的每个算子action如果没有缓存到内存是会每次重新从头开始计算
    //统计个个省份对应的流量的接口
     CizeFlowStatus(logrdd,spark)
    //统计个个状态码的占有率
     HttpStatusCode(logrdd,spark)
    cityTopURL(logrdd,spark)
  }
  //ip 查询归属地的信息
  def logprovincecity(str: String): Array[String] = {
    val Iphelp = new IP();
    val data = Iphelp.mains(str)
    data.substring(1, data.length - 1).split(",")
  }


  def parses(line: String): Record = {
    setThreadCache()
    val as = line.split(" ")

    val ip = as(0)
    val restime = as(2).toInt
    val time = parseVisitTime(as(3))
    val code = as(8).toInt
    val size = as(9).toLong
    //切分出出ua Dalvik/2.1.0+(Linux;+U;+Android+6.0.1;+vivo+Y55A+Build/MMB29M)
    val ua = line.substring(line.lastIndexOf(" \"") + 2, line.lastIndexOf("\""))
    val region = logprovincecity(ip)
    val province = changeEncodeing(region(0))
    val city = changeEncodeing(region(1))
    val country = changeEncodeing(region(2))
    val driver = parseUa(ua)
    val firm = driver._1
    val device = driver._2
    val rom = driver._3
    val user = mixtureUser(ip, ua)
    val ke = parseToKey(as(6))

    val obj = Record(user, ip, country, province, city,
      restime, time, code, size,
      firm, device, rom, ke)
    obj
  }
  def changeEncodeing(string: String): String={
    string
  }
  def parseVisitTime(string: String):Long={
    println(string)
    var timeData=string.substring(1,string.length)
    println(timeData)
    val loc = new Locale("en")
    val fm = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss",loc)
    val tm = timeData
    val dt2 = fm.parse(tm);
    var dates=dt2.getTime()
    dates.toString.substring(0,dates.toString.length-3).toLong

  }

  def setThreadCache(): Unit = {
    val cache = threadSafeCache.get()
    if (cache == null) {
      val dateParser = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss ZZZZ")
      val sha1 = MessageDigest.getInstance("SHA1")
      threadSafeCache.set(ThreadCache(dateParser, sha1))
    }
  }

  private val Iphelp = new IP();

  def getIpInfo(Str: String): String = {
    Iphelp.mains(Str)
  }

  // AndroidDownloadManager/5.1.1+(Linux;+U;+Android+5.1.1;+OPPO+R9+Plusm+A+Build/LMY47V)
  // Dalvik/2.1.0+(Linux;+U;+Android+5.1.1;+NX529J+Build/LMY47V)
  // Dalvik/2.1.0+(Linux;+U;+Android+5.1.1;+NX523J_V1+Build/LMY47V)
  // Dalvik/2.1.0+(Linux;+U;+Android+6.0.1;+vivo+Y55A+Build/MMB29M)
  // AndroidDownloadManager/5.1+(Linux;+U;+Android+5.1;+OPPO+R9m+Build/LMY47I)
  // ua 也包含其它字符
  // -
  // Java/1.7.0_09
  // Go-http-client/1.1
  // VAYXXLWZIKRFDGFHPOXDNHJTDLTNBTV
  // ("Android 6.0.1", "vivo Y55A", "Build/MMB29M")
  def parseUa(ua: String): (String, String, String) = {
    try {
      val t1 = ua.split(";").reverse
      val t2 = t1(0).split("\\+")
      return (t1(1).replaceAll("\\+", " ").trim, t2.slice(0, t2.length - 1).mkString(" ").trim, t2(t2.length - 1))
    } catch {
      case e: Exception => {
        return ("Error", "Error", "Error")
      }
    }
  }


  def mixtureUser(ip: String, ua: String) = {
    hash(ip + ":" + ua)
  }

  def hash(s: String): String = {
    threadSafeCache.get().sha1Digester.digest(s.getBytes).map("%02x".format(_)).mkString
  }


  def parseToKey(url: String) = {
    // https://a 至少有 9 个字符
    val l = url.indexOf("?", 9);
    val end = if (l > 0) l else url.length()
    url.substring(url.indexOf("/", 9) + 1, end)
  }


  def getIpInfos(Str: String): Array[String] = {
    //    val hell = new hello();
    //    hell.getipdata(Str).split(";")
    return Array()
  }
  def readsp(spark: SparkSession): Unit ={
    import spark.implicits._
    val prop = new Properties()
    prop.put("user", "root")
    prop.put("password", "")
    val jdbcDF2 = spark.read
      .jdbc("jdbc:mysql://localhost:3306", "test1.tutorials_tbl",prop)
    jdbcDF2.foreach(x=>println(x))
  }
  def main(args: Array[String]): Unit = {
    //加载ip库
    IP.load("/Users/sicong/scalas/17monipdb.dat");
    logbegin()

  }

}

你可能感兴趣的:(scala,spark)