Spark简单处理Apache日志

Apache日志处理

    • Apache日志处理
        • 解析Apache日志得到有用字段
        • 针对上一步提取的字段做个性化的处理

解析Apache日志得到有用字段

import scala.util.matching.Regex
case class  ApacheAcessLog(
                          IpAdress:String,
                          ClientId:String,
                          UserId:String,
                          ServerDate:String,
                          Response:String,
                          ResponseCode:String,
                          ResponseDataSize:String,
                          Referer:String,
                          UserAgent:String
                          )
object ApacheAcessLog {
val Parttern:Regex="""^(\S+)\s(\S+)\s(\S+)\s(\[[^\[\]]+\])\s"([A-Z]+\s\S+\s\S+)"\s(\d{3})\s(\d+|-)\s"(\S+)"\s(".+")$""".r
   def CheckLogData(line:String):Boolean={ //验证输入的数据是否符合给定的日志正则
    if(line.length>500){false}else {
      val options = Parttern.findFirstMatchIn(line)
        if (options.isEmpty) {
        false
      } else {
        true
      }
    }
  }

  /**
    * 解析输入的日志数据
    */
  def ParseApacheLog(line:String):ApacheAcessLog={
    val options=parttern.findFirstMatchIn(line)
    val matcher=options.get
    ApacheAcessLog(
      matcher.group(1),
      matcher.group(2),
      matcher.group(3),
      matcher.group(4),
      matcher.group(5),
      matcher.group(6),
      matcher.group(7),
      matcher.group(8),
      matcher.group(9)
    )
  }
import org.apache.spark.rdd.RDD
import org.apache.spark.sql
import org.apache.spark.{SparkConf, SparkContext}
object SparkLogAnalyze {
  def main(args: Array[String]): Unit = {
    val conf=new SparkConf()
      .setAppName("ApacheLogAnalyze")
      .setMaster("local[4]")
    val sc=SparkContext.getOrCreate(conf)

    val path="H:\\ApacheLogALL.log"
    val rdd=sc.textFile(path)
    //rdd转换
    val apacheAcessLog:RDD[ApacheAcessLog]=rdd
      //过滤数据
      .filter(line=> ApacheAcessLog.CheckLogData(line))
      //对rdd进行转换
      .map(line=>{
        ApacheAcessLog.ParseApacheLog(line)
      })
    apacheAcessLog.cache() //到这里就得到了正则表达式中要求的日志字段

针对上一步提取的字段做个性化的处理

  • 得到需要处理的字段的RDD
val df:RDD[Array[String]]=apacheAcessLog
  .map(log=> Array(log.ServerDate,log.Referer,log.UserAgent))
df.cache()
  • 处理Apache日志中的时间字段,将Apache中的时间格式转换成yyyy-MM-dd HH:mm:ss格式
def ApacheTimeALZ(time:String):String={ //Apache日志中的时间处理函数
  //传入的time  [19/Sep/2017:16:08:01 +0800]
  val A=time.split("\\[")
  val A1=A.apply(1)
  val B=A1.split("]")
  val viewtime=B.apply(0)
  import java.text.SimpleDateFormat
  import java.util.Locale
  val sdf = new SimpleDateFormat("dd/MMM/yyyy:HH:mm:ss Z", Locale.US)
  val sdf2 = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
  val UStime=sdf.parse(viewtime)
  val CNtime=sdf2.format(UStime)
  CNtime
}
  • 针对转换的时间格式,将日志数据按日期和小时分类,处理的方法是对上面处理的时间标准格式利用正则提取日期和小时字段,根据日期和小时字段做分组,达到分类。
def DealDate(serverdate:String):String={
  val part:Regex="""(\d{4}-\d{1,2}-\d{1,2})\s(\d{2})""".r
  val matcher=part.findFirstMatchIn(serverdate)
  if(matcher.isEmpty){
    return "0000-00-00 00:00:00"
  }
  matcher.get.group(0)

}
logrdd.groupBy(line=>line.ServerDate).foreach{category=>
        IoOperation.LogShow(category._1,category._2)
  • 其余的操作可以参照我的上一篇IIS日志处理

你可能感兴趣的:(spark)