Spark编程案例:分析tomcat访问日志求访问量最高的前两个网页

需求如题,tomcat 访问日志如下:

192.168.88.1 - - [30/Jul/2017:12:53:43 +0800] "GET /MyDemoWeb/ HTTP/1.1" 200 259
192.168.88.1 - - [30/Jul/2017:12:53:43 +0800] "GET /MyDemoWeb/head.jsp HTTP/1.1" 200 713
192.168.88.1 - - [30/Jul/2017:12:53:43 +0800] "GET /MyDemoWeb/body.jsp HTTP/1.1" 200 240
192.168.88.1 - - [30/Jul/2017:12:54:37 +0800] "GET /MyDemoWeb/oracle.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:38 +0800] "GET /MyDemoWeb/hadoop.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:38 +0800] "GET /MyDemoWeb/java.jsp HTTP/1.1" 200 240
192.168.88.1 - - [30/Jul/2017:12:54:40 +0800] "GET /MyDemoWeb/oracle.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:40 +0800] "GET /MyDemoWeb/hadoop.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:41 +0800] "GET /MyDemoWeb/mysql.jsp HTTP/1.1" 200 241
192.168.88.1 - - [30/Jul/2017:12:54:41 +0800] "GET /MyDemoWeb/hadoop.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:42 +0800] "GET /MyDemoWeb/web.jsp HTTP/1.1" 200 239
192.168.88.1 - - [30/Jul/2017:12:54:42 +0800] "GET /MyDemoWeb/oracle.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:52 +0800] "GET /MyDemoWeb/oracle.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:52 +0800] "GET /MyDemoWeb/hadoop.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:53 +0800] "GET /MyDemoWeb/oracle.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:54 +0800] "GET /MyDemoWeb/mysql.jsp HTTP/1.1" 200 241
192.168.88.1 - - [30/Jul/2017:12:54:54 +0800] "GET /MyDemoWeb/hadoop.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:54 +0800] "GET /MyDemoWeb/hadoop.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:56 +0800] "GET /MyDemoWeb/web.jsp HTTP/1.1" 200 239
192.168.88.1 - - [30/Jul/2017:12:54:56 +0800] "GET /MyDemoWeb/java.jsp HTTP/1.1" 200 240
192.168.88.1 - - [30/Jul/2017:12:54:57 +0800] "GET /MyDemoWeb/oracle.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:57 +0800] "GET /MyDemoWeb/java.jsp HTTP/1.1" 200 240
192.168.88.1 - - [30/Jul/2017:12:54:58 +0800] "GET /MyDemoWeb/oracle.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:58 +0800] "GET /MyDemoWeb/hadoop.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:59 +0800] "GET /MyDemoWeb/oracle.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:54:59 +0800] "GET /MyDemoWeb/hadoop.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:55:00 +0800] "GET /MyDemoWeb/mysql.jsp HTTP/1.1" 200 241
192.168.88.1 - - [30/Jul/2017:12:55:00 +0800] "GET /MyDemoWeb/oracle.jsp HTTP/1.1" 200 242
192.168.88.1 - - [30/Jul/2017:12:55:02 +0800] "GET /MyDemoWeb/web.jsp HTTP/1.1" 200 239
192.168.88.1 - - [30/Jul/2017:12:55:02 +0800] "GET /MyDemoWeb/hadoop.jsp HTTP/1.1" 200 242

思路:

  1. 根据日志对每个页面的访问量求和
  2. 降序排序
  3. 取前两条记录

scala 代码如下:

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import java.util.regex.Pattern
import java.util.regex.Matcher

object TomcatLogDemo {
  def main(args: Array[String]):Unit = {
    // 定义SparkContext
    val conf = new SparkConf()
    conf.setAppName("MyWebLogDemo")
    conf.setMaster("local")
    val sc = new SparkContext(conf)
    
    // 读取tomcat的访问日志
    val rdd1 = sc.textFile("F:\\localhost_access_log.2017-07-30.txt")
    
    // 对日志中的网页的访问量求和
    val rdd2 = rdd1.map(e => {
      // 提取双引号里面的内容
      val regex:String = "\"(.*?)\""
      val pattern:Pattern = Pattern.compile(regex)
      val matcher:Matcher = pattern.matcher(e)
      var goal:(String, Int) = ("", 0)
      while (matcher.find()) {
        val str = matcher.group()
        // 提取两个空格之间的内容,即URI
        val re:String = " (.*?) "
        val pa:Pattern = Pattern.compile(re)
        val ma:Matcher = pa.matcher(str)
        while (ma.find()) {
          val temp = ma.group().trim
          val index = temp.lastIndexOf("/")
          // 提取jsp文件名
          val result = temp.substring(index + 1)
//          println((result, 1))
          goal = (result, 1)
        }
      }
      goal
    })
    // 对相同的文件名的访问次数求和
    val rdd3 = rdd2.reduceByKey(_+_)
    
    // 针对访问量降序排序
    val rdd4 = rdd3.sortBy(_._2, false)
//    rdd4.foreach(println)
    
    // 取访问量最高的前两条记录
    val result = rdd4.take(2)
    
    result.foreach(println)
    
    sc.stop()
  }
}

结果:

(oracle.jsp,9)
(hadoop.jsp,9)

关注我的微信公众号(曲健磊的个人随笔),观看更多精彩内容:
Spark编程案例:分析tomcat访问日志求访问量最高的前两个网页_第1张图片

你可能感兴趣的:(【Spark】)