学习笔记-日志采集和实时分析简单实例

流程简述:Nginx生成的访问日志文件通过Filebeat采集输入到Kafka中,Kafka中数据流入Logstash,Logstash处理过滤数据。一个出口直接写到ElasticSearch,提供Kibana进行日志分析可视化处理;一个出口直接写到HDFS,提供后期离线统计分析处理;一个出口写入Kafka中,提供SparkStreaming进行近实时日志统计分析处理。具体配置操作记录如下:

Nginx 日志格式如下:

log_format main '$http_x_forwarded_for##$remote_user##$remote_addr##$time_local##'
    '$request##$status##$body_bytes_sent##$http_referer##'
    '$http_user_agent##$request_time##$http_accessToken';

Filebeat 版本:6.2.2

修改filebeat.yml配置文件

filebeat.prospectors:
- type: log
  enabled: true
  paths:
    - /usr/local/nginx/logs/sysapp-access.log
    
filebeat.config.modules:
  path: ${path.config}/modules.d/*.yml
  reload.enabled: false

setup.template.settings:
  index.number_of_shards: 3

fields:
  sysplat: sysapp

fields_under_root: true

setup.kibana:

output.kafka:
  hosts: ["192.168.0.1:9092"]
  topic: "logcollect"
  partition.round_robin:
    reachable_only: false

  version: 0.10.2.0
  required_acks: 1
  compression: none
  max_message_bytes: 1000000

启动Filebeat(Kafka中Topic已经创建)

~/filebeat-6.2.2$ ./filebeat -e -c filebeat.yml

Logstash版本:6.2.2

修改logstash.conf配置文件

input {
    kafka {
        group_id => "test-consumer-group"
        auto_offset_reset => "latest" 
        consumer_threads => 1
        decorate_events => false
        topics => ["logcollect"]
        bootstrap_servers => ["192.168.0.1:9092"]
    }
}
filter {
    json {
        source => "message"
        remove_field => ["@metadata", "prospector", "beat", "source", "offset"]
    }
    csv {
        columns => ["http_x_forwarded_for", "remote_user", "remote_addr", "time_local", "request", "status", "body_bytes_sent", "http_referer", "http_user_agent", "request_time", "http_accessToken"]
        separator => "##"
        remove_field => ["host", "path", "message", "@version"]
    }
    ruby {
        path => "/home/dataplat/logstash-6.2.2/handle.rb"
    }
    urldecode {
        all_fields => false
        field => "request_params"
    }
    mutate {
        convert => {
            "request_time" => "float"
            "status" => "integer"
            "userId" => "integer"
        }
        remove_field => ["request", "http_accessToken", "sysplat"]
    }
    if [remote_addr] == "127.0.0.1" {
		drop{}
    }
}
output {
    stdout {
        codec => json {
            charset => "UTF-8"
        }
    }
    elasticsearch {
        hosts => "192.168.0.1:9200"
        index => "accesslog-%{+YYYYMM}"
        document_type=>"sysapp"
        manage_template => true
        template_name => "accesslog"
        template_overwrite => true
        template => "/home/dataplat/logstash-6.2.2/accesslog.json" 
    }
    webhdfs {
        codec => json
        host => "192.168.0.1"
        port => 50070
        path => "/user/test/logstash/%{+YYYYMMdd}/logstash-%{+HH}.log"
        user => "test"
    }
    kafka {
        codec => json
        acks => "1"
        bootstrap_servers => "192.168.0.1:9092"
        topic_id => "logstat"
        compression_type => "none"
    } 
}

handle.rb代码如下

#!/usr/bin/ruby 
#-*- coding: UTF-8 -*-

require "date"
require "json"
require "base64"

def register(params)
end

def filter(event)
    time_local = event.get("time_local")
    ttime = DateTime.strptime(time_local, "%d/%b/%Y:%H:%M:%S")
    event.set("time_local", ttime.strftime("%Y-%m-%d %H:%M:%S"))
    requestTxt = event.get("request")
    requestContent = requestTxt.split(" ")
    if requestContent[0] == "OPTIONS"
        return []
    end
    event.set("request_method", requestContent[0])
    if requestContent[1].include?("?")
        requestUrlAndParams = requestContent[1].split("?")
        event.set("request_url", requestUrlAndParams[0])
        requestUrlAndParamsLen = requestUrlAndParams.size
        if requestUrlAndParamsLen = 2
            event.set("request_params", requestUrlAndParams[1])
        end
    else
        event.set("request_url", requestContent[1])
    end 
    if event.include? "http_accessToken"
        accessToken = event.get("http_accessToken")
        if !accessToken.blank? && accessToken.length > 32
            jsonTxt = Base64.decode64(accessToken[32, accessToken.length])
            jsonObj = JSON.parse(jsonTxt)
            event.set("userId", jsonObj["userId"])
        end
    end
    if event.include? "sysplat"
        sysplat = event.get("sysplat")
        if !sysplat.blank? && sysplat.length > 0
            event.set("type", sysplat)
        end
    end
    return [event]
end

accesslog模版文件如下(ELasticSearch版本:6.3.0)

{
	"template": "accesslog*",
	"settings": {
		"index.number_of_shards": 5,
		"number_of_replicas": 0,
		"index.refresh_interval": "60s"
	},
	"mappings": {
		"mirrorworld": {
			"dynamic": "strict",
			"_source": {
				"excludes": ["type"]
			},
			"properties": {
				"remote_addr": {
					"type": "ip"
				},
				"body_bytes_sent": {
					"type": "keyword"
				},
				"time_local": {
					"type": "date",
					"format": "yyyy-MM-dd HH:mm:ss"
				},
				"request_method": {
					"type": "keyword"
				},
				"type": {
					"type": "keyword"
				},
				"request_url": {
					"type": "keyword"
				},
				"request_params": {
					"analyzer": "ik_smart",
					"type": "text"
				},
				"userId": {
					"type": "long"
				},
				"http_user_agent": {
					"analyzer": "ik_smart",
					"type": "text"
				},
				"remote_user": {
					"analyzer": "ik_smart",
					"type": "text"
				},
				"@timestamp": {
					"type": "date",
					"format": "strict_date_optional_time||epoch_millis"
				},
				"request_time": {
					"type": "double"
				},
				"http_referer": {
					"type": "keyword"
				},
				"http_x_forwarded_for": {
					"type": "keyword"
				},
				"status": {
					"type": "integer"
				}
			}
		}
	}
}

启动Logstash (ElasticSearch服务已启动完成、Kafka中Topic已经创建)

~/logstash-6.2.2$ bin/logstash -f logstash.conf

Spark Streaming 简单统计代码如下:

import java.util

import com.google.gson.reflect.TypeToken

import scala.collection.JavaConverters._
import com.google.gson.{Gson, GsonBuilder}
import kafka.api.OffsetRequest
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import kafka.utils.ZKStringSerializer
import org.I0Itec.zkclient.ZkClient
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, State, StateSpec, StreamingContext}
import org.platform.utils.{KafkaOffsetUtils, RedisClusterUtils}
import org.slf4j.LoggerFactory

object RlTimeStatisticsFromLogV1 {

  lazy val LOG = LoggerFactory.getLogger(RlTimeStatisticsFromLogV1.getClass());

  val isLocal = true
  val readLastestInFirst = true

  val gson:Gson = new GsonBuilder().serializeSpecialFloatingPointValues()
    .setDateFormat("yyyy-MM-dd HH:mm:ss").create();

  def main(args: Array[String]): Unit = {
    val sparkConf = initSparkConf()
    val ssc = new StreamingContext(sparkConf, Seconds(10))

    var kafkaParams = Map[String,String]("bootstrap.servers" -> "192.168.0.1:9092")
    if (readLastestInFirst) {
      kafkaParams += ("auto.offset.reset" -> OffsetRequest.LargestTimeString)
    }

    val zkClient = new ZkClient("192.168.0.1:2181", 30000, 30000, ZKStringSerializer)

    val zkOffsetPath = "/spark/streaming/kafka/logstat"

    val topicsSet = "logstat".split(",").toSet

    val rdds:InputDStream[(String, String)] = createKafkaStream(ssc, kafkaParams, zkClient, zkOffsetPath, topicsSet)

    val remoteAddrRankCacheKey = "rank_remote_addr"
    val requestUrlRankCacheKey = "rank_request_url";

    rdds.foreachRDD(rdd => {
      if(!rdd.isEmpty()){
        rdd.foreachPartition(partitions=>{
          val redisClusterUtils = RedisClusterUtils.getInstance()
          partitions.foreach(data => {
            LOG.info("read data:" + data)
            val msgJson = data._2
            val mapType = new TypeToken[java.util.HashMap[String, Object]] {}.getType
            val msg = gson.fromJson[java.util.Map[String, Object]](msgJson, mapType).asScala
            val remoteAddr:String = msg.get("remote_addr").get.toString
            redisClusterUtils.zincrby(remoteAddrRankCacheKey, 1.0, remoteAddr)
            val requestUrl:String = msg.get("request_url").get.toString
            redisClusterUtils.zincrby(requestUrlRankCacheKey, 1.0, requestUrl)
            val remoteAddrRankLen = redisClusterUtils.zcard(remoteAddrRankCacheKey)
            if (remoteAddrRankLen > 100) {
              val remoteAddrs = redisClusterUtils.zrevrange(remoteAddrRankCacheKey, 100, remoteAddrRankLen)
              for (remoteAddr <- remoteAddrs.asScala) {
                redisClusterUtils.zrem(remoteAddrRankCacheKey, remoteAddr)
              }
            }
            val requestUrlRankLen = redisClusterUtils.zcard(requestUrlRankCacheKey)
            if (requestUrlRankLen > 100) {
              val requestUrls = redisClusterUtils.zrevrange(requestUrlRankCacheKey, 100, requestUrlRankLen)
              for (requestUrl <- requestUrls.asScala) {
                redisClusterUtils.zrem(requestUrlRankCacheKey, requestUrl)
              }
            }
            val requestUrls = redisClusterUtils.zrevrange(requestUrlRankCacheKey, 0, 100)
            for (requestUrl <- requestUrls.asScala) {
              println(requestUrl)
            }

          })
        })
        KafkaOffsetUtils.saveOffsets(zkClient, zkOffsetPath, rdd)
      }
    })

    ssc.start()
    ssc.awaitTermination()
  }

  def initSparkConf(): SparkConf = {
    val sparkConf = new SparkConf().setAppName("Real Time Statistics")
    sparkConf.setMaster(if(isLocal) "local[*]" else "spark://192.168.0.1:7077")
    sparkConf.set("spark.streaming.stopGracefullyOnShutdown", "true")
    sparkConf.set("spark.streaming.backpressure.enabled", "true") 
    sparkConf.set("spark.streaming.backpressure.initialRate", "5000") 
    sparkConf.set("spark.streaming.kafka.maxRatePerPartition", "2000") 
    sparkConf
  }

  def createKafkaStream(ssc: StreamingContext,
                        kafkaParams: Map[String, String],
                        zkClient: ZkClient,
                        zkOffsetPath: String,
                        topics: Set[String]): InputDStream[(String, String)] = {
    
    val zkOffsetData = KafkaOffsetUtils.readOffsets(zkClient, zkOffsetPath, topics.last)

    val kafkaStream = zkOffsetData match {
      case None => 
        KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics)
      case Some(lastStopOffset) =>
        val messageHandler = (mmd: MessageAndMetadata[String, String]) => (mmd.key, mmd.message)
        KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](ssc, kafkaParams, lastStopOffset, messageHandler)
    }
    kafkaStream
  }

  var global_addr:util.Map[String, Int] = new util.HashMap[String, Int]()
  var global_url:util.Map[String, Int] = new util.HashMap[String, Int]()

  def processMessage(msgJson:String): Unit = {
    val mapType = new TypeToken[java.util.HashMap[String, Object]] {}.getType
    val msg = gson.fromJson[java.util.Map[String, Object]](msgJson, mapType).asScala
    val remote_addr:String = msg.get("remote_addr").get.toString
    val remote_addr_count:Int = global_addr.getOrDefault(remote_addr, 0) + 1
    global_addr.put(remote_addr, remote_addr_count)
    val request_url:String = msg.get("request_url").get.toString
    val request_url_count:Int = global_url.getOrDefault(request_url, 0) + 1
    global_url.put(request_url, request_url_count)
  }

}

 

 

你可能感兴趣的:(Spark,Kafka,ElasticSearch,Filebeat,Logstash,Kafka,Spark)