流程简述:Nginx生成的访问日志文件通过Filebeat采集输入到Kafka中,Kafka中数据流入Logstash,Logstash处理过滤数据。一个出口直接写到ElasticSearch,提供Kibana进行日志分析可视化处理;一个出口直接写到HDFS,提供后期离线统计分析处理;一个出口写入Kafka中,提供SparkStreaming进行近实时日志统计分析处理。具体配置操作记录如下:
Nginx 日志格式如下:
log_format main '$http_x_forwarded_for##$remote_user##$remote_addr##$time_local##'
'$request##$status##$body_bytes_sent##$http_referer##'
'$http_user_agent##$request_time##$http_accessToken';
Filebeat 版本:6.2.2
修改filebeat.yml配置文件
filebeat.prospectors:
- type: log
enabled: true
paths:
- /usr/local/nginx/logs/sysapp-access.log
filebeat.config.modules:
path: ${path.config}/modules.d/*.yml
reload.enabled: false
setup.template.settings:
index.number_of_shards: 3
fields:
sysplat: sysapp
fields_under_root: true
setup.kibana:
output.kafka:
hosts: ["192.168.0.1:9092"]
topic: "logcollect"
partition.round_robin:
reachable_only: false
version: 0.10.2.0
required_acks: 1
compression: none
max_message_bytes: 1000000
启动Filebeat(Kafka中Topic已经创建)
~/filebeat-6.2.2$ ./filebeat -e -c filebeat.yml
Logstash版本:6.2.2
修改logstash.conf配置文件
input {
kafka {
group_id => "test-consumer-group"
auto_offset_reset => "latest"
consumer_threads => 1
decorate_events => false
topics => ["logcollect"]
bootstrap_servers => ["192.168.0.1:9092"]
}
}
filter {
json {
source => "message"
remove_field => ["@metadata", "prospector", "beat", "source", "offset"]
}
csv {
columns => ["http_x_forwarded_for", "remote_user", "remote_addr", "time_local", "request", "status", "body_bytes_sent", "http_referer", "http_user_agent", "request_time", "http_accessToken"]
separator => "##"
remove_field => ["host", "path", "message", "@version"]
}
ruby {
path => "/home/dataplat/logstash-6.2.2/handle.rb"
}
urldecode {
all_fields => false
field => "request_params"
}
mutate {
convert => {
"request_time" => "float"
"status" => "integer"
"userId" => "integer"
}
remove_field => ["request", "http_accessToken", "sysplat"]
}
if [remote_addr] == "127.0.0.1" {
drop{}
}
}
output {
stdout {
codec => json {
charset => "UTF-8"
}
}
elasticsearch {
hosts => "192.168.0.1:9200"
index => "accesslog-%{+YYYYMM}"
document_type=>"sysapp"
manage_template => true
template_name => "accesslog"
template_overwrite => true
template => "/home/dataplat/logstash-6.2.2/accesslog.json"
}
webhdfs {
codec => json
host => "192.168.0.1"
port => 50070
path => "/user/test/logstash/%{+YYYYMMdd}/logstash-%{+HH}.log"
user => "test"
}
kafka {
codec => json
acks => "1"
bootstrap_servers => "192.168.0.1:9092"
topic_id => "logstat"
compression_type => "none"
}
}
handle.rb代码如下
#!/usr/bin/ruby
#-*- coding: UTF-8 -*-
require "date"
require "json"
require "base64"
def register(params)
end
def filter(event)
time_local = event.get("time_local")
ttime = DateTime.strptime(time_local, "%d/%b/%Y:%H:%M:%S")
event.set("time_local", ttime.strftime("%Y-%m-%d %H:%M:%S"))
requestTxt = event.get("request")
requestContent = requestTxt.split(" ")
if requestContent[0] == "OPTIONS"
return []
end
event.set("request_method", requestContent[0])
if requestContent[1].include?("?")
requestUrlAndParams = requestContent[1].split("?")
event.set("request_url", requestUrlAndParams[0])
requestUrlAndParamsLen = requestUrlAndParams.size
if requestUrlAndParamsLen = 2
event.set("request_params", requestUrlAndParams[1])
end
else
event.set("request_url", requestContent[1])
end
if event.include? "http_accessToken"
accessToken = event.get("http_accessToken")
if !accessToken.blank? && accessToken.length > 32
jsonTxt = Base64.decode64(accessToken[32, accessToken.length])
jsonObj = JSON.parse(jsonTxt)
event.set("userId", jsonObj["userId"])
end
end
if event.include? "sysplat"
sysplat = event.get("sysplat")
if !sysplat.blank? && sysplat.length > 0
event.set("type", sysplat)
end
end
return [event]
end
accesslog模版文件如下(ELasticSearch版本:6.3.0)
{
"template": "accesslog*",
"settings": {
"index.number_of_shards": 5,
"number_of_replicas": 0,
"index.refresh_interval": "60s"
},
"mappings": {
"mirrorworld": {
"dynamic": "strict",
"_source": {
"excludes": ["type"]
},
"properties": {
"remote_addr": {
"type": "ip"
},
"body_bytes_sent": {
"type": "keyword"
},
"time_local": {
"type": "date",
"format": "yyyy-MM-dd HH:mm:ss"
},
"request_method": {
"type": "keyword"
},
"type": {
"type": "keyword"
},
"request_url": {
"type": "keyword"
},
"request_params": {
"analyzer": "ik_smart",
"type": "text"
},
"userId": {
"type": "long"
},
"http_user_agent": {
"analyzer": "ik_smart",
"type": "text"
},
"remote_user": {
"analyzer": "ik_smart",
"type": "text"
},
"@timestamp": {
"type": "date",
"format": "strict_date_optional_time||epoch_millis"
},
"request_time": {
"type": "double"
},
"http_referer": {
"type": "keyword"
},
"http_x_forwarded_for": {
"type": "keyword"
},
"status": {
"type": "integer"
}
}
}
}
}
启动Logstash (ElasticSearch服务已启动完成、Kafka中Topic已经创建)
~/logstash-6.2.2$ bin/logstash -f logstash.conf
Spark Streaming 简单统计代码如下:
import java.util
import com.google.gson.reflect.TypeToken
import scala.collection.JavaConverters._
import com.google.gson.{Gson, GsonBuilder}
import kafka.api.OffsetRequest
import kafka.message.MessageAndMetadata
import kafka.serializer.StringDecoder
import kafka.utils.ZKStringSerializer
import org.I0Itec.zkclient.ZkClient
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, State, StateSpec, StreamingContext}
import org.platform.utils.{KafkaOffsetUtils, RedisClusterUtils}
import org.slf4j.LoggerFactory
object RlTimeStatisticsFromLogV1 {
lazy val LOG = LoggerFactory.getLogger(RlTimeStatisticsFromLogV1.getClass());
val isLocal = true
val readLastestInFirst = true
val gson:Gson = new GsonBuilder().serializeSpecialFloatingPointValues()
.setDateFormat("yyyy-MM-dd HH:mm:ss").create();
def main(args: Array[String]): Unit = {
val sparkConf = initSparkConf()
val ssc = new StreamingContext(sparkConf, Seconds(10))
var kafkaParams = Map[String,String]("bootstrap.servers" -> "192.168.0.1:9092")
if (readLastestInFirst) {
kafkaParams += ("auto.offset.reset" -> OffsetRequest.LargestTimeString)
}
val zkClient = new ZkClient("192.168.0.1:2181", 30000, 30000, ZKStringSerializer)
val zkOffsetPath = "/spark/streaming/kafka/logstat"
val topicsSet = "logstat".split(",").toSet
val rdds:InputDStream[(String, String)] = createKafkaStream(ssc, kafkaParams, zkClient, zkOffsetPath, topicsSet)
val remoteAddrRankCacheKey = "rank_remote_addr"
val requestUrlRankCacheKey = "rank_request_url";
rdds.foreachRDD(rdd => {
if(!rdd.isEmpty()){
rdd.foreachPartition(partitions=>{
val redisClusterUtils = RedisClusterUtils.getInstance()
partitions.foreach(data => {
LOG.info("read data:" + data)
val msgJson = data._2
val mapType = new TypeToken[java.util.HashMap[String, Object]] {}.getType
val msg = gson.fromJson[java.util.Map[String, Object]](msgJson, mapType).asScala
val remoteAddr:String = msg.get("remote_addr").get.toString
redisClusterUtils.zincrby(remoteAddrRankCacheKey, 1.0, remoteAddr)
val requestUrl:String = msg.get("request_url").get.toString
redisClusterUtils.zincrby(requestUrlRankCacheKey, 1.0, requestUrl)
val remoteAddrRankLen = redisClusterUtils.zcard(remoteAddrRankCacheKey)
if (remoteAddrRankLen > 100) {
val remoteAddrs = redisClusterUtils.zrevrange(remoteAddrRankCacheKey, 100, remoteAddrRankLen)
for (remoteAddr <- remoteAddrs.asScala) {
redisClusterUtils.zrem(remoteAddrRankCacheKey, remoteAddr)
}
}
val requestUrlRankLen = redisClusterUtils.zcard(requestUrlRankCacheKey)
if (requestUrlRankLen > 100) {
val requestUrls = redisClusterUtils.zrevrange(requestUrlRankCacheKey, 100, requestUrlRankLen)
for (requestUrl <- requestUrls.asScala) {
redisClusterUtils.zrem(requestUrlRankCacheKey, requestUrl)
}
}
val requestUrls = redisClusterUtils.zrevrange(requestUrlRankCacheKey, 0, 100)
for (requestUrl <- requestUrls.asScala) {
println(requestUrl)
}
})
})
KafkaOffsetUtils.saveOffsets(zkClient, zkOffsetPath, rdd)
}
})
ssc.start()
ssc.awaitTermination()
}
def initSparkConf(): SparkConf = {
val sparkConf = new SparkConf().setAppName("Real Time Statistics")
sparkConf.setMaster(if(isLocal) "local[*]" else "spark://192.168.0.1:7077")
sparkConf.set("spark.streaming.stopGracefullyOnShutdown", "true")
sparkConf.set("spark.streaming.backpressure.enabled", "true")
sparkConf.set("spark.streaming.backpressure.initialRate", "5000")
sparkConf.set("spark.streaming.kafka.maxRatePerPartition", "2000")
sparkConf
}
def createKafkaStream(ssc: StreamingContext,
kafkaParams: Map[String, String],
zkClient: ZkClient,
zkOffsetPath: String,
topics: Set[String]): InputDStream[(String, String)] = {
val zkOffsetData = KafkaOffsetUtils.readOffsets(zkClient, zkOffsetPath, topics.last)
val kafkaStream = zkOffsetData match {
case None =>
KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](ssc, kafkaParams, topics)
case Some(lastStopOffset) =>
val messageHandler = (mmd: MessageAndMetadata[String, String]) => (mmd.key, mmd.message)
KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder, (String, String)](ssc, kafkaParams, lastStopOffset, messageHandler)
}
kafkaStream
}
var global_addr:util.Map[String, Int] = new util.HashMap[String, Int]()
var global_url:util.Map[String, Int] = new util.HashMap[String, Int]()
def processMessage(msgJson:String): Unit = {
val mapType = new TypeToken[java.util.HashMap[String, Object]] {}.getType
val msg = gson.fromJson[java.util.Map[String, Object]](msgJson, mapType).asScala
val remote_addr:String = msg.get("remote_addr").get.toString
val remote_addr_count:Int = global_addr.getOrDefault(remote_addr, 0) + 1
global_addr.put(remote_addr, remote_addr_count)
val request_url:String = msg.get("request_url").get.toString
val request_url_count:Int = global_url.getOrDefault(request_url, 0) + 1
global_url.put(request_url, request_url_count)
}
}