Nginx访问日志形式: $remote_addr – $remote_user [$time_local] “$request”$status $body_bytes_sent“$http_referer” ”$http_user_agent”
例如:192.168.241.1 - - [02/Mar/2017:15:22:57 +0800] “GET /favicon.ico HTTP/1.1” 404 209 “http://192.168.241.10/” “Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36”
考虑到Nginx服务器可能有多个,所以采用二级Agent来收集日志
其中Interceptor是自定义拦截器,将利用正则表达式配置过滤掉无用记录,将文本记录转换为map对象。
HBaseSink是自定义HBaseSink,可以将map对象存储到HBase中,可配置时间字段作为行键前缀,可按时间升序排列。
package com.mine.flume.interceptors;
import org.apache.commons.io.output.ByteArrayOutputStream;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.interceptor.Interceptor;
import org.apache.log4j.Logger;
import java.io.IOException;
import java.io.ObjectOutputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class LogFormatInterceptor implements Interceptor{
private static Logger logger = Logger.getLogger(LogFormatInterceptor.class);
private Pattern regex; //正则匹配式
private String[] keys; //匹配内容所对应keys
public LogFormatInterceptor(Pattern regex,String[] keys) {
this.regex = regex;
this.keys = keys;
}
public void initialize() {
}
public Event intercept(Event event) {
byte[] body = event.getBody();
String record = new String(body);
Matcher matcher = regex.matcher(record);
if(matcher.find()) {
int groupCount = matcher.groupCount();
if(groupCount!=keys.length) {
logger.error("regex匹配的group数与keys长度不相等");
event = null;
}else {
//构造Map
Map map = Util.matcherToMap(matcher,keys);
//将Map转换成byte数组存放到body中
byte[] bytes = Util.mapToByteArray(map);
//替换Event内容
event.setBody(bytes);
try {
ByteArrayOutputStream bos = new ByteArrayOutputStream();
ObjectOutputStream oos = new ObjectOutputStream(bos);
} catch (IOException e) {
e.printStackTrace();
}
}
}else {
event = null;
}
return event;
}
public List intercept(List list) {
List events = new ArrayList();
for(Event event: list) {
Event nEvent = intercept(event);
if(nEvent!=null) {//该事件有效
events.add(nEvent);
}
}
return events;
}
public void close() {
}
public static class Builder implements Interceptor.Builder {
private Pattern regex; //正则匹配式
private String[] keys; //匹配内容所对应keys
public Builder() {
}
public Interceptor build() {
return new LogFormatInterceptor(regex,keys);
}
public void configure(Context context) {
String regexString = context.getString("regex",".*");
regex = Pattern.compile(regexString);
String keyString = context.getString("keys");
keys = keyString.split(",");
}
}
}
Interceptor过滤器工作方式:Channel Processor会根据Class.newInstance()方法创建一个Build实例,将配置文件生成的Context传递给Build.configure方法从中获取Interceptor所需要的配置项,这里是regex和keys。而后调用Build.build()返回一个Interceptor对象。调用Interceptor.intercept(List)处理事件列表
agent.sources = execSrc
agent.channels = memoryChannel
agent.sinks = avroSink
# 利用exec方式实时追踪日志
agent.sources.execSrc.type = exec
agent.sources.execSrc.command = tail -F /etc/nginx/log/access.log
agent.sources.execSrc.channels = memoryChannel
agent.sources.execSrc.interceptors = logformat
agent.sources.execSrc.interceptors.logformat.type = com.mine.flume.interceptors.LogFormatInterceptor$Builder
agent.sources.execSrc.interceptors.logformat.regex = (\\d+\\.\\d+\\.\\d+\\.\\d+) - ([^ ]*) \\[(.*)\\] \"(.*)\" ([^ ]*) ([^ ]*) \"([^\"]*)\" \"([^\"]*)\"
agent.sources.execSrc.interceptors.logformat.keys = remote_addr,remote_user,time_local,request,status,body_bytes_sent,http_referer,http_user_agent
agent.sinks.avroSink.type = avro
agent.sinks.avroSink.hostname = 192.168.241.10
agent.sinks.avroSink.port = 1111
agent.sinks.avroSink.channel = memoryChannel
agent.channels.memoryChannel.type = memory
public class MapBodyHbaseSink implements HbaseEventSerializer {
private static Logger logger = Logger.getLogger(MapBodyHbaseSink.class);
private String rowPrefix;
private byte[] incrementRow;
private byte[] cf;
private Map body;
private byte[] incCol;
private KeyType keyType;
public void configure(Context context) {
rowPrefix = context.getString("rowPrefix", "default");
incrementRow = context.getString("incrementRow", "incRow").getBytes(Charsets.UTF_8);
String suffix = context.getString("suffix", "uuid");
String incColumn = context.getString("incrementColumn", "iCol");
if (suffix.equals("timestamp")) {
keyType = KeyType.TS;
} else if (suffix.equals("random")) {
keyType = KeyType.RANDOM;
} else if (suffix.equals("nano")) {
keyType = KeyType.TSNANO;
} else if(suffix.equals("timePrefix")){
//自定义主键类型,以某个时间字段为前缀,rowPrefix指定时间段域名
keyType = KeyType.TIMEPREFIX;
} else {
keyType = KeyType.UUID;
}
if (incColumn != null && !incColumn.isEmpty()) {
incCol = incColumn.getBytes(Charsets.UTF_8);
}
}
public void configure(ComponentConfiguration conf) {
}
public void initialize(Event event, byte[] cf) {
this.body = Util.byteArrayToMap(event.getBody());
this.cf = cf;
}
public List getActions() throws FlumeException {
List actions = new LinkedList();
if (body != null&&!body.isEmpty()) {
byte[] rowKey;
try {
if (keyType == KeyType.TS) {
rowKey = SimpleRowKeyGenerator.getTimestampKey(rowPrefix);
} else if (keyType == KeyType.RANDOM) {
rowKey = SimpleRowKeyGenerator.getRandomKey(rowPrefix);
} else if (keyType == KeyType.TSNANO) {
rowKey = SimpleRowKeyGenerator.getNanoTimestampKey(rowPrefix);
} else {
if(keyType == KeyType.TIMEPREFIX) {
String dateString = body.get(rowPrefix);
if(dateString==null) return actions;
Date access_time = Util.parseDate(dateString);
if(access_time==null) {//解析访问时间失败
logger.error("解析时间失败:"+body.get(rowPrefix));
return actions;
}else {
body.put(rowPrefix,Util.formatDate(access_time));
rowPrefix = access_time.getTime()+"";
}
}
rowKey = SimpleRowKeyGenerator.getUUIDKey(rowPrefix);
}
for(String col : body.keySet()) {
Put put = new Put(rowKey);
put.add(cf, col.getBytes(), body.get(col).getBytes());
actions.add(put);
}
} catch (Exception e) {
throw new FlumeException("Could not get row key!", e);
}
}
return actions;
}
public List getIncrements() {
List incs = new LinkedList();
if (incCol != null) {
Increment inc = new Increment(incrementRow);
inc.addColumn(cf, incCol, 1);
incs.add(inc);
}
return incs;
}
public void close() {
}
public enum KeyType {
UUID,
RANDOM,
TS,
TSNANO,
TIMEPREFIX;
}
}
HBaseSink工作方式:从configure方法获取配置信息,包括行键类型、行键前缀等,我在这里添加行键类型timePrefix可以配置某个时间字段作为行键前缀,因为HBase中按照行键升序排列。
agent.sources = avroSrc
agent.channels = memoryChannel
agent.sinks = hbaseSink
agent.sources.avroSrc.type = avro
agent.sources.avroSrc.bind = 192.168.241.10
agent.sources.avroSrc.port = 1111
agent.sources.avroSrc.channels = memoryChannel
agent.sinks.hbaseSink.type = hbase
agent.sinks.hbaseSink.table = access_logs
agent.sinks.hbaseSink.columnFamily = log
agent.sinks.hbaseSink.serializer = com.mine.flume.sinks.MapBodyHbaseSink
agent.sinks.hbaseSink.serializer.rowPrefix = time_local
agent.sinks.hbaseSink.serializer.suffix = timePrefix
agent.sinks.hbaseSink.channel = memoryChannel
agent.channels.memoryChannel.type = memory
将项目打成jar包,放到flume的lib目录下
配置完成之后,启动flume
flume-ng agent -c /home/user/hadoop/apache-flume-1.7.0-bin/conf -f /home/user/hadoop/apache-flume-1.7.0-bin/conf/flume-conf.properties -n agent
-c — 配置文件目录
-f — 配置文件位置
-n — agent名字(即配置文件中agent.sources其中agent就是名字)
1.工作时,NoDefClass异常:ProtobufUtil
这个是由于打包时,直接用idea导出jar包,其中会包含依赖jar包中的class文件,所以导致与flume/lib中jar包重复冲突导致异常,利用mvn install打包文件之后,将仓库下jar文件复制到lib目录下就可以避免。