目前官方Flume最新的版本仍然不支持Elasticsearch2.x版本,flume的老版本(1.6)只支持Elasticsearch1.7.x的版本,因为Elasticsearch2.x版本做了比较大的改动,很多API都已经废弃不用了。
在github上找了一个项目:https://github.com/lucidfrontier45/ElasticsearchSink2 也是按照flume源码中sink的模式编写的,其实也很简单。最近,公司有个项目也是需要将日志手机到es上,供实时查询,索性自己写了一个。
一、整体架构:
1、线上业务代码将日志数据发送到kafka,数据结构是pb,kafka中存放的是byte流;
2、flume-source对接kafka,自定义一个intercept,将byte流解成结构化数据;
3、自定义es-sink,将结构化数据存放到es中。
二、代码:
1、pb:
option java_package = "XXX.base.proto";
option java_outer_classname="ApiLogPB";
message ApiLog {
optional string puid = 1;
optional string uId = 2;
optional string reqId = 3;
optional int32 fNum = 4;
optional int32 cost = 5;
optional string chId = 6 [ default = "default"];
optional string strategy = 7;
repeated int64 recId = 8;
optional string txt = 9;
optional string vedio = 10;
optional string gallery = 11;
optional string pMap = 12;
optional string paramMap = 13;
}
public Integer call() throws Exception {
Builder newBuilder = ApiLogPB.ApiLog.newBuilder();
newBuilder.setReqId(reqId);
newBuilder.setUId(uId);
newBuilder.setPuid(puid);
newBuilder.setChId(channelId);
newBuilder.setFNum(fNum);
newBuilder.setCost(Integer.parseInt((System.currentTimeMillis()-a)+""));
newBuilder.setStrategy(personalResponse.getStrategy());
List recIdSet = personalResponse.getRecIdSet();
for (Long fid:recIdSet) {
newBuilder.addRecId(fid);
}
Map> typeFeeds = personalResponse.getTypeFeeds();
List newList = typeFeeds.get("NEWS");
newBuilder.setTxt(newList==null ? "":newList.toString());
List garleryList = typeFeeds.get("GALLERY");
newBuilder.setGallery(garleryList==null ? "":garleryList.toString());
List vedioList = typeFeeds.get("VIDEO");
newBuilder.setVedio(vedioList==null?"":vedioList.toString());
Map> mutilFeeds = personalResponse.getMutilFeeds();
StringBuilder pSb = new StringBuilder();
Set>> entrySet = mutilFeeds.entrySet();
int i = 0;
for (Entry> engry : entrySet) {
String key = engry.getKey();
List value = engry.getValue();
if (CollectionUtils.isNotEmpty(value)) {
String v1 = StringUtils.strip(value.toString(),"[]");
pSb.append(key).append(":").append(v1);
if (i
package XXX.log.common.entity;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import com.google.protobuf.InvalidProtocolBufferException;
import com.iqiyi.ttbrain.base.proto.ApiLogPB;
import com.iqiyi.ttbrain.base.proto.ApiLogPB.ApiLog;
public class ApiEntity {
public static final int P_N_SIZE = 50;
private static final String flag = "\t";
private String reqId="";
private String uid="";
private String ppuid="";
private String channel="";
private int feedNum=10;
private int cost=0;
private String strategy="";
private String timeStamp="";
private String host="";
private String recFeedId="";
private String txt="";
private String gallery="";
private String vedio="";
private String pMap = "";
private String paraMap = "";
@Override
public String toString() {
StringBuilder sb = new StringBuilder(768);
sb.append(reqId).append(flag).append(uid).append(flag).append(ppuid).append(flag).append(channel).append(flag)
.append(feedNum).append(flag).append(cost).append(flag).append(strategy).append(flag)
.append(timeStamp).append(flag).append(host).append(flag)
.append(recFeedId).append(flag)
.append(txt).append(flag).append(gallery).append(flag).append(vedio).append(flag)
.append(pMap).append(flag)
.append(paraMap).append(flag)
.append("end");
return sb.toString();
}
public static ApiEntity parseFromPB(byte[] bytes) throws InvalidProtocolBufferException{
ApiEntity an = null;
if (bytes != null) {
an = new ApiEntity();
ApiLog apiLog = ApiLogPB.ApiLog.parseFrom(bytes);
an.setReqId(apiLog.getReqId());
an.setUid(apiLog.getUId());
an.setPpuid(apiLog.getPuid());
an.setChannel(apiLog.getChId());
an.setFeedNum(apiLog.getFNum());
an.setCost(apiLog.getCost());
an.setStrategy(apiLog.getStrategy());
StringBuilder sb = new StringBuilder();
List recIdList = apiLog.getRecIdList();
for (int i=0;i parseToMap(String line) {
Map map = new HashMap<>();
if (line != null) {
String[] split = line.split(flag);
if (split.length>=15) {
map.put("reqId", split[0]);
map.put("uid", split[1]);
map.put("ppuid", split[2]);
map.put("channel", split[3]);
map.put("feedNum", split[4]);
map.put("cost", split[5]);
map.put("strategy", split[6]);
map.put("timeStamp", split[7]);
map.put("host", split[8]);
map.put("recFeedId", split[9]);
map.put("txt", split[10]);
map.put("gallery", split[11]);
map.put("vedio", split[12]);
map.put("pMap", split[13]);
map.put("paraMap", split[14]);
}
}
/*BeanMap beanMap = BeanMap.create(an);
for (Object key : beanMap.keySet()) {
map.put((String)key, beanMap.get(key));
}*/
return map;
}
public static void main(String...strings) {
ApiEntity le = new ApiEntity();
le.setChannel("default");
le.setParaMap("sdfsdfdsfdsfds");
le.setpMap("12321321321321");
String line = le.toString();
System.out.println(line);
String[] split = line.split(flag);
System.out.println(split.length);
Map parseToMap = parseToMap(line);
System.out.println(parseToMap);
System.out.println(split[15]);
}
}
package XXX.log.flume.interceptor;
import java.util.List;
import java.util.Map;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.interceptor.Interceptor;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.Lists;
import com.iqiyi.ttbrain.log.common.entity.ApiEntity;
public class ApiInterceptor implements Interceptor {
private static final Logger logger = LoggerFactory.getLogger(ApiInterceptor.class);
@Override
public void close() {
// TODO Auto-generated method stub
logger.info("flume ApiInterceptor is close");
}
@Override
public void initialize() {
// TODO Auto-generated method stub
logger.info("flume ApiInterceptor is initialize");
}
@Override
public Event intercept(Event event) {
try {
Map headers = event.getHeaders();
byte[] body = event.getBody();
if (body != null) {
ApiEntity apiLog = null;
try{
apiLog = ApiEntity.parseFromPB(body);
}catch(Exception e){
logger.info("apiLog:{}",apiLog);
}
if (apiLog != null) {
String hostName = headers.get("hostname");
String timeStamp = headers.get("timestamp");
apiLog.setHost(hostName);
apiLog.setTimeStamp(timeStamp);
// logger.info(apiLog.toString());
event.setBody(apiLog.toString().getBytes());
return event;
}
}
} catch (Exception e ) {
logger.error("intercept:",e);
}
return null;
}
@Override
public List intercept(List events) {
List intercepted = Lists.newArrayListWithCapacity(events.size());
for (Event event : events) {
Event interceptedEvent = intercept(event);
if (interceptedEvent != null) {
intercepted.add(interceptedEvent);
}
}
return intercepted;
}
public static class Builder implements Interceptor.Builder {
//使用Builder初始化Interceptor
@Override
public Interceptor build() {
return new ApiInterceptor();
}
@Override
public void configure(Context arg0) {
// TODO Auto-generated method stub
}
}
}
5、esDao工具类:
package XXX.log.flume.sink.db;
import java.net.InetSocketAddress;
import java.util.ArrayList;
import java.util.Date;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.elasticsearch.action.bulk.BulkRequestBuilder;
import org.elasticsearch.action.bulk.BulkResponse;
import org.elasticsearch.action.index.IndexResponse;
import org.elasticsearch.client.transport.TransportClient;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.transport.InetSocketTransportAddress;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
public class EsDao {
private static final Logger logger = LoggerFactory.getLogger(EsDao.class);
// ip:port
private static final String clusterHost = "1.1.1.1:9300";
private static final String clusterName = "test";
private static TransportClient transportClient = null;
static {
Settings settings = Settings.settingsBuilder()
.put("cluster.name", clusterName).build();
transportClient = TransportClient.builder().settings(settings).build();
String[] hostNames = clusterHost.split(",");
InetSocketTransportAddress[] serverAddresses = new InetSocketTransportAddress[hostNames.length];
for (int i = 0; i < hostNames.length; i++) {
String[] hostPort = hostNames[i].trim().split(":");
String host = hostPort[0].trim();
int port = hostPort.length == 2 ? Integer.parseInt(hostPort[1]
.trim()) : 9300;
serverAddresses[i] = new InetSocketTransportAddress(
new InetSocketAddress(host, port));
}
for (InetSocketTransportAddress host : serverAddresses) {
transportClient.addTransportAddress(host);
}
}
private EsDao() {
}
public static void closeClient() {
transportClient.close();
}
/**
* bulk——在一个请求中添加、更新和删除多个文档
* @param indexName
* @param indexType
* @param datas
* @return
*/
public static boolean bulk(String indexName,String indexType,List
package XXX.log.flume.sink;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.flume.Channel;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.EventDeliveryException;
import org.apache.flume.Transaction;
import org.apache.flume.conf.Configurable;
import org.apache.flume.sink.AbstractSink;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import com.google.common.collect.Lists;
import com.iqiyi.ttbrain.log.flume.sink.db.EsDao;
public class EsSink extends AbstractSink implements Configurable {
private static final Logger logger = LoggerFactory.getLogger(EsSink.class);
private String indexName = "api";
private String indexType = "api_type";
private int batchSize = 100;
public EsSink() {
logger.info("EsSink start...");
}
@Override
public void start() {
super.start();
}
@Override
public void stop() {
super.stop();
//DataSourceUtils.closeDs();
}
@Override
public Status process() throws EventDeliveryException {
Status result = Status.READY;
Transaction transaction = null;
Event event = null;
String content = "";
List
2)flume的配置如下:
agent1.sources = logsource
agent1.channels = mc1 mc2
agent1.sinks = avro-sink sink2
agent1.sources.logsource.channels = mc1 mc2
agent1.sinks.avro-sink.channel = mc1
agent1.sinks.sink2.channel = mc2
#source
agent1.sources.logsource.type = org.apache.flume.source.kafka.KafkaSource
agent1.sources.logsource.zookeeperConnect = ttAlgorithm-kafka-online001-jyltqbs.qiyi.virtual:2181,ttAlgorithm-kafka-online002-jyltqbs.qiyi.virtual:2181,ttAlgorithm-kafka-online003-jyltqbs.qiyi.virtual:2181,ttAlgorithm-kafka-online004-jyltqbs.qiyi.virtual:2181,ttAlgorithm-kafka-online005-jyltqbs.qiyi.virtual:2181
agent1.sources.logsource.topic = topic_predict
agent1.sources.logsource.groupId = flume
agent1.sources.logsource.kafka.consumer.timeout.ms = 100
#interceptor
agent1.sources.logsource.interceptors=filt2 filt3 filt4
agent1.sources.logsource.interceptors.filt2.type=host
agent1.sources.logsource.interceptors.filt2.hostHeader=hostname
agent1.sources.logsource.interceptors.filt2.useIP=true
agent1.sources.logsource.interceptors.filt3.type=timestamp
agent1.sources.logsource.interceptors.filt4.type=com.iqiyi.ttbrain.log.flume.interceptor.PredictInterceptor$Builder
agent1.sources.logsource.selector.type = multiplexing
agent1.sources.logsource.selector.header = isCool
agent1.sources.logsource.selector.mapping.0 = mc1
agent1.sources.logsource.selector.mapping.1 = mc2
agent1.sources.logsource.selector.default = mc1
#channel1
agent1.channels.mc1.type = memory
agent1.channels.mc1.capacity = 10000
agent1.channels.mc1.transactionCapacity = 10000
agent1.channels.mc1.keep-alive = 60
#channel2
agent1.channels.mc2.type = memory
agent1.channels.mc2.capacity = 10000
agent1.channels.mc2.transactionCapacity = 10000
agent1.channels.mc2.keep-alive = 60
#sink1
#agent1.sinks.avro-sink.type = file_roll
#agent1.sinks.avro-sink.sink.directory = /data/mysink
#agent1.sinks.avro-sink.sink.rollInterval = 10000000
agent1.sinks.avro-sink.type = hdfs
agent1.sinks.avro-sink.hdfs.path = hdfs://hadoop-jy-namenode/data/qytt/flume/ttengine_predict/dt=%Y-%m-%d/hour=%H/
agent1.sinks.avro-sink.hdfs.writeFormat = Text
agent1.sinks.avro-sink.hdfs.fileType = DataStream
agent1.sinks.avro-sink.hdfs.fileSuffix = .log
agent1.sinks.avro-sink.hdfs.filePrefix = %Y-%m-%d_%H
agent1.sinks.avro-sink.hdfs.rollInterval = 3600
agent1.sinks.avro-sink.hdfs.rollSize = 0
agent1.sinks.avro-sink.hdfs.rollCount = 0
agent1.sinks.avro-sink.hdfs.batchSize = 1000
agent1.sinks.avro-sink.hdfs.callTimeout = 60000
agent1.sinks.avro-sink.hdfs.appendTimeout = 60000
#sink2
agent1.sinks.sink2.type = hdfs
agent1.sinks.sink2.hdfs.path = hdfs://hadoop-jy-namenode/data/qytt/flume/ttengine_predict_cool_start/dt=%Y-%m-%d/hour=%H/
agent1.sinks.sink2.hdfs.writeFormat = Text
agent1.sinks.sink2.hdfs.fileType = DataStream
agent1.sinks.sink2.hdfs.fileSuffix = .log
agent1.sinks.sink2.hdfs.filePrefix = %Y-%m-%d_%H
agent1.sinks.sink2.hdfs.rollInterval = 3600
agent1.sinks.sink2.hdfs.rollSize = 0
agent1.sinks.sink2.hdfs.rollCount = 0
agent1.sinks.sink2.hdfs.batchSize = 1000
agent1.sinks.sink2.hdfs.callTimeout = 60000
agent1.sinks.sink2.hdfs.appendTimeout = 60000
3)注意事项:
flume1.6和es2.3版本有jar冲突问题,在编写flume-es-sink时,需要引入:
org.elasticsearch
elasticsearch
2.3.2
然后发现会有java.lang.NoSuchMethodError:com.google.common.util.concurrent.MoreExecutors.directExecutor()Ljava/util/concurrent/Executor 报错,和 Exception in thread "main" java.lang.NoSuchFieldError: FAIL_ON_SYMBOL_HASH_OVERFLOW 两个问题,解决方法,在maven中加上
com.google.guava
guava
18.0
和
com.fasterxml.jackson.core
jackson-core
2.6.2
参考文档:
http://tech.lede.com/2017/02/08/rd/server/flumeToEs/