flume连接metaQ传数据到spark进行实时计算

以前我们在使用hadoop的时候,都是使用shell脚本来从线上服务器拉取日志(今天凌晨把昨天生成的日志拉过来)。这种情况适用于日质量较小的情况。但是日质量很大了,比如一天1T的日志,那么在使用shell脚本就不能够满足要求了。有两个原因:

1. 拉取的速度较慢。因为从线上服务器拉取日志是需要跨机房的,而且机房的带宽有限,如果在拉取日志时不加限制,那么带宽就全被拉取日志占了,留给线上服务的带宽就会很小,所以这样就需要舍弃shell脚本的方式。

2. shell脚本拉取日志经常会出现失败的情况,不是很稳定。

现在我们使用的工具是flume,他的部署方式是多个client向一个server传日志。

flume连接metaQ传数据到spark进行实时计算_第1张图片

部署的client和server都是用同一个安装包,只是配置文件不同。安装地址见:

http://download.csdn.net/detail/aaa1117a8w5s6d/7973839


我们在实时计算的时候需要使用一个MQ,我们选择的是淘宝的MetaQ,但是flume默认是不会向metaQ导数据的,所以就需要我们修改源码,

在flume-ng-core工程(flume的核心代码)下的org.apache.flume.sink包里添加一个类:

/**
 * Licensed to the Apache Software Foundation (ASF) under one
 * or more contributor license agreements.  See the NOTICE file
 * distributed with this work for additional information
 * regarding copyright ownership.  The ASF licenses this file
 * to you under the Apache License, Version 2.0 (the
 * "License"); you may not use this file except in compliance
 * with the License.  You may obtain a copy of the License at
 *
 *     http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */

package org.apache.flume.sink;

import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.SynchronousQueue;
import java.util.concurrent.ThreadPoolExecutor;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicInteger;

import org.apache.flume.Channel;
import org.apache.flume.Context;
import org.apache.flume.Event;
import org.apache.flume.EventDeliveryException;
import org.apache.flume.Transaction;
import org.apache.flume.conf.Configurable;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.taobao.metamorphosis.Message;
import com.taobao.metamorphosis.client.MessageSessionFactory;
import com.taobao.metamorphosis.client.MetaClientConfig;
import com.taobao.metamorphosis.client.MetaMessageSessionFactory;
import com.taobao.metamorphosis.client.producer.MessageProducer;
import com.taobao.metamorphosis.client.producer.SendResult;
import com.taobao.metamorphosis.exception.MetaClientException;
import com.taobao.metamorphosis.utils.ZkUtils.ZKConfig;

public class MetaQSink extends AbstractSink implements Configurable {

	private static final Logger logger = LoggerFactory
			.getLogger(MetaQSink.class);

	private MessageSessionFactory sessionFactory;
	private MessageProducer producer;

	private String zkConnect;
	private String zkRoot;
	private String topic;
	private int batchSize;
	private int threadNum;
	private ExecutorService executor;

	@Override
	public void configure(Context context) {

		this.zkConnect = context.getString("sink.zkConnect");
		this.zkRoot = context.getString("sink.zkRoot");
		this.topic = context.getString("sink.topic");
		this.batchSize = context.getInteger("sink.batchSize", 10000);
		this.threadNum = context.getInteger("sink.threadNum", 50);
		executor = Executors.newCachedThreadPool();

		MetaClientConfig metaClientConfig = new MetaClientConfig();
		ZKConfig zkConfig = new ZKConfig();
		zkConfig.zkConnect = zkConnect;
		zkConfig.zkRoot = zkRoot;
		metaClientConfig.setZkConfig(zkConfig);
		try {
			sessionFactory = new MetaMessageSessionFactory(metaClientConfig);
		} catch (MetaClientException e) {
			e.printStackTrace();
			logger.error("", e);
			throw new RuntimeException("init error");
		}
		producer = sessionFactory.createProducer();
		logger.info("zkConnect:" + zkConnect + ", zkRoot:" + zkRoot
				+ ", topic:" + topic);
	}

	@Override
	public Status process() throws EventDeliveryException {
		long start = System.currentTimeMillis();
		producer.publish(topic);
		Status result = Status.READY;
		final Channel channel = getChannel();
		final AtomicInteger al = new AtomicInteger(0);
		final CountDownLatch cdl = new CountDownLatch(threadNum);
		for (int t = 0; t < threadNum; t++) {
			executor.execute(new Runnable() {

				@Override
				public void run() {

					Event event = null;
					Transaction transaction = null;
					int i = 0;
					try {
						transaction = channel.getTransaction();
						transaction.begin();
						boolean startTransaction = false;
						for (i = 0; i < batchSize; i++) {
							event = channel.take();
							if (event != null) {
								if (i == 0) {
									producer.beginTransaction();
									startTransaction = true;
								}
								final SendResult sendResult = producer
										.sendMessage(new Message(topic, event
												.getBody()));
								// check result
								if (!sendResult.isSuccess()) {
									logger.error("Send message failed,error message:"
											+ sendResult.getErrorMessage());
									throw new RuntimeException(
											"Send message failed,error message:"
													+ sendResult
															.getErrorMessage());
								} else {
									logger.debug("Send message successfully,sent to "
											+ sendResult.getPartition());
								}
							} else {
								// No event found, request back-off semantics
								// from the sink
								// runner
								// result = Status.BACKOFF;
								break;
							}

						}
						if (startTransaction) {
							producer.commit();
						}
						al.addAndGet(i);
						transaction.commit();
					} catch (Exception ex) {
						logger.error("error while rollback:", ex);
						try {
							producer.rollback();
						} catch (Exception e) {
							e.printStackTrace();
						}
						transaction.rollback();
					} finally {
						cdl.countDown();
						transaction.close();
					}
				}
			});
		}
		try {
			cdl.await();
		} catch (InterruptedException e) {
			e.printStackTrace();
		}
		if (al.get() == 0) {
			result = Status.BACKOFF;
		}

		logger.info("metaqSink_new,process:{},time:{},queue_size:{}",
				new Object[] { al.get(), System.currentTimeMillis() - start,
						channel.getSize() });
		return result;

	}
}

然后在flume里配置如下:

# example.conf: A single-node Flume configuration


# Name the components on this agent
info.sources = info_source
info.sinks = info_sink info_sink_to_metaq
info.channels = info_channel info_channel_to_metaq




# Describe/configure the source
info.sources.info_source.type = avro
info.sources.info_source.bind = 10.0.3.19
info.sources.info_source.port = 58001
info.sources.info_source.threads = 24


info.sinks.info_sink.type = file_roll
info.sinks.info_sink.sink.directory = /data1/logs/flume/info
info.sinks.info_sink.sink.name= info
ifno.sinks.info_sink.sink.batchSize= 20000


info.sinks.info_sink_to_metaq.type = org.apache.flume.sink.MetaQSink
info.sinks.info_sink_to_metaq.sink.zkConnect = 10.0.5.108:2181,10.0.5.109:2181,10.0.5.110:2181
info.sinks.info_sink_to_metaq.sink.zkRoot= /meta
info.sinks.info_sink_to_metaq.sink.topic= info
ifno.sinks.info_sink_to_metaq.sink.batchSize= 20000



# Describe the channel
info.channels.info_channel.type = memory
info.channels.info_channel.capacity = 10000000
info.channels.info_channel.transactionCapacity = 10000000




info.channels.info_channel_to_metaq.type = memory
info.channels.info_channel_to_metaq.capacity = 10000000
info.channels.info_channel_to_metaq.transactionCapacity = 10000000


# Bind the source and sink to the channel
info.sources.info_source.channels = info_channel info_channel_to_metaq
info.sinks.info_sink.channel = info_channel
info.sinks.info_sink_to_metaq.channel = info_channel_to_metaq








nginx.sources = nginx_source
nginx.sinks = nginx_sink
nginx.channels = nginx_channel




# Describe/configure the source
nginx.sources.nginx_source.type = avro
nginx.sources.nginx_source.bind = 10.0.3.19
nginx.sources.nginx_source.port = 58002
#
#
nginx.sinks.nginx_sink.type = file_roll
nginx.sinks.nginx_sink.sink.directory = /data1/logs/flume/nginx
nginx.sinks.nginx_sink.sink.name= nginx
nginx.sinks.nginx_sink.sink.batchSize= 2000
#
# Describe the channel
nginx.channels.nginx_channel.type = SPILLABLEMEMORY
# #1000W 1G
nginx.channels.nginx_channel.memoryCapacity = 20000000
nginx.channels.nginx_channel.overflowCapacity = 200000000
nginx.channels.nginx_channel.checkpointDir = /data1/logs/flume/nginx/check
nginx.channels.nginx_channel.dataDirs = /data1/logs/flume/nginx/data
# Bind the source and sink to the channel
nginx.sources.nginx_source.channels = nginx_channel
nginx.sinks.nginx_sink.channel = nginx_channel






usage.sources = usage_source
usage.sinks = usage_sink
usage.channels = usage_channel




# Describe/configure the source
usage.sources.usage_source.type = avro
usage.sources.usage_source.bind = 10.0.3.19
usage.sources.usage_source.port = 58003
#
#
usage.sinks.usage_sink.type = file_roll
usage.sinks.usage_sink.sink.directory = /data1/logs/flume/usage
usage.sinks.usage_sink.sink.name= usage
usage.sinks.usage_sink.sink.batchSize= 1000
# #
# # Describe the channel
usage.channels.usage_channel.type = SPILLABLEMEMORY
# # #1000W 1G
usage.channels.usage_channel.memoryCapacity = 20000000
usage.channels.usage_channel.overflowCapacity = 200000000
usage.channels.usage_channel.checkpointDir = /data1/logs/flume/usage/check
usage.channels.usage_channel.dataDirs = /data1/logs/flume/usage/data
# # Bind the source and sink to the channel
usage.sources.usage_source.channels = usage_channel
usage.sinks.usage_sink.channel = usage_channel


注:上面的配置文件是配置了3个flume的信息。

---------------------------------------------------------------------------------------------------------------

metaQ的安装包:

http://download.csdn.net/detail/aaa1117a8w5s6d/7974039

metaQ向spark传数据,见工程:

http://download.csdn.net/detail/aaa1117a8w5s6d/7974053

你可能感兴趣的:(flume连接metaQ传数据到spark进行实时计算)