flume接收kafka source落地本地

flume接收kafka source落地本地,然后上传hdfs,避免flume直接上传hdfs
maven工程
pom.xml

1.7.0

	org.apache.flume
	flume-ng-core
	${version.flume}


	org.apache.flume
	flume-ng-configuration
	${version.flume}

java

package com.qunar.qav.flume;

/**
 * Created by lifei on 2018/7/31.
 */
import java.io.File;
import java.util.concurrent.atomic.AtomicInteger;
public class PathManagerExtra {
    private long seriesTimestamp;
    private String baseDirectory;
    private AtomicInteger fileIndex;

    private File currentFile;

    private String pefix;
    private String suffix;


    public PathManagerExtra() {
        seriesTimestamp = System.currentTimeMillis();
        fileIndex = new AtomicInteger();
    }


    public File nextFile() {
        //(1)  /usr/local/flume/xxxxpjmLog/%Y%m%d 将%Y%m%d替换为年月日 并返回(此处为省事整串替换,配置文件中的也必须写成%Y%m%d)
        String dirStr = SinkPjmDefinedUtils.getRealPath(baseDirectory);
        //(2)  flume_bjxd02.%Y%m%d%H%M将%Y%m%d%H%M替换为年月日时分
        String pefixStr = SinkPjmDefinedUtils.getRealPathFilePrefix(pefix);
        //(3)  拼文件全路径/data/logs/flume/allpjm/20150115/flume_bjxd02.201501151029.1421288975655.log
        //    (写文件中需要添加.tmp后缀)
        String filePath = dirStr+pefixStr+"."+System.currentTimeMillis()+suffix+".tmp";
        currentFile = SinkPjmDefinedUtils.CreateFolderAndFile(dirStr, filePath);

        return currentFile;
    }
    /* public File nextFile() {
       currentFile = new File(baseDirectory, seriesTimestamp + "-"
           + fileIndex.incrementAndGet());
       return currentFile;
     }
   */
    public File getCurrentFile() {
        if (currentFile == null) {
            return nextFile();
        }

        return currentFile;
    }
    public void rotate() {
        currentFile = null;
    }
    public String getBaseDirectory() {
        return baseDirectory;
    }
    public void setBaseDirectory(String baseDirectory) {
        this.baseDirectory = baseDirectory;
    }
    public long getSeriesTimestamp() {
        return seriesTimestamp;
    }
    public AtomicInteger getFileIndex() {
        return fileIndex;
    }


    public String getPefix() {
        return pefix;
    }


    public void setPefix(String pefix) {
        this.pefix = pefix;
    }
    public String getSuffix() {
        return suffix;
    }
    public void setSuffix(String suffix) {
        this.suffix = suffix;
    }
}



package com.qunar.qav.flume;

import java.io.File;
import java.text.SimpleDateFormat;
import java.util.Date;

/**
 * Created by lifei on 2018/7/31.
 */
public class SinkPjmDefinedUtils {

    /**
     * 功能:替换文件夹路径中的%Y%m%d 
* * @author pjm
* @version 2015-1-15 上午09:44:46
*/ public static String getRealPath(String path){ if (path.contains("%Y%m%d%H")) { Date today = new Date(); SimpleDateFormat formatter = new SimpleDateFormat("yyyyMMddHH"); String formattedDate = formatter.format(today); System.out.println(formattedDate); path = path.replace("%Y%m%d%H", formattedDate); } return path; } /** * 功能: 文件前缀替换
* * @author pjm
* @version 2015-1-15 上午09:45:32
*/ public static String getRealPathFilePrefix(String path){ if (path.contains("%Y%m%d%H%M")) { Date today = new Date(); SimpleDateFormat formatter = new SimpleDateFormat("yyyyMMddHHmm"); String formattedDate = formatter.format(today); System.out.println(formattedDate); path = path.replace("%Y%m%d%H%M", formattedDate); } return path; } /** * 功能: 创建文件和文件夹,并返回文件
* * @author pjm
* @version 2015-1-15 上午09:45:48
*/ public static File CreateFolderAndFile(String dirpath,String filepath){ //String dirpath = "/data/logs/flume/All/20150115/"; //String filepath = "/data/logs/flume/All/20150115/flume_bjxd04.201501150900.1421283612463.log"; //String dirpath = "/usr/local/flume/AllLog/20150115/"; //String filepath = "/usr/local/flume/AllLog/20150115/flume_bjxd04.201501150900.1421283612463.log"; File dirFile = new File(dirpath); // 创建文件夹 if (!dirFile.exists()) { dirFile.mkdirs(); } File f = new File(filepath); /* // 创建文件 if (!f.exists()) { try { f.createNewFile(); // f.createTempFile("kkk2", ".java", dirFile); } catch (IOException e) { e.printStackTrace(); } }*/ return f; } } package com.qunar.qav.flume; import com.google.common.base.Preconditions; import com.google.common.util.concurrent.ThreadFactoryBuilder; import org.apache.commons.io.FileUtils; import org.apache.flume.*; import org.apache.flume.conf.Configurable; import org.apache.flume.instrumentation.SinkCounter; import org.apache.flume.serialization.EventSerializer; import org.apache.flume.serialization.EventSerializerFactory; import org.apache.flume.sink.AbstractSink; import org.apache.flume.sink.RollingFileSink; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import java.io.*; import java.util.concurrent.Executors; import java.util.concurrent.ScheduledExecutorService; import java.util.concurrent.TimeUnit; /** * Created by lifei on 2018/7/31. */ public class RollingFileSinkExtra extends AbstractSink implements Configurable { private static final Logger logger = LoggerFactory .getLogger(RollingFileSink.class); private static final long defaultRollInterval = 30; private static final int defaultBatchSize = 100; private int batchSize = defaultBatchSize; private String directory; //在 RollingFileSink类 是 private File directory; 因为此处需要替换 年月日等 定义为String private long rollInterval; private OutputStream outputStream; private ScheduledExecutorService rollService; private String serializerType; private Context serializerContext; private EventSerializer serializer; private SinkCounter sinkCounter; private PathManagerExtra pathController; private volatile boolean shouldRotate; private String pefix; private String suffix; public RollingFileSinkExtra() { pathController = new PathManagerExtra(); shouldRotate = false; } @Override public void configure(Context context) { // //获取配置参数sink.directory sink.rollInterval sink.filePrefix sink.fileSuffix directory = context.getString("sink.directory"); String rollInterval = context.getString("sink.rollInterval"); pefix = context.getString("sink.filePrefix"); suffix = context.getString("sink.fileSuffix"); serializerType = context.getString("sink.serializer", "TEXT"); serializerContext = new Context(context.getSubProperties("sink." + EventSerializer.CTX_PREFIX)); Preconditions.checkArgument(directory != null, "Directory may not be null"); Preconditions.checkNotNull(serializerType, "Serializer type is undefined"); if (rollInterval == null) { this.rollInterval = defaultRollInterval; } else { this.rollInterval = Long.parseLong(rollInterval); } batchSize = context.getInteger("sink.batchSize", defaultBatchSize); if (sinkCounter == null) { sinkCounter = new SinkCounter(getName()); } } @Override public void start() { logger.info("Starting {}...", this); sinkCounter.start(); super.start(); pathController.setBaseDirectory(directory); pathController.setPefix(pefix); pathController.setSuffix(suffix); if (rollInterval > 0) { rollService = Executors.newScheduledThreadPool( 1, new ThreadFactoryBuilder().setNameFormat( "rollingFileSink-roller-" + Thread.currentThread().getId() + "-%d") .build()); /* * Every N seconds, mark that it's time to rotate. We purposefully * do NOT touch anything other than the indicator flag to avoid * error handling issues (e.g. IO exceptions occuring in two * different threads. Resist the urge to actually perform rotation * in a separate thread! */ rollService.scheduleAtFixedRate(new Runnable() { @Override public void run() { logger.debug("Marking time to rotate file {}", pathController.getCurrentFile()); shouldRotate = true; } }, rollInterval, rollInterval, TimeUnit.SECONDS); } else { logger.info("RollInterval is not valid, file rolling will not happen."); } logger.info("RollingFileSink {} started.", getName()); } @Override public Status process() throws EventDeliveryException { if (shouldRotate) { // shouldRotate为真,表示当前文件停止Roll,再生成新的文件执行写入 logger.debug("Time to rotate {}", pathController.getCurrentFile()); if (outputStream != null) { logger.debug("Closing file {}", pathController.getCurrentFile()); try { serializer.flush(); serializer.beforeClose(); outputStream.close(); sinkCounter.incrementConnectionClosedCount(); shouldRotate = false; } catch (Exception e) { sinkCounter.incrementConnectionFailedCount(); throw new EventDeliveryException("Unable to rotate file " + pathController.getCurrentFile() + " while delivering event", e); } finally { serializer = null; outputStream = null; } ////去掉文件后缀名(文件在写入的过程中默认给加了.tmp作为区分,文件写完需要去掉这个后缀) File ff = pathController.getCurrentFile(); try { FileUtils.moveFile( ff, new File(ff.getAbsolutePath().substring(0, ff.getAbsolutePath().indexOf(".tmp")))); } catch (IOException e) { e.printStackTrace(); } pathController.rotate(); } } if (outputStream == null) { File currentFile = pathController.getCurrentFile(); logger.debug("Opening output stream for file {}", currentFile); try { outputStream = new BufferedOutputStream(new FileOutputStream( currentFile)); serializer = EventSerializerFactory.getInstance(serializerType, serializerContext, outputStream); serializer.afterCreate(); sinkCounter.incrementConnectionCreatedCount(); } catch (IOException e) { sinkCounter.incrementConnectionFailedCount(); throw new EventDeliveryException("Failed to open file " + pathController.getCurrentFile() + " while delivering event", e); } } Channel channel = getChannel(); Transaction transaction = channel.getTransaction(); Event event = null; Status result = Status.READY; try { transaction.begin(); int eventAttemptCounter = 0; for (int i = 0; i < batchSize; i++) { event = channel.take(); if (event != null) { sinkCounter.incrementEventDrainAttemptCount(); eventAttemptCounter++; serializer.write(event); /* * FIXME: Feature: Rotate on size and time by checking bytes * written and setting shouldRotate = true if we're past a * threshold. */ /* * FIXME: Feature: Control flush interval based on time or * number of events. For now, we're super-conservative and * flush on each write. */ } else { // No events found, request back-off semantics from runner result = Status.BACKOFF; break; } } serializer.flush(); outputStream.flush(); transaction.commit(); sinkCounter.addToEventDrainSuccessCount(eventAttemptCounter); } catch (Exception ex) { transaction.rollback(); throw new EventDeliveryException("Failed to process transaction", ex); } finally { transaction.close(); } return result; } @Override public void stop() { logger.info("RollingFile sink {} stopping...", getName()); sinkCounter.stop(); super.stop(); if (outputStream != null) { logger.debug("Closing file {}", pathController.getCurrentFile()); try { serializer.flush(); serializer.beforeClose(); outputStream.close(); sinkCounter.incrementConnectionClosedCount(); } catch (IOException e) { sinkCounter.incrementConnectionFailedCount(); logger.error( "Unable to close output stream. Exception follows.", e); } finally { outputStream = null; serializer = null; } } if (rollInterval > 0) { rollService.shutdown(); while (!rollService.isTerminated()) { try { rollService.awaitTermination(1, TimeUnit.SECONDS); } catch (InterruptedException e) { logger.debug( "Interrupted while waiting for roll service to stop. " + "Please report this.", e); } } } logger.info("RollingFile sink {} stopped. Event metrics: {}", getName(), sinkCounter); } public String getDirectory() { return directory; } public void setDirectory(String directory) { this.directory = directory; } public long getRollInterval() { return rollInterval; } public void setRollInterval(long rollInterval) { this.rollInterval = rollInterval; } }

flume配置
老版本kakfa zookeeperConnect配置

agent.sources = source1
agent.channels = memoryChannel
agent.sinks = k1


#source
agent.sources.source1.type = org.apache.flume.source.kafka.KafkaSource
agent.sources.source1.zookeeperConnect = xxxxxxxxx.com:2181
agent.sources.source1.topic = custom_wireless_m_pub_loganalysts
agent.sources.source1.groupId = test-group3
agent.sources.source1.batchSize = 1000
agent.sources.source1.batchDurationMillis = 1000

#channel
agent.channels.memoryChannel.type = memory
agent.channels.memoryChannel.capacity = 10000
agent.channels.memoryChannel.transactionCapacity = 10000

#sink
#agent.sinks.k1.type = file_roll
#agent.sinks.k1.channel = c1
#agent.sinks.k1.sink.directory = /home/q/performance/apache-flume-1.7.0-bin/testdir/

agent.sinks.k1.type = com.qunar.qav.flume.RollingFileSinkExtra
agent.sinks.k1.sink.directory = /home/q/performance/shell/data/%Y%m%d%H/
agent.sinks.k1.sink.filePrefix = performance.%Y%m%d%H%M
agent.sinks.k1.sink.fileSuffix = .log
agent.sinks.k1.sink.rollInterval = 60

#assemble
agent.sources.source1.channels = memoryChannel
agent.sinks.k1.channel = memoryChannel

启动命令

bin/flume-ng agent --conf conf/ --conf-file conf/file.conf --name agent  -Dflum
e.root.logger=INFO,console > run.log 2>&1 &

zookeeper-3.3.6.jar
hadoop-yarn-common-2.1.0-beta.jar
hadoop-yarn-api-2.1.0-beta.jar
hadoop-mapreduce-client-core-2.1.0-beta.jar
hadoop-common-2.2.0.jar
hadoop-auth-2.2.0.jar
hadoop-annotations-2.2.0.jar
commons-configuration-1.6.jar
hadoop-hdfs-2.2.0.jar
udf-1.0.jar

put2hdfs.sh

#!/bin/bash
source /etc/profile
dt="$(date -d "$1 3 min ago " +'%Y-%m-%d')"
h="$(date -d "$1 3 min ago " +'%H')"
hour="$(date -d "$1 3 min ago " +'%Y%m%d%H')"
min="$(date -d "$1 3 min ago " +'%Y%m%d%H%M')"
pt="/home/q/performance/shell/data/${hour}"
chmod a+w  ${pt}
cd $pt
pwd

context=`ls $pt | grep "performance.${min}"`
if [ "$context" = "" ];then
        echo "$context not exists!!!! skip"
        exit 1
fi
echo ">>>>>>>>>>>>>>>>process: $pt/$context>>>>>>>>>>>>>>>>>>"
#获取文件的大小
FILE_SIZE=`ls -l $context | awk '{print $5}' `
echo ">>>>>>>>>>size: $FILE_SIZE<<<<<<<<<<"
#判断文件大小 如果为0直接删除
if [ $FILE_SIZE -ne 0 ];then
    #压缩文件
    gzip $context
    SQL=" LOAD DATA LOCAL INPATH '${pt}/${context}.gz' INTO TABLE orig_performance_all PARTITION (dt='${dt}',hour='${h}') "
    echo "$SQL"
    hive -e "use wirelessdata;${SQL};" || exit 1
fi
rm -rf ${context}.gz
echo ">>>>>>>>>>>>>>>>>>done>>>>>>>>>>>>>>>>>"
exit 0

你可能感兴趣的:(flume接收kafka source落地本地)