flume源码分析

1、LifecycleAware

public interface LifecycleAware { 
  public void start(); 
  public void stop(); 
  public LifecycleState getLifecycleState(); 
}

所有核心组件,全都实现这个接口。通过start,stop启动关闭,通过LifecycleState记录状态

2、Application
Application中的main方法,是flume程序启动的入口。

//1、设置默认值启动参数、参数是否必须的
Options options = new Options();
Option option = new Option("n", "name", true, "the name of this agent");
option.setRequired(true);
options.addOption(option);

option = new Option("f", "conf-file", true,
"specify a config file (required if -z missing)");
option.setRequired(false);
options.addOption(option);

//2、接着解析命令行参数
CommandLineParser parser = new GnuParser();
CommandLine commandLine = parser.parse(options, args);

String agentName = commandLine.getOptionValue('n');
boolean reload = !commandLine.hasOption("no-reload-conf");

if (commandLine.hasOption('z') || commandLine.hasOption("zkConnString")) {
  isZkConfigured = true;
}

if (isZkConfigured) {
    //3、如果是通过ZooKeeper配置,则使用ZooKeeper参数启动,此处忽略,我们以配置文件讲解
} else {
  //4、打开配置文件,如果不存在则快速失败
  File configurationFile = new File(commandLine.getOptionValue('f'));
  if (!configurationFile.exists()) {
         throw new ParseException(
        "The specified configuration file does not exist: " + path);
  }
  List components = Lists.newArrayList();

  if (reload) { //5、如果需要定期reload配置文件,则走如下方式
    //5.1、此处使用Guava提供的事件总线
    EventBus eventBus = new EventBus(agentName + "-event-bus");
    //5.2、读取配置文件,使用定期轮训拉起策略,默认30s拉取一次
    PollingPropertiesFileConfigurationProvider configurationProvider =
        new PollingPropertiesFileConfigurationProvider(
          agentName, configurationFile, eventBus, 30);
    components.add(configurationProvider);
    application = new Application(components); //5.3、向Application注册组件
    //5.4、向事件总线注册本应用,EventBus会自动注册Application中使用@Subscribe声明的方法
    eventBus.register(application);

  } else { //5、配置文件不支持定期reload
    PropertiesFileConfigurationProvider configurationProvider =
        new PropertiesFileConfigurationProvider(
          agentName, configurationFile);
    application = new Application();
    //6.2、直接使用配置文件初始化Flume组件
    application.handleConfigurationEvent(configurationProvider
      .getConfiguration());
  }
}
//7、启动Flume应用
application.start();

//8、注册虚拟机关闭钩子,当虚拟机关闭时调用Application的stop方法进行终止
final Application appReference = application;
Runtime.getRuntime().addShutdownHook(new Thread("agent-shutdown-hook") {
  @Override
  public void run() {
    appReference.stop();
  }
});


Application的start()方法

  public synchronized void start() {
    for(LifecycleAware component : components ) {
      supervisor.supervise( component ,
          new SupervisorPolicy.AlwaysRestartPolicy(), LifecycleState.START );
    }
  }
实际上component是PollingPropertiesFileConfigurationProvider configurationProvider对象。 LifecycleSupervisor对象的supervise方法,会创建一个线程,并执行内部类MonitorRunnable的run()方法,在run方法中,会执行configurationProvider的start()方法

在PollingPropertiesFileConfigurationProvider的start()方法中,会创建线程,并执行内部类FileWatcherRunnable的run()方法,在run方法中,执行eventBus .post(getConfiguration());代码。


getConfiguration()方法在AbstractConfigurationProvider类中。在这个 方法中,会解析配置文件,并创建对应的source,channel,sink对象,并把这些对象封装后,存入conf中

  public MaterializedConfiguration getConfiguration() {
    MaterializedConfiguration conf = new SimpleMaterializedConfiguration();
    FlumeConfiguration fconfig = getFlumeConfiguration();
    AgentConfiguration agentConf = fconfig .getConfigurationFor(getAgentName());
    if ( agentConf != null ) {
      Map channelComponentMap = Maps.newHashMap();
      Map sourceRunnerMap = Maps.newHashMap();
      Map sinkRunnerMap = Maps.newHashMap();
      try {
        loadChannels( agentConf , channelComponentMap );//创建channel,并存入channelComponentMap中
        loadSources( agentConf , channelComponentMap , sourceRunnerMap );//创建source,并存入sourceRunnerMap中。在source中,读取创建的多个channel,并根据多个channel创建ChannelSelector,在根据selector,创建ChannelProcessor。选择器和拦截器会传入ChannelProcessor。
        loadSinks( agentConf , channelComponentMap , sinkRunnerMap );//创建sink,并存入sinkRunnerMap中
        Set channelNames =
            new HashSet(channelComponentMap .keySet());
        for (String channelName : channelNames ) {
          ChannelComponent channelComponent = channelComponentMap .
              get( channelName );
          if (channelComponent .components .isEmpty()) {
            LOGGER .warn(String.format( "Channel %s has no components connected" +
                " and has been removed." , channelName ));
            channelComponentMap .remove(channelName );
            Map nameChannelMap = channelCache .
                get( channelComponent .channel .getClass());
            if (nameChannelMap != null) {
              nameChannelMap .remove(channelName );
            }
          } else {
            LOGGER .info(String.format( "Channel %s connected to %s",
                channelName , channelComponent .components .toString()));
            conf.addChannel( channelName , channelComponent .channel );//把channel存入conf中
          }
        }
        for (Map.Entry entry : sourceRunnerMap .entrySet()) {
          conf.addSourceRunner( entry.getKey(), entry .getValue());//把SourceRunner存入conf中
        }
        for (Map.Entry entry : sinkRunnerMap .entrySet()) {
          conf.addSinkRunner( entry.getKey(), entry .getValue());//把sinkRunner存入conf中
        }
      } catch (InstantiationException ex) {
        LOGGER .error("Failed to instantiate component" , ex );
      } finally {
        channelComponentMap .clear();
        sourceRunnerMap .clear();
        sinkRunnerMap .clear();
      }
    } else {
      LOGGER.warn( "No configuration found for this host:{}", getAgentName());
    }
    return conf;
  }

loadSources()方法

  private void loadSources(AgentConfiguration agentConf ,
      Map channelComponentMap ,
      Map sourceRunnerMap )
      throws InstantiationException {
       //得到source的名字
    Set sourceNames = agentConf .getSourceSet();
    Map compMap =
        agentConf .getSourceConfigMap();
    /*
     * Components which have a ComponentConfiguration object //循环
     */
    for (String sourceName : sourceNames ) {
      ComponentConfiguration comp = compMap .get(sourceName );
      if( comp != null) {
        SourceConfiguration config = (SourceConfiguration) comp ;
        //创建source
        Source source = sourceFactory .create(comp .getComponentName(),
            comp.getType());
        try {
          Configurables. configure( source, config); //配置source
          Set channelNames = config .getChannels();
          List sourceChannels = new ArrayList();
          for (String chName : channelNames ) {
            ChannelComponent channelComponent = channelComponentMap .get(chName );
            if (channelComponent != null) {
              sourceChannels .add(channelComponent .channel );//为该source添加对应的channel
            }
          }
          if (sourceChannels .isEmpty()) {
            String msg = String. format( "Source %s is not connected to a " +
                "channel" ,  sourceName );
            throw new IllegalStateException( msg);
          }
          ChannelSelectorConfiguration selectorConfig =
              config.getSelectorConfiguration();

          ChannelSelector selector = ChannelSelectorFactory.create(
              sourceChannels , selectorConfig );//创建channel 选择器

          ChannelProcessor channelProcessor = new ChannelProcessor(selector );
          Configurables. configure( channelProcessor, config );// 这里实际是调用ChannelProcessor的configure()方法,设置拦截器链

          source.setChannelProcessor( channelProcessor );
          sourceRunnerMap .put(comp .getComponentName(),
              SourceRunner. forSource( source));
          for (Channel channel : sourceChannels ) {
            ChannelComponent channelComponent = Preconditions.
                checkNotNull( channelComponentMap.get( channel.getName()),
                    String. format( "Channel %s", channel .getName()));
            channelComponent .components .add(sourceName );
          }
        } catch (Exception e ) {
          String msg = String. format( "Source %s has been removed due to an " +
              "error during configuration" , sourceName );
          LOGGER .error(msg , e );
        }
      }
    }
    /*
     * Components which DO NOT have a ComponentConfiguration object
     * and use only Context
     */
    Map sourceContexts = agentConf .getSourceContext();
    for (String sourceName : sourceNames ) {
      Context context = sourceContexts .get(sourceName );
      if( context != null){
        Source source =
            sourceFactory .create(sourceName ,
                context .getString(BasicConfigurationConstants. CONFIG_TYPE));
        try {
          Configurables. configure( source, context);
          List sourceChannels = new ArrayList();
          String[] channelNames = context .getString(
              BasicConfigurationConstants.CONFIG_CHANNELS ).split("\\s+" );
          for (String chName : channelNames ) {
            ChannelComponent channelComponent = channelComponentMap .get(chName );
            if (channelComponent != null) {
              sourceChannels .add(channelComponent .channel );
            }
          }
          if (sourceChannels .isEmpty()) {
            String msg = String. format( "Source %s is not connected to a " +
                "channel" ,  sourceName );
            throw new IllegalStateException( msg);
          }
          Map selectorConfig = context .getSubProperties(
              BasicConfigurationConstants.CONFIG_SOURCE_CHANNELSELECTOR_PREFIX );

          ChannelSelector selector = ChannelSelectorFactory.create(
              sourceChannels , selectorConfig );

          ChannelProcessor channelProcessor = new ChannelProcessor(selector );
          Configurables. configure( channelProcessor, context );
          source.setChannelProcessor( channelProcessor );
          sourceRunnerMap .put(sourceName ,
              SourceRunner. forSource( source));
          for (Channel channel : sourceChannels ) {
            ChannelComponent channelComponent = Preconditions.
                checkNotNull( channelComponentMap.get( channel.getName()),
                    String. format( "Channel %s", channel .getName()));
            channelComponent .components .add(sourceName );
          }
        } catch (Exception e ) {
          String msg = String. format( "Source %s has been removed due to an " +
              "error during configuration" , sourceName );
          LOGGER .error(msg , e );
        }
      }
    }
  }

在loadSources()方法中,把source封装进SourceRunner类中,再把sourceRunner存入sourceRunnerMap中。
SourceRunner是一个抽象类,有两个类继承它。
从名字可以看出,一个是事件驱动,一个是主动拉取。

    

通过抽象类中的forSource()方法执行是哪个类。forSource()方法在loadSources()方法中被调用

  public static SourceRunner forSource(Source source ) {
    SourceRunner runner = null;

    if ( source instanceof PollableSource ) {
      runner = new PollableSourceRunner();
      runner.setSource( source);
    } else if (source instanceof EventDrivenSource) {
      runner = new EventDrivenSourceRunner();
      runner.setSource( source);
    } else {
      throw new IllegalArgumentException("No known runner type for source "
          + source);
    }

    return runner;
  }



eventBus .post(getConfiguration());
执行完getConfiguration()方法后,接着执行eventBus .post()方法,会触发Application类的handleConfigurationEvent()方法

  @Subscribe
  public synchronized void handleConfigurationEvent(MaterializedConfiguration conf) {
    stopAllComponents();
    startAllComponents( conf);
  }


  private void startAllComponents(MaterializedConfiguration materializedConfiguration ) {
    logger.info( "Starting new configuration:{}" , materializedConfiguration );

    this. materializedConfiguration = materializedConfiguration ;

    for (Entry entry :
      materializedConfiguration.getChannels().entrySet()) {
      try{
        logger .info("Starting Channel " + entry .getKey());
        supervisor .supervise(entry .getValue(),
            new SupervisorPolicy.AlwaysRestartPolicy(), LifecycleState.START );//从 conf中取出每一个channel,创建线程,执行channel类的start()方法
      } catch (Exception e){
        logger .error("Error while starting {}" , entry .getValue(), e );
      }
    }

    /*
     * Wait for all channels to start.等所有的channel的状态都是start。否则就一直等。
     */
    for(Channel ch: materializedConfiguration .getChannels().values()){
      while( ch.getLifecycleState() != LifecycleState.START
          && ! supervisor .isComponentInErrorState(ch )){
        try {
          logger .info("Waiting for channel: " + ch .getName() +
              " to start. Sleeping for 500 ms" );
          Thread. sleep(500);
        } catch (InterruptedException e ) {
          logger .error("Interrupted while waiting for channel to start.", e);
          Throwables. propagate( e);
        }
      }
    }
    //从conf中取出每一个sinkRunner,并使用LifecycleSupervisor.supervise()方法,当然也是创建线程,执行sinkRunner的start方法
    for (Entry entry : materializedConfiguration .getSinkRunners()
        .entrySet()) {
      try{
        logger .info("Starting Sink " + entry .getKey());
        supervisor .supervise(entry .getValue(),
          new SupervisorPolicy.AlwaysRestartPolicy(), LifecycleState.START );
      } catch (Exception e) {
        logger .error("Error while starting {}" , entry .getValue(), e );
      }
    }

    for (Entry entry : materializedConfiguration
        .getSourceRunners().entrySet()) {
      try{
        logger .info("Starting Source " + entry .getKey());
        supervisor .supervise(entry .getValue(),
          new SupervisorPolicy.AlwaysRestartPolicy(), LifecycleState.START );//执行SourceRunner的start方法
      } catch (Exception e) {
        logger .error("Error while starting {}" , entry .getValue(), e );
      }
    }

    this.loadMonitoring();
  }
 

3、source

Source是一个接口。另外有PollableSource和EventDrivenSource分别实现Source。
在上面的startAllComponents()方法中,会调用SourceRunner的start方法。我们看到SourceRunner是一个抽象类。继承SourceRunner的是EventDrivenSourceRunner和PollableSourceRunner。

public class EventDrivenSourceRunner extends SourceRunner {

  private LifecycleState lifecycleState ;

  public EventDrivenSourceRunner() {
    lifecycleState = LifecycleState. IDLE ;
  }

  @Override
  public void start() {
    Source source = getSource();
    ChannelProcessor cp = source.getChannelProcessor();
    cp.initialize();
    source.start();//执行对应source的start()方法
    lifecycleState = LifecycleState. START ;
  }

  @Override
  public void stop() {
    Source source = getSource();
    source.stop();
    ChannelProcessor cp = source.getChannelProcessor();
    cp.close();
    lifecycleState = LifecycleState. STOP ;
  }

  @Override
  public String toString() {
    return "EventDrivenSourceRunner: { source:" + getSource() + " }" ;
  }

  @Override
  public LifecycleState getLifecycleState() {
    return lifecycleState ;
  }

}

我们看一个SpoolDirectorySource类

package org.apache.flume.source;

import com.google.common.annotations.VisibleForTesting;
import com.google.common.base.Preconditions;
import com.google.common.base.Throwables;
import org.apache.flume.*;
import org.apache.flume.client.avro.ReliableSpoolingFileEventReader;
import org.apache.flume.conf.Configurable;
import org.apache.flume.instrumentation.SourceCounter;
import org.apache.flume.serialization.DecodeErrorPolicy;
import org.apache.flume.serialization.LineDeserializer;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import java.io.File;
import java.io.IOException;
import java.util.List;
import java.util.Locale;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;

import static org.apache.flume.source.SpoolDirectorySourceConfigurationConstants.*;

public class SpoolDirectorySource extends AbstractSource implements
Configurable, EventDrivenSource {

  private static final Logger logger = LoggerFactory
      . getLogger(SpoolDirectorySource. class);

  // Delay used when polling for new files
  private static final int POLL_DELAY_MS = 500;

  /* Config options */
  private String completedSuffix ;
  private String spoolDirectory ;
  private boolean fileHeader ;
  private String fileHeaderKey ;
  private boolean basenameHeader ;
  private String basenameHeaderKey ;
  private int batchSize ;
  private String ignorePattern ;
  private String trackerDirPath ;
  private String deserializerType ;
  private Context deserializerContext ;
  private String deletePolicy ;
  private String inputCharset ;
  private DecodeErrorPolicy decodeErrorPolicy ;
  private volatile boolean hasFatalError = false;

  private SourceCounter sourceCounter ;
  ReliableSpoolingFileEventReader reader;
  private ScheduledExecutorService executor;
  private boolean backoff = true;
  private boolean hitChannelException = false;
  private int maxBackoff ;
  private ConsumeOrder consumeOrder ;

  @Override
  public synchronized void start() {
    logger.info( "SpoolDirectorySource source starting with directory: {}",
        spoolDirectory );

    executor = Executors.newSingleThreadScheduledExecutor(); //创建单线程定时执行线程

    File directory = new File(spoolDirectory );//读取文件目录,a3.sources.r1.spoolDir = /opt/ flume/logs
    try {
      reader = new ReliableSpoolingFileEventReader.Builder()
          .spoolDirectory( directory )
          .completedSuffix( completedSuffix )
          .ignorePattern( ignorePattern )
          .trackerDirPath( trackerDirPath )
          .annotateFileName( fileHeader )
          .fileNameHeader( fileHeaderKey )
          .annotateBaseName( basenameHeader )
          .baseNameHeader( basenameHeaderKey )
          .deserializerType( deserializerType )
          .deserializerContext( deserializerContext )
          .deletePolicy( deletePolicy )
          .inputCharset( inputCharset )
          .decodeErrorPolicy( decodeErrorPolicy )
          .consumeOrder( consumeOrder )
          .build(); //创建读取文件内容的类
    } catch (IOException ioe) {
      throw new FlumeException("Error instantiating spooling event parser",
          ioe);
    }

    Runnable runner = new SpoolDirectoryRunnable( reader, sourceCounter );//创建Runner,把文件读取类和计数器类传入
    executor.scheduleWithFixedDelay(
        runner, 0, POLL_DELAY_MS , TimeUnit.MILLISECONDS );//执行Runner的run()方法

    super.start();
    logger.debug( "SpoolDirectorySource source started" );
    sourceCounter.start();
  }

  @Override
  public synchronized void stop() {
    executor.shutdown();
    try {
      executor.awaitTermination(10L, TimeUnit. SECONDS );
    } catch (InterruptedException ex) {
      logger.info( "Interrupted while awaiting termination", ex);
    }
    executor.shutdownNow();

    super.stop();
    sourceCounter.stop();
    logger.info( "SpoolDir source {} stopped. Metrics: {}", getName(),
      sourceCounter);
  }

  @Override
  public String toString() {
    return "Spool Directory source " + getName() +
        ": { spoolDir: " + spoolDirectory + " }" ;
  }

  @Override
  public synchronized void configure(Context context) {
    spoolDirectory = context.getString( SPOOL_DIRECTORY );
    Preconditions. checkState(spoolDirectory != null,
        "Configuration must specify a spooling directory");

    completedSuffix = context .getString( SPOOLED_FILE_SUFFIX,
        DEFAULT_SPOOLED_FILE_SUFFIX );
    deletePolicy = context.getString( DELETE_POLICY , DEFAULT_DELETE_POLICY );
    fileHeader = context.getBoolean( FILENAME_HEADER ,
        DEFAULT_FILE_HEADER );
    fileHeaderKey = context .getString( FILENAME_HEADER_KEY,
        DEFAULT_FILENAME_HEADER_KEY );
    basenameHeader = context.getBoolean( BASENAME_HEADER ,
        DEFAULT_BASENAME_HEADER );
    basenameHeaderKey = context .getString( BASENAME_HEADER_KEY,
        DEFAULT_BASENAME_HEADER_KEY );
    batchSize = context.getInteger( BATCH_SIZE ,
        DEFAULT_BATCH_SIZE );
    inputCharset = context.getString( INPUT_CHARSET , DEFAULT_INPUT_CHARSET );
    decodeErrorPolicy = DecodeErrorPolicy. valueOf(
        context.getString( DECODE_ERROR_POLICY , DEFAULT_DECODE_ERROR_POLICY )
        .toUpperCase(Locale. ENGLISH ));

    ignorePattern = context.getString( IGNORE_PAT , DEFAULT_IGNORE_PAT );
    trackerDirPath = context.getString( TRACKER_DIR , DEFAULT_TRACKER_DIR );

    deserializerType = context.getString( DESERIALIZER , DEFAULT_DESERIALIZER );
    deserializerContext = new Context(context .getSubProperties( DESERIALIZER +
        "." ));

    consumeOrder = ConsumeOrder.valueOf( context.getString( CONSUME_ORDER ,
        DEFAULT_CONSUME_ORDER .toString()).toUpperCase(Locale. ENGLISH));

    // "Hack" to support backwards compatibility with previous generation of
    // spooling directory source, which did not support deserializers
    Integer bufferMaxLineLength = context .getInteger(BUFFER_MAX_LINE_LENGTH );
    if ( bufferMaxLineLength != null && deserializerType != null &&
        deserializerType .equalsIgnoreCase( DEFAULT_DESERIALIZER)) {
      deserializerContext .put(LineDeserializer. MAXLINE_KEY,
          bufferMaxLineLength .toString());
    }

    maxBackoff = context.getInteger( MAX_BACKOFF , DEFAULT_MAX_BACKOFF );
    if ( sourceCounter == null ) {
      sourceCounter = new SourceCounter(getName());
    }
  }

  @VisibleForTesting
  protected boolean hasFatalError() {
    return hasFatalError ;
  }

  /**
   * The class always backs off, this exists only so that we can test without
   * taking a really long time.
   * @param backoff - whether the source should backoff if the channel is full
   */
  @VisibleForTesting
  protected void setBackOff( boolean backoff) {
    this. backoff = backoff;
  }

  @VisibleForTesting
  protected boolean hitChannelException() {
    return hitChannelException ;
  }

  @VisibleForTesting
  protected SourceCounter getSourceCounter() {
    return sourceCounter ;
  }

  private class SpoolDirectoryRunnable implements Runnable {
    private ReliableSpoolingFileEventReader reader;
    private SourceCounter sourceCounter ;

    public SpoolDirectoryRunnable(ReliableSpoolingFileEventReader reader ,
        SourceCounter sourceCounter ) {
      this. reader = reader;
      this. sourceCounter = sourceCounter ;
    }

    @Override
    public void run() {
      int backoffInterval = 250;
      try {
        while (!Thread.interrupted()) {
          List events = reader .readEvents(batchSize );//批量读取,这里默认是按行读取
          if (events .isEmpty()) {
            break ;
          }
          sourceCounter .addToEventReceivedCount(events .size()); //计数
          sourceCounter .incrementAppendBatchReceivedCount();

          try {
            getChannelProcessor().processEventBatch(events );//调用ChannelProcessor,处理events
            reader.commit(); //文件读取类提交这些events,表示这些event已经处理过了。这里是调用LineDeserializer类的mark()方法
          } catch (ChannelException ex ) {
            logger .warn("The channel is full, and cannot write data now. The " +
              "source will try again after " + String.valueOf( backoffInterval) +
              " milliseconds" );
            hitChannelException = true ;
            if (backoff ) {
              TimeUnit.MILLISECONDS .sleep(backoffInterval );
              backoffInterval = backoffInterval << 1;
              backoffInterval = backoffInterval >= maxBackoff ? maxBackoff :
                                backoffInterval ;
            }
            continue ;
          }
          backoffInterval = 250;
          sourceCounter .addToEventAcceptedCount(events .size());
          sourceCounter .incrementAppendBatchAcceptedCount();
        }
      } catch (Throwable t) {
        logger .error("FATAL: " + SpoolDirectorySource.this .toString() + ": " +
            "Uncaught exception in SpoolDirectorySource thread. " +
            "Restart or reconfigure Flume to continue processing.", t);
        hasFatalError = true ;
        Throwables. propagate( t);
      }
    }
  }
}

ReliableSpoolingFileEventReader类是读取events的关键类,看注释

A ReliableEventReader which reads log data from files stored in a spooling directory and renames each file once all of its data has been read (through EventDeserializer.readEvent() calls). The user must commit() each read, to indicate that the lines have been fully processed.

Read calls will return no data if there are no files left to read. This class, in general, is not thread safe.

This reader assumes that files with unique file names are left in the spooling directory and not modified once they are placed there. Any user behavior which violates these assumptions, when detected, will result in a FlumeException being thrown.

ReliableEventReader,从文件读取日志数据存储在一个假脱机目录,一旦文件内所有的数据被读取(通过EventDeserializer.readEvent()调用)就重命名这个文件。用户必须提交()每个read,表明已经完全处理。
如果没有文件可以读取,Read会返回no data。这个类,一般来说,不是线程安全的。
这个Reader假设文件名唯一并且一旦文件进入目录就不得修改,否则,当检测到,将导致FlumeException抛出。


  public List readEvents( int numEvents) throws IOException {
    if (! committed ) {
      if (! currentFile .isPresent()) {
        throw new IllegalStateException( "File should not roll when " +
            "commit is outstanding." );
      }
      logger.info( "Last read was never committed - resetting mark position.");
      currentFile.get().getDeserializer(). reset();
    } else {
      // Check if new files have arrived since last call
      if (! currentFile .isPresent()) {
        currentFile = getNextFile();
      }
      // Return empty list if no new files
      if (! currentFile .isPresent()) {
        return Collections.emptyList ();
      }
    }

    EventDeserializer des = currentFile .get().getDeserializer();//得到event解析器
    List events = des.readEvents (numEvents );//解析器读取event

    /* It's possible that the last read took us just up to a file boundary.
     * If so, try to roll to the next file, if there is one.
     * Loop until events is not empty or there is no next file in case of 0 byte files */
    while ( events.isEmpty()) {
      logger.info( "Last read took us just up to a file boundary. Rolling to the next file, if there is one.");
      retireCurrentFile ();
      currentFile = getNextFile();
      if (! currentFile .isPresent()) {
        return Collections.emptyList ();
      }
      events = currentFile .get().getDeserializer().readEvents( numEvents );
    }

    if ( annotateFileName) {
      String filename = currentFile .get().getFile().getAbsolutePath();
      for (Event event : events) {
        event.getHeaders().put( fileNameHeader , filename );
      }
    }

    if ( annotateBaseName) {
      String basename = currentFile .get().getFile().getName();
      for (Event event : events) {
        event.getHeaders().put( baseNameHeader , basename );
      }
    }

    committed = false ;
    lastFileRead = currentFile ;
    return events ;
  }

接下来看EventDeserializer接口,这里默认使用的是行解析器 LineDeserializer


  public Event readEvent() throws IOException {
    ensureOpen();
    String line = readLine ();
    if ( line == null) {
      return null ;
    } else {
      return EventBuilder. withBody( line, outputCharset);
    }
  }

  /**
   * Batch line read
   * @param numEvents Maximum number of events to return.
   * @return List of events containing read lines
   * @throws IOException
   */
  @Override
  public List readEvents( int numEvents ) throws IOException {
    ensureOpen();
    List events = Lists. newLinkedList();
    for ( int i = 0; i < numEvents ; i ++) {
      Event event = readEvent();
      if ( event != null) {
        events.add( event);
      } else {
        break ;
      }
    }
    return events;
  }

  private String readLine() throws IOException {
    StringBuilder sb = new StringBuilder();
    int c;
    int readChars = 0;
    while (( c = in.readChar()) != -1) {
      readChars++;

      // FIXME : support \r\n
      if ( c == '\n') {
        break ;
      }

      sb.append(( char) c);

      if ( readChars >= maxLineLength ) {
        logger .warn("Line length exceeds max ({}), truncating line!",
            maxLineLength );
        break ;
      }
    }

    if ( readChars > 0) {
      return sb.toString();
    } else {
      return null ;
    }
  }

  @Override
  public void mark() throws IOException {
    ensureOpen();
    in.mark();
  }

  @Override
  public void reset() throws IOException {
    ensureOpen();
    in.reset();
  }

4、ChannelProcessor

核心方法

  public void processEventBatch(List events ) {
    Preconditions. checkNotNull(events , "Event list must not be null");

    events = interceptorChain.intercept( events); //调用拦截器对events进行处理

    Map> reqChannelQueue =
        new LinkedHashMap>();

    Map> optChannelQueue =
        new LinkedHashMap>();

    for (Event event : events) {
      List reqChannels = selector .getRequiredChannels(event );//对每个event得到必须的Channel集合
      //对每一个必须的Channel,创建eventQueue集合,并把event存入对应的集合
      for (Channel ch : reqChannels ) {
        List eventQueue = reqChannelQueue .get(ch );
        if (eventQueue == null) {
          eventQueue = new ArrayList();
          reqChannelQueue .put(ch , eventQueue );
        }
        eventQueue .add(event );
      }

      List optChannels = selector .getOptionalChannels(event );//对每个event得到可选的Channel集合
      //对每一个可选Channel,创建eventQueue集合,并把event存入对应的集合
      for (Channel ch: optChannels ) {
        List eventQueue = optChannelQueue .get(ch );
        if (eventQueue == null) {
          eventQueue = new ArrayList();
          optChannelQueue .put(ch , eventQueue );
        }

        eventQueue .add(event );
      }
    }

    // Process required channels
    for (Channel reqChannel : reqChannelQueue .keySet()) {
      Transaction tx = reqChannel .getTransaction();//得到事物
      Preconditions. checkNotNull( tx, "Transaction object must not be null");
      try {
        tx.begin(); //开启事物

        List batch = reqChannelQueue .get(reqChannel );//得到刚才的每个Channel对应的集合,里面是event元素

        for (Event event : batch ) {
          reqChannel .put(event );//对每个event,调用Channel进行处理
        }

        tx.commit(); //提交事物
      } catch (Throwable t) {
        tx.rollback(); //回滚
        if (t instanceof Error) {
          LOG .error("Error while writing to required channel: " +
              reqChannel , t );
          throw (Error) t ;
        } else {
          throw new ChannelException( "Unable to put batch on required " +
              "channel: " + reqChannel , t );
        }
      } finally {
        if (tx != null) {
          tx.close();
        }
      }
    }

    // Process optional channels
    for (Channel optChannel : optChannelQueue .keySet()) {
      Transaction tx = optChannel .getTransaction();
      Preconditions. checkNotNull( tx, "Transaction object must not be null");
      try {
        tx.begin();

        List batch = optChannelQueue .get(optChannel );

        for (Event event : batch ) {
          optChannel .put(event );
        }

        tx.commit();
      } catch (Throwable t) {
        tx.rollback();
        LOG .error("Unable to put batch on optional channel: " + optChannel, t);
        if (t instanceof Error) {
          throw (Error) t ;
        }
      } finally {
        if (tx != null) {
          tx.close();
        }
      }
    }
  }

5、Channel 
MemoryChannel继承自BasicChannelSemantics,在BasicChannelSemantics中存在put()方法

  @Override
  public void put(Event event) throws ChannelException {
    BasicTransactionSemantics transaction = currentTransaction .get();
    Preconditions. checkState(transaction != null,
        "No transaction exists for this thread" );
    transaction.put( event);//调用事物类的put方法
  }

这个事物类,是MemoryChannel的内部类 MemoryTransaction extends BasicTransactionSemantics 。 在BasicTransactionSemantics类中,有put方法,在put方法中,调用子类的duPut()方法

BasicTransactionSemantics类
  protected void put(Event event) {
    Preconditions.checkState(Thread. currentThread().getId() == initialThreadId ,
        "put() called from different thread than getTransaction()!");
    Preconditions. checkState(state .equals(State. OPEN),
        "put() called when transaction is %s!" , state );
    Preconditions. checkArgument(event != null,
        "put() called with null event!" );

    try {
      doPut( event);
    } catch (InterruptedException e) {
      Thread. currentThread().interrupt();
      throw new ChannelException(e .toString(), e);
    }
  }

MemoryTransaction类
    @Override
    protected void doPut(Event event) throws InterruptedException {
      channelCounter.incrementEventPutAttemptCount();
      int eventByteSize = (int )Math.ceil(estimateEventSize( event)/ byteCapacitySlotSize );

      if (! putList.offer( event)) {
        throw new ChannelException(
          "Put queue for MemoryTransaction of capacity " +
            putList.size() + " full, consider committing more frequently, " +
            "increasing capacity or increasing thread count");
      }
      putByteCounter += eventByteSize ;
    }

在doPut方法中,把event存入putList中。
然后在doCommit()中,把putList元素存入queue。

    @Override
    protected void doCommit() throws InterruptedException {
      int remainingChange = takeList.size() - putList .size();
      if( remainingChange < 0) {
        if (!bytesRemaining .tryAcquire( putByteCounter, keepAlive ,
          TimeUnit. SECONDS )) {
          throw new ChannelException( "Cannot commit transaction. Byte capacity " +
            "allocated to store event body " + byteCapacity * byteCapacitySlotSize +
            "reached. Please increase heap space/byte capacity allocated to " +
            "the channel as the sinks may not be keeping up with the sources");
        }
        if (!queueRemaining .tryAcquire(- remainingChange, keepAlive , TimeUnit. SECONDS)) {
          bytesRemaining .release(putByteCounter );
          throw new ChannelFullException ("Space for commit to queue couldn't be acquired." +
              " Sinks are likely not keeping up with sources, or the buffer size is too tight");
        }
      }
      int puts = putList.size();
      int takes = takeList.size();
      synchronized( queueLock ) {
        if (puts > 0 ) {
          while (!putList .isEmpty()) {
            if (!queue .offer(putList .removeFirst())) {
              throw new RuntimeException( "Queue add failed, this shouldn't be able to happen");
            }
          }
        }
        putList.clear();
        takeList.clear();
      }
      bytesRemaining.release( takeByteCounter );
      takeByteCounter = 0;
      putByteCounter = 0;

      queueStored.release( puts);
      if( remainingChange > 0) {
        queueRemaining .release(remainingChange );
      }
      if ( puts > 0) {
        channelCounter .addToEventPutSuccessCount(puts );
      }
      if ( takes > 0) {
        channelCounter .addToEventTakeSuccessCount(takes );
      }

      channelCounter.setChannelSize( queue.size());
    }

从Channel取数据,其实就是从queue中读取数据。调用的是doTake()方法

    @Override
    protected Event doTake() throws InterruptedException {
      channelCounter.incrementEventTakeAttemptCount();
      if( takeList.remainingCapacity() == 0) {
        throw new ChannelException( "Take list for MemoryTransaction, capacity " +
            takeList.size() + " full, consider committing more frequently, " +
            "increasing capacity, or increasing thread count");
      }
      if(! queueStored.tryAcquire( keepAlive , TimeUnit.SECONDS )) {
        return null ;
      }
      Event event;
      synchronized( queueLock ) {
        event = queue.poll();
      }
      Preconditions. checkNotNull( event, "Queue.poll returned NULL despite semaphore " +
          "signalling existence of entry" );
      takeList.put( event);

      int eventByteSize = (int )Math.ceil(estimateEventSize( event)/ byteCapacitySlotSize );
      takeByteCounter += eventByteSize ;

      return event;
    }

6、Sink
Sink是一个接口,里面主要的方法是process()。有一个抽象类,实现了这个接口。AbstractSink。接下来分析一下HDFSEventSink 

  public Status process() throws EventDeliveryException {
    Channel channel = getChannel(); //得到Channel
    Transaction transaction = channel .getTransaction();//得到事物类
    List writers = Lists. newArrayList();
    transaction.begin(); //开始事物
    try {
      int txnEventCount = 0;
      for ( txnEventCount = 0; txnEventCount < batchSize ; txnEventCount ++) {
        Event event = channel.take(); //取出event数据
        if (event == null) {
          break ;
        }

        // reconstruct the path name by substituting place holders
        String realPath = BucketPath.escapeString( filePath, event.getHeaders(),
            timeZone, needRounding , roundUnit , roundValue , useLocalTime );
        String realName = BucketPath.escapeString( fileName, event.getHeaders(),
          timeZone, needRounding , roundUnit , roundValue , useLocalTime );

        String lookupPath = realPath + DIRECTORY_DELIMITER + realName ;// hdfs文件路径
        BucketWriter bucketWriter ;
        HDFSWriter hdfsWriter = null ;
        // Callback to remove the reference to the bucket writer from the
        // sfWriters map so that all buffers used by the HDFS file
        // handles are garbage collected.
        WriterCallback closeCallback = new WriterCallback() {
          @Override
          public void run(String bucketPath) {
            LOG .info("Writer callback called." );
            synchronized (sfWritersLock ) {
              sfWriters .remove(bucketPath );
            }
          }
        };
        synchronized (sfWritersLock ) {
          bucketWriter = sfWriters .get(lookupPath );
          // we haven't seen this file yet, so open it and cache the handle
          if (bucketWriter == null) {
            hdfsWriter = writerFactory .getWriter(fileType );
            bucketWriter = initializeBucketWriter(realPath , realName ,
              lookupPath , hdfsWriter , closeCallback );//创建BucketWriter
            sfWriters .put(lookupPath , bucketWriter );
          }
        }

        // track the buckets getting written in this transaction
        if (!writers .contains( bucketWriter)) {
          writers.add( bucketWriter );
        }

        // Write the data to HDFS
        try {
          bucketWriter .append(event );
        } catch (BucketClosedException ex ) {
          LOG .info("Bucket was closed while trying to append, " +
            "reinitializing bucket and writing event." );
          hdfsWriter = writerFactory .getWriter( fileType);
          bucketWriter = initializeBucketWriter(realPath , realName ,
            lookupPath , hdfsWriter , closeCallback );
          synchronized (sfWritersLock ) {
            sfWriters .put(lookupPath , bucketWriter );
          }
          bucketWriter .append(event );
        }
      }

      if ( txnEventCount == 0) {
        sinkCounter .incrementBatchEmptyCount();
      } else if (txnEventCount == batchSize ) {
        sinkCounter .incrementBatchCompleteCount();
      } else {
        sinkCounter .incrementBatchUnderflowCount();
      }

      // flush all pending buckets before committing the transaction
      for (BucketWriter bucketWriter : writers ) {
        bucketWriter .flush();
      }

      transaction.commit();

      if ( txnEventCount < 1) {
        return Status. BACKOFF;
      } else {
        sinkCounter .addToEventDrainSuccessCount(txnEventCount );
        return Status. READY;
      }
    } catch (IOException eIO) {
      transaction.rollback();
      LOG.warn( "HDFS IO error" , eIO );
      return Status. BACKOFF ;
    } catch (Throwable th) {
      transaction.rollback();
      LOG.error( "process failed" , th );
      if ( th instanceof Error) {
        throw (Error) th ;
      } else {
        throw new EventDeliveryException( th);
      }
    } finally {
      transaction. close();
    }
  }



你可能感兴趣的:(Hadoop)