MapReduce:N keys,N files(四)终极解决方案

在文章 MapReduce:N keys,N files(二) 中提到取消MR的推测执行功能,可以避免每次都产生重复且不完整的orc文件。但其实当reduce任务失败重试时依然会出现这种情况。

【解决方案】

FileOutputCommitter 中有abortJob和abortTask方法,当job失败或者task失败时会回调该方法。我们只要重写这两个方法即可。

我们的目的是重写FileOutputCommitter的abortJob和abortTask方法。因此只需要一个子类继承并重写FileOutputCommitter的abortJob和abortTask方法即可。

FileOutputCommitter是FileOutputFormat的一个属性,但FileOutputFormat中没有设置FileOutputCommitter的方法,其初始化在getOutputCommitter中:

  public synchronized 
     OutputCommitter getOutputCommitter(TaskAttemptContext context
                                        ) throws IOException {
    if (committer == null) {
      Path output = getOutputPath(context);
      committer = new FileOutputCommitter(output, context);
    }
    return committer;
  }
}

我们注意到程序中都是通过这个方法获取到的committer。只要重写该方法即可。完整的解决方案如下:

package is.split;

import org.apache.commons.lang.RandomStringUtils;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.orc.OrcFile;
import org.apache.orc.Writer;
import org.apache.orc.mapred.OrcStruct;
import org.apache.orc.mapreduce.OrcMapreduceRecordWriter;
import org.apache.orc.mapreduce.OrcOutputFormat;

import java.io.IOException;
import java.util.HashMap;
import java.util.Map;

public class OrcReNameFileOutputFormat extends OrcOutputFormat {
    static Map recordWriterMap = new HashMap();
    static Map pathMap = new HashMap();

    private OutputCommitter newCommitter;
    @Override
    public RecordWriter getRecordWriter(TaskAttemptContext taskAttemptContext) throws IOException {
        return new OrcReNameMapreduceRecordWriter(taskAttemptContext);
    }

    public synchronized OutputCommitter getOutputCommitter(TaskAttemptContext context
    ) throws IOException {
        if(newCommitter == null){
            Path output = getOutputPath(context);
            this.newCommitter = new OrcReNameFileOutputCommitter(output, context);
        }
        return this.newCommitter;
    }

    private class OrcReNameMapreduceRecordWriter extends RecordWriter{

        //private OrcMapreduceRecordWriter realWrite ;
        private TaskAttemptContext taskAttemptContext;
        private final String GroupName = "is";
        private final String CounterName = "is_output_record";
        private String curTime ="";
        public OrcReNameMapreduceRecordWriter(TaskAttemptContext taskAttemptContext){
            this.taskAttemptContext = taskAttemptContext;
            this.curTime = ISTool.getCurTime();
        }

        public void write(Text key, OrcStruct value) throws IOException, InterruptedException {
            OrcMapreduceRecordWriter realWrite = recordWriterMap.get(key.toString());
            if (realWrite == null){
                String outputDirPath = taskAttemptContext.getConfiguration().get(FileOutputFormat.OUTDIR ) + "/" + key.toString();
                //Path filename = new Path(new Path(outputDirPath), this.curTime );
                Path filename = new Path(new Path(outputDirPath), ISTool.getCurTime() + "_" + ISTool.getLocalIp() + "_" + RandomStringUtils.randomAlphanumeric(8));
                Writer writer = OrcFile.createWriter(filename, org.apache.orc.mapred.OrcOutputFormat.buildOptions(taskAttemptContext.getConfiguration()));
                realWrite = new OrcMapreduceRecordWriter(writer);
                recordWriterMap.put(key.toString(), realWrite);
                pathMap.put(key.toString(), filename);
            }
            realWrite.write(NullWritable.get(), value);
            this.taskAttemptContext.getCounter(GroupName, CounterName).increment(1);
        }

        public void close(TaskAttemptContext context) throws IOException, InterruptedException {
            for (Map.Entry entry : recordWriterMap.entrySet()) {
                if (entry.getValue() != null){
                    entry.getValue().close(context);
                }
            }
            recordWriterMap.clear();
        }
    }


    private class OrcReNameFileOutputCommitter extends FileOutputCommitter {

        public OrcReNameFileOutputCommitter(Path outputPath, TaskAttemptContext context) throws IOException {
            super(outputPath, context);
        }

        public void deleteFiles(JobContext context){
            for (Map.Entry entry : pathMap.entrySet()) {
                if (entry.getValue() != null){
                    FileSystem fs = null;
                    try {
                        fs = entry.getValue()
                                .getFileSystem(context.getConfiguration());
                        fs.delete(entry.getValue(), true);
                    } catch (IOException e) {
                        e.printStackTrace();
                    }

                }
            }
            pathMap.clear();
        }

        @Override
        public void abortJob(JobContext context, JobStatus.State state)
                throws IOException {
            // delete the _temporary folder
            super.abortJob(context,state);
            if (state != JobStatus.State.SUCCEEDED){
                deleteFiles(context);
            }
        }

        @Override
        public void abortTask(TaskAttemptContext context) throws IOException{
            // delete the _temporary folder
            super.abortTask(context);
            deleteFiles(context);

        }
    }
}

【疑问】

1、reduce任务失败时,如果是reduce内部处理失败,可以回调abortTask进行文件清理。但若是这个reduce节点挂掉了呢?
答案:若是节点挂掉,也就是说执行task的container挂掉, task attempt会回调该函数.

TaskAttemptImpl的StateMachineFactory中添加了Container fail和kill的事件,添加了TaskCleanupTransition监听器。CommitterEventHandler会处理添加的TaskCleanupEvent事件,回调abortTask方法。

    // Transitions from FAIL_CONTAINER_CLEANUP state.
     .addTransition(TaskAttemptState.FAIL_CONTAINER_CLEANUP,
         TaskAttemptState.FAIL_TASK_CLEANUP,
         TaskAttemptEventType.TA_CONTAINER_CLEANED, new TaskCleanupTransition())
      // Ignore-able events
     .addTransition(TaskAttemptState.FAIL_CONTAINER_CLEANUP,
         TaskAttemptState.FAIL_CONTAINER_CLEANUP,
         EnumSet.of(TaskAttemptEventType.TA_KILL,
             TaskAttemptEventType.TA_CONTAINER_COMPLETED,
             TaskAttemptEventType.TA_UPDATE,
             TaskAttemptEventType.TA_DIAGNOSTICS_UPDATE,
             TaskAttemptEventType.TA_COMMIT_PENDING,
             TaskAttemptEventType.TA_CONTAINER_LAUNCHED,
             TaskAttemptEventType.TA_DONE,
             TaskAttemptEventType.TA_FAILMSG,
             TaskAttemptEventType.TA_TIMED_OUT))
      // Transitions from KILL_CONTAINER_CLEANUP
     .addTransition(TaskAttemptState.KILL_CONTAINER_CLEANUP,
         TaskAttemptState.KILL_TASK_CLEANUP,
         TaskAttemptEventType.TA_CONTAINER_CLEANED, new TaskCleanupTransition())


  private static class TaskCleanupTransition implements
      SingleArcTransition {
    @Override
    public void transition(TaskAttemptImpl taskAttempt, 
        TaskAttemptEvent event) {
      TaskAttemptContext taskContext =
        new TaskAttemptContextImpl(taskAttempt.conf,
            TypeConverter.fromYarn(taskAttempt.attemptId));
      taskAttempt.eventHandler.handle(new TaskCleanupEvent(
          taskAttempt.attemptId,
          taskAttempt.committer,
          taskContext));
    }
  }
  
    //上面的handler是CommitterEventHandler,CommitterEventHandler将接收到的时间存入Queue,启动后台线程依次处理事件。
    
  public void run() {
      LOG.info("Processing the event " + event.toString());
      switch (event.getType()) {
      case JOB_SETUP:
        handleJobSetup((CommitterJobSetupEvent) event);
        break;
      case JOB_COMMIT:
        handleJobCommit((CommitterJobCommitEvent) event);
        break;
      case JOB_ABORT:
        handleJobAbort((CommitterJobAbortEvent) event);
        break;
      case TASK_ABORT:
        handleTaskAbort((CommitterTaskAbortEvent) event);
        break;
      default:
        throw new YarnRuntimeException("Unexpected committer event "
            + event.toString());
      }
    }
    
    //handleTaskAbort的过程
      protected void handleTaskAbort(CommitterTaskAbortEvent event) {
      try {
        committer.abortTask(event.getAttemptContext());
      } catch (Exception e) {
        LOG.warn("Task cleanup failed for attempt " + event.getAttemptID(), e);
      }
      context.getEventHandler().handle(
          new TaskAttemptEvent(event.getAttemptID(),
              TaskAttemptEventType.TA_CLEANUP_DONE));
    }

至此终于解决了问题。。

你可能感兴趣的:(MapReduce:N keys,N files(四)终极解决方案)