MapReduce源码解析之InputFormat(二)


继续花了一些时间专研了下昨天未读透的FileInputFormat,首先是listStatus,其中单线程的逻辑不赘述了,比较好理解。下面主要看一下多线程模式下的listStatus。


其中涉及到类:LocatedFileStatusFetcher,其中入参:配置、文件目录、是否递归处理、过滤、新旧Api。

public Iterable getFileStatuses() throws InterruptedException,
            IOException {
        // Increment to make sure a race between the first thread completing and the
        // rest being scheduled does not lead to a termination.
        //主线程数+1,这个没想明白作用?
        runningTasks.incrementAndGet();
        for (Path p : inputDirs) {
            runningTasks.incrementAndGet();
            //提交任务
            ListenableFuture future = exec
                    .submit(new ProcessInitialInputPathCallable(p, conf, inputFilter));
            //回调
            Futures.addCallback(future, processInitialInputPathCallback);
        }
        //减去进入之后的+1
        runningTasks.decrementAndGet();
        //获取锁
        lock.lock();
        try {
            //执行线程都结束或者存在错误异常
            while (runningTasks.get() != 0 && unknownError == null) {
                //进入睡眠,释放锁,等待唤醒
                condition.await();
            }
        } finally {
            lock.unlock();
        }
        this.exec.shutdownNow();
        if (this.unknownError != null) {
            if (this.unknownError instanceof Error) {
                throw (Error) this.unknownError;
            } else if (this.unknownError instanceof RuntimeException) {
                throw (RuntimeException) this.unknownError;
            } else if (this.unknownError instanceof IOException) {
                throw (IOException) this.unknownError;
            } else if (this.unknownError instanceof InterruptedException) {
                throw (InterruptedException) this.unknownError;
            } else {
                throw new IOException(this.unknownError);
            }
        }
        if (this.invalidInputErrors.size() != 0) {
            if (this.newApi) {
                throw new org.apache.hadoop.mapreduce.lib.input.InvalidInputException(
                        invalidInputErrors);
            } else {
                throw new InvalidInputException(invalidInputErrors);
            }
        }
        return Iterables.concat(resultQueue);
    }
其中Callable的实现,实现第一层文件(夹)的读取过程
private static class ProcessInitialInputPathCallable implements
            Callable {
        //文件路径
        private final Path path;
        //配置项
        private final Configuration conf;
        //路径过滤项
        private final PathFilter inputFilter;

        public ProcessInitialInputPathCallable(Path path, Configuration conf,
                                               PathFilter pathFilter) {
            this.path = path;
            this.conf = conf;
            this.inputFilter = pathFilter;
        }

        @Override
        public Result call() throws Exception {
            //初始化返回结果
            Result result = new Result();
            //获取文件系统
            FileSystem fs = path.getFileSystem(conf);
            result.fs = fs;
            FileStatus[] matches = fs.globStatus(path, inputFilter);
            if (matches == null) {
                result.addError(new IOException("Input path does not exist: " + path));
            } else if (matches.length == 0) {
                result.addError(new IOException("Input Pattern " + path
                        + " matches 0 files"));
            } else {
                result.matchedFileStatuses = matches;
            }
            //返回结果
            return result;
        }

        private static class Result {
            private List errors;
            private FileStatus[] matchedFileStatuses;
            private FileSystem fs;

            void addError(IOException ioe) {
                if (errors == null) {
                    errors = new LinkedList();
                }
                errors.add(ioe);
            }
        }
    }

private class ProcessInitialInputPathCallback implements
            FutureCallback {

        @Override
        public void onSuccess(ProcessInitialInputPathCallable.Result result) {
            try {
                if (result.errors != null) {
                    registerInvalidInputError(result.errors);
                }
                if (result.matchedFileStatuses != null) {
                    for (FileStatus matched : result.matchedFileStatuses) {
                        //线程数+1
                        runningTasks.incrementAndGet();
                        //递归获取所有文件夹下的文件信息
                        ListenableFuture future = exec
                                .submit(new ProcessInputDirCallable(result.fs, matched,
                                        recursive, inputFilter));
                        Futures.addCallback(future, processInputDirCallback);
                    }
                }
                //线程成功之后线程数减1
                decrementRunningAndCheckCompletion();
            } catch (Throwable t) { // Exception within the callback
                registerError(t);
            }
        }

其中在future执行成功之后,执行第二层文件结构解析,实际上是递归,直接看callback实现就理解了。

private class ProcessInputDirCallback implements
            FutureCallback {

        @Override
        public void onSuccess(ProcessInputDirCallable.Result result) {
            try {
                if (result.locatedFileStatuses.size() != 0) {
                    resultQueue.add(result.locatedFileStatuses);
                }
                //
                if (result.dirsNeedingRecursiveCalls.size() != 0) {
                    for (FileStatus fileStatus : result.dirsNeedingRecursiveCalls) {
                        //新线程+1
                        runningTasks.incrementAndGet();
                        //间接递归
                        ListenableFuture future = exec
                                .submit(new ProcessInputDirCallable(result.fs, fileStatus,
                                        recursive, inputFilter));
                        Futures.addCallback(future, processInputDirCallback);
                    }
                }
                //线程总数-1,并判断是否唤醒
                decrementRunningAndCheckCompletion();
            } catch (Throwable t) { // Error within the callback itself.
                //发生异常,直接唤醒
                registerError(t);
            }
        }

        @Override
        public void onFailure(Throwable t) {
            // Any generated exceptions. Leads to immediate termination.
            registerError(t);
        }
    }

总结下:

第一层path解析,获取FileStatus,然后第二层递归获取LocatedFileStatus。其中用到lock+condition的方式来控制线程是否都跑结束或者是否发生异常

此段代码写的比较给力,不过其中有个点不太理解,刚进方法体的时候,执行了一次:

这个设计有点疑惑,此处都是单线程进行的,没啥用处。


你可能感兴趣的:(Hadoop源码)