用CombineFileInputFormat解决小文件问题

使用Hadoop API(抽象类CombineFileInputFormat)来解决小文件的问题。抽象类CombineFileInputFormat的基本思想是通过使用一个定制的InputFormat允许将小文件合并到Hadoop的分片(split)或块(chunk)中。要使用抽象类CombineFileInputFormat,需要事项2个定制类。

    1、 CombineSmallfileInputFormat要扩展CombineFileInputFormat,创建子类来支持定制格式的输入。

    2、 CombineSmallfileRecordReader是一个定制RecordReader。

CombineSmallfileInputFormat自定义类的实现


public class CombineSmallfileInputFormat extends CombineFileInputFormat {

    @Override
    public RecordReader createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException {

        CombineFileSplit combineFileSplit = (CombineFileSplit) split;
        CombineFileRecordReader recordReader = new CombineFileRecordReader(combineFileSplit, context, CombineSmallfileRecordReader.class);
        try {
            recordReader.initialize(combineFileSplit, context);
        } catch (InterruptedException e) {
            new RuntimeException("Error to initialize CombineSmallfileRecordReader.");
        }
        return recordReader;
    }



    private final Log LOG = LogFactory.getLog(getClass());

    @Override
    protected List listStatus(JobContext job) throws IOException {

        LOG.info("###################### CombineSmallfileInputFormat.listStatus invoked ######################");

        Path[] dirs = getInputPaths(job);
        if (dirs.length == 0) {
            throw new IOException("No input paths specified in job");
        }

        // get tokens for all the required FileSystems..
        TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs, job.getConfiguration());

        // Whether we need to recursive look into the directory structure
        boolean recursive = getInputDirRecursive(job);

        // creates a MultiPathFilter with the hiddenFileFilter and the
        // user provided one (if any).
        List filters = new ArrayList();
        filters.add(hiddenFileFilter);
        PathFilter jobFilter = getInputPathFilter(job);
        if (jobFilter != null) {
            filters.add(jobFilter);
        }
        PathFilter inputFilter = new MultiPathFilter(filters);

        List result = null;

        int numThreads = job.getConfiguration().getInt(LIST_STATUS_NUM_THREADS, DEFAULT_LIST_STATUS_NUM_THREADS);
        Stopwatch sw = new Stopwatch().start();
        if (numThreads == 1) {
            result = singleThreadedListStatus(job, dirs, inputFilter, recursive);
        } else {
            Iterable locatedFiles = null;
            try {
                LocatedFileStatusFetcher locatedFileStatusFetcher = new LocatedFileStatusFetcher(job.getConfiguration(),
                        dirs, recursive, inputFilter, true);
                locatedFiles = locatedFileStatusFetcher.getFileStatuses();
            } catch (InterruptedException e) {
                throw new IOException("Interrupted while getting file statuses");
            }
            result = Lists.newArrayList(locatedFiles);
        }

        sw.stop();
        if (LOG.isDebugEnabled()) {
            LOG.debug("Time taken to get FileStatuses: " + sw.elapsedMillis());
        }
        LOG.info("Total input paths to process : " + result.size());
        return result;

    }

    private List singleThreadedListStatus(JobContext job, Path[] dirs, PathFilter inputFilter,
                                                      boolean recursive) throws IOException {

        List result = new ArrayList();
//		List errors = new ArrayList();
        List errors = new ArrayList();
        for (int i = 0; i < dirs.length; ++i) {
            Path p = dirs[i];
            FileSystem fs = p.getFileSystem(job.getConfiguration());

            FileStatus[] matches = fs.globStatus(p, inputFilter);
            if (matches == null) {
//				errors.add(new IOException("Input path does not exist: " + p));
                errors.add("Input path does not exist: " + p);
            } else if (matches.length == 0) {
//				errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
                errors.add("Input Pattern " + p + " matches 0 files");
            } else {
                for (FileStatus globStat : matches) {
                    if (globStat.isDirectory()) {
                        RemoteIterator iter = fs.listLocatedStatus(globStat.getPath());
                        while (iter.hasNext()) {
                            LocatedFileStatus stat = iter.next();
                            if (inputFilter.accept(stat.getPath())) {
                                if (recursive && stat.isDirectory()) {
                                    addInputPathRecursively(result, fs, stat.getPath(), inputFilter);
                                } else {
                                    result.add(stat);
                                }
                            }
                        }
                    } else {
                        result.add(globStat);
                    }
                }
            }
        }

        if (!errors.isEmpty()) {
//			throw new InvalidInputException(errors);

            for (String error : errors) {
                LOG.error(error);
            }
        }
        return result;
    }

    /**
     * Proxy PathFilter that accepts a path only if all filters given in the
     * constructor do. Used by the listPaths() to apply the built-in
     * hiddenFileFilter together with a user provided one (if any).
     */
    private static class MultiPathFilter implements PathFilter {
        private List filters;

        public MultiPathFilter(List filters) {
            this.filters = filters;
        }

        public boolean accept(Path path) {
            for (PathFilter filter : filters) {
                if (!filter.accept(path)) {
                    return false;
                }
            }
            return true;
        }
    }

    private static final PathFilter hiddenFileFilter = new PathFilter() {
        public boolean accept(Path p) {
            String name = p.getName();
            return !name.startsWith("_") && !name.startsWith(".");
        }
    };



}

 

CombineSmallfileRecordReader自定义类的实现


public class CombineSmallfileRecordReader extends RecordReader {
    private static Log log = LogFactory.getLog(CombineSmallfileRecordReader.class);

    private CombineFileSplit combineFileSplit;
    private LineRecordReader lineRecordReader = new LineRecordReader();
    private Path[] paths;
    private int totalLength;
    private int currentIndex;
    private float currentProgress = 0;
    private LongWritable currentKey;
    private Text currentValue = new Text();


    public CombineSmallfileRecordReader(CombineFileSplit combineFileSplit, TaskAttemptContext context, Integer index) throws IOException {
        super();
        this.combineFileSplit = combineFileSplit;
        this.currentIndex = index; // 当前要处理的小文件Block在CombineFileSplit中的索引

//        FileStatus f = FileSystem.get(context.getConfiguration()).getFileStatus(combineFileSplit.getPath(currentIndex));

        try {
            context.getConfiguration().set("file.filename",combineFileSplit.getPath(currentIndex).getName());
            context.getConfiguration().set("file.modificationtime",
                    FileSystem.get(context.getConfiguration()).getFileStatus(combineFileSplit.getPath(currentIndex)).getModificationTime()+"");
        } catch (Exception e) {
             e.printStackTrace();
        }
    }

    @Override
    public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException {
        this.combineFileSplit = (CombineFileSplit) split;

        // 处理CombineFileSplit中的一个小文件Block,因为使用LineRecordReader,需要构造一个FileSplit对象,然后才能够读取数据
        FileSplit fileSplit = null;
        try {
           fileSplit = new FileSplit(combineFileSplit.getPath(currentIndex), combineFileSplit.getOffset(currentIndex), combineFileSplit.getLength(currentIndex), combineFileSplit.getLocations());
           lineRecordReader.initialize(fileSplit, context);
       }catch(FileNotFoundException e){
           log.error("initialize file is not found ,..........................");
       }catch(Exception e){
           log.error("initialize is wrong    ..........................");
       }

        this.paths = combineFileSplit.getPaths();
        totalLength = paths.length;
//        context.getConfiguration().set("map.input.file.path", combineFileSplit.getPath(currentIndex).toString());
    }

    @Override
    public LongWritable getCurrentKey() throws IOException, InterruptedException {
        currentKey = lineRecordReader.getCurrentKey();
        return currentKey;
    }

    @Override
    public Text getCurrentValue() throws IOException, InterruptedException {
        Text content = lineRecordReader.getCurrentValue();
        currentValue.set(content);
//        currentValue.set(content, 0, content.length);
        return currentValue;
    }

    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {
        boolean exist = false;
        if (currentIndex >= 0 && currentIndex < totalLength) {
            try{
                exist = lineRecordReader.nextKeyValue();
            }catch(Exception e){
                exist = false;
            }
            return exist;
        } else {
            return false;
        }
    }

    @Override
    public float getProgress() throws IOException {
        if (currentIndex >= 0 && currentIndex < totalLength) {
            currentProgress = (float) currentIndex / totalLength;
            return currentProgress;
        }
        return currentProgress;
    }

    @Override
    public void close() throws IOException {
        lineRecordReader.close();
    }
}

Hadoop程序的实现


public class WordCount {
    //    private static Log log = LogFactory.getLog(WordCount.class);
    private static Logger log = LoggerFactory.getLogger(WordCount.class);
    private static Path in;
    private static Path out;

    public static void main(String[] args) {
        System.setProperty("hadoop.home.dir", "D:\\installed\\work\\hadoop-2.7.3");
        String inputPath = args[0];
        String outputPath = args[1];
        String jobName = args[2];
        String queueName = args[3];
        String htablePath = args[4];
        log.info("############inputPath:" + inputPath + "############");
        log.info("############outputPath:" + outputPath + "############");

        if (jobName == null || "".equals(jobName)) {
            jobName = "StatisticCount";
        }

        if (queueName == null || "".equals(queueName)) {
            queueName = "statdpi";
        }

        log.info("############jobName:" + jobName + "############");
        log.info("############queueName:" + queueName + "############");

        try {
            execute(inputPath, outputPath, jobName, queueName, htablePath);
        } catch (Exception e) {
            log.error(e.getMessage());
            e.printStackTrace();
        }
    }

    private static void execute(String inputPath, String outputPath,
                                String jobName, String queueName, String htablePath) throws Exception {

        Configuration conf = new Configuration();
//        conf.set("mapreduce.job.queuename", queueName);
        conf.set("mapreduce.jobtracker.split.metainfo.maxsize", "100000000");
        conf.setLong("mapreduce.map.failures.maxpercent", 100);
        conf.setLong("mapreduce.reduce.failures.maxpercent", 100);
        conf.setInt("mapreduce.input.fileinputformat.split.minsize", 1);
        conf.setLong("mapreduce.input.fileinputformat.split.maxsize", 157286400);
//        conf.set("htablePath", htablePath);

        out = new Path(outputPath);
        in = new Path(inputPath);

        FileSystem fs = FileSystem.get(conf);

        Job job = Job.getInstance(conf);
        job.setJobName(jobName);
        job.setJarByClass(WordCount.class);
        job.setMapperClass(WordCountMapper.class);
        job.setReducerClass(WordCountReducer.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(IntWritable.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);
        job.setNumReduceTasks(1);
        job.setInputFormatClass(CombineSmallfileInputFormat.class);
        //报错,实例化错误
//        job.setInputFormatClass(CombineFileInputFormat.class);


        FileInputFormat.addInputPath(job, in);
//		FileInputFormat.setInputPaths(job, in);
        FileOutputFormat.setOutputPath(job, out);
        //FileOutputFormat.setCompressOutput(job, true);

        if (fs.exists(out)) {
            fs.delete(out, true);
        }

        boolean flag = job.waitForCompletion(true);
        log.info("#####################执行结果:" + flag);
        log.info("#############################################################################################");
        
    }
}

class WordCountMapper extends Mapper {

    final static IntWritable one = new IntWritable(1);
    private Text word = new Text();

    public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
        String line = value.toString().trim();
        String[] tokens = StringUtils.split(line, "\t");
        for (String tok : tokens) {
            word.set(tok);
            context.write(word, one);
        }
    }
}


class WordCountReducer extends Reducer {

    public void reduce(Text key, Iterable values, Context context)
            throws IOException, InterruptedException {
        int sum = 0;
        for (IntWritable val : values) {
            sum += val.get();
        }
        context.write(key, new IntWritable(sum));
    }
}

 

你可能感兴趣的:(hadoop)