



1、 在客户端将小文件合并为大文件。

2、 使用Hadoop的CombineFileInputFormat实现小文件的合并。







 * 为Hadoop作业驱动程序提供通用小文件进行合并功能。
public class SmallFilesConsolidator {

	private static Logger logger = Logger.getLogger(SmallFilesConsolidator.class);

	// 可配置的HDFS根目录
	private static String MERGED_HDFS_ROOT_DIR = "/tmp/";

	 * 获取Buckets的数量
	 * @param totalFiles:总文件数
	 * @param numberOfMapSlotsAvailable:
	 * @param maxFilesPerBucket:每一个Bucket的最大文件数
	public static int getNumberOfBuckets(int totalFiles, int numberOfMapSlotsAvailable, int maxFilesPerBucket) {
		if (totalFiles <= (maxFilesPerBucket * numberOfMapSlotsAvailable)) {
			return numberOfMapSlotsAvailable;
		} else {
			int numberOfBuckets = totalFiles / maxFilesPerBucket;
			int remainder = totalFiles % maxFilesPerBucket;
			if (remainder == 0) {
				return numberOfBuckets;
			} else {
				return numberOfBuckets + 1;

	 * 为映射器创建Buckets
	public static BucketThread[] createBuckets(int totalFiles, int numberOfMapSlotsAvailable, int maxFilesPerBucket) {
		int numberOfBuckets = getNumberOfBuckets(totalFiles, numberOfMapSlotsAvailable, maxFilesPerBucket);
		BucketThread[] buckets = new BucketThread[numberOfBuckets];
		return buckets;

	 * 填充Bucket
	 * @param buckets:所有Bucket列表
	 * @param smallFiles:小文件数
	 * @param job:Hadoop运行的作业
	 * @param maxFilesPerBucket:每一个Bucket的最大文件数
	public static void fillBuckets(BucketThread[] buckets, List smallFiles, Job job, int maxFilesPerBucket)
			throws Exception {

		int numberOfBuckets = buckets.length;
		// 将所有的小文件分区并填充到bucket中
		int combinedSize = smallFiles.size();
		int biosetsPerBucket = combinedSize / numberOfBuckets;
		if (biosetsPerBucket < maxFilesPerBucket) {
			int remainder = combinedSize % numberOfBuckets;
			if (remainder != 0) {

		String parentDir = getParentDir();
		// 使用Bucket的序号定义Bucket的Id(范围是从0到numberOfBuckets-1)
		int id = 0;
		int index = 0;
		boolean done = false;
		while ((!done) & (id < numberOfBuckets)) {
			// 创建一个Bucket对象
			buckets[id] = new BucketThread(parentDir, id, job.getConfiguration());
			// 使用小文件填充Bucket
			for (int b = 0; b < biosetsPerBucket; b++) {
				if (index == combinedSize) {
					done = true;

	 * 对于每一个Bucket启动一个线程,并合并小文件
	public static void mergeEachBucket(BucketThread[] buckets, Job job) throws Exception {
		if (buckets == null) {

		int numberOfBuckets = buckets.length;
		if (numberOfBuckets < 1) {

		for (int ID = 0; ID < numberOfBuckets; ID++) {
			if (buckets[ID] != null) {

		// 等待所有线程完成
		for (int ID = 0; ID < numberOfBuckets; ID++) {
			if (buckets[ID] != null) {

		for (int ID = 0; ID < numberOfBuckets; ID++) {
			if (buckets[ID] != null) {
				Path biosetPath = buckets[ID].getTargetDir();
				addInputPathWithoutCheck(job, biosetPath);

	private static void addInputPathWithoutCheck(Job job, Path path) {
		try {
			FileInputFormat.addInputPath(job, path);
			logger.info("added path: " + path);
		} catch (Exception e) {
			logger.error("could not add path: " + path, e);

	private static String getParentDir() {
		String guid = UUID.randomUUID().toString();
		return MERGED_HDFS_ROOT_DIR + guid + "/";



 * 这个类提供了将小于块大小的文件合并为一个大于块大小的文件,这样将提交较少的map()作业,提高map的运行效率。
public class BucketThread implements Runnable {

	private static Logger theLogger = Logger.getLogger(BucketThread.class);
	private static final Path NULL_PATH = new Path("/tmp/null");

	private Thread runner = null;
	private List bucket = null;
	private Configuration conf = null;
	private FileSystem fs = null;
	private String parentDir = null;

	private String targetDir = null;
	private String targetFile = null;

	 * 创建一个新的Bucket线程对象
	 * @param parentDir:父目录
	 * @param id:
	 *            每一个Bucket都有一个唯一的ID
	public BucketThread(String parentDir, int id, Configuration conf) throws IOException {
		this.parentDir = parentDir;
		// 存储目标目录
		this.targetDir = parentDir + id;
		// 存储目标文件
		this.targetFile = targetDir + "/" + id;
		this.conf = conf;
		this.runner = new Thread(this);
		this.fs = FileSystem.get(this.conf);
		this.bucket = new ArrayList();

	 * 启动线程
	public void start() {

	 * 连接并等待其他线程
	public void join() throws InterruptedException {

	 * 线程的核心执行
	public void run() {
		try {
		} catch (Exception e) {
			theLogger.error("run(): copyMerge() failed.", e);

	 * @param path
	 *            :添加一个文件到Bucket中
	public void add(String path) {
		if (path == null) {

		Path hdfsPath = new Path(path);
		if (pathExists(hdfsPath)) {

	public List getBucket() {
		return bucket;

	public int size() {
		return bucket.size();

	public Path getTargetDir() {
		if (size() == 0) {
			// 没有文件的空目录
			return NULL_PATH;
		} else if (size() == 1) {
			return bucket.get(0);
		} else {
			// bucket有两个或更多的文件,并且已经被合并
			return new Path(targetDir);

	 * 将多个目录中的所有文件复制到一个输出文件(合并)。
	 * 将bucket中的所有路径合并,并返回一个新的目录(targetDir),该目录包含合并的路径。
	public void copyMerge() throws IOException {
		// 如果bucket中只有一个路径/dir,则不需要合并它
		if (size() < 2) {

		Path hdfsTargetFile = new Path(targetFile);
		OutputStream out = fs.create(hdfsTargetFile);
		try {
			for (int i = 0; i < bucket.size(); i++) {
				FileStatus contents[] = fs.listStatus(bucket.get(i));
				for (int k = 0; k < contents.length; k++) {
					if (!contents[k].isDir()) {
						InputStream in = fs.open(contents[k].getPath());
						try {
							IOUtils.copyBytes(in, out, conf, false);
						} finally {

		} finally {


	public String getParentDir() {
		return parentDir;

	 * HDFS目录存在,则返回true,否则返回false
	public boolean pathExists(Path path) {
		if (path == null) {
			return false;

		try {
			return fs.exists(path);
		} catch (Exception e) {
			return false;

	public String toString() {
		return bucket.toString();



 * 使用小文件合并的单词计数驱动程序
public class WordCountDriverWithConsolidator extends Configured implements Tool {

	private static final Logger logger = Logger.getLogger(WordCountDriverWithConsolidator.class);
	private static int NUMBER_OF_MAP_SLOTS_AVAILABLE = 8;
	// 每一个bucket的最大文件数
	private static int MAX_FILES_PER_BUCKET = 5;

	private String inputDir = null;
	private String outputDir = null;
	private Job job = null;

	public WordCountDriverWithConsolidator(String inputDir, String outputDir) {
		this.inputDir = inputDir;
		this.outputDir = outputDir;

	public Job getJob() {
		return this.job;

	 * 启动Job
	public int run(String[] args) throws Exception {
		this.job = new Job(getConf(), "WordCountDriverWithConsolidator");
		job.getConfiguration().setInt("word.count.ignored.length", 3);

		// 将所有jar文件添加到HDFS的分布式缓存中
		HadoopUtil.addJarsToDistributedCache(job, "/lib/");

		// 获取HDFS文件系统
		FileSystem fs = FileSystem.get(job.getConfiguration());
		List smallFiles = HadoopUtil.listDirectoryAsListOfString(inputDir, fs);
		int size = smallFiles.size();
			for (String file : smallFiles) {
				logger.info("file=" + file);
				addInputPath(fs, job, file);
		} else {
			// 创建文件Bucket,每一个Bucket将会添加小文件
			BucketThread[] buckets = SmallFilesConsolidator.createBuckets(size, NUMBER_OF_MAP_SLOTS_AVAILABLE,
			SmallFilesConsolidator.fillBuckets(buckets, smallFiles, job, MAX_FILES_PER_BUCKET);
			SmallFilesConsolidator.mergeEachBucket(buckets, job);

		// 输出路径
		FileOutputFormat.setOutputPath(job, new Path(outputDir));




		boolean status = job.waitForCompletion(true);
		logger.info("run(): status=" + status);
		return status ? 0 : 1;

	 * 添加输入路径
	private void addInputPath(FileSystem fs, Job job, String pathAsString) {
		try {
			Path path = new Path(pathAsString);
			if (HadoopUtil.pathExists(path, fs)) {
				FileInputFormat.addInputPath(job, path);
			} else {
				logger.info("addInputPath(): path does not exist. ignored: " + pathAsString);
		} catch (Exception e) {
			logger.error("addInputPath(): could not add path: " + pathAsString, e);

	 * 提交map/reduce作业
	public static int submitJob(String inputDir, String outputDir) throws Exception {
		WordCountDriverWithConsolidator driver = new WordCountDriverWithConsolidator(inputDir, outputDir);
		int status = ToolRunner.run(driver, null);
		logger.info("submitJob(): status=" + status);
		return status;

	 * Wordcount的map/reduce程序的主驱动程序。调用此方法提交map/reduce作业。
	 * @throws Exception:作业跟踪器通信问题时抛出异常。
	public static void main(String[] args) throws Exception {
		// 确定有两个参数
		if (args.length != 2) {
			logger.warn("2 arguments. , ");
			throw new IllegalArgumentException("2 arguments. , ");

		logger.info("inputDir=" + args[0]);
		logger.info("outputDir=" + args[1]);
		long startTime = System.currentTimeMillis();
		int returnStatus = submitJob(args[0], args[1]);
		long elapsedTime = System.currentTimeMillis() - startTime;
		logger.info("returnStatus=" + returnStatus);
		logger.info("Finished in milliseconds: " + elapsedTime);
 * WordCount Mapper
public class WordCountMapper extends Mapper {

	private int ignoredLength = 3;
	private static final IntWritable one = new IntWritable(1);
	private Text reducerKey = new Text();

	protected void setup(Context context) throws IOException, InterruptedException {
		this.ignoredLength = context.getConfiguration().getInt("word.count.ignored.length", 3);

	public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
		String line = value.toString().trim();
		if ((line == null) || (line.length() < ignoredLength)) {

		String[] words = StringUtils.split(line);
		if (words == null) {

		for (String word : words) {
			if (word.length() < this.ignoredLength) {
			if (word.matches(".*[,.;]$")) {
				word = word.substring(0, word.length() - 1);
			context.write(reducerKey, one);

public class WordCountReducer extends Reducer {

	public void reduce(Text key, Iterable values, Context context)
			throws IOException, InterruptedException {
		int sum = 0;
		for (IntWritable count : values) {
			sum += count.get();
		context.write(key, new IntWritable(sum));



使用Hadoop API(抽象类CombineFileInputFormat)来解决小文件的问题。抽象类CombineFileInputFormat的基本思想是通过使用一个定制的InputFormat允许将小文件合并到Hadoop的分片(split)或块(chunk)中。要使用抽象类CombineFileInputFormat,需要事项3个定制类。

    1、 CustomCFIF要扩展CombineFileInputFormat,创建子类来支持定制格式的输入。

    2、 PairOfStringLong是一个Writable类,会存储小文件名称及其偏移量(Long)。调用compareTo()方法:首先比较文件名,再比较便宜量。

    3、 CustomRecordReader是一个定制RecordReader。


 * 自定义文件输入格式,将较小的文件合并到控制大小为MAX_SPLIT_SIZE_128MB的文件中
public class CustomCFIF extends CombineFileInputFormat {
	final static long MAX_SPLIT_SIZE_128MB = 134217728; // 128 MB = 128*1024*1024

	public CustomCFIF() {

	public RecordReader createRecordReader(InputSplit split, TaskAttemptContext context)
			throws IOException {
		return new CombineFileRecordReader((CombineFileSplit) split, context,

	protected boolean isSplitable(JobContext context, Path file) {
		return false;


 * 自定义记录文件读取类
public class CustomRecordReader extends RecordReader {
	private PairOfStringLong key;
	private Text value;

	// define pos and offsets
	private long startOffset;
	private long endOffset;
	private long pos;

	private FileSystem fs;
	private Path path;
	private FSDataInputStream fileIn;
	private LineReader reader;

	public CustomRecordReader(CombineFileSplit split, TaskAttemptContext context, Integer index) throws IOException {
		path = split.getPath(index);
		fs = path.getFileSystem(context.getConfiguration());
		startOffset = split.getOffset(index);
		endOffset = startOffset + split.getLength(index);
		fileIn = fs.open(path);
		reader = new LineReader(fileIn);
		pos = startOffset;

	public void initialize(InputSplit arg0, TaskAttemptContext arg1) throws IOException, InterruptedException {
		// This will not be called, use custom Constructor

	public void close() throws IOException {

	public float getProgress() throws IOException {
		if (startOffset == endOffset) {
			return 0;
		return Math.min(1.0f, (pos - startOffset) / (float) (endOffset - startOffset));

	public PairOfStringLong getCurrentKey() throws IOException, InterruptedException {
		return key;

	public Text getCurrentValue() throws IOException, InterruptedException {
		return value;

	public boolean nextKeyValue() throws IOException {
		if (key == null) {
			// key.filename = path.getName()
			// key.offset = pos
			key = new PairOfStringLong(path.getName(), pos);
		if (value == null) {
			value = new Text();
		int newSize = 0;
		if (pos < endOffset) {
			newSize = reader.readLine(value);
			pos += newSize;
		if (newSize == 0) {
			key = null;
			value = null;
			return false;
		} else {
			return true;


 * 将小文件合并到大文件的单词计数驱动程序类。
public class CombineSmallFilesDriver extends Configured implements Tool {

	public static void main(String[] args) throws Exception {
		long beginTime = System.currentTimeMillis();
		System.exit(ToolRunner.run(new Configuration(), new CombineSmallFilesDriver(), args));
		long elapsedTime = System.currentTimeMillis() - beginTime;
		System.out.println("elapsed time(millis): " + elapsedTime);

	public int run(String[] args) throws Exception {
		System.out.println("input path = " + args[0]);
		System.out.println("output path = " + args[1]);

		Configuration conf = getConf();
		Job job = new Job(conf);

		// 将所有jar文件添加到HDFS的分布式缓存中
		HadoopUtil.addJarsToDistributedCache(job, "/lib/");

		// 定义文件数据格式化

		// 定义Output的Key和Value类型

		// 定义map和reduce的函数类
		// job.setNumReduceTasks(13);

		// 定义输入/输出路径
		Path inputPath = new Path(args[0]);
		Path outputPath = new Path(args[1]);
		FileInputFormat.addInputPath(job, inputPath);
		FileOutputFormat.setOutputPath(job, outputPath);

		// 提交作业等待完成
		return 0;

 * Wordcount Mapper
public class WordCountMapper extends Mapper {

	final static IntWritable one = new IntWritable(1);
	private Text word = new Text();

	public void map(PairOfStringLong key, Text value, Context context) throws IOException, InterruptedException {
		String line = value.toString().trim();
		String[] tokens = StringUtils.split(line, " ");
		for (String tok : tokens) {
			context.write(word, one);

 * Wordcount Reduce
public class WordCountReducer extends Reducer {

	public void reduce(Text key, Iterable values, Context context)
			throws IOException, InterruptedException {
		int sum = 0;
		for (IntWritable val : values) {
			sum += val.get();
		context.write(key, new IntWritable(sum));



