MapReduce处理小文件合并

一:小文件合并几种方式:

1、 在数据采集的时候,客户端就将小文件或小批数据合成大文件再上传HDFS

2、 在业务处理之前,在HDFS上使用mapreduce程序对小文件进行合并

3、 在mapreduce处理时,可采用combineInputFormat提高效率

二:自定义InputFormat合并小文件

通过自定义实现FileInputFormat,设置读取小文件时不进行切片,且使用自定义的RecordReader进行读取。自定义实现RecordReader,在读取小文件时因为设置了不切片,可以读取小文件的所有内容,并将内容写到ByteWritalb中,且ByteWritalb作为v1。

在驱动类中job设置setInputFormatClass 时设置成为自定义实现的FileInputFormat,且输出的时候使用SequenceFileOutputFormat设置输出文件内容为二进制内容。

三:自定义实现FileInputFormat代码实现


import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

import java.io.IOException;

/**
 * 自定义InputFormat读取文件
 */
public class WholeFileInputFormat extends FileInputFormat{

    /**
     * 是否可以切割,小文件直接返回,不用切割
     * @param context
     * @param filename
     * @return
     */
    @Override
    protected boolean isSplitable(JobContext context, Path filename) {
        return false;
    }

    @Override
    public RecordReader createRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
        WholeFileRecordReader reader = new WholeFileRecordReader();
        reader.initialize(inputSplit,taskAttemptContext);
        return reader;
    }
}

四:自定义实现RecordReader类


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;

/**
 * 小文件读取,将内容封装到BytesWritable中
 * 自定义RecordReader类
 * RecordReader核心工作逻辑
 * 通过nextKeyValue方法读取数据构造,返回key,value
 * 通过getCurrentKey方法和getCurrentValue方法获取上面构造好了的key和value
 */
public class WholeFileRecordReader extends RecordReader {

    private FileSplit fileSplit;
    private Configuration configuration;
    //定义v2
    private BytesWritable bytesWritable = new BytesWritable();
    private boolean nextKeyValue = false;

    /**
     * 初始化方法
     * @param inputSplit:文件切片,拿到文件的切片,就可以拿到文件,就可以把文件内容转换成字节数组
     * @param taskAttemptContext:上下文对象,一些参数都封装在context中
     * @throws IOException
     * @throws InterruptedException
     */
    @Override
    public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
        this.fileSplit = (FileSplit) inputSplit;
        this.configuration = taskAttemptContext.getConfiguration();
    }

    /**
     * 往下继续读取文件
     * 返回一个boolean值,如果返回true,表示文件已经读取完成,不在继续往下读取
     * 如果返回false,则继续往下读取
     * @return
     * @throws IOException
     * @throws InterruptedException
     */
    @Override
    public boolean nextKeyValue() throws IOException, InterruptedException {
        if(!nextKeyValue){
            //根据自定义的WholeFileInputFormat.isSplitable是否切片方法设置了false,所以获取的长度为文件大小的整个大小
            // 文件的内容全部读取出来,封装到BytesWritable里面去,字节数组定义的大小为文件的大小
            byte[] fileContent = new byte[Integer.valueOf((int) fileSplit.getLength())];
            Path path = fileSplit.getPath();
            // file:///本地文件系统   hdfs://hdfs文件系统
            //获取到文件系统
            FileSystem fileSystem = path.getFileSystem(configuration);
            FSDataInputStream dataInputStream = null;
            try {
                //根据fileSystem打开文件的输入流
                dataInputStream = fileSystem.open(path);
                //将小文件输入流封装到字节数组中
                IOUtils.readFully(dataInputStream,fileContent,0, (int) fileSplit.getLength());
                bytesWritable.set(fileContent,0,fileContent.length);
            }finally {
                IOUtils.closeStream(dataInputStream);
            }
            //设置读取文件标识,标识文件已经读取完毕,不需要再继续往下读取
            nextKeyValue = true;
            return nextKeyValue;
        }
        return false;
    }

    /**
     * 返回k1
     * @return
     * @throws IOException
     * @throws InterruptedException
     */
    @Override
    public NullWritable getCurrentKey() throws IOException, InterruptedException {
        return NullWritable.get();
    }

    /**
     * 返回v1  BytesWritable
     * 需要将文件内容读取出来封装到BytesWritable
     * @return
     * @throws IOException
     * @throws InterruptedException
     */
    @Override
    public BytesWritable getCurrentValue() throws IOException, InterruptedException {
        return bytesWritable;
    }

    /**
     * 读取文件进度
     * @return
     * @throws IOException
     * @throws InterruptedException
     */
    @Override
    public float getProgress() throws IOException, InterruptedException {
        return nextKeyValue?0:1;
    }

    /**
     * 读取完成之后释放资源
     * @throws IOException
     */
    @Override
    public void close() throws IOException {

    }
}
五:自定义MAP
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;

/**
 * 小文件合并map
 */
public class OwnInputFormatMapper extends Mapper{

    @Override
    protected void map(NullWritable key, BytesWritable value, Context context) throws IOException, InterruptedException {
        FileSplit split = (FileSplit) context.getInputSplit();
        String fileName = split.getPath().getName();
        context.write(new Text(fileName),value);
    }
}

五:自定义MAP

import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;

import java.io.IOException;

/**
 * 小文件合并map
 */
public class OwnInputFormatMapper extends Mapper{

    @Override
    protected void map(NullWritable key, BytesWritable value, Context context) throws IOException, InterruptedException {
        FileSplit split = (FileSplit) context.getInputSplit();
        String fileName = split.getPath().getName();
        context.write(new Text(fileName),value);
    }
}

六:驱动程序


import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
 * 合并小文件
 * 自定义FileInputFormat实现类、RecordReader实现类
 * 设置小文件不能分割,读取小文件内容,将内容封装到BytesWritable中作为v2
 */
public class OwnInputFormatMain extends Configured implements Tool{
    @Override
    public int run(String[] args) throws Exception {
        Job job = Job.getInstance(super.getConf(), "ownInputFormat");
        job.setInputFormatClass(WholeFileInputFormat.class);
        WholeFileInputFormat.addInputPath(job,new Path("file:///E:\\java\\\\input"));
        job.setMapperClass(OwnInputFormatMapper.class);
        job.setMapOutputKeyClass(Text.class);
        job.setMapOutputValueClass(BytesWritable.class);


        //设置reduce的输出类型,虽然没有reduce了,但是默认输出类型 LongWritalb  Text
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(BytesWritable.class);

        //设置输出结果文件内容为二进制:SequenceFileOutputFormat
        job.setOutputFormatClass(SequenceFileOutputFormat.class);
        SequenceFileOutputFormat.setOutputPath(job,new Path("file:///E:\\java\\xucjInout"));
        boolean b = job.waitForCompletion(true);


        return b?0:1;
    }

    public static void main(String[] args) throws Exception {
        int run = ToolRunner.run(new Configuration(), new OwnInputFormatMain(), args);
        System.exit(run);
    }
}

你可能感兴趣的:(bigdata)