1、 在数据采集的时候,客户端就将小文件或小批数据合成大文件再上传HDFS
2、 在业务处理之前,在HDFS上使用mapreduce程序对小文件进行合并
3、 在mapreduce处理时,可采用combineInputFormat提高效率
通过自定义实现FileInputFormat,设置读取小文件时不进行切片,且使用自定义的RecordReader进行读取。自定义实现RecordReader,在读取小文件时因为设置了不切片,可以读取小文件的所有内容,并将内容写到ByteWritalb中,且ByteWritalb作为v1。
在驱动类中job设置setInputFormatClass 时设置成为自定义实现的FileInputFormat,且输出的时候使用SequenceFileOutputFormat设置输出文件内容为二进制内容。
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.JobContext;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import java.io.IOException;
/**
* 自定义InputFormat读取文件
*/
public class WholeFileInputFormat extends FileInputFormat{
/**
* 是否可以切割,小文件直接返回,不用切割
* @param context
* @param filename
* @return
*/
@Override
protected boolean isSplitable(JobContext context, Path filename) {
return false;
}
@Override
public RecordReader createRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
WholeFileRecordReader reader = new WholeFileRecordReader();
reader.initialize(inputSplit,taskAttemptContext);
return reader;
}
}
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataInputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.RecordReader;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
/**
* 小文件读取,将内容封装到BytesWritable中
* 自定义RecordReader类
* RecordReader核心工作逻辑
* 通过nextKeyValue方法读取数据构造,返回key,value
* 通过getCurrentKey方法和getCurrentValue方法获取上面构造好了的key和value
*/
public class WholeFileRecordReader extends RecordReader {
private FileSplit fileSplit;
private Configuration configuration;
//定义v2
private BytesWritable bytesWritable = new BytesWritable();
private boolean nextKeyValue = false;
/**
* 初始化方法
* @param inputSplit:文件切片,拿到文件的切片,就可以拿到文件,就可以把文件内容转换成字节数组
* @param taskAttemptContext:上下文对象,一些参数都封装在context中
* @throws IOException
* @throws InterruptedException
*/
@Override
public void initialize(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException {
this.fileSplit = (FileSplit) inputSplit;
this.configuration = taskAttemptContext.getConfiguration();
}
/**
* 往下继续读取文件
* 返回一个boolean值,如果返回true,表示文件已经读取完成,不在继续往下读取
* 如果返回false,则继续往下读取
* @return
* @throws IOException
* @throws InterruptedException
*/
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
if(!nextKeyValue){
//根据自定义的WholeFileInputFormat.isSplitable是否切片方法设置了false,所以获取的长度为文件大小的整个大小
// 文件的内容全部读取出来,封装到BytesWritable里面去,字节数组定义的大小为文件的大小
byte[] fileContent = new byte[Integer.valueOf((int) fileSplit.getLength())];
Path path = fileSplit.getPath();
// file:///本地文件系统 hdfs://hdfs文件系统
//获取到文件系统
FileSystem fileSystem = path.getFileSystem(configuration);
FSDataInputStream dataInputStream = null;
try {
//根据fileSystem打开文件的输入流
dataInputStream = fileSystem.open(path);
//将小文件输入流封装到字节数组中
IOUtils.readFully(dataInputStream,fileContent,0, (int) fileSplit.getLength());
bytesWritable.set(fileContent,0,fileContent.length);
}finally {
IOUtils.closeStream(dataInputStream);
}
//设置读取文件标识,标识文件已经读取完毕,不需要再继续往下读取
nextKeyValue = true;
return nextKeyValue;
}
return false;
}
/**
* 返回k1
* @return
* @throws IOException
* @throws InterruptedException
*/
@Override
public NullWritable getCurrentKey() throws IOException, InterruptedException {
return NullWritable.get();
}
/**
* 返回v1 BytesWritable
* 需要将文件内容读取出来封装到BytesWritable
* @return
* @throws IOException
* @throws InterruptedException
*/
@Override
public BytesWritable getCurrentValue() throws IOException, InterruptedException {
return bytesWritable;
}
/**
* 读取文件进度
* @return
* @throws IOException
* @throws InterruptedException
*/
@Override
public float getProgress() throws IOException, InterruptedException {
return nextKeyValue?0:1;
}
/**
* 读取完成之后释放资源
* @throws IOException
*/
@Override
public void close() throws IOException {
}
}
五:自定义MAP
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
/**
* 小文件合并map
*/
public class OwnInputFormatMapper extends Mapper{
@Override
protected void map(NullWritable key, BytesWritable value, Context context) throws IOException, InterruptedException {
FileSplit split = (FileSplit) context.getInputSplit();
String fileName = split.getPath().getName();
context.write(new Text(fileName),value);
}
}
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import java.io.IOException;
/**
* 小文件合并map
*/
public class OwnInputFormatMapper extends Mapper{
@Override
protected void map(NullWritable key, BytesWritable value, Context context) throws IOException, InterruptedException {
FileSplit split = (FileSplit) context.getInputSplit();
String fileName = split.getPath().getName();
context.write(new Text(fileName),value);
}
}
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
* 合并小文件
* 自定义FileInputFormat实现类、RecordReader实现类
* 设置小文件不能分割,读取小文件内容,将内容封装到BytesWritable中作为v2
*/
public class OwnInputFormatMain extends Configured implements Tool{
@Override
public int run(String[] args) throws Exception {
Job job = Job.getInstance(super.getConf(), "ownInputFormat");
job.setInputFormatClass(WholeFileInputFormat.class);
WholeFileInputFormat.addInputPath(job,new Path("file:///E:\\java\\\\input"));
job.setMapperClass(OwnInputFormatMapper.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(BytesWritable.class);
//设置reduce的输出类型,虽然没有reduce了,但是默认输出类型 LongWritalb Text
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(BytesWritable.class);
//设置输出结果文件内容为二进制:SequenceFileOutputFormat
job.setOutputFormatClass(SequenceFileOutputFormat.class);
SequenceFileOutputFormat.setOutputPath(job,new Path("file:///E:\\java\\xucjInout"));
boolean b = job.waitForCompletion(true);
return b?0:1;
}
public static void main(String[] args) throws Exception {
int run = ToolRunner.run(new Configuration(), new OwnInputFormatMain(), args);
System.exit(run);
}
}