1、
package com.jiepu.mr; import java.io.IOException; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileSplit; public class WholeFileRecordReader extends RecordReader<NullWritable, BytesWritable> { private FileSplit fileSplit; private JobContext jobContext; private NullWritable currentKey=NullWritable.get(); private BytesWritable currentValue; private boolean finishConverting=false; @Override public void close() throws IOException { // TODO Auto-generated method stub } @Override public NullWritable getCurrentKey() throws IOException, InterruptedException { // TODO Auto-generated method stub return currentKey; } @Override public BytesWritable getCurrentValue() throws IOException, InterruptedException { // TODO Auto-generated method stub return currentValue; } @Override public float getProgress() throws IOException, InterruptedException { float progress=0; if(finishConverting) { progress=1; } return progress; } @Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { this.fileSplit=(FileSplit) split; this.jobContext=context; context.getConfiguration().set("map.input.file",fileSplit.getPath().getName()); } @Override public boolean nextKeyValue() throws IOException, InterruptedException { if(!finishConverting) { currentValue=new BytesWritable(); int len=(int) fileSplit.getLength(); byte[] content=new byte[len]; Path file=fileSplit.getPath(); FileSystem fs=file.getFileSystem(jobContext.getConfiguration()); FSDataInputStream in = null; try { in = fs.open(file); org.apache.hadoop.io.IOUtils.readFully(in, content, 0, len); currentValue.set(content, 0, len); } finally{ if(in!=null) { org.apache.hadoop.io.IOUtils.closeStream(in); } } finishConverting=true; return true; } return false; } }
package com.jiepu.mr; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.hadoop.util.GenericOptionsParser; public class WholeCombinedSmallfiles { static class WholeSmallfilesMapper extends Mapper<NullWritable, BytesWritable, Text, BytesWritable> { private Text file = new Text(); @Override protected void map(NullWritable key, BytesWritable value, Context context) throws IOException, InterruptedException { String filename = context.getConfiguration().get("map.input.file"); file.set(filename); context.write(file, value); // super.map(key, value, context); } } static class IdentityReducer extends Reducer<Text, BytesWritable, Text, BytesWritable> { @Override protected void reduce(Text key, Iterable<BytesWritable> values, Context context) throws IOException, InterruptedException { for (BytesWritable value : values) { context.write(key, value); } } } static class WholeFileInputFormat extends FileInputFormat<NullWritable, BytesWritable> { @Override public org.apache.hadoop.mapreduce.RecordReader<NullWritable, BytesWritable> createRecordReader( org.apache.hadoop.mapreduce.InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { RecordReader<NullWritable, BytesWritable> recordReader = new WholeFileRecordReader(); recordReader.initialize(split, context); return recordReader; } } public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = new Configuration(); String[] otherArgs = new GenericOptionsParser(conf, args) .getRemainingArgs(); if (otherArgs.length != 2) { System.err.println("Usage:WholeCombinedSmallfiles <in> <out>"); System.exit(2); } Job job = new Job(conf, "WholeCombinedSmallfiles"); job.setJarByClass(WholeCombinedSmallfiles.class); job.setMapperClass(WholeSmallfilesMapper.class); job.setReducerClass(IdentityReducer.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(BytesWritable.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(BytesWritable.class); job.setInputFormatClass(WholeFileInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); job.setNumReduceTasks(5); FileInputFormat.addInputPath(job, new Path(otherArgs[0])); Path outpath=new Path(otherArgs[1]); FileSystem fs=outpath.getFileSystem(conf); if(fs.exists(outpath)) { fs.delete(outpath); } FileOutputFormat.setOutputPath(job,outpath); int exitflag = job.waitForCompletion(true) ? 0 : 1; System.exit(exitflag); } }