hadoop中大数据全排序

知识点:

数据采样:

InputSampler.RandomSampler
分区:
 
  
TotalOrderPartitioner
自定义InputFormat
 
  
MyKeyValueTextInputFormat

 
  
网上其他类似的程序直接使用
KeyValueTextInputFormat。我在使用的时候发现,数据最后是按照字典序排序的,并没有做到全排序。原因可能是默认的
KeyValueTextInputFormat需要的数据类型为Text,最后生成的采样文件中数据的类型也为Text,Map端的输出Key的类型也为Text,Map端的输出数据在划分分区的时候受制于Text类型,并不能做到全排序,所以自己
写了一个
MyKeyValueTextInputFormat,让其生成的Key为LongWrite类型的。

代码如下:
public class MyKeyValueTextInputFormat extends FileInputFormat {
    public RecordReader createRecordReader(InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException {
        context.setStatus(inputSplit.toString());
        return new MyKeyValueLineRecordReader(context.getConfiguration());
    }
    protected boolean isSplitable(JobContext context, Path file) {
        CompressionCodec codec = (new CompressionCodecFactory(context.getConfiguration())).getCodec(file);
        return null == codec?true:codec instanceof SplittableCompressionCodec;
    }
}


public class MyKeyValueLineRecordReader extends RecordReader {
    public static final String KEY_VALUE_SEPERATOR = "mapreduce.input.keyvaluelinerecordreader.key.value.separator";
    private final LineRecordReader lineRecordReader = new LineRecordReader();
    private byte separator = 9;
    private Text innerValue;
    private LongWritable key;
    private Text value;

    public Class getKeyClass() {
        return Text.class;
    }

    public MyKeyValueLineRecordReader(Configuration conf) throws IOException {
        String sepStr = conf.get("mapreduce.input.keyvaluelinerecordreader.key.value.separator", "\t");
        this.separator = (byte)sepStr.charAt(0);
    }

    public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException {
        this.lineRecordReader.initialize(genericSplit, context);
    }

    public static int findSeparator(byte[] utf, int start, int length, byte sep) {
        for(int i = start; i < start + length; ++i) {
            if(utf[i] == sep) {
                return i;
            }
        }

        return -1;
    }

    public static void setKeyValue(LongWritable key, Text value, Text line, int lineLen, int pos) {
        if(pos == -1) {
                key.set(Long.parseLong(line.toString()));
                value.set("");
        } else {
                key.set(Long.parseLong(line.toString().substring(0,lineLen)));
                value.set(line.toString().substring(lineLen));
        }

    }

    public synchronized boolean nextKeyValue() throws IOException {
        if(this.lineRecordReader.nextKeyValue()) {
            this.innerValue = this.lineRecordReader.getCurrentValue();
            //byte[] line = this.innerValue.getBytes();
            int lineLen = this.innerValue.getLength();
            if(this.innerValue == null||this.innerValue.toString().equals("")) {
                return false;
            } else {
                if(this.key == null) {
                    this.key = new LongWritable();
                }

                if(this.value == null) {
                    this.value = new Text();
                }

                int pos = findSeparator(this.innerValue.getBytes(), 0, lineLen, this.separator);
                setKeyValue(this.key, this.value, this.innerValue, lineLen, pos);
                return true;
            }
        } else {
            return false;
        }
    }

    public LongWritable getCurrentKey() {
        return this.key;
    }

    public Text getCurrentValue() {
        return this.value;
    }

    public float getProgress() throws IOException {
        return this.lineRecordReader.getProgress();
    }

    public synchronized void close() throws IOException {
        this.lineRecordReader.close();
    }
}
 
  
 
  
public class TotalSortV3 extends Configured implements Tool {
        static class SimpleMapper extends Mapper {
            protected void map(LongWritable key, Text value,
                               Context context) throws IOException, InterruptedException {
                context.write(key, key);
            }
        }

        static class SimpleReducer extends Reducer {

            protected void reduce(LongWritable key, Iterable values,
                                  Mapper.Context context) throws IOException, InterruptedException {
                for (LongWritable value : values)
                    context.write(value, NullWritable.get());
            }
        }



        public int run(String[] args) throws Exception {
            Configuration conf = getConf();
            Job job = Job.getInstance(conf, "Total Order Sorting");
            job.setJarByClass(TotalSortV3.class);

            //不能使用默认的TextInputFormat,默认的TextInputFormat会将行号偏移量作为key,导致对行号采样。我们是将每行的内容采样
            job.setInputFormatClass(MyKeyValueTextInputFormat.class);



            FileInputFormat.addInputPath(job, new Path("hdfs://192.168.12.150:9000/datasort/input/hadoopsortdata100000.txt"));
            FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.12.150:9000/datasort/output3/result"));
            job.setNumReduceTasks(10);
            job.setMapOutputKeyClass(LongWritable.class);
            job.setMapOutputValueClass(LongWritable.class);
            job.setOutputKeyClass(LongWritable.class);
            job.setOutputValueClass(NullWritable.class);

            TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), new Path("hdfs://192.168.12.150:9000/datasort/output3/_partition"));
            InputSampler.Sampler sampler = new InputSampler.RandomSampler(0.01, 1000, 100);
            InputSampler.writePartitionFile(job, sampler);

            job.setPartitionerClass(TotalOrderPartitioner.class);
            job.setMapperClass(SimpleMapper.class);
            job.setReducerClass(SimpleReducer.class);

            job.setJobName("TotalSortV3");

            return job.waitForCompletion(true) ? 0 : 1;
        }

        public static void main(String[] args) throws Exception {
            int exitCode = ToolRunner.run(new TotalSortV3(), args);
            System.exit(exitCode);
        }
}

 
  
测试数据生成:
[hadoop@master ~]$ cat hadoopsortdata100000.sh
#!/bin/bash
i=1
while ((i<=100000))
do
 echo $RANDOM >> hadoopsortdata100000.txt
 let  i++ 
done

你可能感兴趣的:(hadoop)