知识点:
数据采样:
InputSampler.RandomSampler
分区:
TotalOrderPartitioner
自定义InputFormat
MyKeyValueTextInputFormat
网上其他类似的程序直接使用
KeyValueTextInputFormat。我在使用的时候发现,数据最后是按照字典序排序的,并没有做到全排序。原因可能是默认的
KeyValueTextInputFormat需要的数据类型为Text,最后生成的采样文件中数据的类型也为Text,Map端的输出Key的类型也为Text,Map端的输出数据在划分分区的时候受制于Text类型,并不能做到全排序,所以自己
写了一个
MyKeyValueTextInputFormat,让其生成的Key为LongWrite类型的。
代码如下:
public class MyKeyValueTextInputFormat extends FileInputFormat{ public RecordReader createRecordReader(InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException { context.setStatus(inputSplit.toString()); return new MyKeyValueLineRecordReader(context.getConfiguration()); } protected boolean isSplitable(JobContext context, Path file) { CompressionCodec codec = (new CompressionCodecFactory(context.getConfiguration())).getCodec(file); return null == codec?true:codec instanceof SplittableCompressionCodec; } }
public class MyKeyValueLineRecordReader extends RecordReader{ public static final String KEY_VALUE_SEPERATOR = "mapreduce.input.keyvaluelinerecordreader.key.value.separator"; private final LineRecordReader lineRecordReader = new LineRecordReader(); private byte separator = 9; private Text innerValue; private LongWritable key; private Text value; public Class getKeyClass() { return Text.class; } public MyKeyValueLineRecordReader(Configuration conf) throws IOException { String sepStr = conf.get("mapreduce.input.keyvaluelinerecordreader.key.value.separator", "\t"); this.separator = (byte)sepStr.charAt(0); } public void initialize(InputSplit genericSplit, TaskAttemptContext context) throws IOException { this.lineRecordReader.initialize(genericSplit, context); } public static int findSeparator(byte[] utf, int start, int length, byte sep) { for(int i = start; i < start + length; ++i) { if(utf[i] == sep) { return i; } } return -1; } public static void setKeyValue(LongWritable key, Text value, Text line, int lineLen, int pos) { if(pos == -1) { key.set(Long.parseLong(line.toString())); value.set(""); } else { key.set(Long.parseLong(line.toString().substring(0,lineLen))); value.set(line.toString().substring(lineLen)); } } public synchronized boolean nextKeyValue() throws IOException { if(this.lineRecordReader.nextKeyValue()) { this.innerValue = this.lineRecordReader.getCurrentValue(); //byte[] line = this.innerValue.getBytes(); int lineLen = this.innerValue.getLength(); if(this.innerValue == null||this.innerValue.toString().equals("")) { return false; } else { if(this.key == null) { this.key = new LongWritable(); } if(this.value == null) { this.value = new Text(); } int pos = findSeparator(this.innerValue.getBytes(), 0, lineLen, this.separator); setKeyValue(this.key, this.value, this.innerValue, lineLen, pos); return true; } } else { return false; } } public LongWritable getCurrentKey() { return this.key; } public Text getCurrentValue() { return this.value; } public float getProgress() throws IOException { return this.lineRecordReader.getProgress(); } public synchronized void close() throws IOException { this.lineRecordReader.close(); } }
public class TotalSortV3 extends Configured implements Tool { static class SimpleMapper extends Mapper{ protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { context.write(key, key); } } static class SimpleReducer extends Reducer { protected void reduce(LongWritable key, Iterable values, Mapper.Context context) throws IOException, InterruptedException { for (LongWritable value : values) context.write(value, NullWritable.get()); } } public int run(String[] args) throws Exception { Configuration conf = getConf(); Job job = Job.getInstance(conf, "Total Order Sorting"); job.setJarByClass(TotalSortV3.class); //不能使用默认的TextInputFormat,默认的TextInputFormat会将行号偏移量作为key,导致对行号采样。我们是将每行的内容采样 job.setInputFormatClass(MyKeyValueTextInputFormat.class); FileInputFormat.addInputPath(job, new Path("hdfs://192.168.12.150:9000/datasort/input/hadoopsortdata100000.txt")); FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.12.150:9000/datasort/output3/result")); job.setNumReduceTasks(10); job.setMapOutputKeyClass(LongWritable.class); job.setMapOutputValueClass(LongWritable.class); job.setOutputKeyClass(LongWritable.class); job.setOutputValueClass(NullWritable.class); TotalOrderPartitioner.setPartitionFile(job.getConfiguration(), new Path("hdfs://192.168.12.150:9000/datasort/output3/_partition")); InputSampler.Sampler sampler = new InputSampler.RandomSampler (0.01, 1000, 100); InputSampler.writePartitionFile(job, sampler); job.setPartitionerClass(TotalOrderPartitioner.class); job.setMapperClass(SimpleMapper.class); job.setReducerClass(SimpleReducer.class); job.setJobName("TotalSortV3"); return job.waitForCompletion(true) ? 0 : 1; } public static void main(String[] args) throws Exception { int exitCode = ToolRunner.run(new TotalSortV3(), args); System.exit(exitCode); } }
测试数据生成:
[hadoop@master ~]$ cat hadoopsortdata100000.sh #!/bin/bash i=1 while ((i<=100000)) do echo $RANDOM >> hadoopsortdata100000.txt let i++ done