Hadoop MapReduce做大数据排序

1. 我们知道mapreduce天生适合作排序,由于他有一个shuffer的过程,当数据量很少的时候我们可以把reduce的num设置成1来进行排序,但是如果数据量很大,在一个reduce上处理不过来或者处理时间太长,那么我们就需要重新考虑这个排序(需要设置多个reduce)

2. 假设我们现在的数据是这样的,每个数字占一行,如:

6
1
56
43
65
15
54
93
47
56
24
65
90
93
57
25
95
36

  map中按行读取文件,直接把这个值作为key输出到reduce,由于没有value,我们可以使用NullWritable来代替,那么map输出大概就是这样的格式:

(2:null,null)(3:null) (10: null)

3. 以下是最重要的步骤,我们需要根据reduce的书目决定如何分区,也就是要自定义分区函数,让结果形成多个区间,比如我认为大于50的应该在一个区间,一共3个reduce,那么最后的数据应该是三个区间,大于50的直接分到第一个分区0,25到50之间的分到第二个分区1,小于25的分到第三个分区2.因为分区数和reduce数是相同的,所以不同的分区对应不同的reduce,因为分区是从0开始的,分区是0的会分到第一个reduce处理,分区是1的会分到第2个reduce处理,依次类推。并且reduce对应着输出文件,所以,第一个reduce生成的文件就会是part-r-0000,第二个reduce对应的生成文件就会是part-r-0001,依次类推,所以reduce处理时只需要把key和value再倒过来直接输出。这样最后就会让形成数目最大的字符串就会在第一个生成文件里,排好的序就会文件命的顺序,这里也需要自定义一个Comparator来保证每个reduce的输出是有序的(我们按照降序排列),代码如下:

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class SortJob {

	public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
		if (args.length < 2) {
			System.out.println("参数数量不对,至少两个以上参数:<数据文件输出路径>、<输入路径...>");
			System.exit(1);
		}
		String dataOutput = args[0];
		String[] inputs = new String[args.length - 1];
		System.arraycopy(args, 1, inputs, 0, inputs.length);

		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf, "sort 测试");
		job.setJarByClass(JoinJob.class);
		job.setMapperClass(SortMapper.class);
		job.setReducerClass(SortReducer.class);
		job.setSortComparatorClass(SortComparator.class);
		job.setPartitionerClass(SortPartitoner.class);

		job.setNumReduceTasks(3);

		job.setOutputKeyClass(IntWritable.class);
		job.setOutputValueClass(NullWritable.class);

		Path[] inputPathes = new Path[inputs.length];
		for (int i = 0; i < inputs.length; i++) {
			inputPathes[i] = new Path(inputs[i]);
		}
		Path outputPath = new Path(dataOutput);
		FileInputFormat.setInputPaths(job, inputPathes);
		FileOutputFormat.setOutputPath(job, outputPath);
		job.waitForCompletion(true);
	}

	static class SortMapper extends Mapper {

		@Override
		public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
			IntWritable in = new IntWritable(Integer.parseInt(value.toString()));
			context.write(in, NullWritable.get());
		}
	}

	static class SortReducer extends Reducer {

		@Override
		public void reduce(IntWritable key, Iterable values, Context context) throws IOException,
				InterruptedException {
			for (NullWritable value : values) {
				System.out.println(key.toString() + value.toString());
				context.write(key, NullWritable.get());
			}
		}

	}

	static class SortPartitoner extends Partitioner {

		@Override
		public int getPartition(K key, V value, int numReduceTasks) {
			int maxValue = 100;
			int keySection = 0;

			// 只有传过来的key值大于maxValue 并且numReduceTasks比如大于1个才需要分区,否则直接返回0
			if (numReduceTasks > 1 && key.hashCode() < maxValue) {
				int sectionValue = maxValue / (numReduceTasks - 1);
				int count = 0;
				while ((key.hashCode() - sectionValue * count) > sectionValue) {
					count++;
				}
				keySection = numReduceTasks - 1 - count;
			}

			return keySection;
		}

	}

	static class SortComparator extends WritableComparator {

		protected SortComparator() {
			super(IntWritable.class, true);
		}

		@Override
		public int compare(WritableComparable a, WritableComparable b) {
			return -super.compare(a, b);
		}

	}
}


你可能感兴趣的:(大数据)