排序是mapReduce框架中最重要的操作之一。在MapTask和ReduceTask均会对数据按照key进行排序。该操作属于Hadoop的默认行为。任何应用程序的数据均会被排序,而不管逻辑上是否需要。
hadoop默认的排序是按照字典顺序进行排序,且实现该排序的方法是快速排序。
在hadoop中任务会进行几次排序?
对于Maptask,它将会处理的结果暂时放到环形缓冲区中,当环形缓冲区使用率达到一定阀值后,在对缓冲区中的数据进行一次快速排序,并将这些有序排序数据溢写在磁盘上,而当数据处理完毕后,它会对磁盘上所有的文件进行归并排序。
对ReduceTask,它从每个MapTask上远程拷贝相应的数据文件,如果文件大小超过一定阀值,则溢写在磁盘上,否则存储在内存中。如果磁盘文件数目达到一定阀值,则进行一次合并将数据溢写到磁盘上。当所有数据拷贝完毕后,ReduceTask统一对内存和磁盘上的所有数据进行一次归并排序。
bean对象作为key传输,需要实现WritableComparable接口重写compareTo方法,就可以实现排序。
package com.xing.MapReduce.Sort;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class SortBean implements WritableComparable<SortBean> {
private String name;
private String className;
private long age;
public SortBean() {
}
public SortBean(String name, String className, long age) {
this.name = name;
this.className = className;
this.age = age;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public String getClassName() {
return className;
}
public void setClassName(String className) {
this.className = className;
}
public long getAge() {
return age;
}
public void setAge(long age) {
this.age = age;
}
public int compareTo(SortBean sortBean) {
return age >= sortBean.getAge()?-1:1;
}
// 序列化
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeUTF(name);
dataOutput.writeUTF(className);
dataOutput.writeLong(age);
}
// 反序列化
public void readFields(DataInput dataInput) throws IOException {
this.name = dataInput.readUTF();
this.className = dataInput.readUTF();
this.age = dataInput.readLong();
}
@Override
public String toString() {
return name+"\t"+className+"\t"+age;
}
}
SortDriver
package com.xing.MapReduce.Sort;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class SortDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
System.setProperty("hadoop.home.dir", "E:\\hadoop-2.7.1");
String in = "E:\\hdfs\\data\\sort\\input\\demo.txt";
String out = "E:\\hdfs\\data\\sort\\output";
Path inPath = new Path(in);
Path outPath = new Path(out);
Configuration configuration = new Configuration();
FileSystem fs = FileSystem.get(configuration);
if (!fs.exists(inPath) || !fs.isFile(inPath)){
System.out.println(" not exists infile");
System.exit(-1);
}
if (fs.exists(outPath)){
if (fs.delete(outPath,true)){
System.out.println("success delete outfile");
} else {
System.err.println("error delete outfile ");
System.exit(-1);
}
}
Job job = Job.getInstance(configuration);
job.setJobName("sort");
job.setMapperClass(SortMapper.class);
job.setReducerClass(SortReducer.class);
job.setJarByClass(SortDriver.class);
job.setMapOutputKeyClass(SortBean.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(SortBean.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job,new Path(in));
FileOutputFormat.setOutputPath(job, new Path(out));
boolean rel = job.waitForCompletion(true);
if (rel){
System.out.println("success");
}
}
}
SortMapper
package com.xing.MapReduce.Sort;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class SortMapper extends Mapper<LongWritable,Text,SortBean,NullWritable> {
private SortBean k = new SortBean();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] split = value.toString().split("\t");
k.setName(split[0]);
k.setClassName(split[1]);
k.setAge(Long.valueOf(split[2]));
/*String phoneNbr = split[0];
long upFlow = Long.parseLong(split[1]);
long downFlow = Long.parseLong(split[2]);
k.set(upFlow, downFlow);*/
System.out.println("############"+k.toString());
context.write(k,NullWritable.get());
}
}
SortReducer
package com.xing.MapReduce.Sort;
import com.xing.MapReduce.Flowsum.FlowBean;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class SortReducer extends Reducer<SortBean,NullWritable,SortBean ,NullWritable> {
@Override
protected void reduce(SortBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
System.out.println("$$$$$$$$$$$$$"+key.toString());
for (NullWritable value : values) {
context.write(key,value);
}
}
}
也就是在全排序的基础上加上分区(分区排序)
SortPatition
package com.xing.MapReduce.Sort;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Partitioner;
public class SortPatition extends Partitioner<SortBean,NullWritable> {
public int getPartition(SortBean sortBean, NullWritable nullWritable, int i) {
String name = sortBean.getName();
if (name.startsWith("小")){
return 0;
}else if (name.startsWith("王")){
return 1;
}else if (name.startsWith("刘")){
return 2;
}else {
return 3;
}
}
}
辅助排序又叫分组排序,实在mapTask执行完归并排序后执行分组排序然后在把数据传给reduceTask
需求: 把下列订单和价格按照订单的升序和价格的降序实现排序 最后得到同一个订单的最大的钱。
OrderBean 自定义bean
package com.xing.MapReduce.GroupingComparatorDemo;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
public class OrderBean implements WritableComparable<OrderBean> {
private String id;
private double money;
public OrderBean() {
}
public OrderBean(String id, double money) {
this.id = id;
this.money = money;
}
public String getId() {
return id;
}
public void setId(String id) {
this.id = id;
}
public double getMoney() {
return money;
}
public void setMoney(double money) {
this.money = money;
}
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeUTF(id);
dataOutput.writeDouble(money);
}
public void readFields(DataInput dataInput) throws IOException {
id = dataInput.readUTF();
money = dataInput.readDouble();
}
// 二次排序
public int compareTo(OrderBean orderBean) {
// 如果id相等 则比较钱的大小 如果id不相同比较id的大小
if (id.equals(orderBean.getId())){
return money>=orderBean.getMoney()? -1:1;
}else {
return Long.parseLong(id) >= Long.parseLong(orderBean.getId())?1:-1;
}
}
@Override
public String toString() {
return id+"\t"+money;
}
}
OrderDriver 驱动类
package com.xing.MapReduce.GroupingComparatorDemo;
import org.apache.avro.Schema;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
public class OrderDriver {
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
System.setProperty("hadoop.home.dir", "E:\\hadoop-2.7.1");
Configuration configuration = new Configuration();
FileSystem fileSystem = FileSystem.get(configuration);
Path in = new Path("E:\\hdfs\\data\\group\\input\\demo.txt");
Path out = new Path("E:\\hdfs\\data\\group\\output");
Job job = Job.getInstance(configuration);
job.setJobName("test");
job.setJarByClass(OrderDriver.class);
job.setMapperClass(OrderMapper.class);
job.setReducerClass(OrderReduce.class);
job.setMapOutputKeyClass(OrderBean.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(OrderBean.class);
job.setMapOutputValueClass(NullWritable.class);
if (fileSystem.exists(out)){
fileSystem.delete(out,true );
}
FileInputFormat.setInputPaths(job,in);
FileOutputFormat.setOutputPath(job,out );
boolean b = job.waitForCompletion(true);
System.exit(b?1:-1);
}
}
OrderMapper
package com.xing.MapReduce.GroupingComparatorDemo;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
public class OrderMapper extends Mapper<LongWritable,Text,OrderBean,NullWritable> {
private OrderBean orderBean = new OrderBean();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
System.out.println("@@@ value:"+value.toString());
String[] split = value.toString().split("\t");
orderBean.setId(split[0]);
orderBean.setMoney(Double.parseDouble(split[2]));
context.write(orderBean,NullWritable.get());
}
}
OrderReduce
package com.xing.MapReduce.GroupingComparatorDemo;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
public class OrderReduce extends Reducer<OrderBean,NullWritable,OrderBean,NullWritable> {
@Override
protected void reduce(OrderBean key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
context.write(key, NullWritable.get());
}
}
输出结果:
可以看到并没有分组排序,因为key是一个bean 而bean的id相同但是money不同,所以不能被聚合在一块。
如何分组排序?需要继承WritableComparator才可以。
步骤:
OrderComparator 自定义分组排序
package com.xing.MapReduce.GroupingComparatorDemo;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;
public class OrderComparator extends WritableComparator {
protected OrderComparator(){
// true 创建类对象 false 全部置为空 一定要是true
super(OrderBean.class,true);
}
@Override
public int compare(WritableComparable a, WritableComparable b) {
// 只要是id相同 就认为是同一个key
OrderBean orderBeana = (OrderBean) a;
OrderBean orderBeanb = (OrderBean) b;
// 相等返回0表示是同一个key 否则返回其他值
return orderBeana.getId().equals(orderBeanb.getId()) ? 0:-1;
}
}
在job中设置分组类
job.setGroupingComparatorClass(OrderComparator.class);