Mapreduce实现二次排序

SecondarySortMR

1.应用场景:MR默认会对键进行排序,然而有的时候我们也有对值进行排序的需求。满足这种需求一是可以在reduce阶段排序收集过来的values,但是,如果有数量巨大的values可能就会导致内存溢出等问题,这就是二次排序应用的场景——将对值的排序也安排到MR计算过程之中,而不是单独来做。

2.需求:
name money
zhangsan 125
lisi 135
wangwu 60
zhangsan 56
wangwu 80
lisi 650
zhangsan 50
wangwu 6
lisi 900

二次排序:第一次排要求按照姓名的首字母进行排序
第二次排序要求按照同一个人的消费金额进行排

分析实现的思路
key#value value

统计结果
lisi 135
lisi 650
lisi 900
wangwu 6
wangwu 60
wangwu 80
zhangsan 50
zhangsan 56
zhangsan 125

构造(name,money)作为key, money作为value,然后进入map流程

新建一个Bean类,定义name和money属性,重写hashcode,equals,compareTo方法,在compareTo中先对name进行排序,再对money进行排序

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
/*
 * name money
 * 自定义key一般实现writableComparable为了实现Shuffle过程的分区,排序,合并
 */
public class PairWritable implements WritableComparable<PairWritable>{
    private String name;
    private int money;

    public PairWritable() {
    }
    public PairWritable(String name, int money) {
        super();
        this.name = name;
        this.money = money;
    }
    public void  set (String name,int money) {
        this.name = name;
        this.money = money;
    }
    public String getName() {
        return name;
    }
    public void setName(String name) {
        this.name = name;
    }
    public int getMoney() {
        return money;
    }
    public void setMoney(int money) {
        this.money = money;
    }
    @Override
    public void write(DataOutput out) throws IOException {
    out.writeUTF(name);
    out.writeInt(money);    
    }
    @Override
    public void readFields(DataInput in) throws IOException {
    this.name=in.readUTF();
    this.money=in.readInt();
    }
    @Override
    public int hashCode() {
        final int prime = 31;
        int result = 1;
        result = prime * result + money;
        result = prime * result + ((name == null) ? 0 : name.hashCode());
        return result;
    }
    @Override
    public boolean equals(Object obj) {
        if (this == obj)
            return true;
        if (obj == null)
            return false;
        if (getClass() != obj.getClass())
            return false;
        PairWritable other = (PairWritable) obj;
        if (money != other.money)
            return false;
        if (name == null) {
            if (other.name != null)
                return false;
        } else if (!name.equals(other.name))
            return false;
        return true;
    }
    @Override
    public int compareTo(PairWritable o) {
        //比较第一个字段
        int comp = this.name.compareTo(o.getName());
        if(0 != comp){
            return comp;
        }
        //比较第二个字段
        return Integer.valueOf(o.getMoney()).compareTo(Integer.valueOf(this.money));
    }
    @Override
    public String toString() {
        return "PairWritable [name=" + name + ", money=" + money + "]";
    }
}

SecondarySortMR

import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;

public class SecondaryMR extends Configured implements Tool{

    public static class SecondaryMapper extends Mapper<
    LongWritable,Text, PairWritable, IntWritable> {     
        private PairWritable mapOutputKey = new PairWritable();
        private IntWritable mapOutputValue = new IntWritable();
        @Override
        protected void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {
            //切分每行数据转化成字符串数组
            String[] strs = value.toString().split("\t");
            //设置输出的对象
            mapOutputKey.set(strs[0], Integer.valueOf(strs[1]));
            mapOutputValue.set(Integer.valueOf(strs[1]));
            context.write(mapOutputKey, mapOutputValue);
        }
    }

        public static class SecondaryReducer extends  Reducer
        <PairWritable, IntWritable,Text,IntWritable>{

            private Text outputKey  = new Text(); 
            @Override
            protected void reduce(PairWritable key, Iterable values,
                    Context context) throws IOException, InterruptedException {
                System.out.print(key+"\t");
               //方法一:value通过key获取
                /*outputKey=new Text(key.getName());
                context.write(outputKey, new IntWritable(key.getMoney()));
                */
               //方法二:value通过遍历values获取
                for(IntWritable value :values){
                    outputKey.set(key.getName());
                    context.write(outputKey, value);
                    System.out.print(value+"\t");
                }

            }

        }

        /**
         * Driver(环境,输入输出路径,并行)
         * @param args
         * @return
         * @throws IOException 
         * @throws InterruptedException 
         * @throws ClassNotFoundException 
         */
        public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException{
            //1.获取配置文件
            Configuration conf = new Configuration();
            //      conf.set("yarn.resourcemanager.hostname","linux1");
            //      conf.set("mapreduce.framework.name", "yarn");
            //      conf.set("yarn.nodemanager.aux-services", "mapreduce_shuffle");

            //2.创建job
            Job job= Job.getInstance(conf,this.getClass().getSimpleName());
            job.setJarByClass(this.getClass());

            //3.设置job的相关参数
            //input -> map ->reduce  -> ouput
            //3.1输入
            Path inPath = new Path(args[0]);
            FileInputFormat.addInputPath(job, inPath);

            //3.2 map  class
            job.setMapperClass(SecondaryMapper.class);
            job.setMapOutputKeyClass(PairWritable.class);
            job.setMapOutputValueClass(IntWritable.class);

            //3.3reduce class
            job.setReducerClass(SecondaryReducer.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(IntWritable.class);

            //3.4ouput
            Path  outPath = new Path(args[1]);
            FileSystem fs = outPath.getFileSystem(conf);
            if(fs.exists(outPath)){
                fs.delete(outPath,true);
            }

            FileOutputFormat.setOutputPath(job, outPath);

            //4.提交job
            boolean isSuccess= job.waitForCompletion(true);
            return isSuccess ? 0 : 1;

        }

        public static void main(String [] args) throws ClassNotFoundException, IOException, InterruptedException{
            args = new String[]{"hdfs://ns1/input/Secondary.txt",
            "hdfs://ns1/output"};

            //运行方法
            int status = new SecondaryMR().run(args);
            System.exit(status);
        }
    }

你可能感兴趣的:(Mapreduce实现二次排序)