MapReduce二次排序分区,分组优化

自定义分组
NameGroup

package test;

import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.WritableComparator;

public class NameGroup  implements RawComparator<ConsumeWritable>{

    public int compare(ConsumeWritable o1, ConsumeWritable o2) {
        return o1.getName().compareTo(o2.getName());
    }
    /**
     * 封装key1:zhangsan,135.00  b1=12个字节   key2:yuti,11032 b2=8个字节   
     * 将组合key转为二进制数组
     * 比较两个对象在二进制层面
     * b1 第一个CosumeWritable对象转成的字节数据
     * s1代表从b1的第几个字节比较
     * l1代表b1的长度
     * compareBytes(b1,s1,l1-4(比较字节个数))
     *   
     */
    public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) {

        return WritableComparator.compareBytes(b1, 0, l1-4, b2, 0, l2-4);
    }

}

ConsumeWritable

package test;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.WritableComparable;

public class ConsumeWritable implements WritableComparable{
    private  String name;
    private float money;

    public ConsumeWritable() {}


    public ConsumeWritable(String name, float money) {
        super();
        this.name = name;
        this.money = money;
    }
    //从源码中的获得
    public void  set(String  name,float money){
        this.name=name;
        this.money=money;
    }

    public String getName() {
        return name;
    }


    public void setName(String name) {
        this.name = name;
    }


    public float getMoney() {
        return money;
    }


    public void setMoney(float money) {
        this.money = money;
    }

    //序列化
    public void write(DataOutput out) throws IOException {
        out.writeUTF(name);
        out.writeFloat(money);

    }
    //反序列化
    public void readFields(DataInput in) throws IOException {
        name=in.readUTF();
        money=in.readFloat();
    }

    public int compareTo(ConsumeWritable o) {
        //第一次比较
        int compareTo = this.getName().compareTo(o.getName());
        if (compareTo !=0) {
            return compareTo;
        }
        //第二次比较  注意:普通的数据类型是没有compaerTo方法 所以要转换为他的包装类
        return Float.valueOf(this.getMoney()).compareTo(Float.valueOf(o.getMoney()));
    }

    //比较对象两个对象,需要重写equals和hashcode()方法
    @Override
    public int hashCode() {
        final int prime = 31;
        int result = 1;
        result = prime * result + Float.floatToIntBits(money);
        result = prime * result + ((name == null) ? 0 : name.hashCode());
        return result;
    }


    @Override
    public boolean equals(Object obj) {
        if (this == obj)
            return true;
        if (obj == null)
            return false;
        if (getClass() != obj.getClass())
            return false;
        ConsumeWritable other = (ConsumeWritable) obj;
        if (Float.floatToIntBits(money) != Float.floatToIntBits(other.money))
            return false;
        if (name == null) {
            if (other.name != null)
                return false;
        } else if (!name.equals(other.name))
            return false;
        return true;
    }



    @Override
    public String toString() {
        return name + "," + money;
    }



}

主要程序:



import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Partitioner;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
/**
 * 主要思想:根据shuffle阶段排序是根据key来排序的
 * @author Administrator
 *
 */
public class SecondSortMapReduce extends Configured implements Tool{

    //map映射
    public static class  SecondSortMapper extends Mapper<LongWritable, Text, ConsumeWritable, FloatWritable>{
        private ConsumeWritable mapOutPutKey = new ConsumeWritable();
        private FloatWritable mapOutPutValue= new FloatWritable();
        @Override
        protected void map(LongWritable key, Text value, Context context)
                throws IOException, InterruptedException {

            //把读取出来的内容 装换为String  类型
            String  line = value.toString();
            //通过制表符分割
            String[] split = line.split("\t");

            mapOutPutKey.set(split[0], Float.valueOf(split[1]));
            mapOutPutValue.set(Float.parseFloat(split[1]));
            System.err.print("key: "+mapOutPutKey.toString());
            System.err.print("->value: "+mapOutPutValue+"\n");
            context.write(mapOutPutKey, mapOutPutValue);

        }


    }

    //分区   参数是map输出的
    public static class MyPartitoner  extends Partitioner<ConsumeWritable, FloatWritable>{

        @Override
        public int getPartition(ConsumeWritable key, FloatWritable value,
                int numPartitions) {
            //根据hashpatitioner源码的得到
            return (key.getName().hashCode() & Integer.MAX_VALUE) % numPartitions;
        }

    }
    public static class SecondSortReducer extends Reducer<ConsumeWritable, FloatWritable, Text, FloatWritable>{
        private  Text  OutPutKey =new Text();
        private  FloatWritable OutPutValue = new FloatWritable();
        @Override
        protected void reduce(ConsumeWritable key,Iterable values,Context context )throws IOException, InterruptedException {
            System.out.print("key:"+key.toString()+"["+"value:");
            OutPutKey.set(key.getName());
            for (FloatWritable floatWritable : values) {
                System.out.print(floatWritable+",");
                OutPutValue.set(floatWritable.get());
                context.write(OutPutKey, OutPutValue);
            }
            System.out.println("]"+"\n");
        }

    }




    public int run(String[] args) throws Exception {
        // 1.创建Configuration对象,获取配置文件
                Configuration conf = new Configuration();
                // 2.构建MapReduce Job对象
                Job job = Job.getInstance(conf, this.getClass().getSimpleName());

                job.setJarByClass(getClass());

                // 3.输入目录/文件(input) -》 map -》 reduce -》输出路径 (output)
                // 3.1 设置输入文件所在目录
                Path inPath = new Path(args[0]);
                FileInputFormat.setInputPaths(job, inPath);

                // 3.2 设置Map输出信息
                job.setMapperClass(SecondSortMapper.class);
                job.setMapOutputKeyClass(ConsumeWritable.class);
                job.setMapOutputValueClass(FloatWritable.class);
                //自定义分区
                job.setPartitionerClass(MyPartitoner.class);
                //自定义分组
                job.setGroupingComparatorClass(NameGroup.class);
                // 3.3设置reduce的输出信息
                job.setReducerClass(SecondSortReducer.class);
                job.setOutputKeyClass(Text.class);
                job.setOutputValueClass(FloatWritable.class);

                // 3.4 设置输出路径
                Path outPath = new Path(args[1]);
                FileSystem fs = outPath.getFileSystem(conf);

                if (fs.exists(outPath)) {
                    fs.delete(outPath, true);
                }
                FileOutputFormat.setOutputPath(job, outPath);

                // 提交job
                /**
                 * 可以详细显示任务的进度信息 job.submit()这种方式是做不到的
                 */
                boolean isSuccessed = job.waitForCompletion(true);
                // job.submit(); 不推荐

                return isSuccessed ? 0 : 1;
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        args = new String[] { 
                "hdfs://hive01:8020/input/ceshi.txt", 
                "hdfs://hive01:8020/outputtest1"
                 };
        int status = ToolRunner.run(conf, new SecondSortMapReduce(), args);

        System.exit(status);
    }

}

你可能感兴趣的:(hadoop)