MapReduce TopN 多种实现

测试数据:

key 1
value 3
aa 4
deng 5
haha 8
tt 8

1、使用TreeMap实现topN

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.TreeMap;

public class TopN {

    public static final int K = 3;

//    public static class MyIntWritable extends IntWritable {
//
//        @Override
//        public int compareTo(IntWritable o) {
//            return -super.compareTo(o);  //重写IntWritable排序方法,默认是升序 ,
//        }
//    }


    public static class MyMapper extends Mapper {

        TreeMap map = new TreeMap<>();

        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//            super.map(key, value, context);
            String[] arr = value.toString().split(" ");

            int score = Integer.parseInt(arr[1]);

            map.put(score, arr[0]);

            if (map.size() > K) {
                map.remove(map.firstKey()); //移除排在最前面的entry
            }


        }

        @Override
        protected void cleanup(Context context) throws IOException, InterruptedException {
//            super.cleanup(context);
            for (Integer key : map.keySet()) {

                context.write(new Text(map.get(key)), new IntWritable(key)); //map执行结束时将k,v写入context

            }
        }
    }

    public static class MyReducer extends Reducer {

        TreeMap map = new TreeMap<>();

        @Override
        protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
//            super.reduce(key, values, context);

            map.put(values.iterator().next().get(), key.toString()); //排序
        }

        @Override
        protected void cleanup(Context context) throws IOException, InterruptedException {
//            super.cleanup(context);
            for (Integer key : map.keySet()) {

                context.write(new Text(map.get(key)), new IntWritable(key));

            }
        }
    }

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();

        Job job = Job.getInstance(conf);

        job.setJarByClass(TopN.class);

        job.setMapperClass(MyMapper.class);

        job.setReducerClass(MyReducer.class);

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.setInputPaths(job, new Path(args[0]));

        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        // exit(arg) arg 非0表示jvm异常终止
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

如果有相同大小的值的话,会被覆盖掉。

使用自定义Writer

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.util.TreeMap;

public class TopN_2 {

    public static final int K = 3;

    public static class MyIntWritable extends IntWritable {

        public MyIntWritable() {
        }

        public MyIntWritable(int value) {
            super(value);
        }

        @Override
        public int compareTo(IntWritable o) {
            return -super.compareTo(o);  //重写IntWritable排序方法,默认是升序 ,
        }
    }


    public static class MyMapper extends Mapper {


        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//            super.map(key, value, context);
            String[] arr = value.toString().split(" ");

            int score = Integer.parseInt(arr[1]);

            context.write(new MyIntWritable(score), new Text(arr[0]));

        }


    }

    public static class MyReducer extends Reducer {

        int num = 0;

        @Override
        protected void reduce(MyIntWritable key, Iterable values, Context context) throws IOException, InterruptedException {
//            super.reduce(key, values, context);

            for (Text text : values) {
                if (num < K) {

                    context.write(text, key);
                }
                num++;

            }

        }
    }

    public static void main(String[] args) throws Exception {


        Configuration conf = new Configuration();

//        conf.set("mapreduce.framework.name", "local");
//
//        conf.set("fs.defaultFS", "file:///");

        Job job = Job.getInstance(conf);

//        job.setJar("/Users/f7689781/Desktop/MyMapReduce.jar");

        job.setJarByClass(TopN_2.class);


        job.setMapperClass(MyMapper.class);

        job.setReducerClass(MyReducer.class);

        job.setMapOutputKeyClass(MyIntWritable.class);

        job.setMapOutputValueClass(Text.class);

        job.setOutputKeyClass(Text.class);

        job.setOutputValueClass(MyIntWritable.class);

        FileInputFormat.setInputPaths(job, new Path(args[0]));

        FileSystem fileSystem = FileSystem.get(conf);

        fileSystem.deleteOnExit(new Path(args[1]));

        FileOutputFormat.setOutputPath(job, new Path(args[1]));

        // exit(arg) arg 非0表示jvm异常终止
        System.exit(job.waitForCompletion(true) ? 0 : 1);


    }

}

测试结果:

tt	8
haha	8
deng	5

相同score不会被覆盖

你可能感兴趣的:(大数据)