Hadoop 案例6-----TopN问题:求最大的K个值并排序

1、需求分析

orderid,userid,payment,productid
[root@x00 hd]# cat seventeen_a.txt

1,9819,100,121
2,8918,2000,111
3,2813,1234,22
4,9100,10,1101
5,3210,490,111
6,1298,28,1211
7,1010,281,90
8,1818,9000,20

[root@x00 hd]# cat seventeen_b.txt

100,3333,10,100
101,9321,1000,293
102,3881,701,20
103,6791,910,30
104,8888,11,39

2.Mapper程序:

package cn.edu.bjut.topn;

import java.io.IOException;
import java.util.Arrays;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class TopNMapper extends Mapper<LongWritable, Text, IntWritable, IntWritable> {

    int len;
    int[] top;
    @Override
    protected void cleanup(Context context)
            throws IOException, InterruptedException {
        for(int i=1; inew IntWritable(top[i]), new IntWritable(top[i]));
        }
    }

    @Override
    protected void map(LongWritable key, Text value, Context context)
            throws IOException, InterruptedException {
        String line = value.toString().trim();
        String[] arr = line.split(",");
        if(4 == arr.length) {
            int payment = Integer.parseInt(arr[2]);
            add(payment);
        }
    }

    private void add(int payment) {
        top[0] = payment;
        Arrays.sort(top);
    }

    @Override
    protected void setup(Context context)
            throws IOException, InterruptedException {
        len = context.getConfiguration().getInt("N", 10);
        top = new int[len+1];
    }

}

3.Reducer程序:

package cn.edu.bjut.topn;

import java.io.IOException;
import java.util.Arrays;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class TopNReducer extends Reducer {

    int len;
    int[] top;
    @Override
    protected void cleanup(Context context)
            throws IOException, InterruptedException {
        for(int i=len; i>0; i--) {
            context.write(new Text(String.valueOf(len-i+1)), new Text(String.valueOf(top[i])));
        }
    }

    @Override
    protected void reduce(IntWritable key, Iterable values, Context context)
            throws IOException, InterruptedException {
        for(IntWritable value : values) {
            add(value.get());
        }
    }

    private void add(int i) {
        top[0] = i;
        Arrays.sort(top);
    }

    @Override
    protected void setup(Context context)
            throws IOException, InterruptedException {
        len = context.getConfiguration().getInt("N", 10);
        top = new int[len + 1];
    }

}

4.主程序:

package cn.edu.bjut.topn;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

public class MainJob {
    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        Job job = new Job(conf, "topn");
        job.setJarByClass(MainJob.class);

        job.setMapperClass(TopNMapper.class);
        job.setMapOutputKeyClass(IntWritable.class);
        job.setMapOutputValueClass(IntWritable.class);

        job.setReducerClass(TopNReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(Text.class);

        FileInputFormat.addInputPath(job, new Path(args[0]));

        Path outPath = new Path(args[1]);
        FileSystem fs = FileSystem.get(conf);
        if(fs.exists(outPath)) {
            fs.delete(outPath, true);
        }
        FileOutputFormat.setOutputPath(job, outPath);
        job.waitForCompletion(true);
    }
}

你可能感兴趣的:(Hadoop 案例6-----TopN问题:求最大的K个值并排序)