MapReduce: Partition

1、partion的作用

Partion作用主要是对map处理的数据进行分区,可以解决数据倾斜的问题。

2、如果没有定义partitioner,那数据在被送达reducer前是如何被分区的?

hadoop有一个默认的分区类,HashPartioer类,通过对输入的k2去hash值来确认map输出的k2,v2送到哪一个reduce中去执行。

3、代码体现

public class ProvincePartitioner extends Partitioner<Text, FlowBean>{

    private static HashMap<String, Integer> provinceMap = new HashMap<String, Integer>();

    static {

        provinceMap.put("135",0);
        provinceMap.put("136",1);
        provinceMap.put("137",2);
        provinceMap.put("138",3);
        provinceMap.put("139",4);

    }


    @Override
    public int getPartition(Text key, FlowBean value, int numPartitions) {

        Integer province =null;
        province = provinceMap.get(key.toString().substring(0, 3));

        if(province == null){
            province = 5;
        }

        return province;
    }

}
public class FlowBeanRunner {

    static class FlowBeanMapper extends Mapper<LongWritable , Text , Text, FlowBean>{

        protected void map(LongWritable key , Text value , Context context) throws IOException, InterruptedException{
            String line = value.toString();
            String[] fields = StringUtils.split(line , "\t");
            String phone = fields[1];

            //拿到上行流量字段值
            long up_flow = Long.parseLong(fields[fields.length-3]);
            //拿到下行流量字段值
            long d_flow = Long.parseLong(fields[fields.length-2]);
            //将上下行流量封装到flowBean中去
            FlowBean flowBean = new FlowBean(up_flow, d_flow);

            context.write(new Text(phone),flowBean);
        }

    }


    static class FlowBeanReduce extends Reducer<Text, FlowBean,Text, FlowBean>{

        protected void reduce (Text key , Iterable<FlowBean> values , Context context) throws IOException, InterruptedException{

            long sum_upflow = 0;
            long sum_dflow  = 0;

            for(FlowBean bean : values){
                sum_upflow += bean.getUpflow();
                sum_dflow += bean.getDflow();
            }

            FlowBean resultBean = new FlowBean(sum_upflow,sum_dflow);

            context.write(key, resultBean);
        }

    }


    public static void main(String[] args) throws Exception, IOException, InterruptedException {
        Configuration conf = new Configuration();
        Job job = Job.getInstance(conf);

        job.setJarByClass(FlowBeanRunner.class);

        job.setMapperClass(FlowBeanMapper.class);
        job.setReducerClass(FlowBeanReduce.class);

        //指定自定义的partitioner类,替换掉框架默认的HashPartitioner
        job.setPartitionerClass(ProvincePartitioner.class);

        //指定reduce task数量,跟ProvincePartitioner的分区数匹配
        job.setNumReduceTasks(6);


        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(FlowBean.class);

        //要处理的数据所在的path
        //指定文件夹即可,该文件夹下的所有文件都会被处理
        FileInputFormat.setInputPaths(job, new Path("/home/hadoop/Desktop/inputflow"));

        //处理完得到的结果输出的path
        FileOutputFormat.setOutputPath(job, new Path("/home/hadoop/Desktop/outputflow"));

        job.waitForCompletion(true);
    }

}

你可能感兴趣的:(mapreduce)