例子内容将同一手机号断分在同一个Reduce中若没有指定手机号段分区的则在同一个没有设置号段的分区
import java.util.HashMap; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Partitioner; import cn.com.bigdata.mr.flowcount.FlowBean; /** * 定义自己的从map到reduce之间的数据(分组)分发规则 按照手机号所属的省份来分发(分组)ProvincePartitioner * 默认的分组组件是HashPartitioner * * @author * */ public class ProvincePartitioner extends Partitioner<Text, FlowBean> { static HashMap<String, Integer> provinceMap = new HashMap<String, Integer>(); static { provinceMap.put("135", 0); provinceMap.put("136", 1); provinceMap.put("137", 2); provinceMap.put("138", 3); provinceMap.put("139", 4); } @Override public int getPartition(Text key, FlowBean value, int numPartitions) { Integer code = provinceMap.get(key.toString().substring(0, 3)); return code == null ? 5 : code; } }
import java.io.IOException; import org.apache.commons.lang.StringUtils; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import cn.com.bigdata.mr.flowcount.FlowBean; public class FlowCountProvince { static class FlowCountProvinceMapper extends Mapper<LongWritable, Text, Text, FlowBean> { @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { // 将传进来的这一行数据转为string String line = value.toString(); // 切分这一行,拿到各个字段 String[] fields = StringUtils.split(line, "\t"); // 拿到手机号 String phone = fields[1]; long upflow = Long.parseLong(fields[fields.length - 3]); long dflow = Long.parseLong(fields[fields.length - 2]); FlowBean bean = new FlowBean(upflow, dflow); context.write(new Text(phone), bean); } } static class FlowCountProvinceReducer extends Reducer<Text, FlowBean, Text, FlowBean> { @Override protected void reduce(Text key, Iterable<FlowBean> beans, Context context) throws IOException, InterruptedException { // 先定义两个计数器 long upAmount = 0; long dAmount = 0; // 遍历该用户所有的流量bean,进行累加求和 for (FlowBean bean : beans) { upAmount += bean.getUpflow(); dAmount += bean.getDflow(); } // 构造一个用于输出最终结果的flowbean FlowBean countBean = new FlowBean(upAmount, dAmount); // 输出结果 context.write(key, countBean); } } public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Job job = Job.getInstance(conf); job.setJarByClass(FlowCountProvince.class); job.setMapperClass(FlowCountProvinceMapper.class); job.setReducerClass(FlowCountProvinceReducer.class); /** * 如果map和reduce的输出kv类型一致,则不用专门设置map的输出kv类型 */ /* * job.setMapOutputKeyClass(Text.class); * job.setMapOutputValueClass(FlowBean.class); */ job.setOutputKeyClass(Text.class); job.setOutputValueClass(FlowBean.class); /** * hadoop中默认的输入输出组件就是TextInputformat和textoutputformat,所以,这两句代码也可以省略 */ /* * job.setInputFormatClass(TextInputFormat.class); * job.setOutputFormatClass(TextOutputFormat.class); */ FileInputFormat.setInputPaths(job, new Path(args[0])); FileOutputFormat.setOutputPath(job, new Path(args[1])); /** * 告诉job,使用我们自定义的数据分组组件ProvincePartitioner */ job.setPartitionerClass(ProvincePartitioner.class); /** * 为了配合自定义分组策略中分组数,需要设置相应的reduce的并发数 如果“并发数”>分组数 ,就会出现空的结果文件 * 如果"并发数"<分组数, * 就会报错:非法的分区号,但是如果并发数=1,则不会调用分组逻辑,所有的数据进入唯一的一个reduceTask,也就不会报错 */ job.setNumReduceTasks(Integer.parseInt(args[2])); boolean res = job.waitForCompletion(true); System.exit(res ? 0 : 1); } }