MapReduce的自定义分区(按照省份)

1. 自定义分区逻辑(例如按照省份)

package com.gerry.bigdata.mapreduce.flowpartion;

import java.util.HashMap;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;

import com.gerry.bigdata.mapreduce.flow.FlowBean;

/**
 * 按照省份进行分区 本类是提供给MapTask用的
 * MapTask通过这个类的getPartition方法,来计算他所产生的每一对kv数据分发给哪一个reduceTask
 * 
 * @author gerry
 *
 */
public class ProvincePartitioner extends Partitioner {

	/**
	 * 相同的手机号key,分发到相同的reduce
	 */
	static HashMap codeMap = new HashMap();
	static {
//		map = AttributeJdbcUtil.loadTable();
		codeMap.put("135", 0);
		codeMap.put("136", 1);
		codeMap.put("137", 2);
		codeMap.put("138", 3);
		codeMap.put("139", 4);
	}

	@Override
	public int getPartition(Text key, FlowBean value, int numPartitions) {
		// TODO Auto-generated method stub
		Integer code = codeMap.get(key.toString().substring(0, 3));

		return code == null ? 5 : code;
	}

}

2. Job提交客户端

package com.gerry.bigdata.mapreduce.flow;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import com.gerry.bigdata.mapreduce.flowpartion.ProvincePartitioner;

/**
 * 用户的访问流量
 * 
 * @author gerry
 *
 */
public class JobSubmiter {

	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		Job job = Job.getInstance(conf);

		job.setJarByClass(JobSubmiter.class);
//		job.setJar("/home/gerry/pyspark/Bigdata/jars/logs.jar");
		job.setMapperClass(FlowCountMapper.class);
		job.setReducerClass(FlowCountReducer.class);

		// 设置参数:mapTask在做数据分区时,用那个分区逻辑类(如果不指定,他会使用默认的HashPartitioner)
		job.setPartitionerClass(ProvincePartitioner.class);
		// 由于我们的ProvincePartitioner可能会产生6中分区号,所以,需要6个reduce task来接收
		job.setNumReduceTasks(6);

		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(FlowBean.class);

		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(FlowBean.class);

		Path inputPath = new Path("/home/gerry/pyspark/Bigdata/data/logs/input");
		Path outPath = new Path("/home/gerry/pyspark/Bigdata/data/logs/output");
//		FileSystem fs = FileSystem.get(new URI("hdfs://172.16.0.2:9000/"), conf, "root");
//		if (fs.exists(outPath)) {
//			fs.delete(outPath, true);
//		}

		FileInputFormat.setInputPaths(job, inputPath);
		FileOutputFormat.setOutputPath(job, outPath);

//		job.setNumReduceTasks(3);
		boolean result = job.waitForCompletion(true);
		System.exit(result ? 0 : 1);

	}

}

3. Mapper端

package com.gerry.bigdata.mapreduce.flow;

import java.io.IOException;

import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

public class FlowCountMapper extends Mapper {

	@Override
	protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
		// TODO Auto-generated method stub
		String line = value.toString();
		String[] fields = line.split("\t");
		String phoneNum = fields[1];
		int upFlow = Integer.parseInt(fields[fields.length - 3]);
		int downFlow = Integer.parseInt(fields[fields.length - 2]);

		context.write(new Text(phoneNum), new FlowBean(phoneNum, upFlow, downFlow));

	}

}

4. Reducer端

package com.gerry.bigdata.mapreduce.flow;

import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

public class FlowCountReducer extends Reducer {

	/**
	 * key是某个手机号, values:是这个手机号所产生的所有访问记录中的流量数据
	 * 
	 * <135,FlowBean1> <135,FlowBean2><135,FlowBean3><135,FlowBean4>
	 */
	@Override
	protected void reduce(Text Key, Iterable values, Context context)
			throws IOException, InterruptedException {

		int upSum = 0;
		int downSum = 0;
		for (FlowBean value : values) {
			upSum += value.getUpFlow();
			downSum += value.getDownFlow();
		}
		context.write(Key, new FlowBean(Key.toString(), upSum, downSum));
	}

}

 

你可能感兴趣的:(Hadoop)