1. 自定义分区逻辑(例如按照省份)
package com.gerry.bigdata.mapreduce.flowpartion;
import java.util.HashMap;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
import com.gerry.bigdata.mapreduce.flow.FlowBean;
/**
* 按照省份进行分区 本类是提供给MapTask用的
* MapTask通过这个类的getPartition方法,来计算他所产生的每一对kv数据分发给哪一个reduceTask
*
* @author gerry
*
*/
public class ProvincePartitioner extends Partitioner {
/**
* 相同的手机号key,分发到相同的reduce
*/
static HashMap codeMap = new HashMap();
static {
// map = AttributeJdbcUtil.loadTable();
codeMap.put("135", 0);
codeMap.put("136", 1);
codeMap.put("137", 2);
codeMap.put("138", 3);
codeMap.put("139", 4);
}
@Override
public int getPartition(Text key, FlowBean value, int numPartitions) {
// TODO Auto-generated method stub
Integer code = codeMap.get(key.toString().substring(0, 3));
return code == null ? 5 : code;
}
}
2. Job提交客户端
package com.gerry.bigdata.mapreduce.flow;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import com.gerry.bigdata.mapreduce.flowpartion.ProvincePartitioner;
/**
* 用户的访问流量
*
* @author gerry
*
*/
public class JobSubmiter {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(JobSubmiter.class);
// job.setJar("/home/gerry/pyspark/Bigdata/jars/logs.jar");
job.setMapperClass(FlowCountMapper.class);
job.setReducerClass(FlowCountReducer.class);
// 设置参数:mapTask在做数据分区时,用那个分区逻辑类(如果不指定,他会使用默认的HashPartitioner)
job.setPartitionerClass(ProvincePartitioner.class);
// 由于我们的ProvincePartitioner可能会产生6中分区号,所以,需要6个reduce task来接收
job.setNumReduceTasks(6);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlowBean.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);
Path inputPath = new Path("/home/gerry/pyspark/Bigdata/data/logs/input");
Path outPath = new Path("/home/gerry/pyspark/Bigdata/data/logs/output");
// FileSystem fs = FileSystem.get(new URI("hdfs://172.16.0.2:9000/"), conf, "root");
// if (fs.exists(outPath)) {
// fs.delete(outPath, true);
// }
FileInputFormat.setInputPaths(job, inputPath);
FileOutputFormat.setOutputPath(job, outPath);
// job.setNumReduceTasks(3);
boolean result = job.waitForCompletion(true);
System.exit(result ? 0 : 1);
}
}
3. Mapper端
package com.gerry.bigdata.mapreduce.flow;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class FlowCountMapper extends Mapper {
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
// TODO Auto-generated method stub
String line = value.toString();
String[] fields = line.split("\t");
String phoneNum = fields[1];
int upFlow = Integer.parseInt(fields[fields.length - 3]);
int downFlow = Integer.parseInt(fields[fields.length - 2]);
context.write(new Text(phoneNum), new FlowBean(phoneNum, upFlow, downFlow));
}
}
4. Reducer端
package com.gerry.bigdata.mapreduce.flow;
import java.io.IOException;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class FlowCountReducer extends Reducer {
/**
* key是某个手机号, values:是这个手机号所产生的所有访问记录中的流量数据
*
* <135,FlowBean1> <135,FlowBean2><135,FlowBean3><135,FlowBean4>
*/
@Override
protected void reduce(Text Key, Iterable values, Context context)
throws IOException, InterruptedException {
int upSum = 0;
int downSum = 0;
for (FlowBean value : values) {
upSum += value.getUpFlow();
downSum += value.getDownFlow();
}
context.write(Key, new FlowBean(Key.toString(), upSum, downSum));
}
}