听了超哥的一席课后逐渐明白了partition,记录一下自己的理解!(thanks 超哥)
package partition;
import java.io.IOException;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
/**
* @ClassName: FlowCount2
* @Description: TODO(这里用一句话描述这个类的作用)
* @author zhangweixiang
* @date 2014年3月6日 下午3:27:56
*/
/**
* 分区的例子必须打成jar运行
* 用处: 1.根据业务需要,产生多个输出文件
* 2.多个reduce任务在运行,提高整体job的运行效率
*/
public class FlowCount2 {
public static final String INPUT_PATH = "hdfs://192.168.0.9:9000/wlan2";
public static final String OUT_PATH = "hdfs://192.168.0.9:9000/myout";
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf, FlowCount2.class.getSimpleName());
//指定打包的jar
job.setJarByClass(FlowCount2.class);
// 1.1指定输入文件的路径
FileInputFormat.addInputPath(job, new Path(INPUT_PATH));
// 指定输入信息的格式化类
job.setInputFormatClass(TextInputFormat.class);
// 1.2指定自定义map类
job.setMapperClass(MyMapper.class);
// 设置map输出类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlowWritable.class);
// 1.3指定分区
job.setPartitionerClass(MyPartition.class);
// 设置reduce的任务个数,由于map输出后建立了两个分区,所以应该设置两个reduce任务输出到不同的文件(一个分区对应一个reduce任务)
job.setNumReduceTasks(2);
// 1.4排序,分组
// 1.5规约
// 2.2指定自定义的reduce类
job.setReducerClass(MyReduce.class);
// 设置输出类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowWritable.class);
// 设置输出格式化类
job.setOutputFormatClass(TextOutputFormat.class);
// 如果输出文件路径存在则删除
FileSystem fileSystem = FileSystem.get(new URI(OUT_PATH),
new Configuration());
Path path = new Path(OUT_PATH);
if (fileSystem.exists(path)) {
fileSystem.delete(path, true);
}
// 2.3指定输出路径
FileOutputFormat.setOutputPath(job, new Path(OUT_PATH));
// 提交任务
job.waitForCompletion(true);
}
static class MyMapper extends
Mapper {
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// 分割行
String[] split = value.toString().split("\t");
// 获取用户电话号码
String mobile = "";
long upPackNum = 0l;
long downPackNum = 0l;
long upPayLoad = 0l;
long downPayLoad = 0l;
// 符合规范的电话号码
if (!("".equals(split[2]))) {
mobile = split[2];
// 获取流量信息
if (!("".equals(split[21]))) {
upPackNum = Long.parseLong(split[21]);
}
if (!("".equals(split[22]))) {
downPackNum = Long.parseLong(split[22]);
}
if (!("".equals(split[23]))) {
upPayLoad = Long.parseLong(split[23]);
}
if (!("".equals(split[24]))) {
downPayLoad = Long.parseLong(split[24]);
}
FlowWritable flowWritable = new FlowWritable(upPackNum,
downPackNum, upPayLoad, downPayLoad);
context.write(new Text(mobile), flowWritable);
}
}
}
static class MyReduce extends
Reducer {
@Override
protected void reduce(Text k2, Iterable v2s,
Context context) throws IOException, InterruptedException {
long upPackNum = 0l;
long downPackNum = 0l;
long upPayLoad = 0l;
long downPayLoad = 0l;
for (FlowWritable flowWritable : v2s) {
upPackNum += flowWritable.upPackNum;
downPackNum += flowWritable.downPackNum;
upPayLoad += flowWritable.upPayLoad;
downPayLoad += flowWritable.downPayLoad;
}
FlowWritable flowWritable = new FlowWritable(upPackNum,
downPackNum, upPayLoad, downPayLoad);
context.write(k2, flowWritable);
}
}
}
package partition;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
/**
* @ClassName: flowWritable
* @Description: 自定义类型实现Writable接口,包含四个参数(upPackNum 上行包, downPackNum 下行包,
* upPayLoad 发送流量,downPayLoad 下载流量)
* @author zhangweixiang
* @date 2014年3月5日 上午11:37:10
*/
public class FlowWritable implements Writable {
public long upPackNum;
public long downPackNum;
public long upPayLoad;
public long downPayLoad;
public FlowWritable() {
// TODO Auto-generated constructor stub
}
public FlowWritable(long upPackNum, long downPackNum, long upPayLoad,
long downPayLoad) {
this.upPackNum = upPackNum;
this.downPackNum = downPackNum;
this.upPayLoad = upPayLoad;
this.downPayLoad = downPayLoad;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeLong(upPackNum);
out.writeLong(downPackNum);
out.writeLong(upPackNum);
out.writeLong(downPayLoad);
}
@Override
public void readFields(DataInput in) throws IOException {
this.upPackNum = in.readLong();
this.downPackNum = in.readLong();
this.upPayLoad = in.readLong();
this.downPayLoad = in.readLong();
}
/*
* (非 Javadoc)
*
*
* @return
*
* @see java.lang.Object#toString()
*/
@Override
public String toString() {
return upPackNum + "\t" + downPackNum + "\t" + upPayLoad + "\t"
+ downPayLoad;
}
}
package partition;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
/**
* @ClassName: MyPartition
* @Description: 根据电话号码分区,正规号码分区代号为0,非正规号码分区为1(在此建立了两个分区,即会产生两个reduce任务输出到不同的文件0和1)
* @param K k2(map输出的键), V v2(map输出的值)
* @author zhangweixiang
* @date 2014年3月6日 下午3:02:29
*/
public class MyPartition extends HashPartitioner{
@Override
public int getPartition(Text key, FlowWritable value, int numReduceTasks) {
int p=0;
if(key.toString().length()!=11){
p=1;
}
return p;
}
}
执行完成后会产生两个文件(part-r-00000和part-r-00001)分别记录不同条件的信息。
eclipse直接运行抛的异常:
(
14/03/06 15:41:13 WARN mapred.LocalJobRunner: job_local_0001
java.io.IOException: Illegal partition for 10.80.203.79 (1)
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.collect(MapTask.java:1073)
at org.apache.hadoop.mapred.MapTask$NewOutputCollector.write(MapTask.java:691)
at org.apache.hadoop.mapreduce.TaskInputOutputContext.write(TaskInputOutputContext.java:80)
at partition.FlowCount2$MyMapper.map(FlowCount2.java:120)
at partition.FlowCount2$MyMapper.map(FlowCount2.java:1)
at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:144)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:764)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:370)
at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:214)
14/03/06 15:41:14 INFO mapred.JobClient: map 0% reduce 0%
14/03/06 15:41:14 INFO mapred.JobClient: Job complete: job_local_0001
14/03/06 15:41:14 INFO mapred.JobClient: Counters: 0
)
记录超哥的总结:
分区的例子必须打成jar运行
* 用处:
*1.根据业务需要,产生多个输出文件
* 2.多个reduce任务在运行,提高整体job的运行效率