mapreduce之partition分区

听了超哥的一席课后逐渐明白了partition,记录一下自己的理解!(thanks 超哥)

package partition;

import java.io.IOException;
import java.net.URI;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

/**
 * @ClassName: FlowCount2 
 * @Description: TODO(这里用一句话描述这个类的作用) 
 * @author zhangweixiang
 * @date 2014年3月6日 下午3:27:56
 */
/**
 * 分区的例子必须打成jar运行
 * 用处:	1.根据业务需要,产生多个输出文件
 * 		2.多个reduce任务在运行,提高整体job的运行效率
 */
public class FlowCount2 {

	public static final String INPUT_PATH = "hdfs://192.168.0.9:9000/wlan2";
	public static final String OUT_PATH = "hdfs://192.168.0.9:9000/myout";

	public static void main(String[] args) throws Exception {
		Configuration conf = new Configuration();
		Job job = new Job(conf, FlowCount2.class.getSimpleName());
		
		//指定打包的jar
		job.setJarByClass(FlowCount2.class);

		// 1.1指定输入文件的路径
		FileInputFormat.addInputPath(job, new Path(INPUT_PATH));
		// 指定输入信息的格式化类
		job.setInputFormatClass(TextInputFormat.class);

		// 1.2指定自定义map类
		job.setMapperClass(MyMapper.class);
		// 设置map输出类型
		job.setMapOutputKeyClass(Text.class);
		job.setMapOutputValueClass(FlowWritable.class);

		// 1.3指定分区
		job.setPartitionerClass(MyPartition.class);
		// 设置reduce的任务个数,由于map输出后建立了两个分区,所以应该设置两个reduce任务输出到不同的文件(一个分区对应一个reduce任务)
		job.setNumReduceTasks(2);

		// 1.4排序,分组

		// 1.5规约

		// 2.2指定自定义的reduce类
		job.setReducerClass(MyReduce.class);
		// 设置输出类型
		job.setOutputKeyClass(Text.class);
		job.setOutputValueClass(FlowWritable.class);
		// 设置输出格式化类
		job.setOutputFormatClass(TextOutputFormat.class);

		// 如果输出文件路径存在则删除
		FileSystem fileSystem = FileSystem.get(new URI(OUT_PATH),
				new Configuration());
		Path path = new Path(OUT_PATH);
		if (fileSystem.exists(path)) {
			fileSystem.delete(path, true);
		}

		// 2.3指定输出路径
		FileOutputFormat.setOutputPath(job, new Path(OUT_PATH));

		// 提交任务
		job.waitForCompletion(true);

	}

	static class MyMapper extends
			Mapper {
		@Override
		protected void map(LongWritable key, Text value, Context context)
				throws IOException, InterruptedException {
			// 分割行
			String[] split = value.toString().split("\t");
			// 获取用户电话号码
			String mobile = "";
			long upPackNum = 0l;
			long downPackNum = 0l;
			long upPayLoad = 0l;
			long downPayLoad = 0l;

			// 符合规范的电话号码
			if (!("".equals(split[2]))) {
				mobile = split[2];
				// 获取流量信息
				if (!("".equals(split[21]))) {
					upPackNum = Long.parseLong(split[21]);
				}
				if (!("".equals(split[22]))) {
					downPackNum = Long.parseLong(split[22]);
				}
				if (!("".equals(split[23]))) {
					upPayLoad = Long.parseLong(split[23]);
				}
				if (!("".equals(split[24]))) {
					downPayLoad = Long.parseLong(split[24]);
				}

				FlowWritable flowWritable = new FlowWritable(upPackNum,
						downPackNum, upPayLoad, downPayLoad);

				context.write(new Text(mobile), flowWritable);
			}

		}
	}

	static class MyReduce extends
			Reducer {
		@Override
		protected void reduce(Text k2, Iterable v2s,
				Context context) throws IOException, InterruptedException {

			long upPackNum = 0l;
			long downPackNum = 0l;
			long upPayLoad = 0l;
			long downPayLoad = 0l;

			for (FlowWritable flowWritable : v2s) {
				upPackNum += flowWritable.upPackNum;
				downPackNum += flowWritable.downPackNum;
				upPayLoad += flowWritable.upPayLoad;
				downPayLoad += flowWritable.downPayLoad;
			}

			FlowWritable flowWritable = new FlowWritable(upPackNum,
					downPackNum, upPayLoad, downPayLoad);

			context.write(k2, flowWritable);
		}
	}
	

}


package partition;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;

/**
 * @ClassName: flowWritable
 * @Description: 自定义类型实现Writable接口,包含四个参数(upPackNum 上行包, downPackNum 下行包,
 *               upPayLoad 发送流量,downPayLoad 下载流量)
 * @author zhangweixiang
 * @date 2014年3月5日 上午11:37:10
 */
public class FlowWritable implements Writable {

	public long upPackNum;
	public long downPackNum;
	public long upPayLoad;
	public long downPayLoad;

	public FlowWritable() {
		// TODO Auto-generated constructor stub
	}

	public FlowWritable(long upPackNum, long downPackNum, long upPayLoad,
			long downPayLoad) {
		this.upPackNum = upPackNum;
		this.downPackNum = downPackNum;
		this.upPayLoad = upPayLoad;
		this.downPayLoad = downPayLoad;
	}

	@Override
	public void write(DataOutput out) throws IOException {
		out.writeLong(upPackNum);
		out.writeLong(downPackNum);
		out.writeLong(upPackNum);
		out.writeLong(downPayLoad);
	}

	@Override
	public void readFields(DataInput in) throws IOException {
		this.upPackNum = in.readLong();
		this.downPackNum = in.readLong();
		this.upPayLoad = in.readLong();
		this.downPayLoad = in.readLong();
	}

	/*
	 * (非 Javadoc)
	 * 
	 * 
	 * @return
	 * 
	 * @see java.lang.Object#toString()
	 */
	@Override
	public String toString() {
		return upPackNum + "\t" + downPackNum + "\t" + upPayLoad + "\t"
				+ downPayLoad;
	}

}

package partition;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.lib.partition.HashPartitioner;
/**
 * @ClassName: MyPartition 
 * @Description: 根据电话号码分区,正规号码分区代号为0,非正规号码分区为1(在此建立了两个分区,即会产生两个reduce任务输出到不同的文件0和1)
 * @param K k2(map输出的键), V v2(map输出的值)
 * @author zhangweixiang
 * @date 2014年3月6日 下午3:02:29
 */
public class MyPartition extends HashPartitioner{
	@Override
	public int getPartition(Text key, FlowWritable value, int numReduceTasks) {
		int p=0;
		if(key.toString().length()!=11){
			p=1;
		}
		return p;
	}
}

注:必须要达成jar包上传到linux下执行(我开始没有打成jar包直接在eclipse下执行抛了异常)

执行完成后会产生两个文件(part-r-00000和part-r-00001)分别记录不同条件的信息。

eclipse直接运行抛的异常:

14/03/06 15:41:13 WARN mapred.LocalJobRunner: job_local_0001
java.io.IOException: Illegal partition for 10.80.203.79 (1)
at org.apache.hadoop.mapred.MapTask$MapOutputBuffer.collect(MapTask.java:1073)
at org.apache.hadoop.mapred.MapTask$NewOutputCollector.write(MapTask.java:691)
at org.apache.hadoop.mapreduce.TaskInputOutputContext.write(TaskInputOutputContext.java:80)
at partition.FlowCount2$MyMapper.map(FlowCount2.java:120)
at partition.FlowCount2$MyMapper.map(FlowCount2.java:1)
at org.apache.hadoop.mapreduce.Mapper.run(Mapper.java:144)
at org.apache.hadoop.mapred.MapTask.runNewMapper(MapTask.java:764)
at org.apache.hadoop.mapred.MapTask.run(MapTask.java:370)
at org.apache.hadoop.mapred.LocalJobRunner$Job.run(LocalJobRunner.java:214)
14/03/06 15:41:14 INFO mapred.JobClient:  map 0% reduce 0%
14/03/06 15:41:14 INFO mapred.JobClient: Job complete: job_local_0001
14/03/06 15:41:14 INFO mapred.JobClient: Counters: 0



记录超哥的总结:

分区的例子必须打成jar运行
 * 用处:	
 *1.根据业务需要,产生多个输出文件
 * 2.多个reduce任务在运行,提高整体job的运行效率

你可能感兴趣的:(hadoop,大数据离线处理,hadoop,mapreduce,partition,分区)