1 Map 、Reduce和主类
package com.wzt.mapreduce.secondsort; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import com.wzt.mapreduce.wordcount.WCRunner; public class SecSortMain { public static class SecSortMapper extends Mapper<LongWritable, Text, FirstSortEntity, IntWritable> { protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString(); String[] spilted = line.split(" "); // 为了显示效果而输出Mapper的输出键值对信息 System.out.println("Mapper输出<" + spilted[0] + "," + spilted[1] + ">"+this); context.write(new FirstSortEntity(spilted[0], Integer.parseInt(spilted[1])) , new IntWritable(Integer.parseInt(spilted[1])) ); }; } public static class SecSortReducer extends Reducer<FirstSortEntity, IntWritable , FirstSortEntity, IntWritable> { @Override protected void reduce( FirstSortEntity key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { // 显示次数表示redcue函数被调用了多少次,表示k2有多少个分组 System.out.println("Reducer输入分组<" + key+ ",N(N>=1)>"+this); StringBuffer sb = new StringBuffer() ; for (IntWritable value : values) { //count += value.get(); // 显示次数表示输入的k2,v2的键值对数量 sb.append( value+" , " ) ; System.out.println("Reducer输入键值对<" + key.toString() + "," + value.get() + "> 组"+sb.toString() ); } // if(sb.length()>0){ // sb.deleteCharAt( -1 ) ; // } context.write(key, key.getSecondkey()); //context.write(key.getFirstkey(), new Text(sb.toString() )); } } public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException { Configuration conf = new Configuration() ; Job job = Job.getInstance(conf) ; job.setJarByClass(WCRunner.class ); job.setMapperClass( SecSortMapper.class ); job.setMapOutputKeyClass( FirstSortEntity.class); job.setMapOutputValueClass( IntWritable.class ); //设置分区方法 job.setPartitionerClass( SSPartintioner.class);//不同 //会有几个reduce去执行最后的汇总数据, 有几个分区就要有几个reduce ,最后就会生成几个reduce ,这里设置为1 ,没看到调用但是确实分区了,没弄明白 job.setNumReduceTasks(1);//当任务数为1的时候设置Partitioner是没有用的 //数据做总的排序 job.setSortComparatorClass(MySSSortComparator.class) ; //排序 //总数据 记性分组 job.setGroupingComparatorClass( GroupComparator.class );//分组 job.setReducerClass( SecSortReducer.class ); job.setOutputKeyClass( FirstSortEntity.class ); job.setOutputValueClass(IntWritable.class ); // FileInputFormat.setInputPaths(job, "/wc/input/xiyou.txt"); // FileOutputFormat.setOutputPath(job, new Path("/wc/output6")); FileInputFormat.setInputPaths(job, "/sort/input"); FileOutputFormat.setOutputPath(job, new Path("/sort/output1")); job.waitForCompletion(true) ; } }
2 自定义 组合key
package com.wzt.mapreduce.secondsort; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable; /** * 自定义组合件 * @author root * */ public class FirstSortEntity implements WritableComparable<FirstSortEntity>{ private Text firstkey ; private IntWritable secondkey ; public FirstSortEntity( ) { } public FirstSortEntity(Text firstkey, IntWritable secondkey) { this.firstkey = firstkey; this.secondkey = secondkey; } public FirstSortEntity(String firstkey, int secondkey) { this.firstkey = new Text(firstkey); this.secondkey = new IntWritable(secondkey); } public Text getFirstkey() { return firstkey; } public void setFirstkey(Text firstkey) { this.firstkey = firstkey; } public IntWritable getSecondkey() { return secondkey; } public void setSecondkey(IntWritable secondkey) { this.secondkey = secondkey; } /** * 对象序列化 */ @Override public void write(DataOutput out) throws IOException { out.writeUTF(firstkey.toString() ); out.writeInt( secondkey.get() ); } //对象反序列化 @Override public void readFields(DataInput in) throws IOException { firstkey = new Text(in.readUTF() ); secondkey = new IntWritable(in.readInt()); } /** * 排序在map执行后数据传出后 会调用这个方法对key进行排序 * 数据map后,如果设置了分区并且reduce>1 的话,会执行分区类方法,进行分区 */ @Override public int compareTo(FirstSortEntity entity) { //利用这个来控制升序或降序 //this本对象写在前面代表是升序 //this本对象写在后面代表是降序 return this.firstkey.compareTo( entity.getFirstkey()); //return this.secondkey.get()>entity.getSecondkey().get()?1:-1; } @Override public String toString() { return this.getFirstkey() +" "+this.getSecondkey()+ " " ; } }
3 自定义分区
package com.wzt.mapreduce.secondsort; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.mapreduce.Partitioner; //自定义 分区 public class SSPartintioner extends Partitioner<FirstSortEntity, IntWritable>{ /** * key map输出的key * value map 输出的value * map后的数据 经过排序后传进这个分区方法,如果返回的值相同的数据,值相同的数据会分配到一组中 ,即 放到一堆 * 到此 数据为N堆,并且数据是经过排序的 */ @Override public int getPartition(FirstSortEntity key, IntWritable value, int numPartitions) { System.out.println("Partitioner key:"+key.getFirstkey()+" value:"+value+" "+ ( ( key.getFirstkey().hashCode()&Integer.MAX_VALUE)%numPartitions ) +" "+this); //System.out.println("Partitioner key:"+key.getFirstkey()+" value:"+value+" "+ ((key.getSecondkey().get()&Integer.MAX_VALUE)%numPartitions) +" "+this); return (key.getFirstkey().hashCode()&Integer.MAX_VALUE)%numPartitions; //return (key.getSecondkey().get()&Integer.MAX_VALUE)%numPartitions; } }
个人理解以上都是在Map阶段进行,即本地操作,以下为Map到Reduce这段进行的
4 自定义整体排序
package com.wzt.mapreduce.secondsort; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.WritableComparator; //组内自定义排序策略 /** * @author root * */ public class MySSSortComparator extends WritableComparator{ public MySSSortComparator() {//注册处理的试题类型 super(FirstSortEntity.class,true); } /** * reduce 处理数据之前 * 对全量数据排序 * 逻辑:分组一样则按照第二个参数排序 ,分组不一样,则按照第一个参数排序 */ @Override public int compare(WritableComparable a, WritableComparable b) { FirstSortEntity e1 = (FirstSortEntity)a; FirstSortEntity e2 = (FirstSortEntity)b; System.out.println( e1.getFirstkey()+"==MySSSortComparator 排序 。。 "+e2.getFirstkey()); //首先要保证是同一个组内,同一个组的标识就是第一个字段相同 if(!e1.getFirstkey().equals( e2.getFirstkey())){ return e1.getFirstkey().compareTo(e2.getFirstkey()); }else{ return e1.getSecondkey().get() - e2.getSecondkey().get() ; } } }
5 自定义分组
package com.wzt.mapreduce.secondsort; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.io.WritableComparator; //对象分组策略 //数据放到 reduce前 ,对数据进行分组 public class GroupComparator extends WritableComparator{ public GroupComparator() { //注册处理的试题类型 super(FirstSortEntity.class,true ) ; } /** * 对排序后的数据 分组, * 第一个参数相同的,放到一个key的 迭代器 集合中 */ @Override public int compare(WritableComparable a, WritableComparable b) { FirstSortEntity e1 = (FirstSortEntity)a; FirstSortEntity e2 = (FirstSortEntity)b; System.out.println( e1.getFirstkey()+"==GroupComparator = 分组=="+e2.getFirstkey()); return e1.getFirstkey().toString().compareTo( e2.getFirstkey().toString()); //return e1.getSecondkey().compareTo( e2.getSecondkey()); } }
在以后就是主类中的reduce进行数据处理
下面这个类作为自己的记录,这里没用:
package com.wzt.mapreduce.secondsort; import java.io.ByteArrayInputStream; import java.io.DataInput; import java.io.DataInputStream; import java.io.IOException; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.RawComparator; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparator; //自定义分组比较器 //这个类 暂时没用, 分组比较器的 实现,但没有测试 public class SSGroupComparator implements RawComparator<FirstSortEntity>{ @Override public int compare(FirstSortEntity o1, FirstSortEntity o2) { return o1.getSecondkey().get()>o2.getSecondkey().get()?1:-1; } @Override public int compare(byte[] b1, int s1, int l1, byte[] b2, int s2, int l2) { //对象可以这样反序列化 //IntWritable d ; System.out.println( "SSGroupComparator 自定义分组 =" ); ByteArrayInputStream bis = new ByteArrayInputStream(b1); DataInput in1 = new DataInputStream(bis); FirstSortEntity entity1 = new FirstSortEntity(); ByteArrayInputStream bis2 = new ByteArrayInputStream(b2); DataInput in2 = new DataInputStream(bis2); FirstSortEntity entity2 = new FirstSortEntity(); try { entity1.readFields(in1); entity2.readFields(in2); } catch (IOException e) { e.printStackTrace(); } return entity1.getFirstkey().compareTo( entity2.getFirstkey()); } }