hadoop的排序功能是非常强大的,据说对1T的数据进行排序只用了一分多钟,我们这片文章的主要目的是介绍如何利用hadoop强大的排序功能来对我们的数据进行排序。在设计一个通用的排序算法的时候我用到了java的反射功能,但是在hadoop中使用反射功能可能会遇到一些麻烦,这些我会在后面提到。
首先hadoop的排序过程是发生在map过程后(如果有combine过程那么是发生在combine过程后的)的shuffle过程的,在这个过程中hadoop系统对map过程产生的键值对的key进行排序,然后发送到各个reducer上去。这篇文章的目的不是剖析hadoop的实现原理的,而是告诉大家怎么利用hadoop来对自己的数据进行排序,废话少说我们下面进入正题。
hadoop为我们提供了很多可作为key的数据类型,比如IntWritable,DoubleWritable,TextWritable等等,如果我们在map端发送的key是基本类型,那么我们直接使用这些内置数据类型即可。
package toolbox.algorithms.neuralnetwork; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import java.util.ArrayList; import java.util.List; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable; public class IntPairs implements WritableComparable<IntPairs>{ int number1; int number2; List<String> list=null; /** * 这里必须要提供一个不带参数的构造方法,因为hadoop是用发射来构造该类,如果没有该构造方法,系统会报错 */ public IntPairs(){ } /** * 如何从sequeceFile中还原数据 */ @Override public void readFields(DataInput in) throws IOException { number1 = in.readInt(); number2 = in.readInt(); int listLen = in.readInt(); //读取链表的长度 list = new ArrayList<String>(); //如果在构造函数中没有初始化该变量,使用前必须先初始化,基础类型不需要初始化 for(int i=0;i<listLen;i++) { Text text = new Text(); text.readFields(in); //这个地方不要使用 in.readline()方法,该方法会引入一些奇怪的字符,我也不知道为什么 list.add(text.toString()); } } /** * 怎么向sequenceFile中写入数据 */ @Override public void write(DataOutput out) throws IOException { new IntWritable(number1).write(out); new IntWritable(number1).write(out); //写入链表的长度,很重要 new IntWritable(list.size()).write(out); for(int i=0;i<list.size();i++) new Text(list.get(i)).write(out); } /** * key之间怎么做比较 */ @Override public int compareTo(IntPairs o) { return (number1==o.number1)?(number2-o.number2):(number1-o.number1); } }
package toolbox.algorithms.sort; import java.io.IOException; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.mahout.common.HadoopUtil; import org.slf4j.Logger; import org.slf4j.LoggerFactory; public class SortDriver { private static final Logger log = LoggerFactory.getLogger(SortDriver.class); /** * args[0] is the * args[1] is inputPath * args[2] is outputPath * args[3] is whether to sort in desc order * args[4] is the data type * @param args * @throws ClassNotFoundException * @throws InterruptedException * @throws IOException */ public static void main(String[] args) throws ClassNotFoundException, IOException, InterruptedException{ if(args.length<3){ System.out.println("arguments error!"); System.exit(-1); } if(args.length==3) run(args[1],args[2],true,"String"); else if(args.length==4) run(args[1],args[2],Boolean.parseBoolean(args[3]),"String"); else run(args[1],args[2],Boolean.parseBoolean(args[3]),args[4]); } @SuppressWarnings("unchecked") public static void run(String inputPath,String outputPath,boolean desc,String type) throws ClassNotFoundException, IOException, InterruptedException{ Class<?extends WritableComparable> outputType; if(type.equals("int")) outputType = (Class<? extends WritableComparable<Integer>>)Class.forName("toolbox.common.IntWritableParser"); else if(type.equals("String")) outputType = (Class<? extends WritableComparable<Integer>>)Class.forName("toolbox.common.TextWritableParser"); else{ log.info("use user-defined output data type!"); outputType = (Class<? extends WritableComparable<Integer>>)Class.forName(type); } run(new Path(inputPath),new Path(outputPath),desc,outputType); // if(type.equals("int")) dataType=IntWritable.class; // else if(type.equals("double")) dataType=DoubleWritable.class; // else if(type.equals("String")) dataType=Text.class; // else dataType=Class.forName(type); } public static void run(Path inputPath,Path outputPath,boolean desc,Class<?extends WritableComparable> outType) throws IOException, ClassNotFoundException, InterruptedException{ Configuration conf = new Configuration(); // HadoopUtil.delete(conf, outputPath); conf.set("toolbox.algorithms.sort.writable_type",outType.getName()); Job job = new Job(conf, "word count"); job.setJarByClass(SortDriver.class); job.setMapperClass(SortMapper.class); job.setCombinerClass(SortReducer.class); job.setReducerClass(SortReducer.class); job.setOutputKeyClass(outType); job.setOutputValueClass(IntWritable.class); FileInputFormat.addInputPath(job, inputPath); FileOutputFormat.setOutputPath(job, outputPath); System.exit(job.waitForCompletion(true) ? 0 : 1); } }
package toolbox.algorithms.sort; import java.io.IOException; import java.lang.reflect.Constructor; import java.lang.reflect.InvocationTargetException; import java.lang.reflect.Method; import java.util.StringTokenizer; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapreduce.Mapper; public class SortMapper extends Mapper<Object,Text,WritableComparable<?>,IntWritable>{ protected static IntWritable ONE = new IntWritable(1); protected static Class<?extends WritableComparable> writable_type; protected static Constructor<?> construct_method; protected static Method parse_method; @SuppressWarnings("unchecked") @Override public void setup(Context context){ Configuration conf = context.getConfiguration(); try { writable_type = (Class<? extends WritableComparable>) Class.forName(conf.get("toolbox.algorithms.sort.writable_type")); parse_method = writable_type.getDeclaredMethod("parse"); construct_method = writable_type.getConstructor(writable_type); } catch (ClassNotFoundException e) { e.printStackTrace(); } catch (NoSuchMethodException e) { e.printStackTrace(); } catch (SecurityException e) { e.printStackTrace(); } } @Override public void map(Object key, Text value, Context context ) throws IOException, InterruptedException { StringTokenizer stk =new StringTokenizer(value.toString()); while(stk.hasMoreElements()){ String elment = stk.nextToken(); Object e; try { e = construct_method.newInstance(parse_method.invoke(writable_type, elment)); context.write((WritableComparable<?>)e, ONE); } catch (IllegalArgumentException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } catch (InstantiationException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } catch (IllegalAccessException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } catch (InvocationTargetException e1) { // TODO Auto-generated catch block e1.printStackTrace(); } } } }
package toolbox.algorithms.sort; import java.io.IOException; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.WritableComparable; import org.apache.hadoop.mapreduce.Reducer; public class SortReducer extends Reducer<WritableComparable<?>,IntWritable,WritableComparable<?>,IntWritable> { @Override public void reduce(WritableComparable<?> key, Iterable<IntWritable> values, Context context) throws IOException, InterruptedException { int sum=0; for(IntWritable value:values) sum += value.get(); context.write(key, new IntWritable(sum)); } }
package toolbox.common; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.WritableComparable; public class IntWritableParser implements WritableComparable<Integer>{ int oValue; IntWritable pValue = null; public IntWritableParser(int v){ oValue = v; pValue = new IntWritable(v); } public IntWritableParser(IntWritableParser v){ oValue = v.getoValue(); pValue = v.getpValue(); } public static IntWritable parse(String arg) { return new IntWritable(Integer.parseInt(arg)); } @Override public void write(DataOutput out) throws IOException { pValue.write(out); } @Override public void readFields(DataInput in) throws IOException { pValue.readFields(in); } @Override public int compareTo(Integer o) { return pValue.compareTo(o); } public IntWritable getpValue() { return pValue; } public void setpValue(IntWritable pValue) { this.pValue = pValue; } public int getoValue() { return oValue; } public void setoValue(int oValue) { this.oValue = oValue; } }
package toolbox.common; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.WritableComparable; public class TextWritableParser implements WritableComparable<String>{ String oValue; Text pValue; public TextWritableParser(String value){ oValue = value; pValue = new Text(value); } public TextWritableParser(TextWritableParser value){ oValue = value.getoValue(); pValue = value.getpValue(); } @Override public void write(DataOutput out) throws IOException { pValue.write(out); } @Override public void readFields(DataInput in) throws IOException { pValue.readFields(in); } @Override public int compareTo(String o) { return oValue.compareTo(o); } public static String parse(String arg) { return arg; } public String getoValue() { return oValue; } public void setoValue(String oValue) { this.oValue = oValue; } public Text getpValue() { return pValue; } public void setpValue(Text pValue) { this.pValue = pValue; } }