简化陆喜恒. Hadoop实战(第2版)5.4单表关联的代码时遇到空指向异常,经分析是逻辑问题,在此做个记录。
环境:Mac OS X 10.9.5, IntelliJ IDEA 13.1.5, Hadoop 1.2.1
改好的代码如下,在reduce阶段遇到了NullPointerException。
1 public class STjoinEx { 2 private static final String TIMES = "TIMES"; 3 4 public static void main(String[] args) throws Exception { 5 Configuration configuration = new Configuration(); 6 configuration.setInt(TIMES, 1); 7 String[] remainingArgs = new GenericOptionsParser(configuration, args).getRemainingArgs(); 8 if (remainingArgs.length != 2) { 9 System.err.println("STjoinEx <input> <output>"); 10 System.exit(2); 11 } 12 13 Job job = new Job(configuration, STjoinEx.class.getSimpleName()); 14 job.setJarByClass(STjoinEx.class); 15 job.setMapperClass(Map.class); 16 job.setReducerClass(Reduce.class); 17 job.setInputFormatClass(KeyValueTextInputFormat.class); 18 job.setOutputFormatClass(TextOutputFormat.class); 19 job.setOutputKeyClass(Text.class); 20 job.setOutputValueClass(Text.class); 21 22 FileInputFormat.setInputPaths(job, new Path(remainingArgs[0])); 23 FileOutputFormat.setOutputPath(job, new Path(remainingArgs[1])); 24 25 System.exit(job.waitForCompletion(true) ? 0 : 1); 26 27 } 28 29 public static class Map extends Mapper<Text, Text, Text, Text> { 30 final static Text LEFT_TABLE = new Text(); 31 final static Text RIGHT_TABLE = new Text(); 32 33 @Override 34 protected void map(Text key, Text value, Context context) throws IOException, InterruptedException { 35 // left table 36 LEFT_TABLE.set("1 " + value); 37 context.write(key, LEFT_TABLE); 38 // right table 39 RIGHT_TABLE.set("2 " + key); 40 context.write(value, RIGHT_TABLE); 41 } 42 } 43 44 public static class Reduce extends Reducer<Text, Text, Text, Text> { 45 private static final int INDENT = 2; 46 private static final Text GRAND_PARENT = new Text(); 47 private static final Text GRAND_CHILD = new Text(); 48 49 @Override 50 protected void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException { 51 // output header 52 int times = context.getConfiguration().getInt(TIMES, 1); 53 if (times == 1) { 54 context.write(new Text("grandChild"), new Text("grandParent")); 55 context.getConfiguration().setInt(TIMES, ++times); 56 } 57 58 // prepare matrix 59 int headChar = 0; 60 String[] grandChild = new String[10]; 61 String[] grandParent = new String[10]; 62 int grandChildNum = 0; 63 int grandParentNum = 0; 64 65 for (Text value : values) { 66 headChar = value.charAt(0); 67 if (headChar == '1') { 68 grandParent[grandParentNum] = value.toString().substring(2); 69 grandParentNum++; 70 } else { 71 grandChild[grandChildNum] = value.toString().substring(2); 72 grandChildNum++; 73 } 74 } 75 76 // multiply 77 if (grandChildNum != 0 && grandChildNum != 0) { 78 for (int i = 0; i < grandChildNum; i++) { 79 GRAND_CHILD.set(grandChild[i]); 80 for (int j = 0; j < grandParentNum; j++) { 81 GRAND_PARENT.set(grandParent[j]); 82 context.write(GRAND_CHILD, GRAND_PARENT); 83 } 84 } 85 } 86 } 87 } 88 }
执行输出为
1 14/10/07 11:12:51 INFO mapred.JobClient: map 0% reduce 0% 2 14/10/07 11:12:54 INFO mapred.JobClient: map 100% reduce 0% 3 14/10/07 11:13:01 INFO mapred.JobClient: map 100% reduce 33% 4 14/10/07 11:13:04 INFO mapred.JobClient: Task Id : attempt_201410021756_0048_r_000000_0, Status : FAILED 5 java.lang.NullPointerException 6 at org.apache.hadoop.io.Text.encode(Text.java:388) 7 at org.apache.hadoop.io.Text.set(Text.java:178) 8 at main.ch5.STjoinEx$Reduce.reduce(STjoinEx.java:96) 9 at main.ch5.STjoinEx$Reduce.reduce(STjoinEx.java:61) 10 at org.apache.hadoop.mapreduce.Reducer.run(Reducer.java:177) 11 at org.apache.hadoop.mapred.ReduceTask.runNewReducer(ReduceTask.java:649) 12 at org.apache.hadoop.mapred.ReduceTask.run(ReduceTask.java:418) 13 at org.apache.hadoop.mapred.Child$4.run(Child.java:255) 14 at java.security.AccessController.doPrivileged(Native Method) 15 at javax.security.auth.Subject.doAs(Subject.java:396) 16 at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1190) 17 at org.apache.hadoop.mapred.Child.main(Child.java:249)
从输出信息可发现,源码96行if (grandChildNum != 0 && grandChildNum != 0)为出错行。两个判断条件重复了,将其中一个改成grandParentNum即可。
执行结果
1 grandChild grandParent 2 Jone Alice 3 Jone Jesse 4 Tom Alice 5 Tom Jesse 6 Tom Mary 7 Tom Ben 8 Jone Mary 9 Jone Ben 10 Philip Alice 11 Philip Jesse 12 Mark Alice 13 Mark Jesse