hadoop 自定义inputformat和outputformat

http://blackproof.iteye.com/blog/1


hadoop的inputformat和outputformat

 

最好的例子vertica :虽然是在pig中实现的udf,但是就是hadoop的inputformat和outputformat,在hive里也可以照用,贴个下载的地址:http://blackproof.iteye.com/blog/1791995

 

再贴一个项目中,在实现hadoop join时,用的inputformat和outputformat的简单实例:

hadoop join在http://blackproof.iteye.com/blog/1757530

   自定义inputformat(泛型是maper的input)

Java代码   收藏代码
  1. public class MyInputFormat extends FileInputFormat<MultiKey,Employee> {  
  2.       
  3.     public MyInputFormat(){}  
  4.   
  5.     @Override  
  6.     public RecordReader<MultiKey, Employee> createRecordReader(  
  7.             InputSplit split, TaskAttemptContext context) throws IOException,  
  8.             InterruptedException {  
  9.         // TODO Auto-generated method stub  
  10.         return new MyRecordReader();  
  11.     }  
  12.       
  13.     public static class MyRecordReader extends RecordReader<MultiKey, Employee>{  
  14.   
  15.         public LineReader in;  
  16.         public MultiKey key;  
  17.         public Employee value;  
  18.         public StringTokenizer token = null;  
  19.           
  20.         public Text line;  
  21.           
  22.         @Override  
  23.         public void initialize(InputSplit split, TaskAttemptContext context)  
  24.                 throws IOException, InterruptedException {  
  25.             // TODO Auto-generated method stub  
  26.             FileSplit fileSplit = (FileSplit)split;  
  27.             Configuration job = context.getConfiguration();  
  28.             Path file = fileSplit.getPath();  
  29.             FileSystem fs = file.getFileSystem(job);  
  30.               
  31.             FSDataInputStream filein = fs.open(file);  
  32.             in = new LineReader(filein, job);  
  33.               
  34.             key = new MultiKey();  
  35.             value = new Employee();  
  36.             line = new Text();  
  37.         }  
  38.   
  39.         @Override  
  40.         public boolean nextKeyValue() throws IOException, InterruptedException {  
  41.   
  42.             int linesize = in.readLine(line);  
  43.             if(linesize==0)  
  44.                 return false;  
  45.             String[] pieces = line.toString().split(",");  
  46.             int i = Integer.valueOf(pieces[0]);  
  47.             switch (i) {  
  48.             case 1:  
  49.                 value.setEmpName(pieces[1]);  
  50.                 value.setFlag(1);  
  51.                 break;  
  52.   
  53.             default:  
  54.                 value.setDepartName(pieces[1]);  
  55.                 value.setFlag(2);  
  56.                 break;  
  57.             }  
  58.             value.setDepartId(pieces[2]);  
  59.             value.setDepartNo(pieces[3]);  
  60.               
  61.             key.setDepartId(value.getDepartId());  
  62.             key.setDepartNo(value.getDepartNo());  
  63.             return true;  
  64.         }  
  65.   
  66.         @Override  
  67.         public MultiKey getCurrentKey() throws IOException,  
  68.                 InterruptedException {  
  69.             // TODO Auto-generated method stub  
  70.             return key;  
  71.         }  
  72.   
  73.         @Override  
  74.         public Employee getCurrentValue() throws IOException,  
  75.                 InterruptedException {  
  76.             // TODO Auto-generated method stub  
  77.             return value;  
  78.         }  
  79.   
  80.         @Override  
  81.         public float getProgress() throws IOException, InterruptedException {  
  82.             // TODO Auto-generated method stub  
  83.             return 0;  
  84.         }  
  85.   
  86.         @Override  
  87.         public void close() throws IOException {  
  88.             // TODO Auto-generated method stub  
  89.               
  90.         }  
  91.           
  92.     }  
  93.   
  94. }  

 

 

   自定义outputformat(泛型是reduce的输出)

Java代码   收藏代码
  1. public class MyOutputFormat extends FileOutputFormat<Text, Employee> {  
  2.   
  3.     @Override  
  4.     public RecordWriter<Text, Employee> getRecordWriter(  
  5.             TaskAttemptContext job) throws IOException, InterruptedException {  
  6.         // TODO Auto-generated method stub  
  7.         Configuration conf = job.getConfiguration();  
  8.         Path file = getDefaultWorkFile(job, "");  
  9.         FileSystem fs = file.getFileSystem(conf);  
  10.         FSDataOutputStream fileOut = fs.create(file, false);  
  11.         return new MyRecordWriter(fileOut);  
  12.     }  
  13.       
  14.     public static class MyRecordWriter extends RecordWriter<Text, Employee>{  
  15.   
  16.         protected DataOutputStream out;  
  17.         private final byte[] keyValueSeparator;  
  18.          public static final String NEW_LINE = System.getProperty("line.separator");  
  19.           
  20.         public MyRecordWriter(DataOutputStream out){  
  21.             this(out,":");  
  22.         }  
  23.           
  24.         public MyRecordWriter(DataOutputStream out,String keyValueSeparator){  
  25.             this.out = out;  
  26.             this.keyValueSeparator = keyValueSeparator.getBytes();  
  27.         }  
  28.           
  29.         @Override  
  30.         public void write(Text key, Employee value) throws IOException,  
  31.                 InterruptedException {  
  32.             if(key!=null){  
  33.                 out.write(key.toString().getBytes());  
  34.                 out.write(keyValueSeparator);  
  35.             }  
  36.             out.write(value.toString().getBytes());  
  37.             out.write(NEW_LINE.getBytes());  
  38.         }  
  39.   
  40.         @Override  
  41.         public void close(TaskAttemptContext context) throws IOException,  
  42.                 InterruptedException {  
  43.             out.close();  
  44.         }  
  45.           
  46.     }  
  47.   
  48. }  

 

806263 转自

你可能感兴趣的:(hadoop 自定义inputformat和outputformat)