Mapreduce中的RCFile输出RCFileOutputFormat实现及其应用

自定义实现RCFileOutputFormat.java 

import java.io.IOException;  
  
import org.apache.Hadoop.conf.Configuration;  
import org.apache.hadoop.fs.FileSystem;  
import org.apache.hadoop.fs.Path;   
import org.apache.hadoop.io.compress.DefaultCodec;  
import org.apache.hadoop.hive.ql.io.RCFile;  
import org.apache.hadoop.hive.serde2.columnar.BytesRefArrayWritable;  
import org.apache.hadoop.io.LongWritable;  
import org.apache.hadoop.io.WritableComparable;  
import org.apache.hadoop.io.compress.CompressionCodec;  
import org.apache.hadoop.mapreduce.RecordWriter;  
import org.apache.hadoop.mapreduce.TaskAttemptContext;  
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;  
 
import org.apache.hadoop.util.ReflectionUtils;  
  
/** 
* RCFileOutputFormat. 
*  
*/  
public class RCFileOutputFormat  
        extends  
        FileOutputFormat<WritableComparable<LongWritable>, BytesRefArrayWritable> {  
  
     /** 
      * set number of columns into the given configuration. 
      *  
      * @param conf 
      *            configuration instance which need to set the column number 
      * @param columnNum 
      *            column number for RCFile's Writer 
      *  
      */  
     public static void setColumnNumber(Configuration conf, int columnNum) {  
         assert columnNum > 0;  
         conf.setInt(RCFile.COLUMN_NUMBER_CONF_STR, columnNum);  
     }  
   
     /** 
      * Returns the number of columns set in the conf for writers. 
      *  
      * @param conf 
      * @return number of columns for RCFile's writer 
      */  
     public static int getColumnNumber(Configuration conf) {  
         return conf.getInt(RCFile.COLUMN_NUMBER_CONF_STR, 0);  
    }  
   
     @Override  
     public RecordWriter<WritableComparable<LongWritable>, BytesRefArrayWritable> getRecordWriter(  
             TaskAttemptContext arg0) throws IOException, InterruptedException {  
         Configuration conf = arg0.getConfiguration();  
         conf.setBoolean("mapred.output.compress", true);  
         Path outputPath = FileOutputFormat.getOutputPath(arg0);  
         FileSystem fs = outputPath.getFileSystem(conf);  
         if (!fs.exists(outputPath)) {  
            fs.mkdirs(outputPath);  
         }  
         Path file = getDefaultWorkFile(arg0, "");  
         CompressionCodec codec = null;  
         if (getCompressOutput(arg0)) {  
             Class<?> codecClass = getOutputCompressorClass(arg0,  
                     DefaultCodec.class);  
             codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass,  
                     conf);  
         }  
         final RCFile.Writer out = new RCFile.Writer(fs, conf, file, null, codec);  
   
         return new RecordWriter<WritableComparable<LongWritable>, BytesRefArrayWritable>() {  
   
             @Override  
             public void write(WritableComparable<LongWritable> key,  
                     BytesRefArrayWritable value) throws IOException {  
                 out.append(value);  
             }  
   
             @Override  
             public void close(TaskAttemptContext arg0) throws IOException,  
                     InterruptedException {  
                 out.close();  
   
             }  
         };  
   
   }  
}




 
应用:
 1.job.setOutputFormatClass(RCFileOutputFormat.class); 
 2. job.setOutputValueClass(BytesRefArrayWritable.class); 
 1. BytesRefArrayWritable values = new BytesRefArrayWritable(COLUMNS); 
 2. values.set(0, new BytesRefWritable(fuid.getBytes())); 
 3. values.set(1, new BytesRefWritable(this.sid.getBytes())); 
 4. values.set(2, new BytesRefWritable(this.times.getBytes()); 
 5. context.write(new Text(this.uid), values); 

本篇文章来源于 Linux公社网站(www.linuxidc.com)  原文链接:http://www.linuxidc.com/Linux/2012-08/69114.htm

你可能感兴趣的:(hadoop)