MapReduce之OutputFormat理解

一 OutputFormat作用

1校验job中指定输出路径是否存在

2将结果写入输出文件

 

二 OutputFormat的实现

2.1DBOutputFormat: 发送Reduce结果到SQL表中

2.2FileOutputFormat: 将Reduce结果写入文件中

2.2.1MapFileOutputFormat: 主要是处理MapFile(特殊的SequenceFile)的输出

2.2.2SequenceFileOutputFormat: 主要是处理SequenceFile的输出

2.2.3TextFileOutputFormat: 主要是处理普通文本的输出,也是默认实现

2.3FilterOutputFormat:主要就是方便包装其他OutputFromat(没用过)

2.4NullOutputFormat: 把所有的输出放到/dev/null(没用过)

 

 

 

三 MultipleOutputs

在有些场景中,我们需要将Map-Reduce结果输出到多个文件中,我们就可以使用MapOutputs这个类。

MultipleOutputs的使用步骤:

3.1我们需要在Mapper中setup方法实例化MapOutputs

3.2在map方法中使用MapOutputs对象进行write, 并且需要把你的文件命传入write方法中

3.3在完成后需要在close方法中关闭MapOutputs

3.4最后生成的结果就是 你传入的文件名-m|r-0000这样的序列

public class OutputMultipleFile extends Configured implements Tool{

public static class OutputMultipleMapper extendsMapper{

      private Text key1 = new Text();

      private Text value1 = new Text();

      private MultipleOutputs mos;

      @Override

      protected void cleanup(Mapper.Context context)

                 throws IOException, InterruptedException {

           super.cleanup(context);

           mos.close();

      }

 

      @Override

      protected void setup(Mapper.Context context)

                 throws IOException, InterruptedException {

           super.setup(context);

           mos = new MultipleOutputs(context);

      }

 

      @Override

      protected void map(LongWritable key, Text value, Mapper.Context context)

                 throws IOException, InterruptedException {

           if (value == null) {

                 return;

           }

           StringTokenizer tokenizer = new StringTokenizer(value.toString());

           while (tokenizer.hasMoreTokens()) {

                 String token = tokenizer.nextToken();

                 key1.set(token);

                 value1.set("=>"+key1);

                 mos.write(key1, value1, generateFileName(key1));

           }

      }

     

      private String generateFileName(Text key){

           if (key == null) {

                 return "default";

           }

           int len = key.toString().length();

           if (len <5) {

                 return "primary";

           }

           return "extended";

      }

}

 

 

public int run(String[] args) throws Exception {

      Configuration conf = new Configuration();

      String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();

      //对数组长度进行校验

      if (otherArgs.length < 2) {

           System.err.println("Usage:wordcount [...] ");

           System.exit(2);

      }

 

      Job job = Job.getInstance(conf,this.getClass().getSimpleName());

      //设置要运行的任务

      job.setJarByClass(OutputMultipleFile.class);

      //设置输入路径

      Path in = new Path(args[0]);

      FileInputFormat.addInputPath(job, in);

 

      //设置输出路径

      Path out = new Path(args[1]);

      FileOutputFormat.setOutputPath(job, out);

 

      //设置要运行的Mapper

      job.setMapperClass(OutputMultipleMapper.class);

      //设置Mapper的输出key和输出value的类型

      job.setMapOutputKeyClass(LongWritable.class);

      job.setMapOutputValueClass(Text.class);

      job.setNumReduceTasks(0);

      boolean isSuccess = job.waitForCompletion(Boolean.TRUE);

      return isSuccess ? 0 : 1;

}

 

public static void main(String[] args) throws Exception {

      int num = new Random().nextInt(1000);

      if (args == null || args.length == 0) {

           args = new String[]{

                 "hdfs://hdfs-cluster/user/hadoop/input",

                 "hdfs://hdfs-cluster/user/hadoop/output"+num

           };

      }

 

      int status = new OutputMultipleFile().run(args);

      System.exit(status);

}

}

 

你可能感兴趣的:(大数据/Hadoop)