MapReduce自定义文件输出名

前言:

MapReduce默认情况下,一个reducer产生一个文件,以name-r-nnnnn来命名,其中默认的name为part,nnnnn从(00000开始递增),保证了每个reducer不会产生重复的文件。
 

一、仅替代文件名part,输出结果为score-r-00000

1.使用org.apache.hadoop.mapreduce.lib.output.MultipleOutputs类
2.MultipleOutputs类需要在Reduce的setup()方法初始化,最好在cleanup()中关闭
3.这个时候还会生产成part-r-000000这种文件,发现是里面是空的,需要LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);

代码样例:

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

import java.io.IOException;

/**
 * Created by HuiQ on 2019-10-16.
 */
public class WordCount {

    public static class WordCountMapper extends Mapper<Object,Text,Text,IntWritable>{
        private final static IntWritable one = new IntWritable(1);
        private Text word = new Text();
        @Override
        public void map(Object key,Text value,Context context) throws IOException, InterruptedException {
            String[] words = value.toString().split(" ");
            for (String str: words){
                word.set(str);
                context.write(word,one);
            }
        }
    }

    public static class WordCountReducer extends Reducer<Text,IntWritable,Text,IntWritable> {

        private MultipleOutputs<Text, IntWritable> multipleOutputs;

        @Override
        protected void setup(Context context) throws IOException, InterruptedException {
            multipleOutputs = new MultipleOutputs<Text, IntWritable>(context);
        }

        @Override
        public void reduce(Text key,Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {
            int total=0;
            for (IntWritable val : values){
                total++;
            }
            // 自定义输出文件名
            multipleOutputs.write(key, new IntWritable(total), "score");
        }

        @Override
        protected void cleanup(Context context) throws IOException, InterruptedException {
            multipleOutputs.close();
        }
    }

    public static void main (String[] args) throws Exception{
        Configuration conf = new Configuration();

        Job job = new Job(conf, "word count");
        job.setJarByClass(WordCount.class);
        job.setMapperClass(WordCountMapper.class);
        job.setReducerClass(WordCountReducer.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        // 去掉临时输出目录会生成part-r-00000或者part-m-00000的空文件
        LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); // 注意:想全部自定义文件名这行一定不能有,否则最终生成的还是part-r-00000
        FileInputFormat.setInputPaths(job, new Path(args[0]));
        FileOutputFormat.setOutputPath(job, new Path("/huiqiang/output"));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}
二、要想全部自定义文件名,需要重写RecordWriter

自定义reducer类输出是通过重写FileOutputFormat类和RecordWriter类实现的。具体操作是通过重写RecordWriter类中的write方法,然后通过FileOutFormat类返回一个RecordWriter对象。

代码样例:

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**
 * Created by HuiQ on 2019-10-16.
 */
public class WordCount {

    public static class WordCountMapper extends Mapper<Object,Text,Text,IntWritable>{
        private final static IntWritable one = new IntWritable(1);
        private Text word = new Text();
        @Override
        public void map(Object key,Text value,Context context) throws IOException, InterruptedException {
            String[] words = value.toString().split(" ");
            for (String str: words){
                word.set(str);
                context.write(word,one);
            }
        }
    }

    public static class WordCountReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
    
        @Override
        public void reduce(Text key,Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {
            int total=0;
            for (IntWritable val : values){
                total++;
            }
            context.write(key, new IntWritable(total));
        }
    }

    // 注意:1.必须要把static关键字加上 2.FileOutputFormat中的数据类型一定要和reduce端输出对应上
    public static class MyFileOutputFormat extends FileOutputFormat<Text,IntWritable>{
        @Override
        public RecordWriter<Text, IntWritable> getRecordWriter(TaskAttemptContext job)throws IOException, InterruptedException {

            FileSystem fileSystem=FileSystem.newInstance(job.getConfiguration());
            //自定义的输出路径
            final FSDataOutputStream title=fileSystem.create(new Path("/huiqiang/output/test.txt"));
            RecordWriter<Text,IntWritable> recordWriter=new RecordWriter<Text, IntWritable>() {

                @Override
                public void close(TaskAttemptContext arg0) throws IOException,
                        InterruptedException {
                    if(title!=null){
                        title.close();
                    }
                }

                @Override
                public void write(Text key, IntWritable value) throws IOException,
                        InterruptedException {
                    String fenGe=" ";
                    String charSet="UTF-8";
                    System.out.println("key="+key.toString());
                    //输出key
                    title.write(key.toString().getBytes(charSet),0,key.toString().getBytes(charSet).length);
                    //输出key和value的分隔符
                    title.write(fenGe.getBytes(charSet),0,fenGe.getBytes(charSet).length);
                    //输出value
                    title.write(value.toString().getBytes(charSet),0,value.toString().getBytes(charSet).length);
                    title.write("\n".getBytes(charSet),0,"\n".getBytes(charSet).length);
                    title.flush();
                }
            };
            return recordWriter;
        }
    }

    public static void main (String[] args) throws Exception{
        Configuration conf = new Configuration();

        Job job = new Job(conf, "word count");
        job.setJarByClass(WordCount.class);
        job.setMapperClass(WordCountMapper.class);
        job.setReducerClass(WordCountReducer.class);
        job.setOutputFormatClass(MyFileOutputFormat.class);
        job.setOutputKeyClass(Text.class);
        job.setOutputValueClass(IntWritable.class);

        FileInputFormat.setInputPaths(job, new Path(args[0]));
        MyFileOutputFormat.setOutputPath(job, new Path("/huiqiang/output"));
        System.exit(job.waitForCompletion(true) ? 0 : 1);
    }
}

参考:https://blog.csdn.net/smallpizza/article/details/78060638

你可能感兴趣的:(hadoop)