MapReduce默认情况下,一个reducer产生一个文件,以name-r-nnnnn来命名,其中默认的name为part,nnnnn从(00000开始递增),保证了每个reducer不会产生重复的文件。
1.使用org.apache.hadoop.mapreduce.lib.output.MultipleOutputs类
2.MultipleOutputs类需要在Reduce的setup()方法初始化,最好在cleanup()中关闭
3.这个时候还会生产成part-r-000000这种文件,发现是里面是空的,需要LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class);
代码样例:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.LazyOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import java.io.IOException;
/**
* Created by HuiQ on 2019-10-16.
*/
public class WordCount {
public static class WordCountMapper extends Mapper<Object,Text,Text,IntWritable>{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
@Override
public void map(Object key,Text value,Context context) throws IOException, InterruptedException {
String[] words = value.toString().split(" ");
for (String str: words){
word.set(str);
context.write(word,one);
}
}
}
public static class WordCountReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
private MultipleOutputs<Text, IntWritable> multipleOutputs;
@Override
protected void setup(Context context) throws IOException, InterruptedException {
multipleOutputs = new MultipleOutputs<Text, IntWritable>(context);
}
@Override
public void reduce(Text key,Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {
int total=0;
for (IntWritable val : values){
total++;
}
// 自定义输出文件名
multipleOutputs.write(key, new IntWritable(total), "score");
}
@Override
protected void cleanup(Context context) throws IOException, InterruptedException {
multipleOutputs.close();
}
}
public static void main (String[] args) throws Exception{
Configuration conf = new Configuration();
Job job = new Job(conf, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
// 去掉临时输出目录会生成part-r-00000或者part-m-00000的空文件
LazyOutputFormat.setOutputFormatClass(job, TextOutputFormat.class); // 注意:想全部自定义文件名这行一定不能有,否则最终生成的还是part-r-00000
FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path("/huiqiang/output"));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
自定义reducer类输出是通过重写FileOutputFormat类和RecordWriter类实现的。具体操作是通过重写RecordWriter类中的write方法,然后通过FileOutFormat类返回一个RecordWriter对象。
代码样例:
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
* Created by HuiQ on 2019-10-16.
*/
public class WordCount {
public static class WordCountMapper extends Mapper<Object,Text,Text,IntWritable>{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
@Override
public void map(Object key,Text value,Context context) throws IOException, InterruptedException {
String[] words = value.toString().split(" ");
for (String str: words){
word.set(str);
context.write(word,one);
}
}
}
public static class WordCountReducer extends Reducer<Text,IntWritable,Text,IntWritable> {
@Override
public void reduce(Text key,Iterable<IntWritable> values,Context context) throws IOException, InterruptedException {
int total=0;
for (IntWritable val : values){
total++;
}
context.write(key, new IntWritable(total));
}
}
// 注意:1.必须要把static关键字加上 2.FileOutputFormat中的数据类型一定要和reduce端输出对应上
public static class MyFileOutputFormat extends FileOutputFormat<Text,IntWritable>{
@Override
public RecordWriter<Text, IntWritable> getRecordWriter(TaskAttemptContext job)throws IOException, InterruptedException {
FileSystem fileSystem=FileSystem.newInstance(job.getConfiguration());
//自定义的输出路径
final FSDataOutputStream title=fileSystem.create(new Path("/huiqiang/output/test.txt"));
RecordWriter<Text,IntWritable> recordWriter=new RecordWriter<Text, IntWritable>() {
@Override
public void close(TaskAttemptContext arg0) throws IOException,
InterruptedException {
if(title!=null){
title.close();
}
}
@Override
public void write(Text key, IntWritable value) throws IOException,
InterruptedException {
String fenGe=" ";
String charSet="UTF-8";
System.out.println("key="+key.toString());
//输出key
title.write(key.toString().getBytes(charSet),0,key.toString().getBytes(charSet).length);
//输出key和value的分隔符
title.write(fenGe.getBytes(charSet),0,fenGe.getBytes(charSet).length);
//输出value
title.write(value.toString().getBytes(charSet),0,value.toString().getBytes(charSet).length);
title.write("\n".getBytes(charSet),0,"\n".getBytes(charSet).length);
title.flush();
}
};
return recordWriter;
}
}
public static void main (String[] args) throws Exception{
Configuration conf = new Configuration();
Job job = new Job(conf, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReducer.class);
job.setOutputFormatClass(MyFileOutputFormat.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job, new Path(args[0]));
MyFileOutputFormat.setOutputPath(job, new Path("/huiqiang/output"));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
参考:https://blog.csdn.net/smallpizza/article/details/78060638