代码测试环境:Hadoop2.4
应用场景:当需要定制输出数据格式时可以采用此技巧,包括定制输出数据的展现形式,输出路径,输出文件名称等。
Hadoop内置的输出文件格式有:
1)FileOutputFormat
2)TextOutputFormat
3)SequenceFileOutputFormat
4)MultipleOutputs
5) NullOutputFormat
6)LazyOutputFormat
步骤:
类似输入数据格式,自定义输出数据格式同样可以参考下面的步骤
1) 定义一个继承自OutputFormat的类,不过一般继承FileOutputFormat即可;
2)实现其getRecordWriter方法,返回一个RecordWriter类型;
3)自定义一个继承RecordWriter的类,定义其write方法,针对每个
实例1(修改文件默认的输出文件名以及默认的key和value的分隔符):
输入数据:
自定义CustomFileOutputFormat(把默认文件名前缀替换掉):
package fz.outputformat;
import java.io.IOException;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class CustomOutputFormat extends FileOutputFormat {
private String prefix = "custom_";
@Override
public RecordWriter getRecordWriter(TaskAttemptContext job)
throws IOException, InterruptedException {
// 新建一个可写入的文件
Path outputDir = FileOutputFormat.getOutputPath(job);
// System.out.println("outputDir.getName():"+outputDir.getName()+",otuputDir.toString():"+outputDir.toString());
String subfix = job.getTaskAttemptID().getTaskID().toString();
Path path = new Path(outputDir.toString()+"/"+prefix+subfix.substring(subfix.length()-5, subfix.length()));
FSDataOutputStream fileOut = path.getFileSystem(job.getConfiguration()).create(path);
return new CustomRecordWriter(fileOut);
}
}
自定义CustomWriter(指定key,value分隔符):
package fz.outputformat;
import java.io.IOException;
import java.io.PrintWriter;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
public class CustomRecordWriter extends RecordWriter {
private PrintWriter out;
private String separator =",";
public CustomRecordWriter(FSDataOutputStream fileOut) {
out = new PrintWriter(fileOut);
}
@Override
public void write(LongWritable key, Text value) throws IOException,
InterruptedException {
out.println(key.get()+separator+value.toString());
}
@Override
public void close(TaskAttemptContext context) throws IOException,
InterruptedException {
out.close();
}
}
package fz.outputformat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class FileOutputFormatDriver extends Configured implements Tool{
/**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
// TODO Auto-generated method stub
ToolRunner.run(new Configuration(), new FileOutputFormatDriver(),args);
}
@Override
public int run(String[] arg0) throws Exception {
if(arg0.length!=3){
System.err.println("Usage:\nfz.outputformat.FileOutputFormatDriver ");
return -1;
}
Configuration conf = getConf();
Path in = new Path(arg0[0]);
Path out= new Path(arg0[1]);
boolean delete=out.getFileSystem(conf).delete(out, true);
System.out.println("deleted "+out+"?"+delete);
Job job = Job.getInstance(conf,"fileouttputformat test job");
job.setJarByClass(getClass());
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(CustomOutputFormat.class);
job.setMapperClass(Mapper.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(LongWritable.class);
job.setOutputValueClass(Text.class);
job.setNumReduceTasks(Integer.parseInt(arg0[2]));
job.setReducerClass(Reducer.class);
FileInputFormat.setInputPaths(job, in);
FileOutputFormat.setOutputPath(job, out);
return job.waitForCompletion(true)?0:-1;
}
}
从输出结果可以看到输出格式以及文件名确实按照预想输出了。
实例2(根据key和value值输出数据到不同目录):
自定义主类(主类其实就是修改了输出的方式而已):
package fz.multipleoutputformat;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class FileOutputFormatDriver extends Configured implements Tool{
/**
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
// TODO Auto-generated method stub
ToolRunner.run(new Configuration(), new FileOutputFormatDriver(),args);
}
@Override
public int run(String[] arg0) throws Exception {
if(arg0.length!=3){
System.err.println("Usage:\nfz.multipleoutputformat.FileOutputFormatDriver ");
return -1;
}
Configuration conf = getConf();
Path in = new Path(arg0[0]);
Path out= new Path(arg0[1]);
boolean delete=out.getFileSystem(conf).delete(out, true);
System.out.println("deleted "+out+"?"+delete);
Job job = Job.getInstance(conf,"fileouttputformat test job");
job.setJarByClass(getClass());
job.setInputFormatClass(TextInputFormat.class);
// job.setOutputFormatClass(CustomOutputFormat.class);
MultipleOutputs.addNamedOutput(job, "ignore", TextOutputFormat.class,
LongWritable.class, Text.class);
MultipleOutputs.addNamedOutput(job, "other", TextOutputFormat.class,
LongWritable.class, Text.class);
job.setMapperClass(Mapper.class);
job.setMapOutputKeyClass(LongWritable.class);
job.setMapOutputValueClass(Text.class);
job.setOutputKeyClass(LongWritable.class);
job.setOutputValueClass(Text.class);
job.setNumReduceTasks(Integer.parseInt(arg0[2]));
job.setReducerClass(MultipleReducer.class);
FileInputFormat.setInputPaths(job, in);
FileOutputFormat.setOutputPath(job, out);
return job.waitForCompletion(true)?0:-1;
}
}
自定义reducer(因为要根据key和value的值输出数据到不同目录,所以需要自定义逻辑)
package fz.multipleoutputformat;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;
public class MultipleReducer extends
Reducer {
private MultipleOutputs out;
@Override
public void setup(Context cxt){
out = new MultipleOutputs(cxt);
}
@Override
public void reduce(LongWritable key ,Iterable value,Context cxt)throws IOException,InterruptedException{
for(Text v:value){
if(v.toString().startsWith("ignore")){
// System.out.println("ignore--------------------value:"+v);
out.write("ignore", key, v, "ign");
}else{
// System.out.println("other---------------------value:"+v);
out.write("other", key, v, "oth");
}
}
}
@Override
public void cleanup(Context cxt)throws IOException,InterruptedException{
out.close();
}
}
可以看到输出的数据确实根据value的不同值被写入了不同的文件目录中,但是这里同样可以看到有默认的文件生成,同时注意到这个文件的大小是0,这个暂时还没解决。
总结:自定义输出格式,可以定制一些特殊需求,不过一般使用Hadoop内置的输出格式即可,这点来说其应用意义不是很大。不过使用Hadoop内置的MultipleOutputs可以根据数据的不同特性输出到不同的目录,还是很有实际意义的。
分享,成长,快乐
转载请注明blog地址:http://blog.csdn.net/fansy1990