主要就是2个实现类TextOutputFormat和SequenceOutputFormat
spark的rdd的saveAsTextFile()方法底层默认调的其实也是TextOutputFormat,这有2个问题:
1是无法指定文件名(这个其实不是问题,因为逻辑上只需要指定目录名即可,分布式的情况下一个文件肯定要分成多个部分,给每个部分指定名称无意义)
2是无法满足一些个性化需求
所以需要自定义
OutputFormat的作用就是把内存中一个个的kv变成文件
需要注意的是,OutputFormat是要在各个分区执行的,所以保存的文件名要区分开来,否则会报文件已存在的错。所以生成完再合并还是免不了的。
write方法的入参是kv,现在已经拿到kv了,可输出流怎么获取呢?这里的输出流需要自己添加方法创建,因为本来就是要自定义。此时需要做2件事:
上面仅仅是基本的例子,实际使用时有很多坑。现在记录一个实际例子:
df.rdd.saveAsTextFile("",classOf[GzipCodec])
默认用的是TextOutputFormat,不能设置编码格式。TextOutputFormat的源码中:注意的是:
org.apache.hadoop.mapreduce.lib.output
,老的是org.apache.hadoop.mapred
。使用老的方法saveAsHadoopFile时,对应的参数也全都是老包中的,否则会报编译错,此处巨坑!!!
包括TextOutputFormat和FileOutputFormat,新旧2套类名一样,包名不同。新的FileOutputFormat是抽象类已经实现了FileOutputFormat接口的所有方法,而旧的继承FileOutputFormat后还要继续实现FileOutputFormat的方法
源码中,kv有一方为null或NullWritable就不输出分隔符,只输出不为空的。
FileOutputFormat中如果判断key或v
为null或者为NullWritable类型
,就不会输出k v和分隔符
。所以pairRDD完全可以把k或者v设为null或NullWritable来实现用单valueRDD调双valueRDD的方法。推荐设为NullWritable
如下2处,声明和引用时要一致。
一开始想的是继承FileOutputFormat,但发现FileOutputFormat中的非抽象方法过多,比如设置压缩,要挨个设置4个参数
发现最简单的方法就是把TextOutputFormat类中的代码直接复制到到自定义的CustomOutputFormat类,然后把TextOutputFormat中写死的编码格式和写入Text类型时没设置编码的坑补上就能完美实现功能。
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.ReflectionUtils;
import java.io.DataOutputStream;
import java.io.IOException;
import java.io.UnsupportedEncodingException;
public class CustomOutputFormat<K, V> extends FileOutputFormat<K, V> {
public static String SEPERATOR = "mapreduce.output.textoutputformat.separator";
protected static class LineRecordWriter<K, V>
extends RecordWriter<K, V> {
// private static final String utf8 = "UTF-8";//此处修改
private static final String encod = "GBK";
private static final byte[] newline;
static {
try {
newline = "\n".getBytes(encod);
} catch (UnsupportedEncodingException uee) {
throw new IllegalArgumentException("can't find " + encod + " encoding");
}
}
protected DataOutputStream out;
private final byte[] keyValueSeparator;
public LineRecordWriter(DataOutputStream out, String keyValueSeparator) {
this.out = out;
try {
this.keyValueSeparator = keyValueSeparator.getBytes(encod);
} catch (UnsupportedEncodingException uee) {
throw new IllegalArgumentException("can't find " + encod + " encoding");
}
}
public LineRecordWriter(DataOutputStream out) {
this(out, "\t");
}
/**
* Write the object to the byte stream, handling Text as a special
* case.
*
* @param o the object to print
* @throws IOException if the write throws, we pass it on
*/
private void writeObject(Object o) throws IOException {
//此处修改
// if (o instanceof Text) {
// Text to = (Text) o;
// out.write(to.getBytes(), 0, to.getLength());
// } else {
out.write(o.toString().getBytes(encod));
}
public synchronized void write(K key, V value)
throws IOException {
boolean nullKey = key == null || key instanceof NullWritable;
boolean nullValue = value == null || value instanceof NullWritable;
if (nullKey && nullValue) {
return;
}
if (!nullKey) {
writeObject(key);
}
if (!(nullKey || nullValue)) {
out.write(keyValueSeparator);
}
if (!nullValue) {
writeObject(value);
}
out.write(newline);
}
public synchronized void close(TaskAttemptContext context) throws IOException {
out.close();
}
}
public RecordWriter<K, V>
getRecordWriter(TaskAttemptContext job
) throws IOException, InterruptedException {
Configuration conf = job.getConfiguration();
boolean isCompressed = getCompressOutput(job);
String keyValueSeparator = conf.get(SEPERATOR, "\t");
CompressionCodec codec = null;
String extension = "";
if (isCompressed) {
Class<? extends CompressionCodec> codecClass =
getOutputCompressorClass(job, GzipCodec.class);
codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
extension = codec.getDefaultExtension();
}
Path file = getDefaultWorkFile(job, extension);
FileSystem fs = file.getFileSystem(conf);
if (!isCompressed) {
FSDataOutputStream fileOut = fs.create(file, false);
return new CustomOutputFormat.LineRecordWriter<K, V>(fileOut, keyValueSeparator);
} else {
FSDataOutputStream fileOut = fs.create(file, false);
return new CustomOutputFormat.LineRecordWriter<K, V>(new DataOutputStream
(codec.createOutputStream(fileOut)),
keyValueSeparator);
}
}
}