关于hadoop的自定义输入格式:
之前在网上找过很多教程,但是大多数都是在不支持分片的前提下进行自定义输入。本文实现了一种在能够分片的基础上支持自定义输入格式的方法。
先贴代码:
import java.io.IOException; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FSDataInputStream; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.fs.Seekable; import org.apache.hadoop.io.IntWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.compress.CodecPool; import org.apache.hadoop.io.compress.CompressionCodec; import org.apache.hadoop.io.compress.CompressionCodecFactory; import org.apache.hadoop.io.compress.Decompressor; import org.apache.hadoop.io.compress.SplitCompressionInputStream; import org.apache.hadoop.io.compress.SplittableCompressionCodec; import org.apache.hadoop.mapreduce.InputSplit; import org.apache.hadoop.mapreduce.JobContext; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.util.LineReader; import org.apache.hadoop.mapreduce.lib.input.FileSplit; public class ZInputFormat extends FileInputFormat<IntWritable,IntWritable>{ @Override public RecordReader<IntWritable, IntWritable> createRecordReader( InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { return new ZRecordReader(); } //自定义的数据类型 public static class ZRecordReader extends RecordReader<IntWritable,IntWritable> { //data private LineReader in; //输入流 private boolean more = true;//提示后续还有没有数据 private IntWritable key = null; private IntWritable value = null; //这三个保存当前读取到位置(即文件中的位置) private long start; private long end; private long pos; private Log LOG = LogFactory.getLog(ZRecordReader.class);//日志写入系统,可加可不加 @Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { // 初始化函数 FileSplit inputsplit = (FileSplit)split; start = inputsplit.getStart(); //得到此分片开始位置 end = start + inputsplit.getLength();//结束此分片位置 final Path file = inputsplit.getPath(); // 打开文件 FileSystem fs = file.getFileSystem(context.getConfiguration()); FSDataInputStream fileIn = fs.open(inputsplit.getPath()); //关键位置2 //将文件指针移动到当前分片,因为每次默认打开文件时,其指针指向开头 fileIn.seek(start); in = new LineReader(fileIn, context.getConfiguration()); if (start != 0) { System.out.println("4"); //关键解决位置1 //如果这不是第一个分片,那么假设第一个分片是0——4,那么,第4个位置已经被读取,则需要跳过4,否则会产生读入错误,因为你回头又去读之前读过的地方 start += in.readLine(new Text(), 0, maxBytesToConsume(start)); } pos = start; } private int maxBytesToConsume(long pos) { return (int) Math.min(Integer.MAX_VALUE, end - pos); } @Override public boolean nextKeyValue() throws IOException, InterruptedException { //下一组值 //tips:以后在这种函数中最好不要有输出,费时 //LOG.info("正在读取下一个,嘿嘿"); if(null == key) { key = new IntWritable(); } if(null == value) { value = new IntWritable(); } Text nowline = new Text();//保存当前行的内容 int readsize = in.readLine(nowline); //更新当前读取到位置 pos += readsize; //关键位置3 //如果pos的值大于等于end,说明此分片已经读取完毕 if(pos >= end) { more = false; return false; } if(0 == readsize) { key = null; value = null; more = false;//说明此时已经读取到文件末尾,则more为false return false; } String[] keyandvalue = nowline.toString().split(","); //排除第一行 if(keyandvalue[0].endsWith("\"CITING\"")) { readsize = in.readLine(nowline); //更新当前读取到位置 pos += readsize; if(0 == readsize) { more = false;//说明此时已经读取到文件末尾,则more为false return false; } //重新划分 keyandvalue = nowline.toString().split(","); } //得到key和value //LOG.info("key is :" + key +"value is" + value); key.set(Integer.parseInt(keyandvalue[0])); value.set(Integer.parseInt(keyandvalue[1])); return true; } @Override public IntWritable getCurrentKey() throws IOException, InterruptedException { //得到当前的Key return key; } @Override public IntWritable getCurrentValue() throws IOException, InterruptedException { //得到当前的value return value; } @Override public float getProgress() throws IOException, InterruptedException { //计算对于当前片的处理进度 if( false == more || end == start) { return 0f; } else { return Math.min(1.0f, (pos - start)/(end - start)); } } @Override public void close() throws IOException { //关闭此输入流 if(null != in) { in.close(); } } } }
1、注释为“关键位置x(1,2,3)”的三处位置,是能够实现在分片的模式下实现自定义输入的关键,这也是和网上大多数流传的教程的不同之处。
PS:写出这段代码用了2天,在网上找了不少资料,最后发现还是系统自带的输入格式文件最管用。其实对比一下,这个和KeyValueTextInputFormat类的实现大同小异,只不 过去除了压缩判定部分。
PSS:感谢那些写教程的人们~嘿嘿