/**
Text缺省的最大到1024*1024,也就是1MB
*/
public class SmallFileRecordReader extends RecordReader<NullWritable, Text>{
private static final Log LOG = LogFactory.getLog(SmallFileRecordReader.class);
private FileSplit fileSplit;
private JobContext jobContext;
private NullWritable currentKey = NullWritable.get();
private Text currentValue;
private boolean finishConverting =false;
private String readFileName;
private long start;
private long pos;
private long end;
private LineReader in;
private Seekable filePosition;
private int maxLineLength;
private boolean isCompressedInput;
private Decompressor decompressor;
private CompressionCodec codec;
private CompressionCodecFactory compressionCodecs = null;
@Override
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
// TODO Auto-generated method stub
this.fileSplit = (FileSplit)split;
this.jobContext = context;
Configuration job = this.jobContext.getConfiguration();
this.maxLineLength = job.getInt("mapreduce.input.linerecordreader.line.maxlength", Integer.MAX_VALUE);
start = this.fileSplit.getStart();
end = start + this.fileSplit.getLength();
final Path file = this.fileSplit.getPath();
readFileName=file.getName();
context.getConfiguration().set("map.input.file", readFileName);
compressionCodecs = new CompressionCodecFactory(job);
//根据job的配置信息,和split的信息,获取到读取实体文件的信息,这里包括文件的压缩信息。
//这里压缩的code有:DEFAULT,GZIP,BZIP2,LZO,LZ4,SNAPPY
codec = compressionCodecs.getCodec(file);
final FileSystem fs = file.getFileSystem(job);
FSDataInputStream fileIn = fs.open(file);
if(null!=codec){
isCompressedInput = true;
decompressor = CodecPool.getDecompressor(codec);
if(codec instanceof SplittableCompressionCodec){
final SplitCompressionInputStream cIn =
((SplittableCompressionCodec)codec).createInputStream(fileIn,
decompressor, start, end, SplittableCompressionCodec.READ_MODE.BYBLOCK);
in = new LineReader(cIn, job);
start = cIn.getAdjustedStart();
end = cIn.getAdjustedEnd();
filePosition = cIn;
}else{
in = new LineReader(codec.createInputStream(fileIn, decompressor), job);
filePosition = fileIn;
}
}else {
fileIn.seek(start);
in = new LineReader(fileIn, job);
filePosition = fileIn;
}
// If this is not the first split, we always throw away first record
// because we always (except the last split) read one extra line in
// next() method.
if (start != 0) {
start += in.readLine(new Text(), 0, maxBytesToConsume(start));
}
this.pos = start;
}
@Override
public NullWritable getCurrentKey() throws IOException,
InterruptedException {
// TODO Auto-generated method stub
return currentKey;
}
@Override
public Text getCurrentValue() throws IOException,
InterruptedException {
// TODO Auto-generated method stub
return currentValue;
}
private int maxBytesToConsume(long pos) {
return isCompressedInput
? Integer.MAX_VALUE
: (int) Math.max(Math.min(Integer.MAX_VALUE, end - pos), maxLineLength);
}
private long getFilePosition() throws IOException {
long retVal;
if (isCompressedInput && null != filePosition) {
retVal = filePosition.getPos();
} else {
retVal = pos;
}
return retVal;
}
private Text value;
@Override
public boolean nextKeyValue() throws IOException, InterruptedException {
// TODO Auto-generated method stub
if(!finishConverting){
if(currentValue == null){
currentValue = new Text();
}
if(value == null){
value = new Text();
}
int newSize =0;
// int len = 0;
// StringBuffer sb =new StringBuffer();
// byte[] content = new byte[5*1024*1024];//小文件最大不能超过5M
LOG.info("Wlz finished read file: " + readFileName);
// Text tmpValue = new Text();
// Text splitValue = new Text("\r");
do{
// We always read one extra line, which lies outside the upper
// split limit i.e. (end - 1)
while(getFilePosition()<=end ){
//在这里进行数据读取,LineReader以\n作为分隔符,读取一行数据,放到Text value里面
//读取一行,可以参考LineReader的源码实现
newSize=in.readLine(value, maxLineLength, maxBytesToConsume(pos));
pos +=newSize;
if ((newSize == 0) || (newSize < maxLineLength)) {
break;
}
// line too long. try again
LOG.info("Skipped line of size " + newSize + " at pos " +
(pos - newSize));
}
if(newSize>0){
value.clear();
}else{
value=null;
}
}while(newSize>0);
finishConverting = true;
return true;
}
return false;
}
@Override
public float getProgress() throws IOException, InterruptedException {
// TODO Auto-generated method stub
float progress = 0;
if(finishConverting){
progress = 1;
}
return progress;
}
public synchronized void close() throws IOException {
try {
if (in != null) {
in.close();
}
} finally {
if (decompressor != null) {
CodecPool.returnDecompressor(decompressor);
}
}
}
public class SmallFileInputFormat extends FileInputFormat<NullWritable, Text> {
@Override
public RecordReader<NullWritable, Text> createRecordReader(
InputSplit split, TaskAttemptContext context) throws IOException,
InterruptedException {
// TODO Auto-generated method stub
RecordReader<NullWritable, Text> recordReader = new SmallFileRecordReader();
recordReader.initialize(split, context);
return recordReader;
}
}
public class SmallfilesMapper extends Mapper<NullWritable, Text, Text, Text> {
private static final Log LOG = LogFactory.getLog(SmallfilesMapper.class);
private Text file = new Text();
@Override
protected void map(NullWritable key, Text value, Context context)
throws IOException, InterruptedException{
String fileName = context.getConfiguration().get("map.input.file");//获取输入的文件名,用来作key
file.set(fileName);
context.write(file, value);
}
}