通过三个简单的类WordCount,MyMapper和MyReducer实现一个简单的单词统计的功能.
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCount {
public static void main(String[] args) throws Exception {
System.getProperty("HADOOP_USER_NAME", "root");
Configuration conf = new Configuration(true);
Job job = Job.getInstance(conf);
job.setJarByClass(WordCount.class);
job.setJobName("myjob");
//设置mapper output的key和value
job.setMapOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
job.setMapperClass(MyMapper.class);
job.setReducerClass(MyReducer.class);
Path input = new Path("/temp/wc/input");
FileInputFormat.addInputPath(job, input);
Path output = new Path("/temp/wc/output");
if (output.getFileSystem(conf).exists(output)) {
output.getFileSystem(conf).delete(output);
}
FileOutputFormat.setOutputPath(job, output);
job.waitForCompletion(true);
}
}
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class MyMapper extends Mapper<Object, Text, Text, IntWritable> {
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
/**
* @param key split后的每一行的偏移量
* @param value split后每一行的内容
*/
public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
}
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class MyReducer extends Reducer<Text, IntWritable, Text, IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
MapTask类里面有一个run()方法:
@SuppressWarnings("unchecked")
private
void runNewMapper(final JobConf job,
final TaskSplitIndex splitIndex,
final TaskUmbilicalProtocol umbilical,
TaskReporter reporter
) throws IOException, ClassNotFoundException,
InterruptedException {
// make a task context so we can get the classes
org.apache.hadoop.mapreduce.TaskAttemptContext taskContext =
new org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl(job,
getTaskID(),
reporter);
/* make a mapper,通过反射获取Mapper. 如果用户定义了Mapper,则使用用户定义的Mapper,如果用户没有定义,则使用默认的Mapper. 可进入taskContext.getMapperClass()看一下是怎样从配置信息获取Mapper的.*/
org.apache.hadoop.mapreduce.Mapper mapper =
(org.apache.hadoop.mapreduce.Mapper)
ReflectionUtils.newInstance(taskContext.getMapperClass(), job);
/*make the input format 通过反射获取InputFormat. 如果用户自定义了就是用用户自定义的,如果用户没有自定义,则使用默认的.可进入taskContext.getInputFormatClass()看一下是怎么从配置信息获取InuputFormat的.*/
org.apache.hadoop.mapreduce.InputFormat inputFormat =
(org.apache.hadoop.mapreduce.InputFormat)
ReflectionUtils.newInstance(taskContext.getInputFormatClass(), job);
/*rebuild the input split,获取split的信息,split的信息包括文件file,开始位置的偏移量,大小,hosts等*/
org.apache.hadoop.mapreduce.InputSplit split = null;
split = getSplitDetails(new Path(splitIndex.getSplitLocation()),
splitIndex.getStartOffset());
LOG.info("Processing split: " + split);
/*通过获取完split,inputFormat,reporter, taskContext的对象去创建input. 这里获取的input是LineRecordReader类型*/
org.apache.hadoop.mapreduce.RecordReader input =
new NewTrackingRecordReader
(split, inputFormat, reporter, taskContext);
job.setBoolean(JobContext.SKIP_RECORDS, isSkipping());
org.apache.hadoop.mapreduce.RecordWriter output = null;
// get an output object,获取输出流
if (job.getNumReduceTasks() == 0) {
output =
new NewDirectOutputCollector(taskContext, job, umbilical, reporter);
} else {
output = new NewOutputCollector(taskContext, job, umbilical, reporter);
}
org.apache.hadoop.mapreduce.MapContext
mapContext =
new MapContextImpl(job, getTaskID(),
input, output,
committer,
reporter, split);
org.apache.hadoop.mapreduce.Mapper.Context
mapperContext =
new WrappedMapper().getMapContext(
mapContext);
try {
//初始化输入
input.initialize(split, mapperContext);
mapper.run(mapperContext);
mapPhase.complete();
setPhase(TaskStatus.Phase.SORT);
statusUpdate(umbilical);
input.close();
input = null;
output.close(mapperContext);
output = null;
} finally {
closeQuietly(input);
closeQuietly(output, mapperContext);
}
}
通过以上源码分析,可以知道input的类型是LineRecordReader,
input.initialize(split, mapperContext)是进行输入的初始化工作,这个方法的实现类是LineRecordReader. 咱们进入方法里面窥探一下:
public void initialize(InputSplit genericSplit,
TaskAttemptContext context) throws IOException {
FileSplit split = (FileSplit) genericSplit;
Configuration job = context.getConfiguration();
this.maxLineLength = job.getInt(MAX_LINE_LENGTH, Integer.MAX_VALUE);
//下面三行是获取split的start(偏移量),end(结束位置)和file(文件信息)
start = split.getStart();
end = start + split.getLength();
final Path file = split.getPath();
// open the file and seek to the start of the split
final FileSystem fs = file.getFileSystem(job);
//获取到split的文件输入流
fileIn = fs.open(file);
CompressionCodec codec = new CompressionCodecFactory(job).getCodec(file);
if (null!=codec) {
isCompressedInput = true;
decompressor = CodecPool.getDecompressor(codec);
if (codec instanceof SplittableCompressionCodec) {
final SplitCompressionInputStream cIn =
((SplittableCompressionCodec)codec).createInputStream(
fileIn, decompressor, start, end,
SplittableCompressionCodec.READ_MODE.BYBLOCK);
in = new CompressedSplitLineReader(cIn, job,
this.recordDelimiterBytes);
start = cIn.getAdjustedStart();
end = cIn.getAdjustedEnd();
filePosition = cIn;
} else {
in = new SplitLineReader(codec.createInputStream(fileIn,
decompressor), job, this.recordDelimiterBytes);
filePosition = fileIn;
}
//fileIn输入流从split切片的偏移量开始读取
fileIn.seek(start);
//这里的in是SplitLineReader类型,通过fileIn去获取SplitLineReader数据in
in = new UncompressedSplitLineReader(
fileIn, job, this.recordDelimiterBytes, split.getLength());
filePosition = fileIn;
}
/*If this is not the first split, we always throw away first record because we always (except the last split) read one extra line in next() method. 说人话就是,如果起始偏移量不是0,也就是说如果不是第一个切片的第一行,放弃读取该行,从第二行开始读取. 因为在生成Block块的时候,有可能一行数据会被拆分放到两个block中*/
if (start != 0) {
start += in.readLine(new Text(), 0, maxBytesToConsume(start));
}
this.pos = start;
}
回到MapTask类的run()方法,里面的代码:
//初始化输入
input.initialize(split, mapperContext);
//输入的初始化完成后就开始mapper的过程了.
mapper.run(mapperContext);
回到MapTask类的run()方法,看Map的输出代码:
// get an output object,获取输出流
//如果Reduce的数量为0,执行NewDirectOutputCollector对象创建
if (job.getNumReduceTasks() == 0) {
output =
new NewDirectOutputCollector(taskContext, job, umbilical, reporter);
} else {
output = new NewOutputCollector(taskContext, job, umbilical, reporter);
}
我们这里分析Reduce数量大于0的情况. 进入NewOutputCollector()的代码:
private class NewOutputCollector
extends org.apache.hadoop.mapreduce.RecordWriter {
private final MapOutputCollector collector;
private final org.apache.hadoop.mapreduce.Partitioner partitioner;
private final int partitions;
@SuppressWarnings("unchecked")
NewOutputCollector(org.apache.hadoop.mapreduce.JobContext jobContext,
JobConf job,
TaskUmbilicalProtocol umbilical,
TaskReporter reporter
) throws IOException, ClassNotFoundException {
collector = createSortingCollector(job, reporter);
//有几个Reduce就对应几个分区partitions
partitions = jobContext.getNumReduceTasks();
if (partitions > 1) {
//又是反射,反射获取的对象要么是用户设置的,要么是默认的
partitioner = (org.apache.hadoop.mapreduce.Partitioner)
ReflectionUtils.newInstance(jobContext.getPartitionerClass(), job);
} else {
partitioner = new org.apache.hadoop.mapreduce.Partitioner() {
@Override
public int getPartition(K key, V value, int numPartitions) {
return partitions - 1;
}
};
}
@Override
public void write(K key, V value) throws IOException, InterruptedException {
collector.collect(key, value,
partitioner.getPartition(key, value, partitions));
}
@Override
public void close(TaskAttemptContext context
) throws IOException,InterruptedException {
try {
collector.flush();
} catch (ClassNotFoundException cnf) {
throw new IOException("can't find class ", cnf);
}
collector.close();
}
}
NewOutputCollector的构造方法中创建了collector:
collector = createSortingCollector(job, reporter);
我们进去看一下createSortingCollector(job, reporter)是怎么实现的:
@SuppressWarnings("unchecked")
private <KEY, VALUE> MapOutputCollector<KEY, VALUE>
createSortingCollector(JobConf job, TaskReporter reporter)
throws IOException, ClassNotFoundException {
MapOutputCollector.Context context =
new MapOutputCollector.Context(this, job, reporter);
/*看到反射就激动,如果用户定义了MAP_OUTPUT_COLLECTOR_CLASS_ATTR,collectorClasses取该类型,如果用户没有定义,collectorClasses则取MapOutputBuffer.class. 这是一个比较复杂的类,一般情况下不会自己定义这个东西. collectorClasses就是用默认的MapOutputBuffer.class就好了*/
Class>[] collectorClasses = job.getClasses(
JobContext.MAP_OUTPUT_COLLECTOR_CLASS_ATTR, MapOutputBuffer.class);
int remainingCollectors = collectorClasses.length;
for (Class clazz : collectorClasses) {
try {
if (!MapOutputCollector.class.isAssignableFrom(clazz)) {
throw new IOException("Invalid output collector class: " + clazz.getName() +
" (does not implement MapOutputCollector)");
}
Class extends MapOutputCollector> subclazz =
clazz.asSubclass(MapOutputCollector.class);
LOG.debug("Trying map output collector class: " + subclazz.getName());
MapOutputCollector<KEY, VALUE> collector =
ReflectionUtils.newInstance(subclazz, job);
/*获取完collector后进行collector的初始化,一般情况下collector使用默认的类型MapOutputBuffer.class*/
collector.init(context);
LOG.info("Map output collector class = " + collector.getClass().getName());
return collector;
} catch (Exception e) {
String msg = "Unable to initialize MapOutputCollector " + clazz.getName();
if (--remainingCollectors > 0) {
msg += " (" + remainingCollectors + " more collector(s) to try)";
}
LOG.warn(msg, e);
}
}
throw new IOException("Unable to initialize any output collector");
}
从上面分析collector的类型一般是MapOutputBuffer.我们看一下collector.init(context)是怎么初始化的:
@SuppressWarnings("unchecked")
public void init(MapOutputCollector.Context context
) throws IOException, ClassNotFoundException {
job = context.getJobConf();
reporter = context.getReporter();
mapTask = context.getMapTask();
mapOutputFile = mapTask.getMapOutputFile();
sortPhase = mapTask.getSortPhase();
spilledRecordsCounter = reporter.getCounter(TaskCounter.SPILLED_RECORDS);
partitions = job.getNumReduceTasks();
rfs = ((LocalFileSystem)FileSystem.getLocal(job)).getRaw();
//sanity checks
final float spillper =
job.getFloat(JobContext.MAP_SORT_SPILL_PERCENT, (float)0.8);
final int sortmb = job.getInt(JobContext.IO_SORT_MB, 100);
indexCacheMemoryLimit = job.getInt(JobContext.INDEX_CACHE_MEMORY_LIMIT,
INDEX_CACHE_MEMORY_LIMIT_DEFAULT);
if (spillper > (float)1.0 || spillper <= (float)0.0) {
throw new IOException("Invalid \"" + JobContext.MAP_SORT_SPILL_PERCENT +
"\": " + spillper);
}
if ((sortmb & 0x7FF) != sortmb) {
throw new IOException(
"Invalid \"" + JobContext.IO_SORT_MB + "\": " + sortmb);
}
sorter = ReflectionUtils.newInstance(job.getClass("map.sort.class",
QuickSort.class, IndexedSorter.class), job);
// buffers and accounting
int maxMemUsage = sortmb << 20;
maxMemUsage -= maxMemUsage % METASIZE;
kvbuffer = new byte[maxMemUsage];
bufvoid = kvbuffer.length;
kvmeta = ByteBuffer.wrap(kvbuffer)
.order(ByteOrder.nativeOrder())
.asIntBuffer();
setEquator(0);
bufstart = bufend = bufindex = equator;
kvstart = kvend = kvindex;
maxRec = kvmeta.capacity() / NMETA;
softLimit = (int)(kvbuffer.length * spillper);
bufferRemaining = softLimit;
LOG.info(JobContext.IO_SORT_MB + ": " + sortmb);
LOG.info("soft limit at " + softLimit);
LOG.info("bufstart = " + bufstart + "; bufvoid = " + bufvoid);
LOG.info("kvstart = " + kvstart + "; length = " + maxRec);
// k/v serialization
/*获取比较器,如果用户自定义了,则使用用户自定义的比较器,如果没有定义,则取默认的比较器.这里就不进去看怎么取比较器了,里面的代码比较简单,看官可以自己进去看*/
comparator = job.getOutputKeyComparator();
keyClass = (Class)job.getMapOutputKeyClass();
valClass = (Class)job.getMapOutputValueClass();
serializationFactory = new SerializationFactory(job);
keySerializer = serializationFactory.getSerializer(keyClass);
keySerializer.open(bb);
valSerializer = serializationFactory.getSerializer(valClass);
valSerializer.open(bb);
// output counters
mapOutputByteCounter = reporter.getCounter(TaskCounter.MAP_OUTPUT_BYTES);
mapOutputRecordCounter =
reporter.getCounter(TaskCounter.MAP_OUTPUT_RECORDS);
fileOutputByteCounter = reporter
.getCounter(TaskCounter.MAP_OUTPUT_MATERIALIZED_BYTES);
// compression
if (job.getCompressMapOutput()) {
Class extends CompressionCodec> codecClass =
job.getMapOutputCompressorClass(DefaultCodec.class);
codec = ReflectionUtils.newInstance(codecClass, job);
} else {
codec = null;
}
// combiner
final Counters.Counter combineInputCounter =
reporter.getCounter(TaskCounter.COMBINE_INPUT_RECORDS);
combinerRunner = CombinerRunner.create(job, getTaskID(),
combineInputCounter,
reporter, null);
if (combinerRunner != null) {
final Counters.Counter combineOutputCounter =
reporter.getCounter(TaskCounter.COMBINE_OUTPUT_RECORDS);
combineCollector= new CombineOutputCollector(combineOutputCounter, reporter, job);
} else {
combineCollector = null;
}
spillInProgress = false;
minSpillsForCombine = job.getInt(JobContext.MAP_COMBINE_MIN_SPILLS, 3);
spillThread.setDaemon(true);
spillThread.setName("SpillThread");
spillLock.lock();
try {
//开始往外溢写内容
spillThread.start();
while (!spillThreadRunning) {
spillDone.await();
}
} catch (InterruptedException e) {
throw new IOException("Spill thread failed to initialize", e);
} finally {
spillLock.unlock();
}
if (sortSpillException != null) {
throw new IOException("Spill thread failed to initialize",
sortSpillException);
}
}
里面有代码spillThread.start(),我们看一下map怎么往外溢写的,进入spillThread的run()方法代码:
protected class SpillThread extends Thread {
@Override
public void run() {
spillLock.lock();
spillThreadRunning = true;
try {
while (true) {
spillDone.signal();
while (!spillInProgress) {
spillReady.await();
}
try {
spillLock.unlock();
//排序并开始往外溢写
sortAndSpill();
} catch (Throwable t) {
sortSpillException = t;
} finally {
spillLock.lock();
if (bufend < bufstart) {
bufvoid = kvbuffer.length;
}
kvstart = kvend;
bufstart = bufend;
spillInProgress = false;
}
}
} catch (InterruptedException e) {
Thread.currentThread().interrupt();
} finally {
spillLock.unlock();
spillThreadRunning = false;
}
}
}
我们看一下SpillThread是怎么sortAndSpill()的,进去看一下:
private void sortAndSpill() throws IOException, ClassNotFoundException,
InterruptedException {
//approximate the length of the output file to be the length of the
//buffer + header lengths for the partitions
final long size = distanceTo(bufstart, bufend, bufvoid) +
partitions * APPROX_HEADER_LENGTH;
FSDataOutputStream out = null;
try {
// create spill file
final SpillRecord spillRec = new SpillRecord(partitions);
final Path filename =
mapOutputFile.getSpillFileForWrite(numSpills, size);
out = rfs.create(filename);
final int mstart = kvend / NMETA;
final int mend = 1 + // kvend is a valid record
(kvstart >= kvend
? kvstart
: kvmeta.capacity() + kvstart) / NMETA;
/*sorter对map进行排序,排序使过程使用的comparator在前面的分析中已经获取过了,看官往回看一下.排序的详细过程这里就不分析了,客官自己进去看吧*/
sorter.sort(MapOutputBuffer.this, mstart, mend, reporter);
int spindex = mstart;
final IndexRecord rec = new IndexRecord();
final InMemValBytes value = new InMemValBytes();
for (int i = 0; i < partitions; ++i) {
IFile.Writer writer = null;
try {
long segmentStart = out.getPos();
FSDataOutputStream partitionOut = CryptoUtils.wrapIfNecessary(job, out);
writer = new Writer(job, partitionOut, keyClass, valClass, codec,
spilledRecordsCounter);
if (combinerRunner == null) {
// spill directly
DataInputBuffer key = new DataInputBuffer();
while (spindex < mend &&
kvmeta.get(offsetFor(spindex % maxRec) + PARTITION) == i) {
final int kvoff = offsetFor(spindex % maxRec);
int keystart = kvmeta.get(kvoff + KEYSTART);
int valstart = kvmeta.get(kvoff + VALSTART);
key.reset(kvbuffer, keystart, valstart - keystart);
getVBytesForOffset(kvoff, value);
writer.append(key, value);
++spindex;
}
} else {
int spstart = spindex;
while (spindex < mend &&
kvmeta.get(offsetFor(spindex % maxRec)
+ PARTITION) == i) {
++spindex;
}
// Note: we would like to avoid the combiner if we've fewer
// than some threshold of records for a partition
if (spstart != spindex) {
combineCollector.setWriter(writer);
RawKeyValueIterator kvIter =
new MRResultIterator(spstart, spindex);
combinerRunner.combine(kvIter, combineCollector);
}
}
// close the writer
writer.close();
// record offsets
rec.startOffset = segmentStart;
rec.rawLength = writer.getRawLength() + CryptoUtils.cryptoPadding(job);
rec.partLength = writer.getCompressedLength() + CryptoUtils.cryptoPadding(job);
spillRec.putIndex(rec, i);
writer = null;
} finally {
if (null != writer) writer.close();
}
}
if (totalIndexCacheMemory >= indexCacheMemoryLimit) {
// create spill index file
Path indexFilename =
mapOutputFile.getSpillIndexFileForWrite(numSpills, partitions
* MAP_OUTPUT_INDEX_RECORD_LENGTH);
spillRec.writeToFile(indexFilename, job);
} else {
indexCacheList.add(spillRec);
totalIndexCacheMemory +=
spillRec.size() * MAP_OUTPUT_INDEX_RECORD_LENGTH;
}
LOG.info("Finished spill " + numSpills);
++numSpills;
} finally {
if (out != null) out.close();
}
}
怎是分析到这里.我们的目的是为了获取NewOutputCollector collector. 前面的分析就是怎么获取这个collector的.为什么我们要获取这个collector呢?客官别急.
在MyReducer类里面有一行代码:context.write(key, result);
public void reduce(Text key, Iterable values, Context context)
throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
//这是往reduce溢写的开始
context.write(key, result);
}
我们跟进去context.write(key, result);看看:
@Override
public void write(KEYOUT key, VALUEOUT value) throws IOException,
InterruptedException {
reduceContext.write(key, value);
}
跟进去reduceContext.write(key, value)看看:
public void write(KEYOUT key, VALUEOUT value
) throws IOException, InterruptedException {
output.write(key, value);
}
这里的output就是前面我们分析获取的NewOutputCollector collector.到这里,Map的输出过程大致分析完毕了.
下面来分析一下map是如何确保具有相同key的数据会被分发到同一个reduce中.
回到NewOutputCollector类中,如果deduce的数量大于1:
if (partitions > 1) {
//又是反射,反射获取的对象要么是用户设置的,要么是默认的
partitioner = (org.apache.hadoop.mapreduce.Partitioner)
ReflectionUtils.newInstance(jobContext.getPartitionerClass(), job);
}
这里jobContext的实现类型是JobContextImpl,进入jobContext.getPartitionerClass()的实现方法:
@SuppressWarnings("unchecked")
public Class extends Partitioner,?>> getPartitionerClass()
throws ClassNotFoundException {
/*如果用户设置了PARTITIONER_CLASS_ATTR,则取该值,如果用户没有设置,则取HashPartitioner.class.*/
return (Class extends Partitioner,?>>)
conf.getClass(PARTITIONER_CLASS_ATTR, HashPartitioner.class);
}
我们可以进去看一下HashPartitioner.class是的逻辑是怎么样的:
public class HashPartitioner<K, V> extends Partitioner<K, V> {
/** Use {@link Object#hashCode()} to partition. */
public int getPartition(K key, V value,
int numReduceTasks) {
return (key.hashCode() & Integer.MAX_VALUE) % numReduceTasks;
}
}
从代码我们可以看出,经过key的取hash值再取模后确保了相同的key返回来的partition是相同的,说人话就是相同的key会被放到相同的reduce中.
回来到NewOutputCollector类中:
@SuppressWarnings("unchecked")
NewOutputCollector(org.apache.hadoop.mapreduce.JobContext jobContext,
JobConf job,
TaskUmbilicalProtocol umbilical,
TaskReporter reporter
) throws IOException, ClassNotFoundException {
collector = createSortingCollector(job, reporter);
//有几个Reduce就对应几个分区partitions
partitions = jobContext.getNumReduceTasks();
if (partitions > 1) {
/*又是反射,反射获取的对象要么是用户设置的,要么是默认的.这两行的代码的功能是获取map分发出来的key对应到prititioner中,进而给对应的reduce处理.*/
partitioner = (org.apache.hadoop.mapreduce.Partitioner)
ReflectionUtils.newInstance(jobContext.getPartitionerClass(), job);
} else {
//若果只有一个reduce,则所有的key都会被放到唯一的reduce中进行计算
partitioner = new org.apache.hadoop.mapreduce.Partitioner() {
@Override
public int getPartition(K key, V value, int numPartitions) {
return partitions - 1;
}
};
}
}