概述
流程图
inputformat | TextInputFormat |
recordreader | LineRecoredReader |
inputSplit | FileSplit |
Map | 不知道 |
combine | 不知道 |
partitioner | HashPartitioner |
GroupComparator | 这个很神奇,可以说是奇迹发生的地方,类里和主类里两个地方都可以写 |
reduce | identityMap,老jar包里的类 |
outputformat | fileOutputFormat |
recordReader | LineRecordReader |
outputCommitter | FileOup0utCommitter |
/**
* A base class for file-based {@link InputFormat}s.
*
* FileInputFormat
is the base class for all file-based
* InputFormat
s. This provides a generic implementation of
* {@link #getSplits(JobContext)}.
* Subclasses of FileInputFormat
can also override the
* {@link #isSplitable(JobContext, Path)} method to ensure input-files are
* not split-up and are processed as a whole by {@link Mapper}s.
*/
public abstract class FileInputFormat extends InputFormat {
public static enum Counter {
BYTES_READ
}
private static final Log LOG = LogFactory.getLog(FileInputFormat.class);
private static final double SPLIT_SLOP = 1.1; // 10% slop
private static final PathFilter hiddenFileFilter = new PathFilter(){
public boolean accept(Path p){
String name = p.getName();
return !name.startsWith("_") && !name.startsWith(".");
}
};
static final String NUM_INPUT_FILES = "mapreduce.input.num.files";
/**
* Proxy PathFilter that accepts a path only if all filters given in the
* constructor do. Used by the listPaths() to apply the built-in
* hiddenFileFilter together with a user provided one (if any).
*/
private static class MultiPathFilter implements PathFilter {
private List filters;
public MultiPathFilter(List filters) {
this.filters = filters;
}
public boolean accept(Path path) {
for (PathFilter filter : filters) {
if (!filter.accept(path)) {
return false;
}
}
return true;
}
}
/**
* Get the lower bound on split size imposed by the format.
* @return the number of bytes of the minimal split for this format
*/
protected long getFormatMinSplitSize() {
return 1;
}
/**
* Is the given filename splitable? Usually, true, but if the file is
* stream compressed, it will not be.
*
* FileInputFormat
implementations can override this and return
* false
to ensure that individual input files are never split-up
* so that {@link Mapper}s process entire files.
*
* @param context the job context
* @param filename the file name to check
* @return is this file splitable?
*/
protected boolean isSplitable(JobContext context, Path filename) {
return true;
}
/**
* Set a PathFilter to be applied to the input paths for the map-reduce job.
* @param job the job to modify
* @param filter the PathFilter class use for filtering the input paths.
*/
public static void setInputPathFilter(Job job,
Class extends PathFilter> filter) {
job.getConfiguration().setClass("mapred.input.pathFilter.class", filter,
PathFilter.class);
}
/**
* Set the minimum input split size
* @param job the job to modify
* @param size the minimum size
*/
public static void setMinInputSplitSize(Job job,
long size) {
job.getConfiguration().setLong("mapred.min.split.size", size);
}
/**
* Get the minimum split size
* @param job the job
* @return the minimum number of bytes that can be in a split
*/
public static long getMinSplitSize(JobContext job) {
return job.getConfiguration().getLong("mapred.min.split.size", 1L);
}
/**
* Set the maximum split size
* @param job the job to modify
* @param size the maximum split size
*/
public static void setMaxInputSplitSize(Job job,
long size) {
job.getConfiguration().setLong("mapred.max.split.size", size);
}
/**
* Get the maximum split size.
* @param context the job to look at.
* @return the maximum number of bytes a split can include
*/
public static long getMaxSplitSize(JobContext context) {
return context.getConfiguration().getLong("mapred.max.split.size",
Long.MAX_VALUE);
}
/**
* Get a PathFilter instance of the filter set for the input paths.
*
* @return the PathFilter instance set for the job, NULL if none has been set.
*/
public static PathFilter getInputPathFilter(JobContext context) {
Configuration conf = context.getConfiguration();
Class> filterClass = conf.getClass("mapred.input.pathFilter.class", null,
PathFilter.class);
return (filterClass != null) ?
(PathFilter) ReflectionUtils.newInstance(filterClass, conf) : null;
}
/** List input directories.
* Subclasses may override to, e.g., select only files matching a regular
* expression.
*
* @param job the job to list input paths for
* @return array of FileStatus objects
* @throws IOException if zero items.
*/
protected List listStatus(JobContext job
) throws IOException {
List result = new ArrayList();
Path[] dirs = getInputPaths(job);
if (dirs.length == 0) {
throw new IOException("No input paths specified in job");
}
// get tokens for all the required FileSystems..
TokenCache.obtainTokensForNamenodes(job.getCredentials(), dirs,
job.getConfiguration());
List errors = new ArrayList();
// creates a MultiPathFilter with the hiddenFileFilter and the
// user provided one (if any).
List filters = new ArrayList();
filters.add(hiddenFileFilter);
PathFilter jobFilter = getInputPathFilter(job);
if (jobFilter != null) {
filters.add(jobFilter);
}
PathFilter inputFilter = new MultiPathFilter(filters);
for (int i=0; i < dirs.length; ++i) {
Path p = dirs[i];
FileSystem fs = p.getFileSystem(job.getConfiguration());
FileStatus[] matches = fs.globStatus(p, inputFilter);
if (matches == null) {
errors.add(new IOException("Input path does not exist: " + p));
} else if (matches.length == 0) {
errors.add(new IOException("Input Pattern " + p + " matches 0 files"));
} else {
for (FileStatus globStat: matches) {
if (globStat.isDir()) {
for(FileStatus stat: fs.listStatus(globStat.getPath(),
inputFilter)) {
result.add(stat);
}
} else {
result.add(globStat);
}
}
}
}
if (!errors.isEmpty()) {
throw new InvalidInputException(errors);
}
LOG.info("Total input paths to process : " + result.size());
return result;
}
/**
* Generate the list of files and make them into FileSplits.
*/
public List getSplits(JobContext job
) throws IOException {
long minSize = Math.max(getFormatMinSplitSize(), getMinSplitSize(job));
long maxSize = getMaxSplitSize(job);
// generate splits
List splits = new ArrayList();
Listfiles = listStatus(job);
for (FileStatus file: files) {
Path path = file.getPath();
FileSystem fs = path.getFileSystem(job.getConfiguration());
long length = file.getLen();
BlockLocation[] blkLocations = fs.getFileBlockLocations(file, 0, length);
if ((length != 0) && isSplitable(job, path)) {
long blockSize = file.getBlockSize();
long splitSize = computeSplitSize(blockSize, minSize, maxSize);
long bytesRemaining = length;
while (((double) bytesRemaining)/splitSize > SPLIT_SLOP) {
int blkIndex = getBlockIndex(blkLocations, length-bytesRemaining);
splits.add(new FileSplit(path, length-bytesRemaining, splitSize,
blkLocations[blkIndex].getHosts()));
bytesRemaining -= splitSize;
}
if (bytesRemaining != 0) {
splits.add(new FileSplit(path, length-bytesRemaining, bytesRemaining,
blkLocations[blkLocations.length-1].getHosts()));
}
} else if (length != 0) {
splits.add(new FileSplit(path, 0, length, blkLocations[0].getHosts()));
} else {
//Create empty hosts array for zero length files
splits.add(new FileSplit(path, 0, length, new String[0]));
}
}
// Save the number of input files in the job-conf
job.getConfiguration().setLong(NUM_INPUT_FILES, files.size());
LOG.debug("Total # of splits: " + splits.size());
return splits;
}
protected long computeSplitSize(long blockSize, long minSize,
long maxSize) {
return Math.max(minSize, Math.min(maxSize, blockSize));
}
protected int getBlockIndex(BlockLocation[] blkLocations,
long offset) {
for (int i = 0 ; i < blkLocations.length; i++) {
// is the offset inside this block?
if ((blkLocations[i].getOffset() <= offset) &&
(offset < blkLocations[i].getOffset() + blkLocations[i].getLength())){
return i;
}
}
BlockLocation last = blkLocations[blkLocations.length -1];
long fileLength = last.getOffset() + last.getLength() -1;
throw new IllegalArgumentException("Offset " + offset +
" is outside of file (0.." +
fileLength + ")");
}
/**
* Sets the given comma separated paths as the list of inputs
* for the map-reduce job.
*
* @param job the job
* @param commaSeparatedPaths Comma separated paths to be set as
* the list of inputs for the map-reduce job.
*/
public static void setInputPaths(Job job,
String commaSeparatedPaths
) throws IOException {
setInputPaths(job, StringUtils.stringToPath(
getPathStrings(commaSeparatedPaths)));
}
/**
* Add the given comma separated paths to the list of inputs for
* the map-reduce job.
*
* @param job The job to modify
* @param commaSeparatedPaths Comma separated paths to be added to
* the list of inputs for the map-reduce job.
*/
public static void addInputPaths(Job job,
String commaSeparatedPaths
) throws IOException {
for (String str : getPathStrings(commaSeparatedPaths)) {
addInputPath(job, new Path(str));
}
}
/**
* Set the array of {@link Path}s as the list of inputs
* for the map-reduce job.
*
* @param job The job to modify
* @param inputPaths the {@link Path}s of the input directories/files
* for the map-reduce job.
*/
public static void setInputPaths(Job job,
Path... inputPaths) throws IOException {
Configuration conf = job.getConfiguration();
Path path = inputPaths[0].getFileSystem(conf).makeQualified(inputPaths[0]);
StringBuffer str = new StringBuffer(StringUtils.escapeString(path.toString()));
for(int i = 1; i < inputPaths.length;i++) {
str.append(StringUtils.COMMA_STR);
path = inputPaths[i].getFileSystem(conf).makeQualified(inputPaths[i]);
str.append(StringUtils.escapeString(path.toString()));
}
conf.set("mapred.input.dir", str.toString());
}
/**
* Add a {@link Path} to the list of inputs for the map-reduce job.
*
* @param job The {@link Job} to modify
* @param path {@link Path} to be added to the list of inputs for
* the map-reduce job.
*/
public static void addInputPath(Job job,
Path path) throws IOException {
Configuration conf = job.getConfiguration();
path = path.getFileSystem(conf).makeQualified(path);
String dirStr = StringUtils.escapeString(path.toString());
String dirs = conf.get("mapred.input.dir");
conf.set("mapred.input.dir", dirs == null ? dirStr : dirs + "," + dirStr);
}
// This method escapes commas in the glob pattern of the given paths.
private static String[] getPathStrings(String commaSeparatedPaths) {
int length = commaSeparatedPaths.length();
int curlyOpen = 0;
int pathStart = 0;
boolean globPattern = false;
List pathStrings = new ArrayList();
for (int i=0; i
public class TextInputFormat extends FileInputFormat {
@Override
public RecordReader
createRecordReader(InputSplit split,
TaskAttemptContext context) {
return new LineRecordReader();
}
@Override
protected boolean isSplitable(JobContext context, Path file) {
CompressionCodec codec =
new CompressionCodecFactory(context.getConfiguration()).getCodec(file);
return codec == null;
}
}
public class FileSplit extends InputSplit implements Writable {
private Path file;
private long start;
private long length;
private String[] hosts;
FileSplit() {}
/** Constructs a split with host information
*
* @param file the file name
* @param start the position of the first byte in the file to process
* @param length the number of bytes in the file to process
* @param hosts the list of hosts containing the block, possibly null
*/
public FileSplit(Path file, long start, long length, String[] hosts) {
this.file = file;
this.start = start;
this.length = length;
this.hosts = hosts;
}
/** The file containing this split's data. */
public Path getPath() { return file; }
/** The position of the first byte in the file to process. */
public long getStart() { return start; }
/** The number of bytes in the file to process. */
@Override
public long getLength() { return length; }
@Override
public String toString() { return file + ":" + start + "+" + length; }
////////////////////////////////////////////
// Writable methods
////////////////////////////////////////////
@Override
public void write(DataOutput out) throws IOException {
Text.writeString(out, file.toString());
out.writeLong(start);
out.writeLong(length);
}
@Override
public void readFields(DataInput in) throws IOException {
file = new Path(Text.readString(in));
start = in.readLong();
length = in.readLong();
hosts = null;
}
@Override
public String[] getLocations() throws IOException {
if (this.hosts == null) {
return new String[]{};
} else {
return this.hosts;
}
}
}
public class LineRecordReader extends RecordReader {
private static final Log LOG = LogFactory.getLog(LineRecordReader.class);
private CompressionCodecFactory compressionCodecs = null;
private long start;
private long pos;
private long end;
private LineReader in;
private int maxLineLength;
private LongWritable key = null;
private Text value = null;
public void initialize(InputSplit genericSplit,
TaskAttemptContext context) throws IOException {
FileSplit split = (FileSplit) genericSplit;
Configuration job = context.getConfiguration();
this.maxLineLength = job.getInt("mapred.linerecordreader.maxlength",
Integer.MAX_VALUE);
start = split.getStart();
end = start + split.getLength();
final Path file = split.getPath();
compressionCodecs = new CompressionCodecFactory(job);
final CompressionCodec codec = compressionCodecs.getCodec(file);
// open the file and seek to the start of the split
FileSystem fs = file.getFileSystem(job);
FSDataInputStream fileIn = fs.open(split.getPath());
boolean skipFirstLine = false;
if (codec != null) {
in = new LineReader(codec.createInputStream(fileIn), job);
end = Long.MAX_VALUE;
} else {
if (start != 0) {
skipFirstLine = true;
--start;
fileIn.seek(start);
}
in = new LineReader(fileIn, job);
}
if (skipFirstLine) { // skip first line and re-establish "start".
start += in.readLine(new Text(), 0,
(int)Math.min((long)Integer.MAX_VALUE, end - start));
}
this.pos = start;
}
public boolean nextKeyValue() throws IOException {
if (key == null) {
key = new LongWritable();
}
key.set(pos);
if (value == null) {
value = new Text();
}
int newSize = 0;
while (pos < end) {
newSize = in.readLine(value, maxLineLength,
Math.max((int)Math.min(Integer.MAX_VALUE, end-pos),
maxLineLength));
if (newSize == 0) {
break;
}
pos += newSize;
if (newSize < maxLineLength) {
break;
}
// line too long. try again
LOG.info("Skipped line of size " + newSize + " at pos " +
(pos - newSize));
}
if (newSize == 0) {
key = null;
value = null;
return false;
} else {
return true;
}
}
@Override
public LongWritable getCurrentKey() {
return key;
}
@Override
public Text getCurrentValue() {
return value;
}
/**
* Get the progress within the split
*/
public float getProgress() {
if (start == end) {
return 0.0f;
} else {
return Math.min(1.0f, (pos - start) / (float)(end - start));
}
}
public synchronized void close() throws IOException {
if (in != null) {
in.close();
}
}
}
public class TokenizerMapper extends Mapper
public class IntSumReducer
extends Reducer {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public class HashPartitioner extends Partitioner {
/** Use {@link Object#hashCode()} to partition. */
public int getPartition(K key, V value,int numReduceTasks) {
return (key.hashCode() & Integer.MAX_VALUE) % numReduceTasks;
}
}
public static class Comparator extends WritableComparator {
public Comparator() {
super(Text.class);
}
public int compare(byte[] b1, int s1, int l1,
byte[] b2, int s2, int l2) {
int n1 = WritableUtils.decodeVIntSize(b1[s1]);
int n2 = WritableUtils.decodeVIntSize(b2[s2]);
return compareBytes(b1, s1+n1, l1-n1, b2, s2+n2, l2-n2);
}
}
static {
// register this comparator
WritableComparator.define(Text.class, new Comparator());
}
public static class IntSumReducer extends Reducer {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable values,Context context) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public abstract class FileOutputFormat extends OutputFormat {
protected static final String BASE_OUTPUT_NAME = "mapreduce.output.basename";
protected static final String PART = "part";
public static enum Counter {
BYTES_WRITTEN
}
/** Construct output file names so that, when an output directory listing is
* sorted lexicographically, positions correspond to output partitions.*/
private static final NumberFormat NUMBER_FORMAT = NumberFormat.getInstance();
static {
NUMBER_FORMAT.setMinimumIntegerDigits(5);
NUMBER_FORMAT.setGroupingUsed(false);
}
private FileOutputCommitter committer = null;
/**
* Set whether the output of the job is compressed.
* @param job the job to modify
* @param compress should the output of the job be compressed?
*/
public static void setCompressOutput(Job job, boolean compress) {
job.getConfiguration().setBoolean("mapred.output.compress", compress);
}
/**
* Is the job output compressed?
* @param job the Job to look in
* @return true
if the job output should be compressed,
* false
otherwise
*/
public static boolean getCompressOutput(JobContext job) {
return job.getConfiguration().getBoolean("mapred.output.compress", false);
}
/**
* Set the {@link CompressionCodec} to be used to compress job outputs.
* @param job the job to modify
* @param codecClass the {@link CompressionCodec} to be used to
* compress the job outputs
*/
public static void
setOutputCompressorClass(Job job,
Class extends CompressionCodec> codecClass) {
setCompressOutput(job, true);
job.getConfiguration().setClass("mapred.output.compression.codec",
codecClass,
CompressionCodec.class);
}
/**
* Get the {@link CompressionCodec} for compressing the job outputs.
* @param job the {@link Job} to look in
* @param defaultValue the {@link CompressionCodec} to return if not set
* @return the {@link CompressionCodec} to be used to compress the
* job outputs
* @throws IllegalArgumentException if the class was specified, but not found
*/
public static Class extends CompressionCodec>
getOutputCompressorClass(JobContext job,
Class extends CompressionCodec> defaultValue) {
Class extends CompressionCodec> codecClass = defaultValue;
Configuration conf = job.getConfiguration();
String name = conf.get("mapred.output.compression.codec");
if (name != null) {
try {
codecClass =
conf.getClassByName(name).asSubclass(CompressionCodec.class);
} catch (ClassNotFoundException e) {
throw new IllegalArgumentException("Compression codec " + name +
" was not found.", e);
}
}
return codecClass;
}
public abstract RecordWriter
getRecordWriter(TaskAttemptContext job
) throws IOException, InterruptedException;
public void checkOutputSpecs(JobContext job
) throws FileAlreadyExistsException, IOException{
// Ensure that the output directory is set and not already there
Path outDir = getOutputPath(job);
if (outDir == null) {
throw new InvalidJobConfException("Output directory not set.");
}
// get delegation token for outDir's file system
TokenCache.obtainTokensForNamenodes(job.getCredentials(),
new Path[] {outDir},
job.getConfiguration());
if (outDir.getFileSystem(job.getConfiguration()).exists(outDir)) {
throw new FileAlreadyExistsException("Output directory " + outDir +
" already exists");
}
}
/**
* Set the {@link Path} of the output directory for the map-reduce job.
*
* @param job The job to modify
* @param outputDir the {@link Path} of the output directory for
* the map-reduce job.
*/
public static void setOutputPath(Job job, Path outputDir) {
job.getConfiguration().set("mapred.output.dir", outputDir.toString());
}
/**
* Get the {@link Path} to the output directory for the map-reduce job.
*
* @return the {@link Path} to the output directory for the map-reduce job.
* @see FileOutputFormat#getWorkOutputPath(TaskInputOutputContext)
*/
public static Path getOutputPath(JobContext job) {
String name = job.getConfiguration().get("mapred.output.dir");
return name == null ? null: new Path(name);
}
/**
* Get the {@link Path} to the task's temporary output directory
* for the map-reduce job
*
* Tasks' Side-Effect Files
*
* Some applications need to create/write-to side-files, which differ from
* the actual job-outputs.
*
*
In such cases there could be issues with 2 instances of the same TIP
* (running simultaneously e.g. speculative tasks) trying to open/write-to the
* same file (path) on HDFS. Hence the application-writer will have to pick
* unique names per task-attempt (e.g. using the attemptid, say
* attempt_200709221812_0001_m_000000_0), not just per TIP.
*
* To get around this the Map-Reduce framework helps the application-writer
* out by maintaining a special
* ${mapred.output.dir}/_temporary/_${taskid}
* sub-directory for each task-attempt on HDFS where the output of the
* task-attempt goes. On successful completion of the task-attempt the files
* in the ${mapred.output.dir}/_temporary/_${taskid} (only)
* are promoted to ${mapred.output.dir}. Of course, the
* framework discards the sub-directory of unsuccessful task-attempts. This
* is completely transparent to the application.
*
* The application-writer can take advantage of this by creating any
* side-files required in a work directory during execution
* of his task i.e. via
* {@link #getWorkOutputPath(TaskInputOutputContext)}, and
* the framework will move them out similarly - thus she doesn't have to pick
* unique paths per task-attempt.
*
* The entire discussion holds true for maps of jobs with
* reducer=NONE (i.e. 0 reduces) since output of the map, in that case,
* goes directly to HDFS.
*
* @return the {@link Path} to the task's temporary output directory
* for the map-reduce job.
*/
public static Path getWorkOutputPath(TaskInputOutputContext,?,?,?> context
) throws IOException,
InterruptedException {
FileOutputCommitter committer = (FileOutputCommitter)
context.getOutputCommitter();
return committer.getWorkPath();
}
/**
* Helper function to generate a {@link Path} for a file that is unique for
* the task within the job output directory.
*
* The path can be used to create custom files from within the map and
* reduce tasks. The path name will be unique for each task. The path parent
* will be the job output directory.
ls
*
* This method uses the {@link #getUniqueFile} method to make the file name
* unique for the task.
*
* @param context the context for the task.
* @param name the name for the file.
* @param extension the extension for the file
* @return a unique path accross all tasks of the job.
*/
public
static Path getPathForWorkFile(TaskInputOutputContext,?,?,?> context,
String name,
String extension
) throws IOException, InterruptedException {
return new Path(getWorkOutputPath(context),
getUniqueFile(context, name, extension));
}
/**
* Generate a unique filename, based on the task id, name, and extension
* @param context the task that is calling this
* @param name the base filename
* @param extension the filename extension
* @return a string like $name-[mr]-$id$extension
*/
public synchronized static String getUniqueFile(TaskAttemptContext context,
String name,
String extension) {
TaskID taskId = context.getTaskAttemptID().getTaskID();
int partition = taskId.getId();
StringBuilder result = new StringBuilder();
result.append(name);
result.append('-');
result.append(taskId.isMap() ? 'm' : 'r');
result.append('-');
result.append(NUMBER_FORMAT.format(partition));
result.append(extension);
return result.toString();
}
/**
* Get the default path and filename for the output format.
* @param context the task context
* @param extension an extension to add to the filename
* @return a full path $output/_temporary/$taskid/part-[mr]-$id
* @throws IOException
*/
public Path getDefaultWorkFile(TaskAttemptContext context,
String extension) throws IOException{
FileOutputCommitter committer =
(FileOutputCommitter) getOutputCommitter(context);
return new Path(committer.getWorkPath(), getUniqueFile(context,
getOutputName(context), extension));
}
/**
* Get the base output name for the output file.
*/
protected static String getOutputName(JobContext job) {
return job.getConfiguration().get(BASE_OUTPUT_NAME, PART);
}
/**
* Set the base output name for output file to be created.
*/
protected static void setOutputName(JobContext job, String name) {
job.getConfiguration().set(BASE_OUTPUT_NAME, name);
}
public synchronized
OutputCommitter getOutputCommitter(TaskAttemptContext context
) throws IOException {
if (committer == null) {
Path output = getOutputPath(context);
committer = new FileOutputCommitter(output, context);
}
return committer;
}
}
public class TextOutputFormat extends FileOutputFormat {
protected static class LineRecordWriter
extends RecordWriter {
private static final String utf8 = "UTF-8";
private static final byte[] newline;
static {
try {
newline = "\n".getBytes(utf8);
} catch (UnsupportedEncodingException uee) {
throw new IllegalArgumentException("can't find " + utf8 + " encoding");
}
}
protected DataOutputStream out;
private final byte[] keyValueSeparator;
public LineRecordWriter(DataOutputStream out, String keyValueSeparator) {
this.out = out;
try {
this.keyValueSeparator = keyValueSeparator.getBytes(utf8);
} catch (UnsupportedEncodingException uee) {
throw new IllegalArgumentException("can't find " + utf8 + " encoding");
}
}
public LineRecordWriter(DataOutputStream out) {
this(out, "\t");
}
/**
* Write the object to the byte stream, handling Text as a special
* case.
* @param o the object to print
* @throws IOException if the write throws, we pass it on
*/
private void writeObject(Object o) throws IOException {
if (o instanceof Text) {
Text to = (Text) o;
out.write(to.getBytes(), 0, to.getLength());
} else {
out.write(o.toString().getBytes(utf8));
}
}
public synchronized void write(K key, V value)
throws IOException {
boolean nullKey = key == null || key instanceof NullWritable;
boolean nullValue = value == null || value instanceof NullWritable;
if (nullKey && nullValue) {
return;
}
if (!nullKey) {
writeObject(key);
}
if (!(nullKey || nullValue)) {
out.write(keyValueSeparator);
}
if (!nullValue) {
writeObject(value);
}
out.write(newline);
}
public synchronized
void close(TaskAttemptContext context) throws IOException {
out.close();
}
}
public RecordWriter
getRecordWriter(TaskAttemptContext job
) throws IOException, InterruptedException {
Configuration conf = job.getConfiguration();
boolean isCompressed = getCompressOutput(job);
String keyValueSeparator= conf.get("mapred.textoutputformat.separator",
"\t");
CompressionCodec codec = null;
String extension = "";
if (isCompressed) {
Class extends CompressionCodec> codecClass =
getOutputCompressorClass(job, GzipCodec.class);
codec = (CompressionCodec) ReflectionUtils.newInstance(codecClass, conf);
extension = codec.getDefaultExtension();
}
Path file = getDefaultWorkFile(job, extension);
FileSystem fs = file.getFileSystem(conf);
if (!isCompressed) {
FSDataOutputStream fileOut = fs.create(file, false);
return new LineRecordWriter(fileOut, keyValueSeparator);
} else {
FSDataOutputStream fileOut = fs.create(file, false);
return new LineRecordWriter(new DataOutputStream
(codec.createOutputStream(fileOut)),
keyValueSeparator);
}
}
}
protected static class LineRecordWriter
extends RecordWriter {
private static final String utf8 = "UTF-8";
private static final byte[] newline;
static {
try {
newline = "\n".getBytes(utf8);
} catch (UnsupportedEncodingException uee) {
throw new IllegalArgumentException("can't find " + utf8 + " encoding");
}
}
protected DataOutputStream out;
private final byte[] keyValueSeparator;
public LineRecordWriter(DataOutputStream out, String keyValueSeparator) {
this.out = out;
try {
this.keyValueSeparator = keyValueSeparator.getBytes(utf8);
} catch (UnsupportedEncodingException uee) {
throw new IllegalArgumentException("can't find " + utf8 + " encoding");
}
}
public LineRecordWriter(DataOutputStream out) {
this(out, "\t");
}
/**
* Write the object to the byte stream, handling Text as a special
* case.
* @param o the object to print
* @throws IOException if the write throws, we pass it on
*/
private void writeObject(Object o) throws IOException {
if (o instanceof Text) {
Text to = (Text) o;
out.write(to.getBytes(), 0, to.getLength());
} else {
out.write(o.toString().getBytes(utf8));
}
}
public synchronized void write(K key, V value)
throws IOException {
boolean nullKey = key == null || key instanceof NullWritable;
boolean nullValue = value == null || value instanceof NullWritable;
if (nullKey && nullValue) {
return;
}
if (!nullKey) {
writeObject(key);
}
if (!(nullKey || nullValue)) {
out.write(keyValueSeparator);
}
if (!nullValue) {
writeObject(value);
}
out.write(newline);
}
public synchronized
void close(TaskAttemptContext context) throws IOException {
out.close();
}
}
public class FileOutputCommitter extends OutputCommitter {
private static final Log LOG = LogFactory.getLog(FileOutputCommitter.class);
/**
* Temporary directory name
*/
protected static final String TEMP_DIR_NAME = "_temporary";
public static final String SUCCEEDED_FILE_NAME = "_SUCCESS";
static final String SUCCESSFUL_JOB_OUTPUT_DIR_MARKER =
"mapreduce.fileoutputcommitter.marksuccessfuljobs";
private FileSystem outputFileSystem = null;
private Path outputPath = null;
private Path workPath = null;
/**
* Create a file output committer
* @param outputPath the job's output path
* @param context the task's context
* @throws IOException
*/
public FileOutputCommitter(Path outputPath,
TaskAttemptContext context) throws IOException {
if (outputPath != null) {
this.outputPath = outputPath;
outputFileSystem = outputPath.getFileSystem(context.getConfiguration());
workPath = new Path(outputPath,
(FileOutputCommitter.TEMP_DIR_NAME + Path.SEPARATOR +
"_" + context.getTaskAttemptID().toString()
)).makeQualified(outputFileSystem);
}
}
/**
* Create the temporary directory that is the root of all of the task
* work directories.
* @param context the job's context
*/
public void setupJob(JobContext context) throws IOException {
if (outputPath != null) {
Path tmpDir = new Path(outputPath, FileOutputCommitter.TEMP_DIR_NAME);
FileSystem fileSys = tmpDir.getFileSystem(context.getConfiguration());
if (!fileSys.mkdirs(tmpDir)) {
LOG.error("Mkdirs failed to create " + tmpDir.toString());
}
}
}
private static boolean shouldMarkOutputDir(Configuration conf) {
return conf.getBoolean(SUCCESSFUL_JOB_OUTPUT_DIR_MARKER,
true);
}
// Mark the output dir of the job for which the context is passed.
private void markOutputDirSuccessful(JobContext context)
throws IOException {
if (outputPath != null) {
FileSystem fileSys = outputPath.getFileSystem(context.getConfiguration());
if (fileSys.exists(outputPath)) {
// create a file in the folder to mark it
Path filePath = new Path(outputPath, SUCCEEDED_FILE_NAME);
fileSys.create(filePath).close();
}
}
}
/**
* Delete the temporary directory, including all of the work directories.
* This is called for all jobs whose final run state is SUCCEEDED
* @param context the job's context.
*/
public void commitJob(JobContext context) throws IOException {
// delete the _temporary folder
cleanupJob(context);
// check if the o/p dir should be marked
if (shouldMarkOutputDir(context.getConfiguration())) {
// create a _success file in the o/p folder
markOutputDirSuccessful(context);
}
}
@Override
@Deprecated
public void cleanupJob(JobContext context) throws IOException {
if (outputPath != null) {
Path tmpDir = new Path(outputPath, FileOutputCommitter.TEMP_DIR_NAME);
FileSystem fileSys = tmpDir.getFileSystem(context.getConfiguration());
if (fileSys.exists(tmpDir)) {
fileSys.delete(tmpDir, true);
}
} else {
LOG.warn("Output path is null in cleanup");
}
}
/**
* Delete the temporary directory, including all of the work directories.
* @param context the job's context
* @param state final run state of the job, should be FAILED or KILLED
*/
@Override
public void abortJob(JobContext context, JobStatus.State state)
throws IOException {
cleanupJob(context);
}
/**
* No task setup required.
*/
@Override
public void setupTask(TaskAttemptContext context) throws IOException {
// FileOutputCommitter's setupTask doesn't do anything. Because the
// temporary task directory is created on demand when the
// task is writing.
}
/**
* Move the files from the work directory to the job output directory
* @param context the task context
*/
public void commitTask(TaskAttemptContext context)
throws IOException {
TaskAttemptID attemptId = context.getTaskAttemptID();
if (workPath != null) {
context.progress();
if (outputFileSystem.exists(workPath)) {
// Move the task outputs to their final place
moveTaskOutputs(context, outputFileSystem, outputPath, workPath);
// Delete the temporary task-specific output directory
if (!outputFileSystem.delete(workPath, true)) {
LOG.warn("Failed to delete the temporary output" +
" directory of task: " + attemptId + " - " + workPath);
}
LOG.info("Saved output of task '" + attemptId + "' to " +
outputPath);
}
}
}
/**
* Move all of the files from the work directory to the final output
* @param context the task context
* @param fs the output file system
* @param jobOutputDir the final output direcotry
* @param taskOutput the work path
* @throws IOException
*/
private void moveTaskOutputs(TaskAttemptContext context,
FileSystem fs,
Path jobOutputDir,
Path taskOutput)
throws IOException {
TaskAttemptID attemptId = context.getTaskAttemptID();
context.progress();
if (fs.isFile(taskOutput)) {
Path finalOutputPath = getFinalPath(jobOutputDir, taskOutput,
workPath);
if (!fs.rename(taskOutput, finalOutputPath)) {
if (!fs.delete(finalOutputPath, true)) {
throw new IOException("Failed to delete earlier output of task: " +
attemptId);
}
if (!fs.rename(taskOutput, finalOutputPath)) {
throw new IOException("Failed to save output of task: " +
attemptId);
}
}
LOG.debug("Moved " + taskOutput + " to " + finalOutputPath);
} else if(fs.getFileStatus(taskOutput).isDir()) {
FileStatus[] paths = fs.listStatus(taskOutput);
Path finalOutputPath = getFinalPath(jobOutputDir, taskOutput, workPath);
fs.mkdirs(finalOutputPath);
if (paths != null) {
for (FileStatus path : paths) {
moveTaskOutputs(context, fs, jobOutputDir, path.getPath());
}
}
}
}
/**
* Delete the work directory
*/
@Override
public void abortTask(TaskAttemptContext context) {
try {
if (workPath != null) {
context.progress();
outputFileSystem.delete(workPath, true);
}
} catch (IOException ie) {
LOG.warn("Error discarding output" + StringUtils.stringifyException(ie));
}
}
/**
* Find the final name of a given output file, given the job output directory
* and the work directory.
* @param jobOutputDir the job's output directory
* @param taskOutput the specific task output file
* @param taskOutputPath the job's work directory
* @return the final path for the specific output file
* @throws IOException
*/
private Path getFinalPath(Path jobOutputDir, Path taskOutput,
Path taskOutputPath) throws IOException {
URI taskOutputUri = taskOutput.toUri();
URI relativePath = taskOutputPath.toUri().relativize(taskOutputUri);
if (taskOutputUri == relativePath) {
throw new IOException("Can not get the relative path: base = " +
taskOutputPath + " child = " + taskOutput);
}
if (relativePath.getPath().length() > 0) {
return new Path(jobOutputDir, relativePath.getPath());
} else {
return jobOutputDir;
}
}
/**
* Did this task write any files in the work directory?
* @param context the task's context
*/
@Override
public boolean needsTaskCommit(TaskAttemptContext context
) throws IOException {
return workPath != null && outputFileSystem.exists(workPath);
}
/**
* Get the directory that the task should write results into
* @return the work directory
* @throws IOException
*/
public Path getWorkPath() throws IOException {
return workPath;
}
}
public class WordCount {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String[] otherArgs = new GenericOptionsParser(conf, args).getRemainingArgs();
if (otherArgs.length != 2) {
System.err.println("Usage: wordcount ");
System.exit(2);
}
Job job = new Job(conf, "word count");
job.setJarByClass(WordCount.class);
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job, new Path(otherArgs[0]));
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
job.setGroupingComparatorClass(Text.Comparator.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job, new Path(otherArgs[1]));
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
hadoop --config 配置文件位置 jar jar包路径 WordCount in out