核心概念
- Split:MapReduce作业处理的数据块,是MapReduce中最小的计算单元。和HDFS中的block默认是一一对应的,也可以手动设置他们之间的比值关系(不建议)
- InputFormat:将输入数据进行分片(split)
package org.apache.hadoop.mapred;
import java.io.IOException;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.fs.FileSystem;
/**
* InputFormat
describes the input-specification for a
* Map-Reduce job.
*
* The Map-Reduce framework relies on the InputFormat
of the
* job to:
*
* -
* Validate the input-specification of the job.
*
-
* Split-up the input file(s) into logical {@link InputSplit}s, each of
* which is then assigned to an individual {@link Mapper}.
*
* -
* Provide the {@link RecordReader} implementation to be used to glean
* input records from the logical
InputSplit
for processing by
* the {@link Mapper}.
*
*
*
* The default behavior of file-based {@link InputFormat}s, typically
* sub-classes of {@link FileInputFormat}, is to split the
* input into logical {@link InputSplit}s based on the total size, in
* bytes, of the input files. However, the {@link FileSystem} blocksize of
* the input files is treated as an upper bound for input splits. A lower bound
* on the split size can be set via
*
* mapreduce.input.fileinputformat.split.minsize.
*
* Clearly, logical splits based on input-size is insufficient for many
* applications since record boundaries are to be respected. In such cases, the
* application has to also implement a {@link RecordReader} on whom lies the
* responsibilty to respect record-boundaries and present a record-oriented
* view of the logical InputSplit
to the individual task.
*
* @see InputSplit
* @see RecordReader
* @see JobClient
* @see FileInputFormat
*/
@InterfaceAudience.Public
@InterfaceStability.Stable
public interface InputFormat {
/**
* Logically split the set of input files for the job.
*
* Each {@link InputSplit} is then assigned to an individual {@link Mapper}
* for processing.
*
* Note: The split is a logical split of the inputs and the
* input files are not physically split into chunks. For e.g. a split could
* be <input-file-path, start, offset> tuple.
*
* @param job job configuration.
* @param numSplits the desired number of splits, a hint.
* @return an array of {@link InputSplit}s for the job.
*/
InputSplit[] getSplits(JobConf job, int numSplits) throws IOException;
/**
* Get the {@link RecordReader} for the given {@link InputSplit}.
*
*
It is the responsibility of the RecordReader
to respect
* record boundaries while processing the logical split to present a
* record-oriented view to the individual task.
*
* @param split the {@link InputSplit}
* @param job the job that this split belongs to
* @return a {@link RecordReader}
*/
RecordReader getRecordReader(InputSplit split,
JobConf job,
Reporter reporter) throws IOException;
}
-OutputFormat:将job的内容输出到文件系统
package org.apache.hadoop.mapred;
import java.io.IOException;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.util.Progressable;
/**
* OutputFormat
describes the output-specification for a
* Map-Reduce job.
*
* The Map-Reduce framework relies on the OutputFormat
of the
* job to:
*
* -
* Validate the output-specification of the job. For e.g. check that the
* output directory doesn't already exist.
*
-
* Provide the {@link RecordWriter} implementation to be used to write out
* the output files of the job. Output files are stored in a
* {@link FileSystem}.
*
*
*
* @see RecordWriter
* @see JobConf
*/
@InterfaceAudience.Public
@InterfaceStability.Stable
public interface OutputFormat {
/**
* Get the {@link RecordWriter} for the given job.
*
* @param ignored
* @param job configuration for the job whose output is being written.
* @param name the unique name for this part of the output.
* @param progress mechanism for reporting progress while writing to file.
* @return a {@link RecordWriter} to write the output for the job.
* @throws IOException
*/
RecordWriter getRecordWriter(FileSystem ignored, JobConf job,
String name, Progressable progress)
throws IOException;
/**
* Check for validity of the output-specification for the job.
*
* This is to validate the output specification for the job when it is
* a job is submitted. Typically checks that it does not already exist,
* throwing an exception when it already exists, so that output is not
* overwritten.
*
* @param ignored
* @param job job configuration.
* @throws IOException when output should not be attempted
*/
void checkOutputSpecs(FileSystem ignored, JobConf job) throws IOException;
}
- Mapper
package org.apache.hadoop.mapreduce;
import java.io.IOException;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.RawComparator;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.mapreduce.task.MapContextImpl;
/**
* Maps input key/value pairs to a set of intermediate key/value pairs.
*
* Maps are the individual tasks which transform input records into a
* intermediate records. The transformed intermediate records need not be of
* the same type as the input records. A given input pair may map to zero or
* many output pairs.
*
* The Hadoop Map-Reduce framework spawns one map task for each
* {@link InputSplit} generated by the {@link InputFormat} for the job.
* Mapper
implementations can access the {@link Configuration} for
* the job via the {@link JobContext#getConfiguration()}.
*
*
The framework first calls
* {@link #setup(org.apache.hadoop.mapreduce.Mapper.Context)}, followed by
* {@link #map(Object, Object, Context)}
* for each key/value pair in the InputSplit
. Finally
* {@link #cleanup(Context)} is called.
*
* All intermediate values associated with a given output key are
* subsequently grouped by the framework, and passed to a {@link Reducer} to
* determine the final output. Users can control the sorting and grouping by
* specifying two key {@link RawComparator} classes.
*
* The Mapper
outputs are partitioned per
* Reducer
. Users can control which keys (and hence records) go to
* which Reducer
by implementing a custom {@link Partitioner}.
*
*
Users can optionally specify a combiner
, via
* {@link Job#setCombinerClass(Class)}, to perform local aggregation of the
* intermediate outputs, which helps to cut down the amount of data transferred
* from the Mapper
to the Reducer
.
*
*
Applications can specify if and how the intermediate
* outputs are to be compressed and which {@link CompressionCodec}s are to be
* used via the Configuration
.
*
* If the job has zero
* reduces then the output of the Mapper
is directly written
* to the {@link OutputFormat} without sorting by keys.
*
* Example:
*
* public class TokenCounterMapper
* extends Mapper<Object, Text, Text, IntWritable>{
*
* private final static IntWritable one = new IntWritable(1);
* private Text word = new Text();
*
* public void map(Object key, Text value, Context context) throws IOException, InterruptedException {
* StringTokenizer itr = new StringTokenizer(value.toString());
* while (itr.hasMoreTokens()) {
* word.set(itr.nextToken());
* context.write(word, one);
* }
* }
* }
*
*
* Applications may override the {@link #run(Context)} method to exert
* greater control on map processing e.g. multi-threaded Mapper
s
* etc.
*
* @see InputFormat
* @see JobContext
* @see Partitioner
* @see Reducer
*/
@InterfaceAudience.Public
@InterfaceStability.Stable
public class Mapper {
/**
* The Context
passed on to the {@link Mapper} implementations.
*/
public abstract class Context
implements MapContext {
}
/**
* Called once at the beginning of the task.
*/
protected void setup(Context context
) throws IOException, InterruptedException {
// NOTHING
}
/**
* Called once for each key/value pair in the input split. Most applications
* should override this, but the default is the identity function.
*/
@SuppressWarnings("unchecked")
protected void map(KEYIN key, VALUEIN value,
Context context) throws IOException, InterruptedException {
context.write((KEYOUT) key, (VALUEOUT) value);
}
/**
* Called once at the end of the task.
*/
protected void cleanup(Context context
) throws IOException, InterruptedException {
// NOTHING
}
/**
* Expert users can override this method for more complete control over the
* execution of the Mapper.
* @param context
* @throws IOException
*/
public void run(Context context) throws IOException, InterruptedException {
setup(context);
try {
while (context.nextKeyValue()) {
map(context.getCurrentKey(), context.getCurrentValue(), context);
}
} finally {
cleanup(context);
}
}
}
- Reducer
package org.apache.hadoop.mapreduce;
import java.io.IOException;
import org.apache.hadoop.classification.InterfaceAudience;
import org.apache.hadoop.classification.InterfaceStability;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapreduce.task.annotation.Checkpointable;
import java.util.Iterator;
/**
* Reduces a set of intermediate values which share a key to a smaller set of
* values.
*
* Reducer
implementations
* can access the {@link Configuration} for the job via the
* {@link JobContext#getConfiguration()} method.
* Reducer
has 3 primary phases:
*
* -
*
*
Shuffle
*
* The Reducer
copies the sorted output from each
* {@link Mapper} using HTTP across the network.
*
*
* -
*
Sort
*
* The framework merge sorts Reducer
inputs by
* key
s
* (since different Mapper
s may have output the same key).
*
* The shuffle and sort phases occur simultaneously i.e. while outputs are
* being fetched they are merged.
*
* SecondarySort
*
* To achieve a secondary sort on the values returned by the value
* iterator, the application should extend the key with the secondary
* key and define a grouping comparator. The keys will be sorted using the
* entire key, but will be grouped using the grouping comparator to decide
* which keys and values are sent in the same call to reduce.The grouping
* comparator is specified via
* {@link Job#setGroupingComparatorClass(Class)}. The sort order is
* controlled by
* {@link Job#setSortComparatorClass(Class)}.
*
*
* For example, say that you want to find duplicate web pages and tag them
* all with the url of the "best" known example. You would set up the job
* like:
*
* - Map Input Key: url
* - Map Input Value: document
* - Map Output Key: document checksum, url pagerank
* - Map Output Value: url
* - Partitioner: by checksum
* - OutputKeyComparator: by checksum and then decreasing pagerank
* - OutputValueGroupingComparator: by checksum
*
*
*
* -
*
Reduce
*
* In this phase the
* {@link #reduce(Object, Iterable, Context)}
* method is called for each <key, (collection of values)>
in
* the sorted inputs.
* The output of the reduce task is typically written to a
* {@link RecordWriter} via
* {@link Context#write(Object, Object)}.
*
*
*
* The output of the Reducer
is not re-sorted.
*
* Example:
*
* public class IntSumReducer<Key> extends Reducer<Key,IntWritable,
* Key,IntWritable> {
* private IntWritable result = new IntWritable();
*
* public void reduce(Key key, Iterable<IntWritable> values,
* Context context) throws IOException, InterruptedException {
* int sum = 0;
* for (IntWritable val : values) {
* sum += val.get();
* }
* result.set(sum);
* context.write(key, result);
* }
* }
*
*
* @see Mapper
* @see Partitioner
*/
@Checkpointable
@InterfaceAudience.Public
@InterfaceStability.Stable
public class Reducer {
/**
* The Context
passed on to the {@link Reducer} implementations.
*/
public abstract class Context
implements ReduceContext {
}
/**
* Called once at the start of the task.
*/
protected void setup(Context context
) throws IOException, InterruptedException {
// NOTHING
}
/**
* This method is called once for each key. Most applications will define
* their reduce class by overriding this method. The default implementation
* is an identity function.
*/
@SuppressWarnings("unchecked")
protected void reduce(KEYIN key, Iterable values, Context context
) throws IOException, InterruptedException {
for(VALUEIN value: values) {
context.write((KEYOUT) key, (VALUEOUT) value);
}
}
/**
* Called once at the end of the task.
*/
protected void cleanup(Context context
) throws IOException, InterruptedException {
// NOTHING
}
/**
* Advanced application writers can use the
* {@link #run(org.apache.hadoop.mapreduce.Reducer.Context)} method to
* control how the reduce task works.
*/
public void run(Context context) throws IOException, InterruptedException {
setup(context);
try {
while (context.nextKey()) {
reduce(context.getCurrentKey(), context.getValues(), context);
// If a back up store is used, reset it
Iterator iter = context.getValues().iterator();
if(iter instanceof ReduceContext.ValueIterator) {
((ReduceContext.ValueIterator)iter).resetBackupStore();
}
}
} finally {
cleanup(context);
}
}
}
- Combiner
- Partitioner
wordcount:词频统计分析
- wordcount 1.0
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class WordCount {
/**
*Mapper实现类通过map()方法一次处理一行数据,数据由指定的的TextInputFormat提供。
*/
public static class TokenizerMapper
extends Mapper
- Mapper
MapReduce为作业的InputFormat生成的每一个InputSplit产生map任务
通过调用context.write(WritableComparable,Writable)
来收集输出对
用户可以通过Job.setGroupingComparatorClass(Class)
来指定Comparator从而控制分组,通常需要继承WritableCompator
用户可以通过Job.setSortComparatorClass(Class)
来指定Comparator从而控制传递给Reduce前如何对key进行排序,通常需要继承WritableCompator
用户可以通过Job.setPartitionerClass(Class)
来指定Partitioner从而控制key和reduce的映射关系,partition的数量和reduce任务的数量是相同的
用户可以通过Job.setCombinerClass(Class)
对map的中间输出进行本地聚合,减少map向reduce的数据传输量,需继承Reducer- Reducer
Reducer有3个主要的阶段:shuffle、sort和reduce
public static class GroupingComparator extends WritableComparator
{
protected GroupingComparator()
{
super(IntPair.class, true);
}
@Override
//Compare two WritableComparables.
public int compare(WritableComparable w1, WritableComparable w2)
{
IntPair ip1 = (IntPair) w1;
IntPair ip2 = (IntPair) w2;
int l = ip1.getFirst();
int r = ip2.getFirst();
return l == r ? 0 : (l < r ? -1 : 1);
}
}
public class KeyPartitioner extends Partitioner {
@Override
public int getPartition(TextInt key, IntWritable value, int numPartitions) {
// TODO Auto-generated method stub
return (key.getFirstKey().hashCode()&Integer.MAX_VALUE)%numPartitions;
}
}
运行:$ bin/hadoop jar wc.jar WordCount /user/joe/wordcount/input /user/joe/wordcount/output
- -file:以逗号分割的路径列表,这些路径会出现在当前任务的工作路径中
- -libjars:以逗号分割的jar包,添加到map和reduce的类路径中
- -archives:以逗号分割的压缩包列表,压缩包未归档,并且在当前工作路径创建了和压缩包同名的链接;
下面的myarchive.zip会解压在名为“myarchive.zip”的目录中
bin/hadoop jar hadoop-mapreduce-examples-.jar wordcount -files cachefile.txt -libjars mylib.jar -archives myarchive.zip input output
- WordCount2.0
package cn.test.mapreduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Counter;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.hadoop.util.StringUtils;
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.net.URI;
import java.util.*;
public class WordCount2 {
public static class TokenizerMapper extends Mapper
- GenericOptionsParser
解析hadoop框架通用命令行参数,使应用程序可以轻松指定namenode,ResourceManager,其他配置资源等。
支持的命令行参数:bin/hadoop command [genericOptions] [commandOptions]
-conf<配置文件> 指定配置文件$ bin/hadoop dfs -conf core-site.xml -conf hdfs-site.xml -ls /data
-D使用给定属性的值 $ bin/hadoop dfs -D fs.default.name=darwin:8020 -ls /data
-fs指定namenode $ bin/hadoop dfs -fs darwin:8020 -ls /data
-jt指定ResourceManager $ bin/hadoop job -jt local -submit job.xml
-files
-libjars
-archives- DistributedCache
有效地分发特定于应用程序的大型只读文件,用于缓存应用程序所需的文件(文本,存档,jar等)。
文件/归档文件分发可以通过设置属性 mapreduce.job.cache.{files |archives},以逗号进行分割;
应用程序中通过API Job.addCacheFile(URI)/ Job.addCacheArchive(URI)(URI默认是HDFS系统上的文件)
Streaming可以在命令行通过 -cacheFile/-cacheArchive分发文件- Counter
使用enum类名作为Counter的组名,enum的成员为Counter的名字
//清理已存在的输出目录
Path outputPath = new Path(args[1]);
FileSystem fileSystem = FileSystem.get(conf);
if(fileSystem.exists(outputPath)){
fileSystem.delete(outputPath, true);
System.out.println("existed file has deleted");
}
- jobhistory
记录已运行完的MapReduce信息到指定的HDFS目录下,默认没有开启该功能。
mapred-site.xml加上下列配置
mr-jobhistory-daemon.sh start historyserver 启动
jps -----》 JobHistoryServer
mapreduce.jobhistory.address
master:10020
mapreduce.jobhistory.webapp.address
master:19888
mapreduce.jobhistory.done-dir
/history/done
mapreduce.jobhisory.intermediate-done-dir
/history/done_intermediate
yarn-site.xml 加上下列配置开启聚合功能,可以查看日志信息
yarn.log-aggregation-enable
true