CHAPTER 6:Metapatterns
**Oozie**
Job Chaining
CombineFileInputFormat takes
smaller blocks and lumps them together to make a larger input split
before being processed by the mapper.
You can also fire off multiple jobs in parallel by using
**Job.submit()** instead of
**Job.wait
ForCompletion()
**. **The submit method returns immediately to the current thread and
runs the job in the background
**. This allows you to run several jobs at once. Use Job.is
Complete(), a nonblocking job completion check, to constantly poll to see whether all
of the jobs are complete.
Problem: Given a data set of StackOverflow posts, bin users based on if they are below
or above the number of average posts per user. Also to enrich each user with his or her
reputation from a separate data set when generating the output.
Job one mapper:
public static class UserIdCountMapper extends
Mapper<Object, Text, Text, LongWritable> {
public static final String RECORDS_COUNTER_NAME = "Records";
private static final LongWritable ONE = new LongWritable(1);
private Text outkey = new Text();
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
Map<String, String> parsed = MRDPUtils.transformXmlToMap(value
.toString());
String userId = parsed.get("OwnerUserId");
if (userId != null) {
outkey.set(userId);
context.write(outkey, ONE);
context.getCounter(AVERAGE_CALC_GROUP,
RECORDS_COUNTER_NAME).increment(1);
}
}
}
Job one reducer:
public static class UserIdSumReducer extends
Reducer<Text, LongWritable, Text, LongWritable> {
public static final String USERS_COUNTER_NAME = "Users";
private LongWritable outvalue = new LongWritable();
public void reduce(Text key, Iterable<LongWritable> values,
Context context) throws IOException, InterruptedException {
// Increment user counter, as each reduce group represents one user
context.getCounter(AVERAGE_CALC_GROUP, USERS_COUNTER_NAME).increment(1);
int sum = 0;
for (LongWritable value : values) {
sum += value.get();
}
outvalue.set(sum);
context.write(key, outvalue);
}
}
Job two mapper:
The setup phase accomplishes three dif‐
ferent things. The average number of posts per user is pulled from the Context object
that was set during job configuration. The MultipleOutputs utility is initialized as well.
This is used to write the output to different bins. Finally, the user data set is parsed from
the DistributedCache to build a map of user ID to reputation. This map is used for the
desired data enrichment during output.
public static class UserIdBinningMapper extends
Mapper<Object, Text, Text, Text> {
public static final String AVERAGE_POSTS_PER_USER = "avg.posts.per.user";
public static void setAveragePostsPerUser(Job job, double avg) {
job.getConfiguration().set(AVERAGE_POSTS_PER_USER,
Double.toString(avg));
}
public static double getAveragePostsPerUser(Configuration conf) {
return Double.parseDouble(conf.get(AVERAGE_POSTS_PER_USER));
}
private double average = 0.0;
private MultipleOutputs<Text, Text> mos = null;
private Text outkey = new Text(), outvalue = new Text();
private HashMap<String, String> userIdToReputation =
new HashMap<String, String>();
protected void setup(Context context) throws IOException,
InterruptedException {
average = getAveragePostsPerUser(context.getConfiguration());
mos = new MultipleOutputs<Text, Text>(context);
Path[] files = DistributedCache.getLocalCacheFiles(context
.getConfiguration());
// Read all files in the DistributedCache
for (Path p : files) {
BufferedReader rdr = new BufferedReader(
new InputStreamReader(
new GZIPInputStream(new FileInputStream(
new File(p.toString())))));
String line;
// For each record in the user file
while ((line = rdr.readLine()) != null) {
// Get the user ID and reputation
Map<String, String> parsed = MRDPUtils
.transformXmlToMap(line);
// Map the user ID to the reputation
userIdToReputation.put(parsed.get("Id"),
parsed.get("Reputation"));
}
}
}
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
String[] tokens = value.toString().split("\t");
String userId = tokens[0];
int posts = Integer.parseInt(tokens[1]);
outkey.set(userId);
outvalue.set((long) posts + "\t" + userIdToReputation.get(userId));
if ((double) posts < average) {
mos.write(MULTIPLE_OUTPUTS_BELOW_NAME, outkey, outvalue,
MULTIPLE_OUTPUTS_BELOW_NAME + "/part");
} else {
mos.write(MULTIPLE_OUTPUTS_ABOVE_NAME, outkey, outvalue,
MULTIPLE_OUTPUTS_ABOVE_NAME + "/part");
}
}
protected void cleanup(Context context) throws IOException,
InterruptedException {
mos.close();
}
}
Driver Code
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Path postInput = new Path(args[0]);
Path userInput = new Path(args[1]);
Path outputDirIntermediate = new Path(args[2] + "_int");
Path outputDir = new Path(args[2]);
// Setup first job to counter user posts
Job countingJob = new Job(, "JobChaining-Counting");
countingJob.setJarByClass(JobChainingDriver.class);
// Set our mapper and reducer, we can use the API's long sum reducer for
// a combiner!
countingJob.setMapperClass(UserIdCountMapper.class);
countingJob.setCombinerClass(LongSumReducer.class);
countingJob.setReducerClass(UserIdSumReducer.class);
countingJob.setOutputKeyClass(Text.class);
countingJob.setOutputValueClass(LongWritable.class);
countingJob.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(countingJob, postInput);
countingJob.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(countingJob, outputDirIntermediate);
// Execute job and grab exit code
int code = countingJob.waitForCompletion(true) ? 0 : 1;
if (code == 0) {
// Calculate the average posts per user by getting counter values
double numRecords = (double) countingJob
.getCounters()
.findCounter(AVERAGE_CALC_GROUP,
UserIdCountMapper.RECORDS_COUNTER_NAME).getValue();
double numUsers = (double) countingJob
.getCounters()
.findCounter(AVERAGE_CALC_GROUP,
UserIdSumReducer.USERS_COUNTER_NAME).getValue();
double averagePostsPerUser = numRecords / numUsers;
// Setup binning job
Job binningJob = new Job(new Configuration(), "JobChaining-Binning");
binningJob.setJarByClass(JobChainingDriver.class);
// Set mapper and the average posts per user
binningJob.setMapperClass(UserIdBinningMapper.class);
UserIdBinningMapper.setAveragePostsPerUser(binningJob,
averagePostsPerUser);
binningJob.setNumReduceTasks(0);
binningJob.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(binningJob, outputDirIntermediate);
// Add two named outputs for below/above average
MultipleOutputs.addNamedOutput(binningJob,
MULTIPLE_OUTPUTS_BELOW_NAME, TextOutputFormat.class,
Text.class, Text.class);
MultipleOutputs.addNamedOutput(binningJob,
MULTIPLE_OUTPUTS_ABOVE_NAME, TextOutputFormat.class,
Text.class, Text.class);
MultipleOutputs.setCountersEnabled(binningJob, true);
TextOutputFormat.setOutputPath(binningJob, outputDir);
// Add the user files to the DistributedCache
FileStatus[] userFiles = FileSystem.get(conf).listStatus(userInput);
for (FileStatus status : userFiles) {
DistributedCache.addCacheFile(status.getPath().toUri(),
binningJob.getConfiguration());
}
// Execute job and grab exit code
code = binningJob.waitForCompletion(true) ? 0 : 1;
}
// Clean up the intermediate output
FileSystem.get(conf).delete(outputDirIntermediate, true);
System.exit(code);
}
*Parallel job chaining*
Problem: Given the previous example’s output of binned users, run parallel jobs over
both bins to calculate the average reputation of each user.
MapCode
public static class AverageReputationMapper extends
Mapper<LongWritable, Text, Text, DoubleWritable> {
private static final Text GROUP_ALL_KEY = new Text("Average Reputation:");
private DoubleWritable outvalue = new DoubleWritable();
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
// Split the line into tokens
String[] tokens = value.toString().split("\t");
// Get the reputation from the third column
double reputation = Double.parseDouble(tokens[2]);
// Set the output value and write to context
outvalue.set(reputation);
context.write(GROUP_ALL_KEY, outvalue);
}
}
Reduce Code
public static class AverageReputationReducer extends
Reducer<Text, DoubleWritable, Text, DoubleWritable> {
private DoubleWritable outvalue = new DoubleWritable();
protected void reduce(Text key, Iterable<DoubleWritable> values,
Context context) throws IOException, InterruptedException {
double sum = 0.0;
double count = 0;
for (DoubleWritable dw : values) {
sum += dw.get();
++count;
}
outvalue.set(sum / count);
context.write(key, outvalue);
}
}
Drive Code:
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Path belowAvgInputDir = new Path(args[0]);
Path aboveAvgInputDir = new Path(args[1]);
Path belowAvgOutputDir = new Path(args[2]);
Path aboveAvgOutputDir = new Path(args[3]);
Job belowAvgJob = submitJob(conf, belowAvgInputDir, belowAvgOutputDir);
Job aboveAvgJob = submitJob(conf, aboveAvgInputDir, aboveAvgOutputDir);
// While both jobs are not finished, sleep
while (!belowAvgJob.isComplete() || !aboveAvgJob.isComplete()) {
Thread.sleep(5000);
}
if (belowAvgJob.isSuccessful()) {
System.out.println("Below average job completed successfully!");
} else {
System.out.println("Below average job failed!");
}
if (aboveAvgJob.isSuccessful()) {
System.out.println("Above average job completed successfully!");
} else {
System.out.println("Above average job failed!");
}
System.exit(belowAvgJob.isSuccessful() &&
aboveAvgJob.isSuccessful() ? 0 : 1);
}
private static Job submitJob(Configuration conf, Path inputDir,
Path outputDir) throws Exception {
Job job = new Job(conf, "ParallelJobs");
job.setJarByClass(ParallelJobs.class);
job.setMapperClass(AverageReputationMapper.class);
job.setReducerClass(AverageReputationReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(DoubleWritable.class);
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(job, inputDir);
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job, outputDir);
// Submit job and immediately return, rather than waiting for completion
job.submit();
return job;
}
*With Shell Scripting*
Wrapping any Hadoop MapReduce job in a script, whether it be a single
Java MapReduce job, a Pig job, or whatever, has a number of benefits.
This includes post-processing, data flows, data preparation, additional
logging, and more.
The script is broken into two pieces: setting variables to actually execute
the jobs, and then executing them.
#!/bin/bash
JAR_FILE="mrdp.jar"
JOB_CHAIN_CLASS="mrdp.ch6.JobChainingDriver"
PARALLEL_JOB_CLASS="mrdp.ch6.ParallelJobs"
HADOOP="$( which hadoop )"
POST_INPUT="posts"
USER_INPUT="users"
JOBCHAIN_OUTDIR="jobchainout" #JobOne reduce output dir
BELOW_AVG_INPUT="${JOBCHAIN_OUTDIR}/belowavg"
ABOVE_AVG_INPUT="${JOBCHAIN_OUTDIR}/aboveavg"
BELOW_AVG_REP_OUTPUT="belowavgrep"
ABOVE_AVG_REP_OUTPUT="aboveavgrep"
#execute the first job
JOB_1_CMD="${HADOOP} jar ${JAR_FILE} ${JOB_CHAIN_CLASS} ${POST_INPUT} \
${USER_INPUT} ${JOBCHAIN_OUTDIR}"
JOB_2_CMD="${HADOOP} jar ${JAR_FILE} ${PARALLEL_JOB_CLASS} ${BELOW_AVG_INPUT} \
${ABOVE_AVG_INPUT} ${BELOW_AVG_REP_OUTPUT} ${ABOVE_AVG_REP_OUTPUT}"
CAT_BELOW_OUTPUT_CMD="${HADOOP} fs -cat ${BELOW_AVG_REP_OUTPUT}/part-*"
CAT_ABOVE_OUTPUT_CMD="${HADOOP} fs -cat ${ABOVE_AVG_REP_OUTPUT}/part-*"
#remove the temporary dirs
RMR_CMD="${HADOOP} fs -rmr ${JOBCHAIN_OUTDIR} ${BELOW_AVG_REP_OUTPUT} \
${ABOVE_AVG_REP_OUTPUT}"
LOG_FILE="avgrep_`date +%s`.txt"
The next part of the script echos each command prior to running it. It executes the first
job, and then checks the return code to see whether it failed. If it did, output is deleted
and the script exits. Upon success, the second job is executed and the same error condition is checked. If the second job completes successfully, the output of each job is
written to the log file and all the output is deleted. All the extra output is not required,
and since the final output of each file consists only one line, storing it in the log file is
worthwhile, instead of keeping it in HDFS.
{
echo ${JOB_1_CMD}
${JOB_1_CMD}
#The first Job executed failed
if [ $? -ne 0 ]
then
echo "First job failed!"
echo ${RMR_CMD}
${RMR_CMD}
exit $?
fi
echo ${JOB_2_CMD}
${JOB_2_CMD}
if [ $? -ne 0 ]
then
echo "Second job failed!"
echo ${RMR_CMD}
${RMR_CMD}
exit $?
fi
#display the second Job's result
echo ${CAT_BELOW_OUTPUT_CMD}
${CAT_BELOW_OUTPUT_CMD}
echo ${CAT_ABOVE_OUTPUT_CMD}
${CAT_ABOVE_OUTPUT_CMD}
#Remove the temporary dirs
echo ${RMR_CMD}
${RMR_CMD}
exit 0
} &> ${LOG_FILE} #redirect the standoutput to the logFile
----------
execute the script in cmd
/home/mrdp/hadoop/bin/hadoop jar mrdp.jar mrdp.ch6.JobChainingDriver posts \
users jobchainout
**The jobchainout is on HDFS?**
*With JobControl*
public static final String AVERAGE_CALC_GROUP = "AverageCalculation";
public static final String MULTIPLE_OUTPUTS_ABOVE_NAME = "aboveavg";
public static final String MULTIPLE_OUTPUTS_BELOW_NAME = "belowavg";
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Path postInput = new Path(args[0]);
Path userInput = new Path(args[1]);
Path countingOutput = new Path(args[3] + "_count");
Path binningOutputRoot = new Path(args[3] + "_bins");
Path binningOutputBelow = new Path(binningOutputRoot + "/"
+ JobChainingDriver.MULTIPLE_OUTPUTS_BELOW_NAME);
Path binningOutputAbove = new Path(binningOutputRoot + "/"
+ JobChainingDriver.MULTIPLE_OUTPUTS_ABOVE_NAME);
Path belowAverageRepOutput = new Path(args[2]);
Path aboveAverageRepOutput = new Path(args[3]);
Job countingJob = getCountingJob(conf, postInput, countingOutput);
int code = 1;
//boolean waitForCompletion(boolean verbose)
//Submit the job to the cluster and wait for it to finish.
if (countingJob.waitForCompletion(true)) {
ControlledJob binningControlledJob = new ControlledJob(
getBinningJobConf(countingJob, conf, countingOutput,
userInput, binningOutputRoot));
ControlledJob belowAvgControlledJob = new ControlledJob(
getAverageJobConf(conf, binningOutputBelow,
belowAverageRepOutput));
belowAvgControlledJob.addDependingJob(binningControlledJob);
ControlledJob aboveAvgControlledJob = new ControlledJob(
getAverageJobConf(conf, binningOutputAbove,
aboveAverageRepOutput));
aboveAvgControlledJob.addDependingJob(binningControlledJob);
JobControl jc = new JobControl("AverageReputation");
jc.addJob(binningControlledJob);
jc.addJob(belowAvgControlledJob);
jc.addJob(aboveAvgControlledJob);
jc.run();
code = jc.getFailedJobList().size() == 0 ? 0 : 1;
}
FileSystem fs = FileSystem.get(conf);
fs.delete(countingOutput, true);
fs.delete(binningOutputRoot, true);
System.exit(code);
}
public static Job getCountingJob(Configuration conf, Path postInput,
Path outputDirIntermediate) throws IOException {
// Setup first job to counter user posts
Job countingJob = new Job(conf, "JobChaining-Counting");
countingJob.setJarByClass(JobChainingDriver.class);
// Set our mapper and reducer, we can use the API's long sum reducer for
// a combiner!
countingJob.setMapperClass(UserIdCountMapper.class);
countingJob.setCombinerClass(LongSumReducer.class);
countingJob.setReducerClass(UserIdSumReducer.class);
countingJob.setOutputKeyClass(Text.class);
countingJob.setOutputValueClass(LongWritable.class);
countingJob.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(countingJob, postInput);
countingJob.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(countingJob, outputDirIntermediate);
return countingJob;
}
public static Configuration getBinningJobConf(Job countingJob,
Configuration conf, Path jobchainOutdir, Path userInput,
Path binningOutput) throws IOException {
// Calculate the average posts per user by getting counter values
double numRecords = (double) countingJob
.getCounters()
.findCounter(JobChainingDriver.AVERAGE_CALC_GROUP,
UserIdCountMapper.RECORDS_COUNTER_NAME).getValue();
double numUsers = (double) countingJob
.getCounters()
.findCounter(JobChainingDriver.AVERAGE_CALC_GROUP,
UserIdSumReducer.USERS_COUNTER_NAME).getValue();
double averagePostsPerUser = numRecords / numUsers;
// Setup binning job
Job binningJob = new Job(conf, "JobChaining-Binning");
binningJob.setJarByClass(JobChainingDriver.class);
// Set mapper and the average posts per user
binningJob.setMapperClass(UserIdBinningMapper.class);
UserIdBinningMapper.setAveragePostsPerUser(binningJob,
averagePostsPerUser);
binningJob.setNumReduceTasks(0);
binningJob.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(binningJob, jobchainOutdir);
// Add two named outputs for below/above average
MultipleOutputs.addNamedOutput(binningJob,
JobChainingDriver.MULTIPLE_OUTPUTS_BELOW_NAME,
TextOutputFormat.class, Text.class, Text.class);
MultipleOutputs.addNamedOutput(binningJob,
JobChainingDriver.MULTIPLE_OUTPUTS_ABOVE_NAME,
TextOutputFormat.class, Text.class, Text.class);
MultipleOutputs.setCountersEnabled(binningJob, true);
// Configure multiple outputs
conf.setOutputFormat(NullOutputFormat.class);
FileOutputFormat.setOutputPath(conf, outputDir);
MultipleOutputs.addNamedOutput(conf, MULTIPLE_OUTPUTS_ABOVE_5000,
TextOutputFormat.class, Text.class, LongWritable.class);
MultipleOutputs.addNamedOutput(conf, MULTIPLE_OUTPUTS_BELOW_5000,
TextOutputFormat.class, Text.class, LongWritable.class);
// Add the user files to the DistributedCache
FileStatus[] userFiles = FileSystem.get(conf).listStatus(userInput);
for (FileStatus status : userFiles) {
DistributedCache.addCacheFile(status.getPath().toUri(),
binningJob.getConfiguration());
}
// Execute job and grab exit code
return binningJob.getConfiguration();
}
public static Configuration getAverageJobConf(Configuration conf,
Path averageOutputDir, Path outputDir) throws IOException {
Job averageJob = new Job(conf, "ParallelJobs");
averageJob.setJarByClass(ParallelJobs.class);
averageJob.setMapperClass(AverageReputationMapper.class);
averageJob.setReducerClass(AverageReputationReducer.class);
averageJob.setOutputKeyClass(Text.class);
averageJob.setOutputValueClass(DoubleWritable.class);
averageJob.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(averageJob, averageOutputDir);
averageJob.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(averageJob, outputDir);
// Execute job and grab exit code
return averageJob.getConfiguration();
}
Chain Folding
The most expensive parts of a MapReduce job are
typically pushing data through the pipeline: loading the data, the shuf‐
fle/sort, and storing the data.
The ChainMapper and ChainReducer Approach
Each chained map phase feeds into the next in the pipeline. The output of the first is then processed by the second, which is then processed by the third, and so on. The map phases on the backend of the reducer take the output of the reducer and do additional computation. This is useful for post-processing operations or additional filtering.
Problem: Given a set of user posts and user information, bin users based on whether their reputation is below or above 5,000.
Parsing mapper code. This mapper implementation gets the user ID from the input post record and outputs it with a count of 1
public static class UserIdCountMapper extends MapReduceBase implements
Mapper<Object, Text, Text, LongWritable> {
public static final String RECORDS_COUNTER_NAME = "Records";
private static final LongWritable ONE = new LongWritable(1);
private Text outkey = new Text();
public void map(Object key, Text value,
OutputCollector<Text, LongWritable> output, Reporter reporter)
throws IOException {
Map<String, String> parsed = MRDPUtils.transformXmlToMap(value
.toString());
// Get the value for the OwnerUserId attribute
outkey.set(parsed.get("OwnerUserId"));
output.collect(outkey, ONE);
}
}
Replicated join mapper code.
public static class UserIdReputationEnrichmentMapper extends MapReduceBase
implements Mapper<Text, LongWritable, Text, LongWritable> {
private Text outkey = new Text();
private HashMap<String, String> userIdToReputation =
new HashMap<String, String>();
public void configure(JobConf job) {
Path[] files = DistributedCache.getLocalCacheFiles(job);
// Read all files in the DistributedCache
for (Path p : files) {
BufferedReader rdr = new BufferedReader(
new InputStreamReader(
new GZIPInputStream(new FileInputStream(
new File(p.toString())))));
String line;
// For each record in the user file
while ((line = rdr.readLine()) != null) {
// Get the user ID and reputation
Map<String, String> parsed = MRDPUtils
.transformXmlToMap(line);
// Map the user ID to the reputation
userIdToReputation.put(parsed.get("Id",
parsed.get("Reputation"));
}
}
}
public void map(Text key, LongWritable value,
OutputCollector<Text, LongWritable> output, Reporter reporter)
throws IOException {
String reputation = userIdToReputation.get(key.toString());
if (reputation != null) {
outkey.set(value.get() + "\t" + reputation);
output.collect(outkey, value);
}
}
}
ChainMapper is first used to add the two map implementations that will be called back to back before any sorting and shuffling occurs. Then, the ChainReducer static methods are used to set the reducer implementation, and then finally a mapper on the end. Note that you don’t use ChainMapper to add a mapper after a reducer: use ChainReducer.
public static void main(String[] args) throws Exception {
JobConf conf = new JobConf("ChainMapperReducer");
conf.setJarByClass(ChainMapperDriver.class);
Path postInput = new Path(args[0]);
Path userInput = new Path(args[1]);
Path outputDir = new Path(args[2]);
ChainMapper.addMapper(conf, UserIdCountMapper.class,
LongWritable.class, Text.class, Text.class, LongWritable.class,
false, new JobConf(false));
ChainMapper.addMapper(conf, UserIdReputationEnrichmentMapper.class,
Text.class, LongWritable.class, Text.class, LongWritable.class,
false, new JobConf(false));
ChainReducer.setReducer(conf, LongSumReducer.class, Text.class,
LongWritable.class, Text.class, LongWritable.class, false,
new JobConf(false));
ChainReducer.addMapper(conf, UserIdBinningMapper.class, Text.class,
LongWritable.class, Text.class, LongWritable.class, false,
new JobConf(false));
conf.setCombinerClass(LongSumReducer.class);
conf.setInputFormat(TextInputFormat.class);
TextInputFormat.setInputPaths(conf, postInput);
// Configure multiple outputs
conf.setOutputFormat(NullOutputFormat.class);
FileOutputFormat.setOutputPath(conf, outputDir);
MultipleOutputs.addNamedOutput(conf, MULTIPLE_OUTPUTS_ABOVE_5000,
TextOutputFormat.class, Text.class, LongWritable.class);
MultipleOutputs.addNamedOutput(conf, MULTIPLE_OUTPUTS_BELOW_5000,
conf.setOutputKeyClass(Text.class);
conf.setOutputValueClass(LongWritable.class);
// Add the user files to the DistributedCache
FileStatus[] userFiles = FileSystem.get(conf).listStatus(userInput);
for (FileStatus status : userFiles) {
DistributedCache.addCacheFile(status.getPath().toUri(), conf);
}
RunningJob job = JobClient.runJob(conf);
while (!job.isComplete()) {
Thread.sleep(5000);
}
System.exit(job.isSuccessful() ? 0 : 1);
}
Job Merging
Problem: Given a set of comments, generate an anonymized version of the data and a distinct set of user IDs.
public static class TaggedText implements WritableComparable<TaggedText> {
private String tag = "";
private Text text = new Text();
public TaggedText() { }
public void setTag(String tag) {
this.tag = tag;
}
public String getTag() {
return tag;
}
public void setText(Text text) {
this.text.set(text);
}
public void setText(String text) {
this.text.set(text);
}
public Text getText() {
return text;
}
public void readFields(DataInput in) throws IOException {
tag = in.readUTF();
text.readFields(in);
}
public void write(DataOutput out) throws IOException {
out.writeUTF(tag);
text.write(out);
}
public int compareTo(TaggedText obj) {
int compare = tag.compareTo(obj.getTag());
if (compare == 0) {
return text.compareTo(obj.getText());
} else {
return compare;
}
}
public String toString() {
return tag.toString() + ":" + text.toString();
}
}
Merged Mapper Code:
Each helper math method parses the input record, but this parsing should instead be done inside the actual map method, The resulting Map<String,String> can then be passed to both helper methods. Any little optimizations like this can be very beneficial in the long run and should be implemented.
public static class AnonymizeDistinctMergedMapper extends
Mapper<Object, Text, TaggedText, Text> {
private static final Text DISTINCT_OUT_VALUE = new Text();
private Random rndm = new Random();
private TaggedText anonymizeOutkey = new TaggedText(),
distinctOutkey = new TaggedText();
private Text anonymizeOutvalue = new Text();
public void map(Object key, Text value, Context context)
throws IOException, InterruptedException {
anonymizeMap(key, value, context);
distinctMap(key, value, context);
}
private void anonymizeMap(Object key, Text value, Context context)
throws IOException, InterruptedException {
Map<String, String> parsed = MRDPUtils.transformXmlToMap(value
.toString());
if (parsed.size() > 0) {
StringBuilder bldr = new StringBuilder();
bldr.append("<row ");
for (Entry<String, String> entry : parsed.entrySet()) {
if (entry.getKey().equals("UserId")
|| entry.getKey().equals("Id")) {
// ignore these fields
} else if (entry.getKey().equals("CreationDate")) {
// Strip out the time, anything after the 'T'
// in the value
bldr.append(entry.getKey()
+ "=\""
+ entry.getValue().substring(0,
entry.getValue().indexOf('T'))
+ "\" ");
} else {
// Otherwise, output this.
bldr.append(entry.getKey() + "=\"" + entry.
getValue() + "\" ");
}
}
bldr.append(">");
anonymizeOutkey.setTag("A");
anonymizeOutkey.setText(Integer.toString(rndm.nextInt()));
anonymizeOutvalue.set(bldr.toString());
context.write(anonymizeOutkey, anonymizeOutvalue);
}
}
private void distinctMap(Object key, Text value, Context context)
throws IOException, InterruptedException {
Map<String, String> parsed = MRDPUtils.transformXmlToMap(value
.toString());
// Otherwise, set our output key to the user's id,
// tagged with a "D"
distinctOutkey.setTag("D");
distinctOutkey.setText(parsed.get("UserId"));
// Write the user's id with a null value
context.write(distinctOutkey, DISTINCT_OUT_VALUE);
}
}
Merged reducer code. The reducer’s calls to setup and cleanup handle the creation and closing of the MultipleOutputs utility.
public static class AnonymizeDistinctMergedReducer extends
Reducer<TaggedText, Text, Text, NullWritable> {
private MultipleOutputs<Text, NullWritable> mos = null;
protected void setup(Context context) throws IOException,
InterruptedException {
mos = new MultipleOutputs<Text, NullWritable>(context);
}
protected void reduce(TaggedText key, Iterable<Text> values,
Context context) throws IOException, InterruptedException {
if (key.getTag().equals("A")) {
anonymizeReduce(key.getText(), values, context);
} else {
distinctReduce(key.getText(), values, context);
}
}
private void anonymizeReduce(Text key, Iterable<Text> values,
Context context) throws IOException, InterruptedException {
for (Text value : values) {
mos.write(MULTIPLE_OUTPUTS_ANONYMIZE, value,
NullWritable.get(), MULTIPLE_OUTPUTS_ANONYMIZE + "/part");
}
}
private void distinctReduce(Text key, Iterable<Text> values,
Context context) throws IOException, InterruptedException {
mos.write(MULTIPLE_OUTPUTS_DISTINCT, key, NullWritable.get(),
MULTIPLE_OUTPUTS_DISTINCT + "/part");
}
protected void cleanup(Context context) throws IOException,
InterruptedException {
mos.close();
}
}
Driver code.
public static void main(String[] args) throws Exception {
// Configure the merged job
Job job = new Job(new Configuration(), "MergedJob");
job.setJarByClass(MergedJobDriver.class);
job.setMapperClass(AnonymizeDistinctMergedMapper.class);
job.setReducerClass(AnonymizeDistinctMergedReducer.class);
job.setNumReduceTasks(10);
TextInputFormat.setInputPaths(job, new Path(args[0]));
TextOutputFormat.setOutputPath(job, new Path(args[1]));
MultipleOutputs.addNamedOutput(job, MULTIPLE_OUTPUTS_ANONYMIZE,
TextOutputFormat.class, Text.class, NullWritable.class);
MultipleOutputs.addNamedOutput(job, MULTIPLE_OUTPUTS_DISTINCT,
TextOutputFormat.class, Text.class, NullWritable.class);
job.setOutputKeyClass(TaggedText.class);
job.setOutputValueClass(Text.class);
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
setOutputKeyClass同时设置map和reduce的key类型