CHAPTER 7:Input and Output Patterns
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
int numMapTasks = Integer.parseInt(args[0]);
int numRecordsPerTask = Integer.parseInt(args[1]);
Path wordList = new Path(args[2]);
Path outputDir = new Path(args[3]);
Job job = new Job(conf, "RandomDataGenerationDriver");
RandomStackOverflowInputFormat.setNumMapTasks(job, numMapTasks);
RandomStackOverflowInputFormat.setRandomWordList(job, wordList);
TextOutputFormat.setOutputPath(job, outputDir);
System.exit(job.waitForCompletion(true) ? 0 : 2);
public static class FakeInputSplit extends InputSplit implements
Writable {
public void readFields(DataInput arg0) throws IOException {
public void write(DataOutput arg0) throws IOException {
public long getLength() throws IOException, InterruptedException {
return 0;
public String[] getLocations() throws IOException,
InterruptedException {
return new String[0];
public static class RandomStackOverflowInputFormat extends
InputFormat {
public static final String NUM_MAP_TASKS = "";
public static final String NUM_RECORDS_PER_TASK =
public static final String RANDOM_WORD_LIST =
public List getSplits(JobContext job) throws IOException {
// Get the number of map tasks configured for
int numSplits = job.getConfiguration().getInt(NUM_MAP_TASKS, -1);
// Create a number of input splits equivalent to the number of tasks
ArrayList splits = new ArrayList();
for (int i = 0; i < numSplits; ++i) {
splits.add(new FakeInputSplit());
return splits;
public RecordReader createRecordReader(
InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
// Create a new RandomStackOverflowRecordReader and initialize it
RandomStackOverflowRecordReader rr =
new RandomStackOverflowRecordReader();
rr.initialize(split, context);
return rr;
public static void setNumMapTasks(Job job, int i) {
job.getConfiguration().setInt(NUM_MAP_TASKS, i);
public static void setNumRecordPerTask(Job job, int i) {
job.getConfiguration().setInt(NUM_RECORDS_PER_TASK, i);
public static void setRandomWordList(Job job, Path file) {
DistributedCache.addCacheFile(file.toUri(), job.getConfiguration());
public static class RandomStackOverflowRecordReader extends
RecordReader {
private int numRecordsToCreate = 0;
private int createdRecords = 0;
private Text key = new Text();
private NullWritable value = NullWritable.get();
private Random rndm = new Random();
private ArrayList randomWords = new ArrayList();
// This object will format the creation date string into a Date
// object
private SimpleDateFormat frmt = new SimpleDateFormat(
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
// Get the number of records to create from the configuration
this.numRecordsToCreate = context.getConfiguration().getInt(
// Get the list of random words from the DistributedCache
URI[] files = DistributedCache.getCacheFiles(context
// Read the list of random words into a list
BufferedReader rdr = new BufferedReader(new FileReader(
String line;
while ((line = rdr.readLine()) != null) {
public boolean nextKeyValue() throws IOException,
InterruptedException {
// If we still have records to create
if (createdRecords < numRecordsToCreate) {
// Generate random data
int score = Math.abs(rndm.nextInt()) % 15000;
int rowId = Math.abs(rndm.nextInt()) % 1000000000;
int postId = Math.abs(rndm.nextInt()) % 100000000;
int userId = Math.abs(rndm.nextInt()) % 1000000;
String creationDate = frmt
// Create a string of text from the random words
String text = getRandomText();
String randomRecord = "
return true;
} else {
// We are done creating records
return false;
private String getRandomText() {
StringBuilder bldr = new StringBuilder();
int numWords = Math.abs(rndm.nextInt()) % 30 + 1;
for (int i = 0; i < numWords; ++i) {
% randomWords.size())
+ " ");
return bldr.toString();
public Text getCurrentKey() throws IOException,
InterruptedException {
return key;
public NullWritable getCurrentValue() throws IOException,
InterruptedException {
return value;
public float getProgress() throws IOException, InterruptedException {
return (float) createdRecords / (float) numRecordsToCreate;
public void close() throws IOException {
// nothing to do here...
public static class RedisHashOutputFormat extends OutputFormat {
public static final String REDIS_HOSTS_CONF =
public static final String REDIS_HASH_KEY_CONF =
public static void setRedisHosts(Job job, String hosts) {
job.getConfiguration().set(REDIS_HOSTS_CONF, hosts);
public static void setRedisHashKey(Job job, String hashKey) {
job.getConfiguration().set(REDIS_HASH_KEY_CONF, hashKey);
public RecordWriter getRecordWriter(TaskAttemptContext job)
throws IOException, InterruptedException {
return new RedisHashRecordWriter(job.getConfiguration().get(
REDIS_HASH_KEY_CONF), job.getConfiguration().get(
public void checkOutputSpecs(JobContext job) throws IOException {
String hosts = job.getConfiguration().get(REDIS_HOSTS_CONF);
if (hosts == null || hosts.isEmpty()) {
throw new IOException(REDIS_HOSTS_CONF
+ " is not set in configuration.");
String hashKey = job.getConfiguration().get(
if (hashKey == null || hashKey.isEmpty()) {
throw new IOException(REDIS_HASH_KEY_CONF
+ " is not set in configuration.");
public OutputCommitter getOutputCommitter(TaskAttemptContext context)
throws IOException, InterruptedException {
return (new NullOutputFormat()).getOutputCommitter(context);
public static class RedisHashRecordWriter extends RecordWriter {
public static class RedisHashOutputFormat extends OutputFormat {
public static final String REDIS_HOSTS_CONF =
public static final String REDIS_HASH_KEY_CONF =
public static void setRedisHosts(Job job, String hosts) {
job.getConfiguration().set(REDIS_HOSTS_CONF, hosts);
public static void setRedisHashKey(Job job, String hashKey) {
job.getConfiguration().set(REDIS_HASH_KEY_CONF, hashKey);
public RecordWriter getRecordWriter(TaskAttemptContext job)
throws IOException, InterruptedException {
return new RedisHashRecordWriter(job.getConfiguration().get(
REDIS_HASH_KEY_CONF), job.getConfiguration().get(
public void checkOutputSpecs(JobContext job) throws IOException {
String hosts = job.getConfiguration().get(REDIS_HOSTS_CONF);
if (hosts == null || hosts.isEmpty()) {
throw new IOException(REDIS_HOSTS_CONF
+ " is not set in configuration.");
String hashKey = job.getConfiguration().get(
if (hashKey == null || hashKey.isEmpty()) {
throw new IOException(REDIS_HASH_KEY_CONF
+ " is not set in configuration.");
public OutputCommitter getOutputCommitter(TaskAttemptContext context)
throws IOException, InterruptedException {
return (new NullOutputFormat()).getOutputCommitter(context);
public static class RedisHashRecordWriter extends RecordWriter {
public static class RedisHashRecordWriter extends RecordWriter {
private HashMap jedisMap = new HashMap();
private String hashKey = null;
public RedisHashRecordWriter(String hashKey, String hosts) {
this.hashKey = hashKey;
// Create a connection to Redis for each host
// Map an integer 0-(numRedisInstances - 1) to the instance
int i = 0;
for (String host : hosts.split(",")) {
Jedis jedis = new Jedis(host);
jedisMap.put(i, jedis);
public void write(Text key, Text value) throws IOException,
InterruptedException {
// Get the Jedis instance that this key/value pair will be
// written to
Jedis j = jedisMap.get(Math.abs(key.hashCode()) % jedisMap.size());
// Write the key/value pair
j.hset(hashKey, key.toString(), value.toString());
public void close(TaskAttemptContext context) throws IOException,
InterruptedException {
// For each jedis instance, disconnect it
for (Jedis jedis : jedisMap.values()) {
public static class RedisOutputMapper extends
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Path inputPath = new Path(args[0]);
String hosts = args[1];
String hashName = args[2];
Job job = new Job(conf, "Redis Output");
TextInputFormat.setInputPaths(job, inputPath);
RedisHashOutputFormat.setRedisHosts(job, hosts);
RedisHashOutputFormat.setRedisHashKey(job, hashName);
int code = job.waitForCompletion(true) ? 0 : 2;
public static class RedisHashInputSplit extends InputSplit implements Writable {
private String location = null;
private String hashKey = null;
public RedisHashInputSplit() {
// Default constructor for reflection
public RedisHashInputSplit(String redisHost, String hash) {
this.location = redisHost;
this.hashKey = hash;
public String getHashKey() {
return this.hashKey;
public void readFields(DataInput in) throws IOException {
this.location = in.readUTF();
this.hashKey = in.readUTF();
public void write(DataOutput out) throws IOException {
public long getLength() throws IOException, InterruptedException {
return 0;
public String[] getLocations() throws IOException, InterruptedException {
return new String[] { location };
public static class RedisHashInputFormat extends InputFormat {
public static final String REDIS_HOSTS_CONF =
public static final String REDIS_HASH_KEY_CONF =
private static final Logger LOG = Logger
public static void setRedisHosts(Job job, String hosts) {
job.getConfiguration().set(REDIS_HOSTS_CONF, hosts);
public static void setRedisHashKey(Job job, String hashKey) {
job.getConfiguration().set(REDIS_HASH_KEY_CONF, hashKey);
public List getSplits(JobContext job) throws IOException {
String hosts = job.getConfiguration().get(REDIS_HOSTS_CONF);
if (hosts == null || hosts.isEmpty()) {
throw new IOException(REDIS_HOSTS_CONF
+ " is not set in configuration.");
String hashKey = job.getConfiguration().get(REDIS_HASH_KEY_CONF);
if (hashKey == null || hashKey.isEmpty()) {
throw new IOException(REDIS_HASH_KEY_CONF
+ " is not set in configuration.");
// Create an input split for each host
List splits = new ArrayList();
for (String host : hosts.split(",")) {
splits.add(new RedisHashInputSplit(host, hashKey));
}"Input splits to process: " + splits.size());
return splits;
public RecordReader createRecordReader(InputSplit split,
TaskAttemptContext context) throws IOException,
InterruptedException {
return new RedisHashRecordReader();
public static class RedisHashRecordReader extends RecordReader {
// code in next section
public static class RedisHashInputSplit extends
InputSplit implements Writable {
// code in next section
public static class RedisHashRecordReader extends RecordReader {
private static final Logger LOG =
private Iterator> keyValueMapIter = null;
private Text key = new Text(), value = new Text();
private float processedKVs = 0, totalKVs = 0;
private Entry currentEntry = null;
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
// Get the host location from the InputSplit
String host = split.getLocations()[0];
String hashKey = ((RedisHashInputSplit) split).getHashKey();"Connecting to " + host + " and reading from "
+ hashKey);
Jedis jedis = new Jedis(host);
// Get all the key/value pairs from the Redis instance and store
// them in memory
totalKVs = jedis.hlen(hashKey);
keyValueMapIter = jedis.hgetAll(hashKey).entrySet().iterator();"Got " + totalKVs + " from " + hashKey);
public boolean nextKeyValue() throws IOException,
InterruptedException {
// If the key/value map still has values
if (keyValueMapIter.hasNext()) {
// Get the current entry and set the Text objects to the entry
currentEntry =;
return true;
} else {
// No more values? return false.
return false;
public Text getCurrentKey() throws IOException,
InterruptedException {
return key;
public Text getCurrentValue() throws IOException,
InterruptedException {
return value;
public float getProgress() throws IOException, InterruptedException {
return processedKVs / totalKVs;
public void close() throws IOException {
// nothing to do here
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String hosts = otherArgs[0];
String hashKey = otherArgs[1];
Path outputDir = new Path(otherArgs[2]);
Job job = new Job(conf, "Redis Input");
// Use the identity mapper
RedisHashInputFormat.setRedisHosts(job, hosts);
RedisHashInputFormat.setRedisHashKey(job, hashKey);
TextOutputFormat.setOutputPath(job, outputDir);
System.exit(job.waitForCompletion(true) ? 0 : 3);
public static class RedisKey implements WritableComparable {
private int lastAccessMonth = 0;
private Text field = new Text();
public int getLastAccessMonth() {
return this.lastAccessMonth;
public void setLastAccessMonth(int lastAccessMonth) {
this.lastAccessMonth = lastAccessMonth;
public Text getField() {
return this.field;
public void setField(String field) {
public void readFields(DataInput in) throws IOException {
lastAccessMonth = in.readInt();
public void write(DataOutput out) throws IOException {
public int compareTo(RedisKey rhs) {
if (this.lastAccessMonth == rhs.getLastAccessMonth()) {
return this.field.compareTo(rhs.getField());
} else {
return this.lastAccessMonth < rhs.getLastAccessMonth() ? -1 : 1;
public String toString() {
return this.lastAccessMonth + "\t" + this.field.toString();
public int hashCode() {
return toString().hashCode();
public static class RedisLastAccessOutputFormat
extends OutputFormat {
public RecordWriter getRecordWriter(
TaskAttemptContext job) throws IOException, InterruptedException {
return new RedisLastAccessRecordWriter();
public void checkOutputSpecs(JobContext context) throws IOException,
InterruptedException {
public OutputCommitter getOutputCommitter(TaskAttemptContext context)
throws IOException, InterruptedException {
return (new NullOutputFormat()).getOutputCommitter(context);
public static class RedisLastAccessRecordWriter
extends RecordWriter {
// Code in next section
public static class RedisLastAccessRecordWriter
extends RecordWriter {
private HashMap jedisMap = new HashMap();
public RedisLastAccessRecordWriter() {
// Create a connection to Redis for each host
int i = 0;
for (String host : MRDPUtils.REDIS_INSTANCES) {
Jedis jedis = new Jedis(host);
jedisMap.put(i, jedis);
jedisMap.put(i + 1, jedis);
i += 2;
public void write(RedisKey key, Text value) throws IOException,
InterruptedException {
// Get the Jedis instance that this key/value pair will be
// written to -- (0,1)->0, (2-3)->1, ... , (10-11)->5
Jedis j = jedisMap.get(key.getLastAccessMonth());
// Write the key/value pair
j.hset(MONTH_FROM_INT.get(key.getLastAccessMonth()), key
.getField().toString(), value.toString());
public void close(TaskAttemptContext context) throws IOException,
InterruptedException {
// For each jedis instance, disconnect it
for (Jedis jedis : jedisMap.values()) {
public static class RedisLastAccessOutputMapper extends
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Path inputPath = new Path(args[0]);
Job job = new Job(conf, "Redis Last Access Output");
TextInputFormat.setInputPaths(job, inputPath);
int code = job.waitForCompletion(true) ? 0 : 2;
public static class RedisLastAccessInputSplit
extends InputSplit implements Writable {
private String location = null;
private List hashKeys = new ArrayList();
public RedisLastAccessInputSplit() {
// Default constructor for reflection
public RedisLastAccessInputSplit(String redisHost) {
this.location = redisHost;
public void addHashKey(String key) {
public void removeHashKey(String key) {
public List getHashKeys() {
return hashKeys;
public void readFields(DataInput in) throws IOException {
location = in.readUTF();
int numKeys = in.readInt();
for (int i = 0; i < numKeys; ++i) {
public void write(DataOutput out) throws IOException {
for (String key : hashKeys) {
public long getLength() throws IOException, InterruptedException {
return 0;
public String[] getLocations() throws IOException, InterruptedException {
return new String[] { location };
public static class RedisLastAccessInputFormat
extends InputFormat {
public static final String REDIS_SELECTED_MONTHS_CONF =
private static final HashMap MONTH_FROM_STRING =
new HashMap();
private static final HashMap MONTH_TO_INST_MAP =
new HashMap();
private static final Logger LOG = Logger
static {
// Initialize month to Redis instance map
// Initialize month 3 character code to integer
public static void setRedisLastAccessMonths(Job job, String months) {
job.getConfiguration().set(REDIS_SELECTED_MONTHS_CONF, months);
public List getSplits(JobContext job) throws IOException {
String months = job.getConfiguration().get(
if (months == null || months.isEmpty()) {
+ " is null or empty.");
// Create input splits from the input months
HashMap instanceToSplitMap =
new HashMap();
for (String month : months.split(",")) {
String host = MONTH_TO_INST_MAP.get(month);
RedisLastAccessInputSplit split = instanceToSplitMap.get(host);
if (split == null) {
split = new RedisLastAccessInputSplit(host);
instanceToSplitMap.put(host, split);
} else {
}"Input splits to process: " +
return new ArrayList(instanceToSplitMap.values());
public RecordReader createRecordReader(
InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
return new RedisLastAccessRecordReader();
public static class RedisLastAccessRecordReader
extends RecordReader {
// Code in next section
public static class RedisLastAccessRecordReader
extends RecordReader {
private static final Logger LOG = Logger
private Entry currentEntry = null;
private float processedKVs = 0, totalKVs = 0;
private int currentHashMonth = 0;
private Iterator> hashIterator = null;
private Iterator hashKeys = null;
private RedisKey key = new RedisKey();
private String host = null;
private Text value = new Text();
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
// Get the host location from the InputSplit
host = split.getLocations()[0];
// Get an iterator of all the hash keys we want to read
hashKeys = ((RedisLastAccessInputSplit) split)
.getHashKeys().iterator();"Connecting to " + host);
public boolean nextKeyValue() throws IOException,
InterruptedException {
boolean nextHashKey = false;
do {
// if this is the first call or the iterator does not have a
// next
if (hashIterator == null || !hashIterator.hasNext()) {
// if we have reached the end of our hash keys, return
// false
if (!hashKeys.hasNext()) {
// ultimate end condition, return false
return false;
} else {
// Otherwise, connect to Redis and get all
// the name/value pairs for this hash key
Jedis jedis = new Jedis(host);
String strKey =;
currentHashMonth = MONTH_FROM_STRING.get(strKey);
hashIterator = jedis.hgetAll(strKey).entrySet()
// If the key/value map still has values
if (hashIterator.hasNext()) {
// Get the current entry and set
// the Text objects to the entry
currentEntry =;
} else {
nextHashKey = true;
} while (nextHashKey);
return true;
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String lastAccessMonths = args[0];
Path outputDir = new Path(args[1]);
Job job = new Job(conf, "Redis Input");
// Use the identity mapper
TextOutputFormat.setOutputPath(job, outputDir);
System.exit(job.waitForCompletion(true) ? 0 : 2);