CHAPTER 7:Input and Output Patterns
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
int numMapTasks = Integer.parseInt(args[0]);
int numRecordsPerTask = Integer.parseInt(args[1]);
Path wordList = new Path(args[2]);
Path outputDir = new Path(args[3]);
Job job = new Job(conf, "RandomDataGenerationDriver");
job.setJarByClass(RandomDataGenerationDriver.class);
job.setNumReduceTasks(0);
job.setInputFormatClass(RandomStackOverflowInputFormat.class);
RandomStackOverflowInputFormat.setNumMapTasks(job, numMapTasks);
RandomStackOverflowInputFormat.setNumRecordPerTask(job,
numRecordsPerTask);
RandomStackOverflowInputFormat.setRandomWordList(job, wordList);
TextOutputFormat.setOutputPath(job, outputDir);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(NullWritable.class);
System.exit(job.waitForCompletion(true) ? 0 : 2);
}
public static class FakeInputSplit extends InputSplit implements
Writable {
public void readFields(DataInput arg0) throws IOException {
}
public void write(DataOutput arg0) throws IOException {
}
public long getLength() throws IOException, InterruptedException {
return 0;
}
public String[] getLocations() throws IOException,
InterruptedException {
return new String[0];
}
}
public static class RandomStackOverflowInputFormat extends
InputFormat {
public static final String NUM_MAP_TASKS = "random.generator.map.tasks";
public static final String NUM_RECORDS_PER_TASK =
"random.generator.num.records.per.map.task";
public static final String RANDOM_WORD_LIST =
"random.generator.random.word.file";
public List getSplits(JobContext job) throws IOException {
// Get the number of map tasks configured for
int numSplits = job.getConfiguration().getInt(NUM_MAP_TASKS, -1);
// Create a number of input splits equivalent to the number of tasks
ArrayList splits = new ArrayList();
for (int i = 0; i < numSplits; ++i) {
splits.add(new FakeInputSplit());
}
return splits;
}
public RecordReader createRecordReader(
InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
// Create a new RandomStackOverflowRecordReader and initialize it
RandomStackOverflowRecordReader rr =
new RandomStackOverflowRecordReader();
rr.initialize(split, context);
return rr;
}
public static void setNumMapTasks(Job job, int i) {
job.getConfiguration().setInt(NUM_MAP_TASKS, i);
}
public static void setNumRecordPerTask(Job job, int i) {
job.getConfiguration().setInt(NUM_RECORDS_PER_TASK, i);
}
public static void setRandomWordList(Job job, Path file) {
DistributedCache.addCacheFile(file.toUri(), job.getConfiguration());
}
}
public static class RandomStackOverflowRecordReader extends
RecordReader {
private int numRecordsToCreate = 0;
private int createdRecords = 0;
private Text key = new Text();
private NullWritable value = NullWritable.get();
private Random rndm = new Random();
private ArrayList randomWords = new ArrayList();
// This object will format the creation date string into a Date
// object
private SimpleDateFormat frmt = new SimpleDateFormat(
"yyyy-MM-dd'T'HH:mm:ss.SSS");
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
// Get the number of records to create from the configuration
this.numRecordsToCreate = context.getConfiguration().getInt(
NUM_RECORDS_PER_TASK, -1);
// Get the list of random words from the DistributedCache
URI[] files = DistributedCache.getCacheFiles(context
.getConfiguration());
// Read the list of random words into a list
BufferedReader rdr = new BufferedReader(new FileReader(
files[0].toString()));
String line;
while ((line = rdr.readLine()) != null) {
randomWords.add(line);
}
rdr.close();
}
public boolean nextKeyValue() throws IOException,
InterruptedException {
// If we still have records to create
if (createdRecords < numRecordsToCreate) {
// Generate random data
int score = Math.abs(rndm.nextInt()) % 15000;
int rowId = Math.abs(rndm.nextInt()) % 1000000000;
int postId = Math.abs(rndm.nextInt()) % 100000000;
int userId = Math.abs(rndm.nextInt()) % 1000000;
String creationDate = frmt
.format(Math.abs(rndm.nextLong()));
// Create a string of text from the random words
String text = getRandomText();
String randomRecord = "
";
key.set(randomRecord);
++createdRecords;
return true;
} else {
// We are done creating records
return false;
}
}
private String getRandomText() {
StringBuilder bldr = new StringBuilder();
int numWords = Math.abs(rndm.nextInt()) % 30 + 1;
for (int i = 0; i < numWords; ++i) {
bldr.append(randomWords.get(Math.abs(rndm.nextInt())
% randomWords.size())
+ " ");
}
return bldr.toString();
}
public Text getCurrentKey() throws IOException,
InterruptedException {
return key;
}
public NullWritable getCurrentValue() throws IOException,
InterruptedException {
return value;
}
public float getProgress() throws IOException, InterruptedException {
return (float) createdRecords / (float) numRecordsToCreate;
}
public void close() throws IOException {
// nothing to do here...
}
}
public static class RedisHashOutputFormat extends OutputFormat {
public static final String REDIS_HOSTS_CONF =
"mapred.redishashoutputformat.hosts";
public static final String REDIS_HASH_KEY_CONF =
"mapred.redishashinputformat.key";
public static void setRedisHosts(Job job, String hosts) {
job.getConfiguration().set(REDIS_HOSTS_CONF, hosts);
}
public static void setRedisHashKey(Job job, String hashKey) {
job.getConfiguration().set(REDIS_HASH_KEY_CONF, hashKey);
}
public RecordWriter getRecordWriter(TaskAttemptContext job)
throws IOException, InterruptedException {
return new RedisHashRecordWriter(job.getConfiguration().get(
REDIS_HASH_KEY_CONF), job.getConfiguration().get(
REDIS_HOSTS_CONF));
}
public void checkOutputSpecs(JobContext job) throws IOException {
String hosts = job.getConfiguration().get(REDIS_HOSTS_CONF);
if (hosts == null || hosts.isEmpty()) {
throw new IOException(REDIS_HOSTS_CONF
+ " is not set in configuration.");
}
String hashKey = job.getConfiguration().get(
REDIS_HASH_KEY_CONF);
if (hashKey == null || hashKey.isEmpty()) {
throw new IOException(REDIS_HASH_KEY_CONF
+ " is not set in configuration.");
}
}
public OutputCommitter getOutputCommitter(TaskAttemptContext context)
throws IOException, InterruptedException {
return (new NullOutputFormat()).getOutputCommitter(context);
}
public static class RedisHashRecordWriter extends RecordWriter {
public static class RedisHashOutputFormat extends OutputFormat {
public static final String REDIS_HOSTS_CONF =
"mapred.redishashoutputformat.hosts";
public static final String REDIS_HASH_KEY_CONF =
"mapred.redishashinputformat.key";
public static void setRedisHosts(Job job, String hosts) {
job.getConfiguration().set(REDIS_HOSTS_CONF, hosts);
}
public static void setRedisHashKey(Job job, String hashKey) {
job.getConfiguration().set(REDIS_HASH_KEY_CONF, hashKey);
}
public RecordWriter getRecordWriter(TaskAttemptContext job)
throws IOException, InterruptedException {
return new RedisHashRecordWriter(job.getConfiguration().get(
REDIS_HASH_KEY_CONF), job.getConfiguration().get(
REDIS_HOSTS_CONF));
}
public void checkOutputSpecs(JobContext job) throws IOException {
String hosts = job.getConfiguration().get(REDIS_HOSTS_CONF);
if (hosts == null || hosts.isEmpty()) {
throw new IOException(REDIS_HOSTS_CONF
+ " is not set in configuration.");
}
String hashKey = job.getConfiguration().get(
REDIS_HASH_KEY_CONF);
if (hashKey == null || hashKey.isEmpty()) {
throw new IOException(REDIS_HASH_KEY_CONF
+ " is not set in configuration.");
}
}
public OutputCommitter getOutputCommitter(TaskAttemptContext context)
throws IOException, InterruptedException {
return (new NullOutputFormat()).getOutputCommitter(context);
}
public static class RedisHashRecordWriter extends RecordWriter {
public static class RedisHashRecordWriter extends RecordWriter {
private HashMap jedisMap = new HashMap();
private String hashKey = null;
public RedisHashRecordWriter(String hashKey, String hosts) {
this.hashKey = hashKey;
// Create a connection to Redis for each host
// Map an integer 0-(numRedisInstances - 1) to the instance
int i = 0;
for (String host : hosts.split(",")) {
Jedis jedis = new Jedis(host);
jedis.connect();
jedisMap.put(i, jedis);
++i;
}
}
public void write(Text key, Text value) throws IOException,
InterruptedException {
// Get the Jedis instance that this key/value pair will be
// written to
Jedis j = jedisMap.get(Math.abs(key.hashCode()) % jedisMap.size());
// Write the key/value pair
j.hset(hashKey, key.toString(), value.toString());
}
public void close(TaskAttemptContext context) throws IOException,
InterruptedException {
// For each jedis instance, disconnect it
for (Jedis jedis : jedisMap.values()) {
jedis.disconnect();
}
}
}
public static class RedisOutputMapper extends
Mapper
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Path inputPath = new Path(args[0]);
String hosts = args[1];
String hashName = args[2];
Job job = new Job(conf, "Redis Output");
job.setJarByClass(RedisOutputDriver.class);
job.setMapperClass(RedisOutputMapper.class);
job.setNumReduceTasks(0);
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.setInputPaths(job, inputPath);
job.setOutputFormatClass(RedisHashOutputFormat.class);
RedisHashOutputFormat.setRedisHosts(job, hosts);
RedisHashOutputFormat.setRedisHashKey(job, hashName);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
int code = job.waitForCompletion(true) ? 0 : 2;
System.exit(code);
}
public static class RedisHashInputSplit extends InputSplit implements Writable {
private String location = null;
private String hashKey = null;
public RedisHashInputSplit() {
// Default constructor for reflection
}
public RedisHashInputSplit(String redisHost, String hash) {
this.location = redisHost;
this.hashKey = hash;
}
public String getHashKey() {
return this.hashKey;
}
public void readFields(DataInput in) throws IOException {
this.location = in.readUTF();
this.hashKey = in.readUTF();
}
public void write(DataOutput out) throws IOException {
out.writeUTF(location);
out.writeUTF(hashKey);
}
public long getLength() throws IOException, InterruptedException {
return 0;
}
public String[] getLocations() throws IOException, InterruptedException {
return new String[] { location };
}
}
public static class RedisHashInputFormat extends InputFormat {
public static final String REDIS_HOSTS_CONF =
"mapred.redishashinputformat.hosts";
public static final String REDIS_HASH_KEY_CONF =
"mapred.redishashinputformat.key";
private static final Logger LOG = Logger
.getLogger(RedisHashInputFormat.class);
public static void setRedisHosts(Job job, String hosts) {
job.getConfiguration().set(REDIS_HOSTS_CONF, hosts);
}
public static void setRedisHashKey(Job job, String hashKey) {
job.getConfiguration().set(REDIS_HASH_KEY_CONF, hashKey);
}
public List getSplits(JobContext job) throws IOException {
String hosts = job.getConfiguration().get(REDIS_HOSTS_CONF);
if (hosts == null || hosts.isEmpty()) {
throw new IOException(REDIS_HOSTS_CONF
+ " is not set in configuration.");
}
String hashKey = job.getConfiguration().get(REDIS_HASH_KEY_CONF);
if (hashKey == null || hashKey.isEmpty()) {
throw new IOException(REDIS_HASH_KEY_CONF
+ " is not set in configuration.");
}
// Create an input split for each host
List splits = new ArrayList();
for (String host : hosts.split(",")) {
splits.add(new RedisHashInputSplit(host, hashKey));
}
LOG.info("Input splits to process: " + splits.size());
return splits;
}
public RecordReader createRecordReader(InputSplit split,
TaskAttemptContext context) throws IOException,
InterruptedException {
return new RedisHashRecordReader();
}
public static class RedisHashRecordReader extends RecordReader {
// code in next section
}
public static class RedisHashInputSplit extends
InputSplit implements Writable {
// code in next section
}
}
public static class RedisHashRecordReader extends RecordReader {
private static final Logger LOG =
Logger.getLogger(RedisHashRecordReader.class);
private Iterator> keyValueMapIter = null;
private Text key = new Text(), value = new Text();
private float processedKVs = 0, totalKVs = 0;
private Entry currentEntry = null;
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
// Get the host location from the InputSplit
String host = split.getLocations()[0];
String hashKey = ((RedisHashInputSplit) split).getHashKey();
LOG.info("Connecting to " + host + " and reading from "
+ hashKey);
Jedis jedis = new Jedis(host);
jedis.connect();
jedis.getClient().setTimeoutInfinite();
// Get all the key/value pairs from the Redis instance and store
// them in memory
totalKVs = jedis.hlen(hashKey);
keyValueMapIter = jedis.hgetAll(hashKey).entrySet().iterator();
LOG.info("Got " + totalKVs + " from " + hashKey);
jedis.disconnect();
}
public boolean nextKeyValue() throws IOException,
InterruptedException {
// If the key/value map still has values
if (keyValueMapIter.hasNext()) {
// Get the current entry and set the Text objects to the entry
currentEntry = keyValueMapIter.next();
key.set(currentEntry.getKey());
value.set(currentEntry.getValue());
return true;
} else {
// No more values? return false.
return false;
}
}
public Text getCurrentKey() throws IOException,
InterruptedException {
return key;
}
public Text getCurrentValue() throws IOException,
InterruptedException {
return value;
}
public float getProgress() throws IOException, InterruptedException {
return processedKVs / totalKVs;
}
public void close() throws IOException {
// nothing to do here
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String hosts = otherArgs[0];
String hashKey = otherArgs[1];
Path outputDir = new Path(otherArgs[2]);
Job job = new Job(conf, "Redis Input");
job.setJarByClass(RedisInputDriver.class);
// Use the identity mapper
job.setNumReduceTasks(0);
job.setInputFormatClass(RedisHashInputFormat.class);
RedisHashInputFormat.setRedisHosts(job, hosts);
RedisHashInputFormat.setRedisHashKey(job, hashKey);
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job, outputDir);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
System.exit(job.waitForCompletion(true) ? 0 : 3);
}
public static class RedisKey implements WritableComparable {
private int lastAccessMonth = 0;
private Text field = new Text();
public int getLastAccessMonth() {
return this.lastAccessMonth;
}
public void setLastAccessMonth(int lastAccessMonth) {
this.lastAccessMonth = lastAccessMonth;
}
public Text getField() {
return this.field;
}
public void setField(String field) {
this.field.set(field);
}
public void readFields(DataInput in) throws IOException {
lastAccessMonth = in.readInt();
this.field.readFields(in);
}
public void write(DataOutput out) throws IOException {
out.writeInt(lastAccessMonth);
this.field.write(out);
}
public int compareTo(RedisKey rhs) {
if (this.lastAccessMonth == rhs.getLastAccessMonth()) {
return this.field.compareTo(rhs.getField());
} else {
return this.lastAccessMonth < rhs.getLastAccessMonth() ? -1 : 1;
}
}
public String toString() {
return this.lastAccessMonth + "\t" + this.field.toString();
}
public int hashCode() {
return toString().hashCode();
}
}
public static class RedisLastAccessOutputFormat
extends OutputFormat {
public RecordWriter getRecordWriter(
TaskAttemptContext job) throws IOException, InterruptedException {
return new RedisLastAccessRecordWriter();
}
public void checkOutputSpecs(JobContext context) throws IOException,
InterruptedException {
}
public OutputCommitter getOutputCommitter(TaskAttemptContext context)
throws IOException, InterruptedException {
return (new NullOutputFormat()).getOutputCommitter(context);
}
public static class RedisLastAccessRecordWriter
extends RecordWriter {
// Code in next section
}
}
public static class RedisLastAccessRecordWriter
extends RecordWriter {
private HashMap jedisMap = new HashMap();
public RedisLastAccessRecordWriter() {
// Create a connection to Redis for each host
int i = 0;
for (String host : MRDPUtils.REDIS_INSTANCES) {
Jedis jedis = new Jedis(host);
jedis.connect();
jedisMap.put(i, jedis);
jedisMap.put(i + 1, jedis);
i += 2;
}
}
public void write(RedisKey key, Text value) throws IOException,
InterruptedException {
// Get the Jedis instance that this key/value pair will be
// written to -- (0,1)->0, (2-3)->1, ... , (10-11)->5
Jedis j = jedisMap.get(key.getLastAccessMonth());
// Write the key/value pair
j.hset(MONTH_FROM_INT.get(key.getLastAccessMonth()), key
.getField().toString(), value.toString());
}
public void close(TaskAttemptContext context) throws IOException,
InterruptedException {
// For each jedis instance, disconnect it
for (Jedis jedis : jedisMap.values()) {
jedis.disconnect();
}
}
}
public static class RedisLastAccessOutputMapper extends
Mapper
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Path inputPath = new Path(args[0]);
Job job = new Job(conf, "Redis Last Access Output");
job.setJarByClass(PartitionPruningOutputDriver.class);
job.setMapperClass(RedisLastAccessOutputMapper.class);
job.setNumReduceTasks(0);
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.setInputPaths(job, inputPath);
job.setOutputFormatClass(RedisHashSetOutputFormat.class);
job.setOutputKeyClass(RedisKey.class);
job.setOutputValueClass(Text.class);
int code = job.waitForCompletion(true) ? 0 : 2;
System.exit(code);
}
public static class RedisLastAccessInputSplit
extends InputSplit implements Writable {
private String location = null;
private List hashKeys = new ArrayList();
public RedisLastAccessInputSplit() {
// Default constructor for reflection
}
public RedisLastAccessInputSplit(String redisHost) {
this.location = redisHost;
}
public void addHashKey(String key) {
hashKeys.add(key);
}
public void removeHashKey(String key) {
hashKeys.remove(key);
}
public List getHashKeys() {
return hashKeys;
}
public void readFields(DataInput in) throws IOException {
location = in.readUTF();
int numKeys = in.readInt();
hashKeys.clear();
for (int i = 0; i < numKeys; ++i) {
hashKeys.add(in.readUTF());
}
}
public void write(DataOutput out) throws IOException {
out.writeUTF(location);
out.writeInt(hashKeys.size());
for (String key : hashKeys) {
out.writeUTF(key);
}
}
public long getLength() throws IOException, InterruptedException {
return 0;
}
public String[] getLocations() throws IOException, InterruptedException {
return new String[] { location };
}
}
public static class RedisLastAccessInputFormat
extends InputFormat {
public static final String REDIS_SELECTED_MONTHS_CONF =
"mapred.redilastaccessinputformat.months";
private static final HashMap MONTH_FROM_STRING =
new HashMap();
private static final HashMap MONTH_TO_INST_MAP =
new HashMap();
private static final Logger LOG = Logger
.getLogger(RedisLastAccessInputFormat.class);
static {
// Initialize month to Redis instance map
// Initialize month 3 character code to integer
}
public static void setRedisLastAccessMonths(Job job, String months) {
job.getConfiguration().set(REDIS_SELECTED_MONTHS_CONF, months);
}
public List getSplits(JobContext job) throws IOException {
String months = job.getConfiguration().get(
REDIS_SELECTED_MONTHS_CONF);
if (months == null || months.isEmpty()) {
throw new IOException(REDIS_SELECTED_MONTHS_CONF
+ " is null or empty.");
}
// Create input splits from the input months
HashMap instanceToSplitMap =
new HashMap();
for (String month : months.split(",")) {
String host = MONTH_TO_INST_MAP.get(month);
RedisLastAccessInputSplit split = instanceToSplitMap.get(host);
if (split == null) {
split = new RedisLastAccessInputSplit(host);
split.addHashKey(month);
instanceToSplitMap.put(host, split);
} else {
split.addHashKey(month);
}
}
LOG.info("Input splits to process: " +
instanceToSplitMap.values().size());
return new ArrayList(instanceToSplitMap.values());
}
public RecordReader createRecordReader(
InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
return new RedisLastAccessRecordReader();
}
public static class RedisLastAccessRecordReader
extends RecordReader {
// Code in next section
}
}
public static class RedisLastAccessRecordReader
extends RecordReader {
private static final Logger LOG = Logger
.getLogger(RedisLastAccessRecordReader.class);
private Entry currentEntry = null;
private float processedKVs = 0, totalKVs = 0;
private int currentHashMonth = 0;
private Iterator> hashIterator = null;
private Iterator hashKeys = null;
private RedisKey key = new RedisKey();
private String host = null;
private Text value = new Text();
public void initialize(InputSplit split, TaskAttemptContext context)
throws IOException, InterruptedException {
// Get the host location from the InputSplit
host = split.getLocations()[0];
// Get an iterator of all the hash keys we want to read
hashKeys = ((RedisLastAccessInputSplit) split)
.getHashKeys().iterator();
LOG.info("Connecting to " + host);
}
public boolean nextKeyValue() throws IOException,
InterruptedException {
boolean nextHashKey = false;
do {
// if this is the first call or the iterator does not have a
// next
if (hashIterator == null || !hashIterator.hasNext()) {
// if we have reached the end of our hash keys, return
// false
if (!hashKeys.hasNext()) {
// ultimate end condition, return false
return false;
} else {
// Otherwise, connect to Redis and get all
// the name/value pairs for this hash key
Jedis jedis = new Jedis(host);
jedis.connect();
String strKey = hashKeys.next();
currentHashMonth = MONTH_FROM_STRING.get(strKey);
hashIterator = jedis.hgetAll(strKey).entrySet()
.iterator();
jedis.disconnect();
}
}
// If the key/value map still has values
if (hashIterator.hasNext()) {
// Get the current entry and set
// the Text objects to the entry
currentEntry = hashIterator.next();
key.setLastAccessMonth(currentHashMonth);
key.setField(currentEntry.getKey());
value.set(currentEntry.getValue());
} else {
nextHashKey = true;
}
} while (nextHashKey);
return true;
}
...
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
String lastAccessMonths = args[0];
Path outputDir = new Path(args[1]);
Job job = new Job(conf, "Redis Input");
job.setJarByClass(PartitionPruningInputDriver.class);
// Use the identity mapper
job.setNumReduceTasks(0);
job.setInputFormatClass(RedisLastAccessInputFormat.class);
RedisLastAccessInputFormat.setRedisLastAccessMonths(job,
lastAccessMonths);
job.setOutputFormatClass(TextOutputFormat.class);
TextOutputFormat.setOutputPath(job, outputDir);
job.setOutputKeyClass(RedisKey.class);
job.setOutputValueClass(Text.class);
System.exit(job.waitForCompletion(true) ? 0 : 2);
}