CHAPTER 7:Input and Output Patterns
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); int numMapTasks = Integer.parseInt(args[0]); int numRecordsPerTask = Integer.parseInt(args[1]); Path wordList = new Path(args[2]); Path outputDir = new Path(args[3]); Job job = new Job(conf, "RandomDataGenerationDriver"); job.setJarByClass(RandomDataGenerationDriver.class); job.setNumReduceTasks(0); job.setInputFormatClass(RandomStackOverflowInputFormat.class); RandomStackOverflowInputFormat.setNumMapTasks(job, numMapTasks); RandomStackOverflowInputFormat.setNumRecordPerTask(job, numRecordsPerTask); RandomStackOverflowInputFormat.setRandomWordList(job, wordList); TextOutputFormat.setOutputPath(job, outputDir); job.setOutputKeyClass(Text.class); job.setOutputValueClass(NullWritable.class); System.exit(job.waitForCompletion(true) ? 0 : 2); }
public static class FakeInputSplit extends InputSplit implements Writable { public void readFields(DataInput arg0) throws IOException { } public void write(DataOutput arg0) throws IOException { } public long getLength() throws IOException, InterruptedException { return 0; } public String[] getLocations() throws IOException, InterruptedException { return new String[0]; } }
public static class RandomStackOverflowInputFormat extends InputFormat<Text, NullWritable> { public static final String NUM_MAP_TASKS = "random.generator.map.tasks"; public static final String NUM_RECORDS_PER_TASK = "random.generator.num.records.per.map.task"; public static final String RANDOM_WORD_LIST = "random.generator.random.word.file"; public List<InputSplit> getSplits(JobContext job) throws IOException { // Get the number of map tasks configured for int numSplits = job.getConfiguration().getInt(NUM_MAP_TASKS, -1); // Create a number of input splits equivalent to the number of tasks ArrayList<InputSplit> splits = new ArrayList<InputSplit>(); for (int i = 0; i < numSplits; ++i) { splits.add(new FakeInputSplit()); } return splits; } public RecordReader<Text, NullWritable> createRecordReader( InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { // Create a new RandomStackOverflowRecordReader and initialize it RandomStackOverflowRecordReader rr = new RandomStackOverflowRecordReader(); rr.initialize(split, context); return rr; } public static void setNumMapTasks(Job job, int i) { job.getConfiguration().setInt(NUM_MAP_TASKS, i); } public static void setNumRecordPerTask(Job job, int i) { job.getConfiguration().setInt(NUM_RECORDS_PER_TASK, i); } public static void setRandomWordList(Job job, Path file) { DistributedCache.addCacheFile(file.toUri(), job.getConfiguration()); } }
public static class RandomStackOverflowRecordReader extends RecordReader<Text, NullWritable> { private int numRecordsToCreate = 0; private int createdRecords = 0; private Text key = new Text(); private NullWritable value = NullWritable.get(); private Random rndm = new Random(); private ArrayList<String> randomWords = new ArrayList<String>(); // This object will format the creation date string into a Date // object private SimpleDateFormat frmt = new SimpleDateFormat( "yyyy-MM-dd'T'HH:mm:ss.SSS"); public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { // Get the number of records to create from the configuration this.numRecordsToCreate = context.getConfiguration().getInt( NUM_RECORDS_PER_TASK, -1); // Get the list of random words from the DistributedCache URI[] files = DistributedCache.getCacheFiles(context .getConfiguration()); // Read the list of random words into a list BufferedReader rdr = new BufferedReader(new FileReader( files[0].toString())); String line; while ((line = rdr.readLine()) != null) { randomWords.add(line); } rdr.close(); } public boolean nextKeyValue() throws IOException, InterruptedException { // If we still have records to create if (createdRecords < numRecordsToCreate) { // Generate random data int score = Math.abs(rndm.nextInt()) % 15000; int rowId = Math.abs(rndm.nextInt()) % 1000000000; int postId = Math.abs(rndm.nextInt()) % 100000000; int userId = Math.abs(rndm.nextInt()) % 1000000; String creationDate = frmt .format(Math.abs(rndm.nextLong())); // Create a string of text from the random words String text = getRandomText(); String randomRecord = "<row Id=\"" + rowId + "\" PostId=\"" + postId + "\" Score=\"" + score + "\" Text=\"" + text + "\" CreationDate=\"" + creationDate + "\" UserId\"=" + userId + "\" />"; key.set(randomRecord); ++createdRecords; return true; } else { // We are done creating records return false; } } private String getRandomText() { StringBuilder bldr = new StringBuilder(); int numWords = Math.abs(rndm.nextInt()) % 30 + 1; for (int i = 0; i < numWords; ++i) { bldr.append(randomWords.get(Math.abs(rndm.nextInt()) % randomWords.size()) + " "); } return bldr.toString(); } public Text getCurrentKey() throws IOException, InterruptedException { return key; } public NullWritable getCurrentValue() throws IOException, InterruptedException { return value; } public float getProgress() throws IOException, InterruptedException { return (float) createdRecords / (float) numRecordsToCreate; } public void close() throws IOException { // nothing to do here... } }
public static class RedisHashOutputFormat extends OutputFormat<Text, Text> { public static final String REDIS_HOSTS_CONF = "mapred.redishashoutputformat.hosts"; public static final String REDIS_HASH_KEY_CONF = "mapred.redishashinputformat.key"; public static void setRedisHosts(Job job, String hosts) { job.getConfiguration().set(REDIS_HOSTS_CONF, hosts); } public static void setRedisHashKey(Job job, String hashKey) { job.getConfiguration().set(REDIS_HASH_KEY_CONF, hashKey); } public RecordWriter<Text, Text> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException { return new RedisHashRecordWriter(job.getConfiguration().get( REDIS_HASH_KEY_CONF), job.getConfiguration().get( REDIS_HOSTS_CONF)); } public void checkOutputSpecs(JobContext job) throws IOException { String hosts = job.getConfiguration().get(REDIS_HOSTS_CONF); if (hosts == null || hosts.isEmpty()) { throw new IOException(REDIS_HOSTS_CONF + " is not set in configuration."); } String hashKey = job.getConfiguration().get( REDIS_HASH_KEY_CONF); if (hashKey == null || hashKey.isEmpty()) { throw new IOException(REDIS_HASH_KEY_CONF + " is not set in configuration."); } } public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException { return (new NullOutputFormat<Text, Text>()).getOutputCommitter(context); } public static class RedisHashRecordWriter extends RecordWriter<Text, Text> { public static class RedisHashOutputFormat extends OutputFormat<Text, Text> { public static final String REDIS_HOSTS_CONF = "mapred.redishashoutputformat.hosts"; public static final String REDIS_HASH_KEY_CONF = "mapred.redishashinputformat.key"; public static void setRedisHosts(Job job, String hosts) { job.getConfiguration().set(REDIS_HOSTS_CONF, hosts); } public static void setRedisHashKey(Job job, String hashKey) { job.getConfiguration().set(REDIS_HASH_KEY_CONF, hashKey); } public RecordWriter<Text, Text> getRecordWriter(TaskAttemptContext job) throws IOException, InterruptedException { return new RedisHashRecordWriter(job.getConfiguration().get( REDIS_HASH_KEY_CONF), job.getConfiguration().get( REDIS_HOSTS_CONF)); } public void checkOutputSpecs(JobContext job) throws IOException { String hosts = job.getConfiguration().get(REDIS_HOSTS_CONF); if (hosts == null || hosts.isEmpty()) { throw new IOException(REDIS_HOSTS_CONF + " is not set in configuration."); } String hashKey = job.getConfiguration().get( REDIS_HASH_KEY_CONF); if (hashKey == null || hashKey.isEmpty()) { throw new IOException(REDIS_HASH_KEY_CONF + " is not set in configuration."); } } public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException { return (new NullOutputFormat<Text, Text>()).getOutputCommitter(context); } public static class RedisHashRecordWriter extends RecordWriter<Text, Text> { public static class RedisHashRecordWriter extends RecordWriter<Text, Text> { private HashMap<Integer, Jedis> jedisMap = new HashMap<Integer, Jedis>(); private String hashKey = null; public RedisHashRecordWriter(String hashKey, String hosts) { this.hashKey = hashKey; // Create a connection to Redis for each host // Map an integer 0-(numRedisInstances - 1) to the instance int i = 0; for (String host : hosts.split(",")) { Jedis jedis = new Jedis(host); jedis.connect(); jedisMap.put(i, jedis); ++i; } } public void write(Text key, Text value) throws IOException, InterruptedException { // Get the Jedis instance that this key/value pair will be // written to Jedis j = jedisMap.get(Math.abs(key.hashCode()) % jedisMap.size()); // Write the key/value pair j.hset(hashKey, key.toString(), value.toString()); } public void close(TaskAttemptContext context) throws IOException, InterruptedException { // For each jedis instance, disconnect it for (Jedis jedis : jedisMap.values()) { jedis.disconnect(); } } }
public static class RedisOutputMapper extends Mapper<Object, Text, Text, Text> { private Text outkey = new Text(); private Text outvalue = new Text(); public void map(Object key, Text value, Context context) throws IOException, InterruptedException { Map<String, String> parsed = MRDPUtils.transformXmlToMap(value .toString()); String userId = parsed.get("Id"); String reputation = parsed.get("Reputation"); // Set our output key and values outkey.set(userId); outvalue.set(reputation); context.write(outkey, outvalue); } }
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Path inputPath = new Path(args[0]); String hosts = args[1]; String hashName = args[2]; Job job = new Job(conf, "Redis Output"); job.setJarByClass(RedisOutputDriver.class); job.setMapperClass(RedisOutputMapper.class); job.setNumReduceTasks(0); job.setInputFormatClass(TextInputFormat.class); TextInputFormat.setInputPaths(job, inputPath); job.setOutputFormatClass(RedisHashOutputFormat.class); RedisHashOutputFormat.setRedisHosts(job, hosts); RedisHashOutputFormat.setRedisHashKey(job, hashName); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); int code = job.waitForCompletion(true) ? 0 : 2; System.exit(code); }
public static class RedisHashInputSplit extends InputSplit implements Writable { private String location = null; private String hashKey = null; public RedisHashInputSplit() { // Default constructor for reflection } public RedisHashInputSplit(String redisHost, String hash) { this.location = redisHost; this.hashKey = hash; } public String getHashKey() { return this.hashKey; } public void readFields(DataInput in) throws IOException { this.location = in.readUTF(); this.hashKey = in.readUTF(); } public void write(DataOutput out) throws IOException { out.writeUTF(location); out.writeUTF(hashKey); } public long getLength() throws IOException, InterruptedException { return 0; } public String[] getLocations() throws IOException, InterruptedException { return new String[] { location }; } }
public static class RedisHashInputFormat extends InputFormat<Text, Text> { public static final String REDIS_HOSTS_CONF = "mapred.redishashinputformat.hosts"; public static final String REDIS_HASH_KEY_CONF = "mapred.redishashinputformat.key"; private static final Logger LOG = Logger .getLogger(RedisHashInputFormat.class); public static void setRedisHosts(Job job, String hosts) { job.getConfiguration().set(REDIS_HOSTS_CONF, hosts); } public static void setRedisHashKey(Job job, String hashKey) { job.getConfiguration().set(REDIS_HASH_KEY_CONF, hashKey); } public List<InputSplit> getSplits(JobContext job) throws IOException { String hosts = job.getConfiguration().get(REDIS_HOSTS_CONF); if (hosts == null || hosts.isEmpty()) { throw new IOException(REDIS_HOSTS_CONF + " is not set in configuration."); } String hashKey = job.getConfiguration().get(REDIS_HASH_KEY_CONF); if (hashKey == null || hashKey.isEmpty()) { throw new IOException(REDIS_HASH_KEY_CONF + " is not set in configuration."); } // Create an input split for each host List<InputSplit> splits = new ArrayList<InputSplit>(); for (String host : hosts.split(",")) { splits.add(new RedisHashInputSplit(host, hashKey)); } LOG.info("Input splits to process: " + splits.size()); return splits; } public RecordReader<Text, Text> createRecordReader(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { return new RedisHashRecordReader(); } public static class RedisHashRecordReader extends RecordReader<Text, Text> { // code in next section } public static class RedisHashInputSplit extends InputSplit implements Writable { // code in next section } }
public static class RedisHashRecordReader extends RecordReader<Text, Text> { private static final Logger LOG = Logger.getLogger(RedisHashRecordReader.class); private Iterator<Entry<String, String>> keyValueMapIter = null; private Text key = new Text(), value = new Text(); private float processedKVs = 0, totalKVs = 0; private Entry<String, String> currentEntry = null; public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { // Get the host location from the InputSplit String host = split.getLocations()[0]; String hashKey = ((RedisHashInputSplit) split).getHashKey(); LOG.info("Connecting to " + host + " and reading from " + hashKey); Jedis jedis = new Jedis(host); jedis.connect(); jedis.getClient().setTimeoutInfinite(); // Get all the key/value pairs from the Redis instance and store // them in memory totalKVs = jedis.hlen(hashKey); keyValueMapIter = jedis.hgetAll(hashKey).entrySet().iterator(); LOG.info("Got " + totalKVs + " from " + hashKey); jedis.disconnect(); } public boolean nextKeyValue() throws IOException, InterruptedException { // If the key/value map still has values if (keyValueMapIter.hasNext()) { // Get the current entry and set the Text objects to the entry currentEntry = keyValueMapIter.next(); key.set(currentEntry.getKey()); value.set(currentEntry.getValue()); return true; } else { // No more values? return false. return false; } } public Text getCurrentKey() throws IOException, InterruptedException { return key; } public Text getCurrentValue() throws IOException, InterruptedException { return value; } public float getProgress() throws IOException, InterruptedException { return processedKVs / totalKVs; } public void close() throws IOException { // nothing to do here } }
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String hosts = otherArgs[0]; String hashKey = otherArgs[1]; Path outputDir = new Path(otherArgs[2]); Job job = new Job(conf, "Redis Input"); job.setJarByClass(RedisInputDriver.class); // Use the identity mapper job.setNumReduceTasks(0); job.setInputFormatClass(RedisHashInputFormat.class); RedisHashInputFormat.setRedisHosts(job, hosts); RedisHashInputFormat.setRedisHashKey(job, hashKey); job.setOutputFormatClass(TextOutputFormat.class); TextOutputFormat.setOutputPath(job, outputDir); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); System.exit(job.waitForCompletion(true) ? 0 : 3); }
public static class RedisKey implements WritableComparable<RedisKey> { private int lastAccessMonth = 0; private Text field = new Text(); public int getLastAccessMonth() { return this.lastAccessMonth; } public void setLastAccessMonth(int lastAccessMonth) { this.lastAccessMonth = lastAccessMonth; } public Text getField() { return this.field; } public void setField(String field) { this.field.set(field); } public void readFields(DataInput in) throws IOException { lastAccessMonth = in.readInt(); this.field.readFields(in); } public void write(DataOutput out) throws IOException { out.writeInt(lastAccessMonth); this.field.write(out); } public int compareTo(RedisKey rhs) { if (this.lastAccessMonth == rhs.getLastAccessMonth()) { return this.field.compareTo(rhs.getField()); } else { return this.lastAccessMonth < rhs.getLastAccessMonth() ? -1 : 1; } } public String toString() { return this.lastAccessMonth + "\t" + this.field.toString(); } public int hashCode() { return toString().hashCode(); } }
public static class RedisLastAccessOutputFormat extends OutputFormat<RedisKey, Text> { public RecordWriter<RedisKey, Text> getRecordWriter( TaskAttemptContext job) throws IOException, InterruptedException { return new RedisLastAccessRecordWriter(); } public void checkOutputSpecs(JobContext context) throws IOException, InterruptedException { } public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException { return (new NullOutputFormat<Text, Text>()).getOutputCommitter(context); } public static class RedisLastAccessRecordWriter extends RecordWriter<RedisKey, Text> { // Code in next section } }
public static class RedisLastAccessRecordWriter extends RecordWriter<RedisKey, Text> { private HashMap<Integer, Jedis> jedisMap = new HashMap<Integer, Jedis>(); public RedisLastAccessRecordWriter() { // Create a connection to Redis for each host int i = 0; for (String host : MRDPUtils.REDIS_INSTANCES) { Jedis jedis = new Jedis(host); jedis.connect(); jedisMap.put(i, jedis); jedisMap.put(i + 1, jedis); i += 2; } } public void write(RedisKey key, Text value) throws IOException, InterruptedException { // Get the Jedis instance that this key/value pair will be // written to -- (0,1)->0, (2-3)->1, ... , (10-11)->5 Jedis j = jedisMap.get(key.getLastAccessMonth()); // Write the key/value pair j.hset(MONTH_FROM_INT.get(key.getLastAccessMonth()), key .getField().toString(), value.toString()); } public void close(TaskAttemptContext context) throws IOException, InterruptedException { // For each jedis instance, disconnect it for (Jedis jedis : jedisMap.values()) { jedis.disconnect(); } } }
public static class RedisLastAccessOutputMapper extends Mapper<Object, Text, RedisKey, Text> { // This object will format the creation date string into a Date object private final static SimpleDateFormat frmt = new SimpleDateFormat( "yyyy-MM-dd'T'HH:mm:ss.SSS"); private RedisKey outkey = new RedisKey(); private Text outvalue = new Text(); public void map(Object key, Text value, Context context) throws IOException, InterruptedException { Map<String, String> parsed = MRDPUtils.transformXmlToMap(value .toString()); String userId = parsed.get("Id"); String reputation = parsed.get("Reputation"); // Grab the last access date String strDate = parsed.get("LastAccessDate"); // Parse the string into a Calendar object Calendar cal = Calendar.getInstance(); cal.setTime(frmt.parse(strDate)); // Set our output key and values outkey.setLastAccessMonth(cal.get(Calendar.MONTH)); outkey.setField(userId); outvalue.set(reputation); context.write(outkey, outvalue); } }
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); Path inputPath = new Path(args[0]); Job job = new Job(conf, "Redis Last Access Output"); job.setJarByClass(PartitionPruningOutputDriver.class); job.setMapperClass(RedisLastAccessOutputMapper.class); job.setNumReduceTasks(0); job.setInputFormatClass(TextInputFormat.class); TextInputFormat.setInputPaths(job, inputPath); job.setOutputFormatClass(RedisHashSetOutputFormat.class); job.setOutputKeyClass(RedisKey.class); job.setOutputValueClass(Text.class); int code = job.waitForCompletion(true) ? 0 : 2; System.exit(code); }
public static class RedisLastAccessInputSplit extends InputSplit implements Writable { private String location = null; private List<String> hashKeys = new ArrayList<String>(); public RedisLastAccessInputSplit() { // Default constructor for reflection } public RedisLastAccessInputSplit(String redisHost) { this.location = redisHost; } public void addHashKey(String key) { hashKeys.add(key); } public void removeHashKey(String key) { hashKeys.remove(key); } public List<String> getHashKeys() { return hashKeys; } public void readFields(DataInput in) throws IOException { location = in.readUTF(); int numKeys = in.readInt(); hashKeys.clear(); for (int i = 0; i < numKeys; ++i) { hashKeys.add(in.readUTF()); } } public void write(DataOutput out) throws IOException { out.writeUTF(location); out.writeInt(hashKeys.size()); for (String key : hashKeys) { out.writeUTF(key); } } public long getLength() throws IOException, InterruptedException { return 0; } public String[] getLocations() throws IOException, InterruptedException { return new String[] { location }; } }
public static class RedisLastAccessInputFormat extends InputFormat<RedisKey, Text> { public static final String REDIS_SELECTED_MONTHS_CONF = "mapred.redilastaccessinputformat.months"; private static final HashMap<String, Integer> MONTH_FROM_STRING = new HashMap<String, Integer>(); private static final HashMap<String, String> MONTH_TO_INST_MAP = new HashMap<String, String>(); private static final Logger LOG = Logger .getLogger(RedisLastAccessInputFormat.class); static { // Initialize month to Redis instance map // Initialize month 3 character code to integer } public static void setRedisLastAccessMonths(Job job, String months) { job.getConfiguration().set(REDIS_SELECTED_MONTHS_CONF, months); } public List<InputSplit> getSplits(JobContext job) throws IOException { String months = job.getConfiguration().get( REDIS_SELECTED_MONTHS_CONF); if (months == null || months.isEmpty()) { throw new IOException(REDIS_SELECTED_MONTHS_CONF + " is null or empty."); } // Create input splits from the input months HashMap<String, RedisLastAccessInputSplit> instanceToSplitMap = new HashMap<String, RedisLastAccessInputSplit>(); for (String month : months.split(",")) { String host = MONTH_TO_INST_MAP.get(month); RedisLastAccessInputSplit split = instanceToSplitMap.get(host); if (split == null) { split = new RedisLastAccessInputSplit(host); split.addHashKey(month); instanceToSplitMap.put(host, split); } else { split.addHashKey(month); } } LOG.info("Input splits to process: " + instanceToSplitMap.values().size()); return new ArrayList<InputSplit>(instanceToSplitMap.values()); } public RecordReader<RedisKey, Text> createRecordReader( InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { return new RedisLastAccessRecordReader(); } public static class RedisLastAccessRecordReader extends RecordReader<RedisKey, Text> { // Code in next section } }
public static class RedisLastAccessRecordReader extends RecordReader<RedisKey, Text> { private static final Logger LOG = Logger .getLogger(RedisLastAccessRecordReader.class); private Entry<String, String> currentEntry = null; private float processedKVs = 0, totalKVs = 0; private int currentHashMonth = 0; private Iterator<Entry<String, String>> hashIterator = null; private Iterator<String> hashKeys = null; private RedisKey key = new RedisKey(); private String host = null; private Text value = new Text(); public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { // Get the host location from the InputSplit host = split.getLocations()[0]; // Get an iterator of all the hash keys we want to read hashKeys = ((RedisLastAccessInputSplit) split) .getHashKeys().iterator(); LOG.info("Connecting to " + host); } public boolean nextKeyValue() throws IOException, InterruptedException { boolean nextHashKey = false; do { // if this is the first call or the iterator does not have a // next if (hashIterator == null || !hashIterator.hasNext()) { // if we have reached the end of our hash keys, return // false if (!hashKeys.hasNext()) { // ultimate end condition, return false return false; } else { // Otherwise, connect to Redis and get all // the name/value pairs for this hash key Jedis jedis = new Jedis(host); jedis.connect(); String strKey = hashKeys.next(); currentHashMonth = MONTH_FROM_STRING.get(strKey); hashIterator = jedis.hgetAll(strKey).entrySet() .iterator(); jedis.disconnect(); } } // If the key/value map still has values if (hashIterator.hasNext()) { // Get the current entry and set // the Text objects to the entry currentEntry = hashIterator.next(); key.setLastAccessMonth(currentHashMonth); key.setField(currentEntry.getKey()); value.set(currentEntry.getValue()); } else { nextHashKey = true; } } while (nextHashKey); return true; } ... }
public static void main(String[] args) throws Exception { Configuration conf = new Configuration(); String lastAccessMonths = args[0]; Path outputDir = new Path(args[1]); Job job = new Job(conf, "Redis Input"); job.setJarByClass(PartitionPruningInputDriver.class); // Use the identity mapper job.setNumReduceTasks(0); job.setInputFormatClass(RedisLastAccessInputFormat.class); RedisLastAccessInputFormat.setRedisLastAccessMonths(job, lastAccessMonths); job.setOutputFormatClass(TextOutputFormat.class); TextOutputFormat.setOutputPath(job, outputDir); job.setOutputKeyClass(RedisKey.class); job.setOutputValueClass(Text.class); System.exit(job.waitForCompletion(true) ? 0 : 2); }