Hadoop comes with a set of samplers for you to choose from. The idea behaind sampler is that you can get a fairly even set of partitions by sampling the key space. You look at a small subset of the keys to approximate the key distribution, which is then used to construct partitions.
- SplitSampler: this sampler samples only the first n records in a split. It's not so good for sorted data because it doesn't select keys from thoughout the split. In some applications, it's common for some of the input to already be sorted, or at least partially sorted. So it's not the ideal circumstance you could apply SplitSampler.
/** * Samples the first n records from s splits. * Inexpensive way to sample random data. */ public static class SplitSampler<K,V> implements Sampler<K,V> { private final int numSamples; private final int maxSplitsSampled; /** * Create a SplitSampler sampling <em>all</em> splits. * Takes the first numSamples / numSplits records from each split. * @param numSamples Total number of samples to obtain from all selected * splits. */ public SplitSampler(int numSamples) { this(numSamples, Integer.MAX_VALUE); } /** * Create a new SplitSampler. * @param numSamples Total number of samples to obtain from all selected * splits. * @param maxSplitsSampled The maximum number of splits to examine. */ public SplitSampler(int numSamples, int maxSplitsSampled) { this.numSamples = numSamples; this.maxSplitsSampled = maxSplitsSampled; } /** * From each split sampled, take the first numSamples / numSplits records. */ @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type public K[] getSample(InputFormat<K,V> inf, Job job) throws IOException, InterruptedException { List<InputSplit> splits = inf.getSplits(job); ArrayList<K> samples = new ArrayList<K>(numSamples); int splitsToSample = Math.min(maxSplitsSampled, splits.size()); int samplesPerSplit = numSamples / splitsToSample; long records = 0; for (int i = 0; i < splitsToSample; ++i) { TaskAttemptContext samplingContext = new TaskAttemptContext( job.getConfiguration(), new TaskAttemptID()); RecordReader<K,V> reader = inf.createRecordReader( splits.get(i), samplingContext); reader.initialize(splits.get(i), samplingContext); while (reader.nextKeyValue()) { samples.add(ReflectionUtils.copy(job.getConfiguration(), reader.getCurrentKey(), null)); ++records; if ((i+1) * samplesPerSplit <= records) { break; } } reader.close(); } return (K[])samples.toArray(); } }
- IntervalSampler. This sampler chooses keys at regular intervals through the split and makes a better choise for sorted data.
/** * Sample from s splits at regular intervals. * Useful for sorted data. */ public static class IntervalSampler<K,V> implements Sampler<K,V> { private final double freq; private final int maxSplitsSampled; /** * Create a new IntervalSampler sampling <em>all</em> splits. * @param freq The frequency with which records will be emitted. */ public IntervalSampler(double freq) { this(freq, Integer.MAX_VALUE); } /** * Create a new IntervalSampler. * @param freq The frequency with which records will be emitted. * @param maxSplitsSampled The maximum number of splits to examine. * @see #getSample */ public IntervalSampler(double freq, int maxSplitsSampled) { this.freq = freq; this.maxSplitsSampled = maxSplitsSampled; } /** * For each split sampled, emit when the ratio of the number of records * retained to the total record count is less than the specified * frequency. */ @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type public K[] getSample(InputFormat<K,V> inf, Job job) throws IOException, InterruptedException { List<InputSplit> splits = inf.getSplits(job); ArrayList<K> samples = new ArrayList<K>(); int splitsToSample = Math.min(maxSplitsSampled, splits.size()); long records = 0; long kept = 0; for (int i = 0; i < splitsToSample; ++i) { TaskAttemptContext samplingContext = new TaskAttemptContext( job.getConfiguration(), new TaskAttemptID()); RecordReader<K,V> reader = inf.createRecordReader( splits.get(i), samplingContext); reader.initialize(splits.get(i), samplingContext); while (reader.nextKeyValue()) { ++records; if ((double) kept / records < freq) { samples.add(ReflectionUtils.copy(job.getConfiguration(), reader.getCurrentKey(), null)); ++kept; } } reader.close(); } return (K[])samples.toArray(); } }
- RandomSample. This is a good general-purpose sampler, it takes numSamples / maxSplitsSampled inputs from each split.
/** * Sample from random points in the input. * General-purpose sampler. Takes numSamples / maxSplitsSampled inputs from * each split. */ public static class RandomSampler<K,V> implements Sampler<K,V> { private double freq; private final int numSamples; private final int maxSplitsSampled; /** * Create a new RandomSampler sampling <em>all</em> splits. * This will read every split at the client, which is very expensive. * @param freq Probability with which a key will be chosen. * @param numSamples Total number of samples to obtain from all selected * splits. */ public RandomSampler(double freq, int numSamples) { this(freq, numSamples, Integer.MAX_VALUE); } /** * Create a new RandomSampler. * @param freq Probability with which a key will be chosen. * @param numSamples Total number of samples to obtain from all selected * splits. * @param maxSplitsSampled The maximum number of splits to examine. */ public RandomSampler(double freq, int numSamples, int maxSplitsSampled) { this.freq = freq; this.numSamples = numSamples; this.maxSplitsSampled = maxSplitsSampled; } /** * Randomize the split order, then take the specified number of keys from * each split sampled, where each key is selected with the specified * probability and possibly replaced by a subsequently selected key when * the quota of keys from that split is satisfied. */ @SuppressWarnings("unchecked") // ArrayList::toArray doesn't preserve type public K[] getSample(InputFormat<K,V> inf, Job job) throws IOException, InterruptedException { List<InputSplit> splits = inf.getSplits(job); ArrayList<K> samples = new ArrayList<K>(numSamples); int splitsToSample = Math.min(maxSplitsSampled, splits.size()); Random r = new Random(); long seed = r.nextLong(); r.setSeed(seed); LOG.debug("seed: " + seed); // shuffle splits for (int i = 0; i < splits.size(); ++i) { InputSplit tmp = splits.get(i); int j = r.nextInt(splits.size()); splits.set(i, splits.get(j)); splits.set(j, tmp); } // our target rate is in terms of the maximum number of sample splits, // but we accept the possibility of sampling additional splits to hit // the target sample keyset for (int i = 0; i < splitsToSample || (i < splits.size() && samples.size() < numSamples); ++i) { TaskAttemptContext samplingContext = new TaskAttemptContext( job.getConfiguration(), new TaskAttemptID()); RecordReader<K,V> reader = inf.createRecordReader( splits.get(i), samplingContext); reader.initialize(splits.get(i), samplingContext); while (reader.nextKeyValue()) { if (r.nextDouble() <= freq) { if (samples.size() < numSamples) { samples.add(ReflectionUtils.copy(job.getConfiguration(), reader.getCurrentKey(), null)); } else { // When exceeding the maximum number of samples, replace a // random element with this one, then adjust the frequency // to reflect the possibility of existing elements being // pushed out int ind = r.nextInt(numSamples); if (ind != numSamples) { samples.set(ind, ReflectionUtils.copy(job.getConfiguration(), reader.getCurrentKey(), null)); } freq *= (numSamples - 1) / (double) numSamples; } } } reader.close(); } return (K[])samples.toArray(); } }
- If none of these suits your application, you can write your own implementation of the Sampler interface. Remember that the point of sampling is to produce partitions that are approximately equal in size.