Partitioner
HashPartitioner、TotalOrderPartitioner、KeyFieldBasedPartitioner、BinaryPartitioner
public abstract class Partitioner<KEY, VALUE> { public abstract int getPartition(KEY key, VALUE value, int numPartitions); }
public class HashPartitioner<K, V> extends Partitioner<K, V> { public int getPartition(K key, V value, int numReduceTasks) { return (key.hashCode() & Integer.MAX_VALUE) % numReduceTasks; } }
public int getPartition(BinaryComparable key, V value, int numPartitions) { int length = key.getLength(); int leftIndex = (leftOffset + length) % length; int rightIndex = (rightOffset + length) % length; int hash = WritableComparator.hashBytes(key.getBytes(), leftIndex, rightIndex - leftIndex + 1); return (hash & Integer.MAX_VALUE) % numPartitions; }
public int getPartition(K2 key, V2 value, int numReduceTasks) { byte[] keyBytes; List <KeyDescription> allKeySpecs = keyFieldHelper.keySpecs(); if (allKeySpecs.size() == 0) { return getPartition(key.toString().hashCode(), numReduceTasks); } try { keyBytes = key.toString().getBytes("UTF-8"); } catch (UnsupportedEncodingException e) { throw new RuntimeException("The current system does not " + "support UTF-8 encoding!", e); } // return 0 if the key is empty if (keyBytes.length == 0) { return 0; } int []lengthIndicesFirst = keyFieldHelper.getWordLengths(keyBytes, 0, keyBytes.length); int currentHash = 0; for (KeyDescription keySpec : allKeySpecs) { int startChar = keyFieldHelper.getStartOffset(keyBytes, 0, keyBytes.length, lengthIndicesFirst, keySpec); // no key found! continue if (startChar < 0) { continue; } int endChar = keyFieldHelper.getEndOffset(keyBytes, 0, keyBytes.length, lengthIndicesFirst, keySpec); currentHash = hashCode(keyBytes, startChar, endChar, currentHash); } return getPartition(currentHash, numReduceTasks); } protected int hashCode(byte[] b, int start, int end, int currentHash) { for (int i = start; i <= end; i++) { currentHash = 31*currentHash + b[i]; } return currentHash; } protected int getPartition(int hash, int numReduceTasks) { return (hash & Integer.MAX_VALUE) % numReduceTasks; }
public class TotalOrderPartitioner<K extends WritableComparable<?>,V> extends Partitioner<K,V> implements Configurable { private K[] readPartitions(FileSystem fs, Path p, Class<K> keyClass, Configuration conf) throws IOException { ... } }
public class InputSampler<K,V> extends Configured implements Tool { /** * Samples the first n records from s splits. * Inexpensive way to sample random data. */ public static class SplitSampler<K,V> implements Sampler<K,V> { ... } /** * Sample from random points in the input. * General-purpose sampler. Takes numSamples / maxSplitsSampled inputs from * each split. */ public static class RandomSampler<K,V> implements Sampler<K,V> { ... } /** * Sample from s splits at regular intervals. * Useful for sorted data. */ public static class IntervalSampler<K,V> implements Sampler<K,V> { ... } /** * 编写一个分区文件给给定的Job,使用提供的取样器 * Queries the sampler for a sample keyset, sorts by the output key * comparator, selects the keys for each rank, and writes to the destination * returned from {@link TotalOrderPartitioner#getPartitionFile}. */ public static <K,V> void writePartitionFile(Job job, Sampler<K,V> sampler) throws IOException, ClassNotFoundException, InterruptedException { ... } }
3)、IntervalSampler<K,V> | 固定间隔采样 | 采样频率,划分数 | 中 |
IntervalSampler<K,V>对有序的数据十分适用
InputSampler.Sampler<IntWritable, Text> sampler = new InputSampler.RandomSampler<IntWritable, Text>( 0.1, 10000, 10);
RandomSampler的三个参数分别是采样率、最大样本数、最大分区。/** * Create a new RandomSampler. * @param freq Probability with which a key will be chosen. * @param numSamples Total number of samples to obtain from all selected * splits. * @param maxSplitsSampled The maximum number of splits to examine. */ public RandomSampler(double freq, int numSamples, int maxSplitsSampled) { this.freq = freq; this.numSamples = numSamples; this.maxSplitsSampled = maxSplitsSampled; }