mapreduce实现全局排序

直接附代码,说明都在源码里了。

  1 package com.hadoop.totalsort;

  2 

  3 import java.io.IOException;

  4 import java.util.ArrayList;

  5 

  6 import org.apache.hadoop.conf.Configuration;

  7 import org.apache.hadoop.fs.FileSystem;

  8 import org.apache.hadoop.fs.Path;

  9 import org.apache.hadoop.io.LongWritable;

 10 import org.apache.hadoop.io.NullWritable;

 11 import org.apache.hadoop.io.SequenceFile;

 12 import org.apache.hadoop.io.Text;

 13 import org.apache.hadoop.mapred.FileInputFormat;

 14 import org.apache.hadoop.mapred.FileSplit;

 15 import org.apache.hadoop.mapred.InputSplit;

 16 import org.apache.hadoop.mapred.JobConf;

 17 import org.apache.hadoop.mapred.LineRecordReader;

 18 import org.apache.hadoop.mapred.RecordReader;

 19 import org.apache.hadoop.mapred.Reporter;

 20 import org.apache.hadoop.util.IndexedSortable;

 21 import org.apache.hadoop.util.QuickSort;

 22 

 23 

 24 public class SamplerInputFormat extends FileInputFormat<Text, Text> {  

 25       

 26     static final String PARTITION_FILENAME = "_partition.lst";  

 27     static final String SAMPLE_SIZE = "terasort.partitions.sample";  

 28     private static JobConf lastConf = null;  

 29     private static InputSplit[] lastResult = null;  

 30   

 31     static class TextSampler implements IndexedSortable {  

 32   

 33         public ArrayList<Text> records = new ArrayList<Text>();  

 34   

 35         public int compare(int arg0, int arg1) {  

 36             Text right = records.get(arg0);  

 37             Text left = records.get(arg1);  

 38   

 39             return right.compareTo(left);  

 40         }  

 41   

 42         public void swap(int arg0, int arg1) {  

 43             Text right = records.get(arg0);  

 44             Text left = records.get(arg1);  

 45   

 46             records.set(arg0, left);  

 47             records.set(arg1, right);  

 48         }  

 49   

 50         public void addKey(Text key) {  

 51             records.add(new Text(key));  

 52         }  

 53   

 54         

 55         //将采集出来的key数据排序

 56         public Text[] createPartitions(int numPartitions) {  

 57             int numRecords = records.size();  

 58             if (numPartitions > numRecords) {  

 59                 throw new IllegalArgumentException("Requested more partitions than input keys (" + numPartitions +  

 60                         " > " + numRecords + ")");  

 61             }  

 62             new QuickSort().sort(this, 0, records.size());  

 63             float stepSize = numRecords / (float) numPartitions;  //采集的时候应该是采了100条记录,从10个分片查找的,此处再取numPartitions-1条

 64             Text[] result = new Text[numPartitions - 1];  

 65             for (int i = 1; i < numPartitions; ++i) {  

 66                 result[i - 1] = records.get(Math.round(stepSize * i));  

 67             }  

 68             return result;  

 69         }  

 70   

 71     }  

 72   

 73     public static void writePartitionFile(JobConf conf, Path partFile) throws IOException {  

 74         //前段代码从分片中采集数据,通过sampler.addKey存入TextSampler中的records数组

 75         SamplerInputFormat inputFormat = new SamplerInputFormat();  

 76         TextSampler sampler = new TextSampler();  

 77         Text key = new Text();  

 78         Text value = new Text();  

 79   

 80         int partitions = conf.getNumReduceTasks(); // Reducer任务的个数   

 81         long sampleSize = conf.getLong(SAMPLE_SIZE, 100); // 采集数据-键值对的个数   

 82         InputSplit[] splits = inputFormat.getSplits(conf, conf.getNumMapTasks());// 获得数据分片   

 83         int samples = Math.min(10, splits.length);// 采集分片的个数   ,采集10个分片

 84         long recordsPerSample = sampleSize / samples;// 每个分片采集的键值对个数   

 85         int sampleStep = splits.length / samples; // 采集分片的步长   ,总的分片个数/要采集的分片个数

 86         long records = 0;  

 87   

 88         for (int i = 0; i < samples; i++) {  //1...10分片数

 89             RecordReader<Text, Text> reader = inputFormat.getRecordReader(splits[sampleStep * i], conf, null);  

 90             while (reader.next(key, value)) {  

 91                 sampler.addKey(key);   //将key值增加到sampler的records数组

 92                 records += 1;  

 93                 if ((i + 1) * recordsPerSample <= records) {  //目的是均匀采集各分片的条数,比如采集到第5个分片,那么记录条数应该小于5个分片应该的条数

 94                     break;  

 95                 }  

 96             }  

 97         }  

 98         FileSystem outFs = partFile.getFileSystem(conf);  

 99         if (outFs.exists(partFile)) {  

100             outFs.delete(partFile, false);  

101         }  

102         SequenceFile.Writer writer = SequenceFile.createWriter(outFs, conf, partFile, Text.class, NullWritable.class);  

103         NullWritable nullValue = NullWritable.get();  

104         for (Text split : sampler.createPartitions(partitions)) {  //调用createPartitions方法,排序采集出来的数据,并取partitions条

105             writer.append(split, nullValue);  

106         }  

107         writer.close();  

108   

109     }  

110   

111     static class TeraRecordReader implements RecordReader<Text, Text> {  

112   

113         private LineRecordReader in;  

114         private LongWritable junk = new LongWritable();  

115         private Text line = new Text();  

116         private static int KEY_LENGTH = 10;  

117   

118         public TeraRecordReader(Configuration job, FileSplit split) throws IOException {  

119             in = new LineRecordReader(job, split);  

120         }  

121   

122         public void close() throws IOException {  

123             in.close();  

124         }  

125   

126         public Text createKey() {  

127             // TODO Auto-generated method stub   

128             return new Text();  

129         }  

130   

131         public Text createValue() {  

132             return new Text();  

133         }  

134   

135         public long getPos() throws IOException {  

136             // TODO Auto-generated method stub   

137             return in.getPos();  

138         }  

139   

140         public float getProgress() throws IOException {  

141             // TODO Auto-generated method stub   

142             return in.getProgress();  

143         }  

144   

145         public boolean next(Text arg0, Text arg1) throws IOException {  

146             if (in.next(junk, line)) {   //调用父类方法,将value值赋给key

147                // if (line.getLength() < KEY_LENGTH) {   

148                     arg0.set(line);  

149                     arg1.clear();  

150 //                } else {   

151 //                    byte[] bytes = line.getBytes(); // 默认知道读取要比较值的前10个字节 作为key   

152 //                                                    // 后面的字节作为value;   

153 //                    arg0.set(bytes, 0, KEY_LENGTH);   

154 //                    arg1.set(bytes, KEY_LENGTH, line.getLength() - KEY_LENGTH);   

155 //                }   

156                 return true;  

157             } else {  

158                 return false;  

159             }  

160         }  

161   

162     }  

163   

164     @Override  

165     public InputSplit[] getSplits(JobConf conf, int splits) throws IOException {  

166         if (conf == lastConf) {  

167             return lastResult;  

168         }  

169         lastConf = conf;  

170         lastResult = super.getSplits(lastConf, splits);  

171         return lastResult;  

172   

173     }  

174   

175     public org.apache.hadoop.mapred.RecordReader<Text, Text> getRecordReader(InputSplit arg0, JobConf arg1,  

176             Reporter arg2) throws IOException {  

177         return new TeraRecordReader(arg1, (FileSplit) arg0);  

178     }  

179   

180 }  

转载自:http://www.open-open.com/lib/view/open1381329062408.html

你可能感兴趣的:(mapreduce)