MapReduce TopK统计加排序

Hadoop技术内幕中指出Top K算法有两步,一是统计词频,二是找出词频最高的前K个词。在网上找了很多MapReduce的Top K案例,这些案例都只有排序功能,所以自己写了个案例。

这个案例分两个步骤,第一个是就是wordCount案例,二就是排序功能。

一,统计词频

 1 package TopK;

 2 import java.io.IOException;

 3 import java.util.StringTokenizer;

 4 

 5 import org.apache.hadoop.conf.Configuration;

 6 import org.apache.hadoop.fs.Path;

 7 import org.apache.hadoop.io.IntWritable;

 8 import org.apache.hadoop.io.Text;

 9 import org.apache.hadoop.mapreduce.Job;

10 import org.apache.hadoop.mapreduce.Mapper;

11 import org.apache.hadoop.mapreduce.Reducer;

12 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

13 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

14 

15 /**

16  * 统计词频

17  * @author zx

18  * [email protected]

19  */

20 public class WordCount {

21     

22     /**

23      * 读取单词

24      * @author zx

25      *

26      */

27     public static class Map extends Mapper<Object,Text,Text,IntWritable>{

28 

29         IntWritable count = new IntWritable(1);

30         

31         @Override

32         protected void map(Object key, Text value, Context context)

33                 throws IOException, InterruptedException {

34             StringTokenizer st = new StringTokenizer(value.toString());

35             while(st.hasMoreTokens()){    

36                 String word = st.nextToken().replaceAll("\"", "").replace("'", "").replace(".", "");

37                 context.write(new Text(word), count);

38             }

39         }

40         

41     }

42     

43     /**

44      * 统计词频

45      * @author zx

46      *

47      */

48     public static class Reduce extends Reducer<Text,IntWritable,Text,IntWritable>{

49 

50         @SuppressWarnings("unused")

51         @Override

52         protected void reduce(Text key, Iterable<IntWritable> values,Context context)

53                 throws IOException, InterruptedException {

54             int count = 0;

55             for (IntWritable intWritable : values) {

56                 count ++;

57             }

58             context.write(key,new IntWritable(count));

59         }

60         

61     }

62     

63     @SuppressWarnings("deprecation")

64     public static boolean run(String in,String out) throws IOException, ClassNotFoundException, InterruptedException{

65         

66         Configuration conf = new Configuration();

67         

68         Job job = new Job(conf,"WordCount");

69         job.setJarByClass(WordCount.class);

70         job.setMapperClass(Map.class);

71         job.setReducerClass(Reduce.class);

72         

73         // 设置Map输出类型

74         job.setMapOutputKeyClass(Text.class);

75         job.setMapOutputValueClass(IntWritable.class);

76 

77         // 设置Reduce输出类型

78         job.setOutputKeyClass(Text.class);

79         job.setOutputValueClass(IntWritable.class);

80 

81         // 设置输入和输出目录

82         FileInputFormat.addInputPath(job, new Path(in));

83         FileOutputFormat.setOutputPath(job, new Path(out));

84         

85         return job.waitForCompletion(true);

86     }

87     

88 }

二,排序 并求出频率最高的前K个词

  1 package TopK;

  2 

  3 import java.io.IOException;

  4 import java.util.Comparator;

  5 import java.util.Map.Entry;

  6 import java.util.Set;

  7 import java.util.StringTokenizer;

  8 import java.util.TreeMap;

  9 import java.util.regex.Pattern;

 10 

 11 import org.apache.hadoop.conf.Configuration;

 12 import org.apache.hadoop.fs.Path;

 13 import org.apache.hadoop.io.IntWritable;

 14 import org.apache.hadoop.io.Text;

 15 import org.apache.hadoop.mapreduce.Job;

 16 import org.apache.hadoop.mapreduce.Mapper;

 17 import org.apache.hadoop.mapreduce.Reducer;

 18 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;

 19 import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

 20 import org.apache.hadoop.mapreduce.lib.output.MultipleOutputs;

 21 import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat;

 22 

 23 /**

 24  * 以单词出现的频率排序

 25  * 

 26  * @author zx

 27  * [email protected]

 28  */

 29 public class Sort {

 30 

 31     /**

 32      * 读取单词(词频 word)

 33      * 

 34      * @author zx

 35      * 

 36      */

 37     public static class Map extends Mapper<Object, Text, IntWritable, Text> {

 38 

 39         // 输出key 词频

 40         IntWritable outKey = new IntWritable();

 41         Text outValue = new Text();

 42 

 43         @Override

 44         protected void map(Object key, Text value, Context context)

 45                 throws IOException, InterruptedException {

 46 

 47             StringTokenizer st = new StringTokenizer(value.toString());

 48             while (st.hasMoreTokens()) {

 49                 String element = st.nextToken();

 50                 if (Pattern.matches("\\d+", element)) {

 51                     outKey.set(Integer.parseInt(element));

 52                 } else {

 53                     outValue.set(element);

 54                 }

 55             }

 56 

 57             context.write(outKey, outValue);

 58         }

 59 

 60     }

 61 

 62     /**

 63      * 根据词频排序

 64      * 

 65      * @author zx

 66      * 

 67      */

 68     public static class Reduce extends

 69             Reducer<IntWritable, Text, Text, IntWritable> {

 70         

 71         private static MultipleOutputs<Text, IntWritable> mos = null;

 72         

 73         //要获得前K个频率最高的词

 74         private static final int k = 10;

 75         

 76         //用TreeMap存储可以利用它的排序功能

 77         //这里用 MyInt 因为TreeMap是对key排序,且不能唯一,而词频可能相同,要以词频为Key就必需对它封装

 78         private static TreeMap<MyInt, String> tm = new TreeMap<MyInt, String>(new Comparator<MyInt>(){

 79             /**

 80              * 默认是从小到大的顺序排的,现在修改为从大到小

 81              * @param o1

 82              * @param o2

 83              * @return

 84              */

 85             @Override

 86             public int compare(MyInt o1, MyInt o2) {

 87                 return o2.compareTo(o1);

 88             }

 89             

 90         }) ;

 91         

 92         /*

 93          * 以词频为Key是要用到reduce的排序功能

 94          */

 95         @Override

 96         protected void reduce(IntWritable key, Iterable<Text> values,

 97                 Context context) throws IOException, InterruptedException {

 98             for (Text text : values) {

 99                 context.write(text, key);

100                 tm.put(new MyInt(key.get()),text.toString());

101                 

102                 //TreeMap以对内部数据进行了排序,最后一个必定是最小的

103                 if(tm.size() > k){

104                     tm.remove(tm.lastKey());

105                 }

106                 

107             }

108         }

109 

110         @Override

111         protected void cleanup(Context context)

112                 throws IOException, InterruptedException {

113             String path = context.getConfiguration().get("topKout");

114             mos = new MultipleOutputs<Text, IntWritable>(context);

115             Set<Entry<MyInt, String>> set = tm.entrySet();

116             for (Entry<MyInt, String> entry : set) {

117                 mos.write("topKMOS", new Text(entry.getValue()), new IntWritable(entry.getKey().getValue()), path);

118             }

119             mos.close();

120         }

121 

122         

123         

124     }

125 

126     @SuppressWarnings("deprecation")

127     public static void run(String in, String out,String topKout) throws IOException,

128             ClassNotFoundException, InterruptedException {

129 

130         Path outPath = new Path(out);

131 

132         Configuration conf = new Configuration();

133         

134         //前K个词要输出到哪个目录

135         conf.set("topKout",topKout);

136         

137         Job job = new Job(conf, "Sort");

138         job.setJarByClass(Sort.class);

139         job.setMapperClass(Map.class);

140         job.setReducerClass(Reduce.class);

141 

142         // 设置Map输出类型

143         job.setMapOutputKeyClass(IntWritable.class);

144         job.setMapOutputValueClass(Text.class);

145 

146         // 设置Reduce输出类型

147         job.setOutputKeyClass(Text.class);

148         job.setOutputValueClass(IntWritable.class);

149 

150         //设置MultipleOutputs的输出格式

151         //这里利用MultipleOutputs进行对文件输出

152         MultipleOutputs.addNamedOutput(job,"topKMOS",TextOutputFormat.class,Text.class,Text.class);

153         

154         // 设置输入和输出目录

155         FileInputFormat.addInputPath(job, new Path(in));

156         FileOutputFormat.setOutputPath(job, outPath);

157         job.waitForCompletion(true);

158 

159     }

160 

161 }

自己封装的Int

 1 package TopK;

 2 

 3 public class MyInt implements Comparable<MyInt>{

 4     private Integer value;

 5 

 6     public MyInt(Integer value){

 7         this.value = value;

 8     }

 9     

10     public int getValue() {

11         return value;

12     }

13 

14     public void setValue(int value) {

15         this.value = value;

16     }

17 

18     @Override

19     public int compareTo(MyInt o) {

20         return value.compareTo(o.getValue());

21     }

22     

23     

24 }

运行入口

 1 package TopK;

 2 

 3 import java.io.IOException;

 4 

 5 /**

 6  * 

 7  * @author zx

 8  *[email protected]

 9  */

10 public class TopK {

11     public static void main(String args[]) throws ClassNotFoundException, IOException, InterruptedException{

12         

13         //要统计字数,排序的文字

14         String in = "hdfs://localhost:9000/input/MaDing.text";

15         

16         //统计字数后的结果

17         String wordCout = "hdfs://localhost:9000/out/wordCout";

18         

19         //对统计完后的结果再排序后的内容

20         String sort = "hdfs://localhost:9000/out/sort";

21         

22         //前K条

23         String topK = "hdfs://localhost:9000/out/topK";

24         

25         //如果统计字数的job完成后就开始排序

26         if(WordCount.run(in, wordCout)){

27             Sort.run(wordCout, sort,topK);

28         }

29         

30     }

31 }

 

你可能感兴趣的:(mapreduce)