自定义Key:
import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; import org.apache.hadoop.io.WritableComparable; public class SecondSortClass implements WritableComparable<SecondSortClass> { /** * 自定义类型的中包含的变量,本例中的变量都是用于排序的变量 * 后序的事例中我们还将定义一些其它功能的变量 */ private int first; private String second; public SecondSortClass() {} public SecondSortClass(int first, String second) { this.first = first; this.second = second; } /** * 反序列化,从流中的二进制转换成自定义Key */ @Override public void readFields(DataInput input) throws IOException { this.first = input.readInt(); this.second = input.readUTF(); } /** * 序列化,将自定义Key转化成使用流传送的二进制 */ @Override public void write(DataOutput output) throws IOException { output.writeInt(first); output.writeUTF(second); } @Override public int hashCode() { final int prime = 31; int result = 1; result = prime * result + first; result = prime * result + ((second == null) ? 0 : second.hashCode()); return result; } @Override public boolean equals(Object obj) { if (this == obj) return true; if (obj == null) return false; if (getClass() != obj.getClass()) return false; SecondSortClass other = (SecondSortClass) obj; if (first != other.first) return false; if (second == null) { if (other.second != null) return false; } else if (!second.equals(other.second)) return false; return true; } /** * 用于map阶段和reduce阶段的排序 以及用于reduce阶段的grouping分组 */ @Override public int compareTo(SecondSortClass o) { if(this.first != o.getFirst()) { return -(this.first - o.getFirst()); } else if( !this.second.equals(o.getSecond())) { return -this.second.compareTo(o.getSecond()); } return 0; } public int getFirst() { return first; } public void setFirst(int first) { this.first = first; } public String getSecond() { return second; } public void setSecond(String second) { this.second = second; } }map阶段:
import java.io.IOException; import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Mapper; public class SecondMapper extends Mapper<LongWritable, Text, SecondSortClass, Text> { @Override protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException { String line = value.toString().trim(); if(line.length() > 0) { String[] arr = line.split(","); if(arr.length == 3) { context.write(new SecondSortClass(Integer.valueOf(arr[2]),arr[1]), new Text(arr[1] + "," + arr[2])); } } } }reduce阶段:
import java.io.IOException; import org.apache.hadoop.io.NullWritable; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Reducer; public class SecondReducer extends Reducer<SecondSortClass, Text, NullWritable, Text> { int len; /** * Map任务启动的时候调用 */ @Override protected void setup( Context context) throws IOException, InterruptedException { /** * 通过context获取任务启动时传入的TopK的K值 */ len = context.getConfiguration().getInt("K", 10); } @Override protected void reduce(SecondSortClass key, Iterable<Text> values, Context context) throws IOException, InterruptedException { // for(Text val: values) { // if(len <= 0) { // break; // } // context.write(null, val); // len --; // } if(len > 0) { context.write(null, values.iterator().next()); len --; } } }
启动函数:
import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.Text; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; public class JobMain { public static void main(String[] args) throws Exception{ Configuration configuration = new Configuration(); /** * 把传入参数放入Configuration中,map或reduce中可以通过 * 获取Configuration来获取传入的参数,这是hadoop传入参数的 * 方式之一 */ configuration.set("K", args[2]); Job job = new Job(configuration, "third-sort-job"); job.setJarByClass(JobMain.class); job.setMapperClass(SecondMapper.class); job.setMapOutputKeyClass(SecondSortClass.class); job.setMapOutputValueClass(Text.class); job.setReducerClass(SecondReducer.class); job.setOutputKeyClass(Text.class); job.setOutputValueClass(Text.class); FileInputFormat.addInputPath(job, new Path(args[0])); Path outputDir = new Path(args[1]); FileSystem fs = FileSystem.get(configuration); if(fs.exists(outputDir)) { fs.delete(outputDir, true); } FileOutputFormat.setOutputPath(job, outputDir); System.exit(job.waitForCompletion(true)? 0: 1); } }
运行命令:
./hadoop jar mr.jar com.seven.mapreduce.test1.JobMain /input/two /output/two14 3
运行数据:
uid,name,cost
1,mr1,3234
2,mr2,123
3,mr3,9877
4,mr4,348
5,mr5,12345
6,mr6,6646
7,mr7,98
8,mr8,12345
运行结果: