1.应用场景:MR默认会对键进行排序,然而有的时候我们也有对值进行排序的需求。满足这种需求一是可以在reduce阶段排序收集过来的values,但是,如果有数量巨大的values可能就会导致内存溢出等问题,这就是二次排序应用的场景——将对值的排序也安排到MR计算过程之中,而不是单独来做。
2.需求:
name money
zhangsan 125
lisi 135
wangwu 60
zhangsan 56
wangwu 80
lisi 650
zhangsan 50
wangwu 6
lisi 900
二次排序:第一次排要求按照姓名的首字母进行排序
第二次排序要求按照同一个人的消费金额进行排
分析实现的思路
key#value value
统计结果
lisi 135
lisi 650
lisi 900
wangwu 6
wangwu 60
wangwu 80
zhangsan 50
zhangsan 56
zhangsan 125
构造(name,money)作为key, money作为value,然后进入map流程
新建一个Bean类,定义name和money属性,重写hashcode,equals,compareTo方法,在compareTo中先对name进行排序,再对money进行排序
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
/*
* name money
* 自定义key一般实现writableComparable为了实现Shuffle过程的分区,排序,合并
*/
public class PairWritable implements WritableComparable<PairWritable>{
private String name;
private int money;
public PairWritable() {
}
public PairWritable(String name, int money) {
super();
this.name = name;
this.money = money;
}
public void set (String name,int money) {
this.name = name;
this.money = money;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public int getMoney() {
return money;
}
public void setMoney(int money) {
this.money = money;
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(name);
out.writeInt(money);
}
@Override
public void readFields(DataInput in) throws IOException {
this.name=in.readUTF();
this.money=in.readInt();
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + money;
result = prime * result + ((name == null) ? 0 : name.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
PairWritable other = (PairWritable) obj;
if (money != other.money)
return false;
if (name == null) {
if (other.name != null)
return false;
} else if (!name.equals(other.name))
return false;
return true;
}
@Override
public int compareTo(PairWritable o) {
//比较第一个字段
int comp = this.name.compareTo(o.getName());
if(0 != comp){
return comp;
}
//比较第二个字段
return Integer.valueOf(o.getMoney()).compareTo(Integer.valueOf(this.money));
}
@Override
public String toString() {
return "PairWritable [name=" + name + ", money=" + money + "]";
}
}
import java.io.IOException;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.Tool;
public class SecondaryMR extends Configured implements Tool{
public static class SecondaryMapper extends Mapper<
LongWritable,Text, PairWritable, IntWritable> {
private PairWritable mapOutputKey = new PairWritable();
private IntWritable mapOutputValue = new IntWritable();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
//切分每行数据转化成字符串数组
String[] strs = value.toString().split("\t");
//设置输出的对象
mapOutputKey.set(strs[0], Integer.valueOf(strs[1]));
mapOutputValue.set(Integer.valueOf(strs[1]));
context.write(mapOutputKey, mapOutputValue);
}
}
public static class SecondaryReducer extends Reducer
<PairWritable, IntWritable,Text,IntWritable>{
private Text outputKey = new Text();
@Override
protected void reduce(PairWritable key, Iterable values,
Context context) throws IOException, InterruptedException {
System.out.print(key+"\t");
//方法一:value通过key获取
/*outputKey=new Text(key.getName());
context.write(outputKey, new IntWritable(key.getMoney()));
*/
//方法二:value通过遍历values获取
for(IntWritable value :values){
outputKey.set(key.getName());
context.write(outputKey, value);
System.out.print(value+"\t");
}
}
}
/**
* Driver(环境,输入输出路径,并行)
* @param args
* @return
* @throws IOException
* @throws InterruptedException
* @throws ClassNotFoundException
*/
public int run(String[] args) throws IOException, ClassNotFoundException, InterruptedException{
//1.获取配置文件
Configuration conf = new Configuration();
// conf.set("yarn.resourcemanager.hostname","linux1");
// conf.set("mapreduce.framework.name", "yarn");
// conf.set("yarn.nodemanager.aux-services", "mapreduce_shuffle");
//2.创建job
Job job= Job.getInstance(conf,this.getClass().getSimpleName());
job.setJarByClass(this.getClass());
//3.设置job的相关参数
//input -> map ->reduce -> ouput
//3.1输入
Path inPath = new Path(args[0]);
FileInputFormat.addInputPath(job, inPath);
//3.2 map class
job.setMapperClass(SecondaryMapper.class);
job.setMapOutputKeyClass(PairWritable.class);
job.setMapOutputValueClass(IntWritable.class);
//3.3reduce class
job.setReducerClass(SecondaryReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//3.4ouput
Path outPath = new Path(args[1]);
FileSystem fs = outPath.getFileSystem(conf);
if(fs.exists(outPath)){
fs.delete(outPath,true);
}
FileOutputFormat.setOutputPath(job, outPath);
//4.提交job
boolean isSuccess= job.waitForCompletion(true);
return isSuccess ? 0 : 1;
}
public static void main(String [] args) throws ClassNotFoundException, IOException, InterruptedException{
args = new String[]{"hdfs://ns1/input/Secondary.txt",
"hdfs://ns1/output"};
//运行方法
int status = new SecondaryMR().run(args);
System.exit(status);
}
}