(mr案例)---使用groupingcomparator求同一订单中最大金额的订单

有如下订单数据

订单id

商品id

成交金额

Order_0000001

Pdt_01

222.8

Order_0000001

Pdt_05

25.8

Order_0000002

Pdt_03

522.8

Order_0000002

Pdt_04

122.4

Order_0000002

Pdt_05

722.4

Order_0000003

Pdt_01

222.8

 

现在需要求出每一个订单中成交金额最大的一笔交易


分析:

1、利用“订单id和成交金额”作为key,可以将map阶段读取到的所有订单数据按照id分区,按照金额排序,发送到reduce

2、在reduce端利用groupingcomparator将订单id相同的kv聚合成组,然后取第一个即是最大值


调用reduce时用来对数据进行分组

package cn.itcastcat.bigdata.secondarysort;


import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.io.WritableComparator;


/**
 * 利用reduce端的GroupingComparator来实现将一组bean看成相同的key
 * @author [email protected]
 *
 */
public class ItemidGroupingComparator extends WritableComparator {


//传入作为key的bean的class类型,以及制定需要让框架做反射获取实例对象
protected ItemidGroupingComparator() {
super(OrderBean.class, true);
}



@Override
public int compare(WritableComparable a, WritableComparable b) {
OrderBean abean = (OrderBean) a;
OrderBean bbean = (OrderBean) b;

//比较两个bean时,指定只比较bean中的orderid,orderid相同的就分到同一组
return abean.getItemid().compareTo(bbean.getItemid());

}

}


Partitioner用来分区

package cn.itcastcat.bigdata.secondarysort;


import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Partitioner;




public class ItemIdPartitioner extends Partitioner{


@Override
public int getPartition(OrderBean bean, NullWritable value, int numReduceTasks) {
//相同id的订单bean,会发往相同的partition
//而且,产生的分区数,是会跟用户设置的reduce task数保持一致
return (bean.getItemid().hashCode() & Integer.MAX_VALUE) % numReduceTasks;

}



}


WritableComparable compareTo用来排序

package cn.itcastcat.bigdata.secondarysort;


import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;


import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableComparable;


/**

 * @author [email protected]
 *
 */
public class OrderBean implements WritableComparable{


private Text itemid;
private DoubleWritable amount;


public OrderBean() {
}


public OrderBean(Text itemid, DoubleWritable amount) {
set(itemid, amount);


}


public void set(Text itemid, DoubleWritable amount) {


this.itemid = itemid;
this.amount = amount;


}






public Text getItemid() {
return itemid;
}


public DoubleWritable getAmount() {
return amount;
}






@Override
public int compareTo(OrderBean o) {
int cmp = this.itemid.compareTo(o.getItemid());
if (cmp == 0) {
cmp = -this.amount.compareTo(o.getAmount());
}
return cmp;
}


@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(itemid.toString());
out.writeDouble(amount.get());

}


@Override
public void readFields(DataInput in) throws IOException {
String readUTF = in.readUTF();
double readDouble = in.readDouble();

this.itemid = new Text(readUTF);
this.amount= new DoubleWritable(readDouble);
}




@Override
public String toString() {


return itemid.toString() + "\t" + amount.get();

}


}


主程序

package cn.itcastcat.bigdata.secondarysort;


import java.io.IOException;


import org.apache.commons.lang.StringUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.DoubleWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


import com.sun.xml.bind.v2.schemagen.xmlschema.List;


/**
 * 
 * @author [email protected]
 *
 */
public class SecondarySort {

static class SecondarySortMapper extends Mapper{

OrderBean bean = new OrderBean();

@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {


String line = value.toString();
String[] fields = StringUtils.split(line, ",");

bean.set(new Text(fields[0]), new DoubleWritable(Double.parseDouble(fields[2])));

context.write(bean, NullWritable.get());

}

}

static class SecondarySortReducer extends Reducer{


//到达reduce时,相同id的所有bean已经被看成一组,且金额最大的那个一排在第一位
@Override
protected void reduce(OrderBean key, Iterable values, Context context) throws IOException, InterruptedException {
context.write(key, NullWritable.get());
}
}


public static void main(String[] args) throws Exception {

Configuration conf = new Configuration();
Job job = Job.getInstance(conf);

job.setJarByClass(SecondarySort.class);

job.setMapperClass(SecondarySortMapper.class);
job.setReducerClass(SecondarySortReducer.class);


job.setOutputKeyClass(OrderBean.class);
job.setOutputValueClass(NullWritable.class);

FileInputFormat.setInputPaths(job, new Path("c:/wordcount/gpinput"));
FileOutputFormat.setOutputPath(job, new Path("c:/wordcount/gpoutput"));

//在此设置自定义的Groupingcomparator类 
job.setGroupingComparatorClass(ItemidGroupingComparator.class);
//在此设置自定义的partitioner类
job.setPartitionerClass(ItemIdPartitioner.class);

job.setNumReduceTasks(2);

job.waitForCompletion(true);

}


}

你可能感兴趣的:(hadoop)