1、需求与实现思路
(1)需求
有2个数据文件:订单数据、商品信息。
订单数据表order
商品信息表product
需要用MapReduce程序来实现下面这个SQL查询运算:
select o.id order_id, o.date, o.amount, p.id p_id, p.pname, p.c
ategory_id, p.price
from t_order o join t_product p on o.pid = p.id
(2)实现思路
SQL的执行结果是这样的:
实际上就是给每条订单记录补充上商品表中的信息。
实现思路:
1)定义bean
把SQL执行结果中的各列封装成一个bean对象,实现序列化。
bean中还要有一个另外的属性flag,用来标识此对象的数据是订单还是商品。
2)map处理
map会处理两个文件中的数据,根据文件名可以知道当前这条数据是订单还是商品。
对每条数据创建一个bean对象,设置对应的属性,并标识flag(0代表order,1代表product)
以join的关联项“productid”为key,bean为value进行输出。
3)reduce处理
reduce方法接收到pid相同的一组bean对象。
遍历bean对象集合,如果bean是订单数据,就放入一个新的订单集合中,如果是商品数据,就保存到一个商品bean中。然后遍历那个新的订单集合,使用商品bean的数据对每个订单bean进行信息补全。
这样就得到了完整的订单及其商品信息。
package join;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
public class InfoBean implements Writable{
private int order_id;
private String dataString;
private String p_id;
private int amount;
private String pname;
private int category_id;
private float price;
private String flag;
public InfoBean() {
}
public void set(int order_id, String dataString, String p_id, int amount,
String pname, int category_id, float price, String flag) {
this.order_id = order_id;
this.dataString = dataString;
this.p_id = p_id;
this.amount = amount;
this.pname = pname;
this.category_id = category_id;
this.price = price;
this.flag = flag;
}
public void readFields(DataInput in) throws IOException {
this.order_id = in.readInt();
this.dataString = in.readUTF();
this.p_id = in.readUTF();
this.amount = in.readInt();
this.pname = in.readUTF();
this.category_id = in.readInt();
this.price = in.readFloat();
this.flag = in.readUTF();
}
public void write(DataOutput out) throws IOException {
out.writeInt(order_id);
out.writeUTF(dataString);
out.writeUTF(p_id);
out.writeInt(amount);
out.writeUTF(pname);
out.writeInt(category_id);
out.writeFloat(price);
out.writeUTF(flag);
}
public int getOrder_id() {
return order_id;
}
public void setOrder_id(int order_id) {
this.order_id = order_id;
}
public String getDataString() {
return dataString;
}
public void setDataString(String dataString) {
this.dataString = dataString;
}
public String getP_id() {
return p_id;
}
public void setP_id(String p_id) {
this.p_id = p_id;
}
public int getAmount() {
return amount;
}
public void setAmount(int amount) {
this.amount = amount;
}
public String getPname() {
return pname;
}
public void setPname(String pname) {
this.pname = pname;
}
public int getCategory_id() {
return category_id;
}
public void setCategory_id(int category_id) {
this.category_id = category_id;
}
public float getPrice() {
return price;
}
public void setPrice(float price) {
this.price = price;
}
public String getFlag() {
return flag;
}
public void setFlag(String flag) {
this.flag = flag;
}
@Override
public String toString() {
return "InfoBean [order_id=" + order_id + ", dataString=" + dataString
+ ", p_id=" + p_id + ", pname=" + pname + ", category_id="
+ category_id + ", price=" + price + "]";
}
}
package join_duomap;
import java.io.IOException;
import java.util.ArrayList;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.MultipleInputs;
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class JoinMR {
static class JoinMRMapper_1 extends Mapper{
InfoBean bean = new InfoBean();
Text k =new Text();
@Override
protected void map(LongWritable key, Text value,Context context)throws IOException, InterruptedException {
String line = value.toString();
String[] fields = line.split(" ");
bean.set(Integer.parseInt(fields[0]), fields[1], fields[2], Integer.parseInt(fields[3]), "", 0, 0f, "0");
k.set(fields[2]);
context.write(k, bean);
}
}
static class JoinMRMapper_2 extends Mapper{
InfoBean bean = new InfoBean();
Text k =new Text();
@Override
protected void map(LongWritable key, Text value,Context context)throws IOException, InterruptedException {
String line = value.toString();
String[] fields = line.split(" ");
bean.set(0, "", fields[0], 0, fields[1], Integer.parseInt(fields[2]), Float.parseFloat(fields[3]), "1");
k.set(fields[0]);
context.write(k, bean);
}
}
static class JoinMRReducer extends Reducer{
@Override
protected void reduce(Text pid, Iterable beans,Context context)throws IOException, InterruptedException {
InfoBean pdBean = new InfoBean();
ArrayList orderBeans = new ArrayList();
try{
for (InfoBean bean : beans) {
if("1".equals(bean.getFlag())){
BeanUtils.copyProperties(pdBean, bean);
}else{
InfoBean odbean = new InfoBean();
BeanUtils.copyProperties(odbean, bean);
orderBeans.add(odbean);
}
}
}catch (Exception e){
}
for(InfoBean bean : orderBeans){
bean.setPname(pdBean.getPname());
bean.setCategory_id(pdBean.getCategory_id());
bean.setPrice(pdBean.getPrice());
context.write(bean, NullWritable.get());
}
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
FileSystem fs = FileSystem.get(conf);
if(fs.exists(new Path(args[args.length-1]))){
fs.delete(new Path(args[args.length-1]), true);
}
job.setJarByClass(JoinMR.class);
job.setJobName("JoinMR");
//job.setMapperClass(JoinMRMapper_1.class);
MultipleInputs.addInputPath(job, new Path(args[0]), TextInputFormat.class, JoinMRMapper_1.class);
MultipleInputs.addInputPath(job, new Path(args[1]), TextInputFormat.class, JoinMRMapper_2.class);
job.setReducerClass(JoinMRReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(InfoBean.class);
job.setOutputKeyClass(InfoBean.class);
job.setOutputValueClass(NullWritable.class);
//FileInputFormat.setInputPaths(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[args.length-1]));
//FileInputFormat.setInputPaths(job, new Path(args[0]),new Path(args[1]));
/*for(int i=0;i