MapReduce实战练习二:两张表的合并汇总

需求:

订单数据表t_order:

id

date

pid

amount

1001

20150710

P0001

2

1002

20150710

P0001

3

1002

20150710

P0002

3

 

商品信息表t_product

id

pname

category_id

price

P0001

小米5

1000

2

P0002

锤子T1

1000

3

 

假如数据量巨大,两表的数据是以文件的形式存储在HDFS中,需要用mapreduce程序来实现一下SQL查询运算:

select  a.id,a.date,b.name,b.category_id,b.price from t_order a join t_product b on a.pid = b.id

(测试文件中数据之间用逗号分隔)


InfoBean来封装相关数据

package com.bpf.mr.rjoin;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

import org.apache.hadoop.io.Writable;

public class InfoBean implements Writable {

    private int order_id;
    private String dateString;
    private String p_id;
    private int amount;
    
    private String pname;
    private int category_id;
    private float price;
    
    //flag=0代表这个对象封装订单表
    //flag=1代表这个对象封装产品信息表
    private int flag;
    public InfoBean() {}
   
    
    
    
    public void set(int order_id, String dateString, String p_id, int amount, String pname, int category_id, float price, int flag) {
        this.order_id = order_id;
        this.dateString = dateString;
        this.p_id = p_id;
        this.amount = amount;
        this.pname = pname;
        this.category_id = category_id;
        this.price = price;
        this.flag = flag;
    }



    public int getOrder_id() {
        return order_id;
    }



    public void setOrder_id(int order_id) {
        this.order_id = order_id;
    }



    public String getDateString() {
        return dateString;
    }



    public void setDateString(String dateString) {
        this.dateString = dateString;
    }



    public String getP_id() {
        return p_id;
    }



    public void setP_id(String p_id) {
        this.p_id = p_id;
    }



    public int getAmount() {
        return amount;
    }



    public void setAmount(int amount) {
        this.amount = amount;
    }



    public String getPname() {
        return pname;
    }



    public void setPname(String pname) {
        this.pname = pname;
    }



    public int getCategory_id() {
        return category_id;
    }



    public void setCategory_id(int category_id) {
        this.category_id = category_id;
    }



    public float getPrice() {
        return price;
    }



    public void setPrice(float price) {
        this.price = price;
    }


    public int getFlag() {
        return flag;
    }


    public void setFlag(int flag) {
        this.flag = flag;
    }




    @Override
    public void readFields(DataInput in) throws IOException {
        this.order_id = in.readInt();
        this.dateString = in.readUTF();
        this.p_id = in.readUTF();
        this.amount = in.readInt();
        this.pname = in.readUTF();
        this.category_id = in.readInt();
        this.price = in.readFloat();
        this.flag = in.readInt();
        
    }

    @Override
    public void write(DataOutput out) throws IOException {
        out.writeInt(order_id);
        out.writeUTF(dateString);
        out.writeUTF(p_id);
        out.writeInt(amount);
        out.writeUTF(pname);
        out.writeInt(category_id);
        out.writeFloat(price);
        out.writeInt(flag);
        
    }

    @Override
    public String toString() {
        return "order_id=" + order_id + ", dateString=" + dateString + ", p_id=" + p_id + ", amount=" + amount + ", pname=" + pname + ", category_id=" + category_id + ", price=" + price + ", flag=" + flag;
    }

    
}


mapreduce代码:

package com.bpf.mr.rjoin;

import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;

import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;


public class Rjoin {

    static class RjoinMapper extends Mapper {
        
        InfoBean bean = new InfoBean();
        Text t = new Text();
        @Override
        protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
                
            String line = value.toString();
            
            FileSplit split = (FileSplit) context.getInputSplit();
            String name = split.getPath().getName();
            
            String pid = "";
            //通过文件名判断是哪种数据
            if(name.startsWith("order")) {
                String[] field = line.split(",");
                bean.set(Integer.parseInt(field[0]), field[1], field[2], Integer.parseInt(field[3]), "", 0, 0, 0);
                pid = field[2];                
            }else {
                String[] field = line.split(",");
                bean.set(0, "", field[0], 0, field[1], Integer.parseInt(field[2]), Float.parseFloat(field[3]), 1);
                pid = field[0];
            }
            t.set(pid);
            context.write(t, bean);
        }
    }
    
    
    static class RjoinReducer extends Reducer{
        @Override
        protected void reduce(Text pid, Iterable beans, Context context) throws IOException, InterruptedException {
            //每一个pid对应多组订单
            InfoBean pdBean = new InfoBean();
            ArrayList orderBeans = new ArrayList();
            
            for (InfoBean infoBean : beans) {
                if(infoBean.getFlag() == 1 ) {
                    try {
                        BeanUtils.copyProperties(pdBean, infoBean);
                    } catch (Exception e) { 
                        e.printStackTrace();
                    }
                }else {
                    InfoBean orderBean = new InfoBean();
                    try {
                        BeanUtils.copyProperties(orderBean, infoBean);
                        orderBeans.add(orderBean);
                    } catch (Exception e) { 
                        e.printStackTrace();
                    }
                }
            }
            
            //拼接两类数据,形成最终结果
            for (InfoBean bean : orderBeans) {
                bean.setPname(pdBean.getPname());
                bean.setCategory_id(pdBean.getCategory_id());
                bean.setPrice(pdBean.getPrice());
                context.write(bean, NullWritable.get());
            }
        }
        
        public static void main(String[] args) throws Exception {
            final Configuration conf = new Configuration();
            final Job job = Job.getInstance(conf);
            job.setJarByClass(Rjoin.class);
            
            job.setMapperClass(RjoinMapper.class);
            job.setReducerClass(RjoinReducer.class);
                
            
            job.setMapOutputKeyClass(Text.class);
            job.setMapOutputValueClass(InfoBean.class);
            // TODO: specify output types
            job.setOutputKeyClass(InfoBean.class);
            job.setOutputValueClass(NullWritable.class);
            
            //便于测试,若存在输出目录,则删除
            Path outPath = new Path("hdfs://Master:9000/output");
            FileSystem fs = FileSystem.get(new URI("hdfs://Master:9000"), conf);
            if(fs.exists(outPath)) {
                fs.delete(outPath,true);
            }
            // TODO: specify input and output DIRECTORIES (not files)
            FileInputFormat.setInputPaths(job, "hdfs://Master:9000/bpf");
            FileOutputFormat.setOutputPath(job, outPath);

            job.waitForCompletion(true);

         
            
        }
    }
            
}


 

你可能感兴趣的:(Hadoop)