需求:
订单数据表t_order:
id |
date |
pid |
amount |
1001 |
20150710 |
P0001 |
2 |
1002 |
20150710 |
P0001 |
3 |
1002 |
20150710 |
P0002 |
3 |
商品信息表t_product
id |
pname |
category_id |
price |
P0001 |
小米5 |
1000 |
2 |
P0002 |
锤子T1 |
1000 |
3 |
假如数据量巨大,两表的数据是以文件的形式存储在HDFS中,需要用mapreduce程序来实现一下SQL查询运算:
select a.id,a.date,b.name,b.category_id,b.price from t_order a join t_product b on a.pid = b.id(测试文件中数据之间用逗号分隔)
InfoBean来封装相关数据
package com.bpf.mr.rjoin;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
public class InfoBean implements Writable {
private int order_id;
private String dateString;
private String p_id;
private int amount;
private String pname;
private int category_id;
private float price;
//flag=0代表这个对象封装订单表
//flag=1代表这个对象封装产品信息表
private int flag;
public InfoBean() {}
public void set(int order_id, String dateString, String p_id, int amount, String pname, int category_id, float price, int flag) {
this.order_id = order_id;
this.dateString = dateString;
this.p_id = p_id;
this.amount = amount;
this.pname = pname;
this.category_id = category_id;
this.price = price;
this.flag = flag;
}
public int getOrder_id() {
return order_id;
}
public void setOrder_id(int order_id) {
this.order_id = order_id;
}
public String getDateString() {
return dateString;
}
public void setDateString(String dateString) {
this.dateString = dateString;
}
public String getP_id() {
return p_id;
}
public void setP_id(String p_id) {
this.p_id = p_id;
}
public int getAmount() {
return amount;
}
public void setAmount(int amount) {
this.amount = amount;
}
public String getPname() {
return pname;
}
public void setPname(String pname) {
this.pname = pname;
}
public int getCategory_id() {
return category_id;
}
public void setCategory_id(int category_id) {
this.category_id = category_id;
}
public float getPrice() {
return price;
}
public void setPrice(float price) {
this.price = price;
}
public int getFlag() {
return flag;
}
public void setFlag(int flag) {
this.flag = flag;
}
@Override
public void readFields(DataInput in) throws IOException {
this.order_id = in.readInt();
this.dateString = in.readUTF();
this.p_id = in.readUTF();
this.amount = in.readInt();
this.pname = in.readUTF();
this.category_id = in.readInt();
this.price = in.readFloat();
this.flag = in.readInt();
}
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(order_id);
out.writeUTF(dateString);
out.writeUTF(p_id);
out.writeInt(amount);
out.writeUTF(pname);
out.writeInt(category_id);
out.writeFloat(price);
out.writeInt(flag);
}
@Override
public String toString() {
return "order_id=" + order_id + ", dateString=" + dateString + ", p_id=" + p_id + ", amount=" + amount + ", pname=" + pname + ", category_id=" + category_id + ", price=" + price + ", flag=" + flag;
}
}
package com.bpf.mr.rjoin;
import java.io.IOException;
import java.net.URI;
import java.util.ArrayList;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class Rjoin {
static class RjoinMapper extends Mapper {
InfoBean bean = new InfoBean();
Text t = new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
FileSplit split = (FileSplit) context.getInputSplit();
String name = split.getPath().getName();
String pid = "";
//通过文件名判断是哪种数据
if(name.startsWith("order")) {
String[] field = line.split(",");
bean.set(Integer.parseInt(field[0]), field[1], field[2], Integer.parseInt(field[3]), "", 0, 0, 0);
pid = field[2];
}else {
String[] field = line.split(",");
bean.set(0, "", field[0], 0, field[1], Integer.parseInt(field[2]), Float.parseFloat(field[3]), 1);
pid = field[0];
}
t.set(pid);
context.write(t, bean);
}
}
static class RjoinReducer extends Reducer{
@Override
protected void reduce(Text pid, Iterable beans, Context context) throws IOException, InterruptedException {
//每一个pid对应多组订单
InfoBean pdBean = new InfoBean();
ArrayList orderBeans = new ArrayList();
for (InfoBean infoBean : beans) {
if(infoBean.getFlag() == 1 ) {
try {
BeanUtils.copyProperties(pdBean, infoBean);
} catch (Exception e) {
e.printStackTrace();
}
}else {
InfoBean orderBean = new InfoBean();
try {
BeanUtils.copyProperties(orderBean, infoBean);
orderBeans.add(orderBean);
} catch (Exception e) {
e.printStackTrace();
}
}
}
//拼接两类数据,形成最终结果
for (InfoBean bean : orderBeans) {
bean.setPname(pdBean.getPname());
bean.setCategory_id(pdBean.getCategory_id());
bean.setPrice(pdBean.getPrice());
context.write(bean, NullWritable.get());
}
}
public static void main(String[] args) throws Exception {
final Configuration conf = new Configuration();
final Job job = Job.getInstance(conf);
job.setJarByClass(Rjoin.class);
job.setMapperClass(RjoinMapper.class);
job.setReducerClass(RjoinReducer.class);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(InfoBean.class);
// TODO: specify output types
job.setOutputKeyClass(InfoBean.class);
job.setOutputValueClass(NullWritable.class);
//便于测试,若存在输出目录,则删除
Path outPath = new Path("hdfs://Master:9000/output");
FileSystem fs = FileSystem.get(new URI("hdfs://Master:9000"), conf);
if(fs.exists(outPath)) {
fs.delete(outPath,true);
}
// TODO: specify input and output DIRECTORIES (not files)
FileInputFormat.setInputPaths(job, "hdfs://Master:9000/bpf");
FileOutputFormat.setOutputPath(job, outPath);
job.waitForCompletion(true);
}
}
}