按输出结果排序
将已经统计好的数据结果,在进行一次MapReduce,
map输出时会分区(省份案例)排序(此案例)
package hadoop.mapreduce.flowsort;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/**
@program:bigdata
@package:hadoop.mapreduce.flowsum
@filename:FlowBean.java
@create:2019.09.24.16.56
@author:Administrator
@descrption.
*/
public class FlowBean implements WritableComparable {
private long upFlow;
private long dFlow;
private long sumFlow;
public long getUpFlow() {
return upFlow;
}
public void setUpFlow(long upFlow) {
this.upFlow = upFlow;
}
public long getdFlow() {
return dFlow;
}
public void setdFlow(long dFlow) {
this.dFlow = dFlow;
}
public FlowBean(long upFlow, long dFlow) {
this.upFlow = upFlow;
this.dFlow = dFlow;
this.sumFlow=upFlow+dFlow;
}
public void set(long upFlow, long dFlow) {
this.upFlow = upFlow;
this.dFlow = dFlow;
this.sumFlow=upFlow+dFlow;
}
public long getSumFlow() {
return sumFlow;
}
public void setSumFlow(long sumFlow) {
this.sumFlow = sumFlow;
}
//反序列化时,需要反射调用空参构造函数,所以需要重新定义一个
public FlowBean() {
}
@Override
public String toString() {
return upFlow+"\t"+dFlow+"\t"+sumFlow;
}
/*
public int compareTo(FlowBean o) {
return this.sumFlow>o.sumFlow?-1:1;
}
}
package hadoop.mapreduce.flowsort;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
@program:bigdata
@package:hadoop.mapreduce.flowsort
@filename:FlowCountSortMapper.java
@create:2019.09.25.18.40
@author:Administrator
@descrption.
*/
public class FlowCountSort {
static class FlowCountSortMapper extends Mapper
FlowBean bean=new FlowBean();
Text v=new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//拿到的是上一个程序的输出结果,已经是个手机号的总流量信息
String line = value.toString();
String[] fields = line.split("\t");
String phoneNum = fields[0];
long upFlow = Long.parseLong(fields[1]);
long dFlow = Long.parseLong(fields[2]);
bean.set(upFlow,dFlow);
v.set(phoneNum);
context.write(bean,v);
}
}
static class FlowCountSortReducer extends Reducer
@Override
protected void reduce(FlowBean key, Iterable values, Context context) throws IOException, InterruptedException {
context.write(values.iterator().next(),key);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
//指定本程序的jar包所在的本地路径
job.setJarByClass(FlowCountSort.class);
//指定本业务job要使用的mapper/reduce业务类
job.setMapperClass(FlowCountSortMapper.class);
job.setReducerClass(FlowCountSortReducer.class);
//指定mapper输出的kv类型
job.setMapOutputKeyClass(FlowBean.class);
job.setMapOutputValueClass(Text.class);
//指定最终的输出类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);
//指定job的输入原始文件的所在目录
FileInputFormat.setInputPaths(job,new Path(args[0]));
//指定job的输出结果所在目录
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//将job中配置的相关参数以及job以及job所用的java类所在的jar包,提交给yarn去运行、
// job.submit();//看不到结果
boolean res = job.waitForCompletion(true);
System.exit(res?0:1);
}
}
02.MR内部的shuffle过程详解_
combiner的逻辑和reduce逻辑相同,在job.setCombinerClass()时设置成reduce类就行。
关于大量小文件的切片问题:采用CombineTextInputFormat的切片模式(默认TextInputFormat),一般电脑的cup核数=切片数=maptask数
mapreduce&yarn的工作机制
本地模式调试MR程序:
(1)mapreduce程序是被提交给LocalJobRunner在本地以单进程的形式运行
(2)而处理的数据及输出结果可以在本地文件系统,也可以在hdfs上
(3)怎样实现本地运行?写一个程序,不要带集群的配置文件(本质是你的mr程序的conf中是否有mapreduce.framework.name=local以及yarn.resourcemanager.hostname参数)
(4)本地模式非常便于进行业务逻辑的debug,只要在eclipse中打断点即可
如果在windows下想运行本地模式来测试程序逻辑,需要在windows中配置环境变量:
%HADOOP_HOME% = d:/hadoop-2.6.1
%PATH% = %HADOOP_HOME%\bin
并且要将d:/hadoop-2.6.1的lib和bin目录替换成windows平台编译的版本
// //提交到本地模拟运行,不做任何配置在本地运行默认就是在本地跑
// conf.set(“mapreduce.framework.name”,“local”);
// 本地模式运行mr程序时,输入输出的数据可以在本地,也可以在hdfs上
// conf.set(“fs.defaultFS”,“file:///”);
如果使用java -jar 命令提交任务就必须手动指定jar包位置job.setJarByClass("/home/hadoop/wc.jar")
并手动配置参数conf.set(“mapreduce.framework.name”,“yarn”)
conf.set(“yarn.resourcemanager.hostname”,“mini1”)
conf.set(“fs.defaultFs”,“hdfs://mini:9000”)
4.4.1 reduce端join算法实现
1、需求:
订单数据表t_order:
id date pid amount
1001 20150710 P0001 2
1002 20150710 P0001 3
1002 20150710 P0002 3
商品信息表t_product
id pname category_id price
P0001 小米5 1000 2
P0002 锤子T1 1000 3
合并表
package hadoop.mapreduce.join;
import hadoop.mapreduce.province.FlowBean;
import hadoop.mapreduce.province.FlowCount;
import hadoop.mapreduce.province.ProvincePartitioner;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;
/**
@program:bigdata
@package:hadoop.mapreduce.join
@filename:RJoin.java
@create:2019.09.26.10.18
@author:Administrator
@descrption.
*/
public class RJoin {
static class RJoinMapper extends Mapper
InfoBean bean=new InfoBean();
Text k=new Text();
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String line = value.toString();
FileSplit inputSplit = (FileSplit) context.getInputSplit();
//拿到文件名
String name = inputSplit.getPath().getName();
String pid="";
//通过文件名判断是哪种数据
if (name.endsWith("order")){
String[] fields = line.split(",");
pid=fields[2];
bean.set(Integer.parseInt(fields[0]),fields[1],pid,Integer.parseInt(fields[3]),
"",0,0,"0");
}else {
String[] fields = line.split(",");
pid=fields[0];
bean.set(0,"",pid,0,fields[1],Integer.parseInt(fields[2]),Float.parseFloat(fields[3]),
"1");
}
k.set(pid);
context.write(k,bean);
}
}
static class RJoinReducer extends Reducer
@Override
protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
InfoBean pdBean = new InfoBean();
ArrayList orderBeans = new ArrayList<>();
for (InfoBean bean:values){
if ("1".equals(bean.getFlag())){
try {
BeanUtils.copyProperties(pdBean,bean);
} catch (IllegalAccessException e) {
e.printStackTrace();
} catch (InvocationTargetException e) {
e.printStackTrace();
}
}else {
InfoBean odBean = new InfoBean();
try {
BeanUtils.copyProperties(odBean,bean);
} catch (IllegalAccessException e) {
e.printStackTrace();
} catch (InvocationTargetException e) {
e.printStackTrace();
}
orderBeans.add(odBean);
}
}
for (InfoBean bean :orderBeans){
bean.setPname(pdBean.getPname());
bean.setCategory_id(pdBean.getCategory_id());
bean.setPrice(pdBean.getPrice());
context.write(bean,NullWritable.get());
}
}
public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
//指定本程序的jar包所在的本地路径
job.setJarByClass(RJoin.class);
//指定本业务job要使用的mapper/reduce业务类
job.setMapperClass(RJoinMapper.class);
job.setReducerClass(RJoinReducer.class);
//指定mapper输出的kv类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(InfoBean.class);
//指定最终的输出类型
job.setOutputKeyClass(InfoBean.class);
job.setOutputValueClass(NullWritable.class);
//指定job的输入原始文件的所在目录
FileInputFormat.setInputPaths(job,new Path(args[0]));
//指定job的输出结果所在目录
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//将job中配置的相关参数以及job以及job所用的java类所在的jar包,提交给yarn去运行、
// job.submit();//看不到结果
boolean res = job.waitForCompletion(true);
System.exit(res?0:1);
}
}
}
package hadoop.mapreduce.join;
import org.apache.hadoop.io.Writable;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
/**
@program:bigdata
@package:hadoop.mapreduce.join
@filename:InfoBean.java
@create:2019.09.26.10.22
@author:Administrator
@descrption.
*/
public class InfoBean implements Writable {
private int order_id;
private String dataString;
private String p_id;
private int amount;
private String pname;
private int category_id;
private float price;
//flag=0表示这个对象是封装订单表记录
//flag=1表示这个对象是封装商品信息记录
private String flag;
public InfoBean() { }
public String getFlag() {
return flag;
}
public void setFlag(String flag) {
this.flag = flag;
}
public void set(int order_id, String dataString, String p_id, int amount, String pname, int category_id, float price, String flag) {
this.order_id = order_id;
this.dataString = dataString;
this.p_id = p_id;
this.amount = amount;
this.pname = pname;
this.category_id = category_id;
this.price = price;
this.flag=flag;
}
public int getOrder_id() {
return order_id;
}
public void setOrder_id(int order_id) {
this.order_id = order_id;
}
public String getDataString() {
return dataString;
}
public void setDataString(String dataString) {
this.dataString = dataString;
}
public String getP_id() {
return p_id;
}
public void setP_id(String p_id) {
this.p_id = p_id;
}
public int getAmount() {
return amount;
}
public void setAmount(int amount) {
this.amount = amount;
}
public String getPname() {
return pname;
}
public void setPname(String pname) {
this.pname = pname;
}
public int getCategory_id() {
return category_id;
}
public void setCategory_id(int category_id) {
this.category_id = category_id;
}
public float getPrice() {
return price;
}
public void setPrice(float price) {
this.price = price;
}
@Override
public void write(DataOutput dataOutput) throws IOException {
dataOutput.writeInt(order_id);
dataOutput.writeUTF(dataString);
dataOutput.writeUTF(p_id);
dataOutput.writeInt(amount);
dataOutput.writeUTF(pname);
dataOutput.writeInt(category_id);
dataOutput.writeFloat(price);
dataOutput.writeUTF(flag);
}
@Override
public void readFields(DataInput dataInput) throws IOException {
this.order_id = dataInput.readInt();
this.dataString = dataInput.readUTF();
this.p_id = dataInput.readUTF();
this.amount = dataInput.readInt();
this.pname=dataInput.readUTF();
this.category_id=dataInput.readInt();
this.price=dataInput.readFloat();
this.flag=dataInput.readUTF();
}
@Override
public String toString() {
return
“order_id=” + order_id +
“, dataString=’” + dataString + ‘’’ +
“, p_id=” + p_id +
“, amount=” + amount +
“, pname=’” + pname + ‘’’ +
“, category_id=” + category_id +
“, price=” + price +
“, flag=’” + flag;
}
}