rm和yarn的原理

按输出结果排序
将已经统计好的数据结果,在进行一次MapReduce,
map输出时会分区(省份案例)排序(此案例)
package hadoop.mapreduce.flowsort;

import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/**

  • @program:bigdata

  • @package:hadoop.mapreduce.flowsum

  • @filename:FlowBean.java

  • @create:2019.09.24.16.56

  • @author:Administrator

  • @descrption.
    */
    public class FlowBean implements WritableComparable {
    private long upFlow;
    private long dFlow;
    private long sumFlow;

    public long getUpFlow() {
    return upFlow;
    }

    public void setUpFlow(long upFlow) {
    this.upFlow = upFlow;
    }

    public long getdFlow() {
    return dFlow;
    }

    public void setdFlow(long dFlow) {
    this.dFlow = dFlow;
    }

    public FlowBean(long upFlow, long dFlow) {
    this.upFlow = upFlow;
    this.dFlow = dFlow;
    this.sumFlow=upFlow+dFlow;
    }
    public void set(long upFlow, long dFlow) {
    this.upFlow = upFlow;
    this.dFlow = dFlow;
    this.sumFlow=upFlow+dFlow;
    }

    public long getSumFlow() {
    return sumFlow;
    }

    public void setSumFlow(long sumFlow) {
    this.sumFlow = sumFlow;
    }
    //反序列化时,需要反射调用空参构造函数,所以需要重新定义一个
    public FlowBean() {
    }

    @Override
    public String toString() {
    return upFlow+"\t"+dFlow+"\t"+sumFlow;
    }

    /*

    • 序列化方法
    • /
      public void write(DataOutput dataOutput) throws IOException {
      dataOutput.writeLong(upFlow );
      dataOutput.writeLong(dFlow);
      dataOutput.writeLong(sumFlow);
      }
      /
    • 反序列化方法
    • */
      public void readFields(DataInput dataInput) throws IOException {
      upFlow = dataInput.readLong();
      dFlow = dataInput.readLong();
      sumFlow = dataInput.readLong();
      }

    public int compareTo(FlowBean o) {
    return this.sumFlow>o.sumFlow?-1:1;
    }

}

package hadoop.mapreduce.flowsort;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**

  • @program:bigdata

  • @package:hadoop.mapreduce.flowsort

  • @filename:FlowCountSortMapper.java

  • @create:2019.09.25.18.40

  • @author:Administrator

  • @descrption.
    */
    public class FlowCountSort {

    static class FlowCountSortMapper extends Mapper {
    FlowBean bean=new FlowBean();
    Text v=new Text();
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    //拿到的是上一个程序的输出结果,已经是个手机号的总流量信息
    String line = value.toString();
    String[] fields = line.split("\t");
    String phoneNum = fields[0];
    long upFlow = Long.parseLong(fields[1]);
    long dFlow = Long.parseLong(fields[2]);
    bean.set(upFlow,dFlow);
    v.set(phoneNum);
    context.write(bean,v);

     }
    

    }

    static class FlowCountSortReducer extends Reducer{
    @Override
    protected void reduce(FlowBean key, Iterable values, Context context) throws IOException, InterruptedException {
    context.write(values.iterator().next(),key);
    }
    }

    public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf);
    //指定本程序的jar包所在的本地路径
    job.setJarByClass(FlowCountSort.class);

     //指定本业务job要使用的mapper/reduce业务类
     job.setMapperClass(FlowCountSortMapper.class);
     job.setReducerClass(FlowCountSortReducer.class);
    
     //指定mapper输出的kv类型
     job.setMapOutputKeyClass(FlowBean.class);
     job.setMapOutputValueClass(Text.class);
    
     //指定最终的输出类型
     job.setOutputKeyClass(Text.class);
     job.setOutputValueClass(FlowBean.class);
    
     //指定job的输入原始文件的所在目录
     FileInputFormat.setInputPaths(job,new Path(args[0]));
     //指定job的输出结果所在目录
     FileOutputFormat.setOutputPath(job,new Path(args[1]));
    
     //将job中配置的相关参数以及job以及job所用的java类所在的jar包,提交给yarn去运行、
    

// job.submit();//看不到结果
boolean res = job.waitForCompletion(true);
System.exit(res?0:1);
}

}

02.MR内部的shuffle过程详解_

combiner的逻辑和reduce逻辑相同,在job.setCombinerClass()时设置成reduce类就行。
关于大量小文件的切片问题:采用CombineTextInputFormat的切片模式(默认TextInputFormat),一般电脑的cup核数=切片数=maptask数
mapreduce&yarn的工作机制

本地模式调试MR程序:
(1)mapreduce程序是被提交给LocalJobRunner在本地以单进程的形式运行
(2)而处理的数据及输出结果可以在本地文件系统,也可以在hdfs上
(3)怎样实现本地运行?写一个程序,不要带集群的配置文件(本质是你的mr程序的conf中是否有mapreduce.framework.name=local以及yarn.resourcemanager.hostname参数)
(4)本地模式非常便于进行业务逻辑的debug,只要在eclipse中打断点即可

如果在windows下想运行本地模式来测试程序逻辑,需要在windows中配置环境变量:
%HADOOP_HOME% = d:/hadoop-2.6.1
%PATH% = %HADOOP_HOME%\bin
并且要将d:/hadoop-2.6.1的lib和bin目录替换成windows平台编译的版本

// //提交到本地模拟运行,不做任何配置在本地运行默认就是在本地跑
// conf.set(“mapreduce.framework.name”,“local”);
// 本地模式运行mr程序时,输入输出的数据可以在本地,也可以在hdfs上
// conf.set(“fs.defaultFS”,“file:///”);

如果使用java -jar 命令提交任务就必须手动指定jar包位置job.setJarByClass("/home/hadoop/wc.jar")
并手动配置参数conf.set(“mapreduce.framework.name”,“yarn”)
conf.set(“yarn.resourcemanager.hostname”,“mini1”)
conf.set(“fs.defaultFs”,“hdfs://mini:9000”)

4.4.1 reduce端join算法实现
1、需求:
订单数据表t_order:
id date pid amount
1001 20150710 P0001 2
1002 20150710 P0001 3
1002 20150710 P0002 3

商品信息表t_product
id pname category_id price
P0001 小米5 1000 2
P0002 锤子T1 1000 3
合并表
package hadoop.mapreduce.join;

import hadoop.mapreduce.province.FlowBean;
import hadoop.mapreduce.province.FlowCount;
import hadoop.mapreduce.province.ProvincePartitioner;
import org.apache.commons.beanutils.BeanUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;
import java.lang.reflect.InvocationTargetException;
import java.util.ArrayList;

/**

  • @program:bigdata

  • @package:hadoop.mapreduce.join

  • @filename:RJoin.java

  • @create:2019.09.26.10.18

  • @author:Administrator

  • @descrption.
    */
    public class RJoin {
    static class RJoinMapper extends Mapper{
    InfoBean bean=new InfoBean();
    Text k=new Text();
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    String line = value.toString();

         FileSplit inputSplit = (FileSplit) context.getInputSplit();
         //拿到文件名
         String name = inputSplit.getPath().getName();
         String pid="";
         //通过文件名判断是哪种数据
         if (name.endsWith("order")){
             String[] fields = line.split(",");
             pid=fields[2];
             bean.set(Integer.parseInt(fields[0]),fields[1],pid,Integer.parseInt(fields[3]),
                     "",0,0,"0");
    
    
         }else {
             String[] fields = line.split(",");
             pid=fields[0];
             bean.set(0,"",pid,0,fields[1],Integer.parseInt(fields[2]),Float.parseFloat(fields[3]),
                     "1");
    
         }
         k.set(pid);
         context.write(k,bean);
     }
    

    }
    static class RJoinReducer extends Reducer{

     @Override
     protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
         InfoBean pdBean = new InfoBean();
         ArrayList orderBeans = new ArrayList<>();
         for (InfoBean bean:values){
             if ("1".equals(bean.getFlag())){
                 try {
                     BeanUtils.copyProperties(pdBean,bean);
                 } catch (IllegalAccessException e) {
                     e.printStackTrace();
                 } catch (InvocationTargetException e) {
                     e.printStackTrace();
                 }
    
             }else {
                 InfoBean odBean = new InfoBean();
                 try {
                     BeanUtils.copyProperties(odBean,bean);
                 } catch (IllegalAccessException e) {
                     e.printStackTrace();
                 } catch (InvocationTargetException e) {
                     e.printStackTrace();
                 }
                 orderBeans.add(odBean);
    
             }
         }
         for (InfoBean bean :orderBeans){
             bean.setPname(pdBean.getPname());
             bean.setCategory_id(pdBean.getCategory_id());
             bean.setPrice(pdBean.getPrice());
             context.write(bean,NullWritable.get());
         }
     }
    
     public static void main(String[] args) throws IOException, ClassNotFoundException, InterruptedException {
         Configuration conf = new Configuration();
         Job job = Job.getInstance(conf);
         //指定本程序的jar包所在的本地路径
         job.setJarByClass(RJoin.class);
    
    
    
         //指定本业务job要使用的mapper/reduce业务类
         job.setMapperClass(RJoinMapper.class);
         job.setReducerClass(RJoinReducer.class);
    
         //指定mapper输出的kv类型
         job.setMapOutputKeyClass(Text.class);
         job.setMapOutputValueClass(InfoBean.class);
    
         //指定最终的输出类型
         job.setOutputKeyClass(InfoBean.class);
         job.setOutputValueClass(NullWritable.class);
    
         //指定job的输入原始文件的所在目录
         FileInputFormat.setInputPaths(job,new Path(args[0]));
         //指定job的输出结果所在目录
         FileOutputFormat.setOutputPath(job,new Path(args[1]));
    
         //将job中配置的相关参数以及job以及job所用的java类所在的jar包,提交给yarn去运行、
    

// job.submit();//看不到结果
boolean res = job.waitForCompletion(true);
System.exit(res?0:1);
}
}
}

package hadoop.mapreduce.join;

import org.apache.hadoop.io.Writable;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;

/**

  • @program:bigdata

  • @package:hadoop.mapreduce.join

  • @filename:InfoBean.java

  • @create:2019.09.26.10.22

  • @author:Administrator

  • @descrption.
    */
    public class InfoBean implements Writable {

    private int order_id;
    private String dataString;
    private String p_id;
    private int amount;
    private String pname;
    private int category_id;
    private float price;

    //flag=0表示这个对象是封装订单表记录
    //flag=1表示这个对象是封装商品信息记录
    private String flag;

    public InfoBean() { }

    public String getFlag() {
    return flag;
    }

    public void setFlag(String flag) {
    this.flag = flag;
    }

    public void set(int order_id, String dataString, String p_id, int amount, String pname, int category_id, float price, String flag) {
    this.order_id = order_id;
    this.dataString = dataString;
    this.p_id = p_id;
    this.amount = amount;
    this.pname = pname;
    this.category_id = category_id;
    this.price = price;
    this.flag=flag;
    }

    public int getOrder_id() {
    return order_id;
    }

    public void setOrder_id(int order_id) {
    this.order_id = order_id;
    }

    public String getDataString() {
    return dataString;
    }

    public void setDataString(String dataString) {
    this.dataString = dataString;
    }

    public String getP_id() {
    return p_id;
    }

    public void setP_id(String p_id) {
    this.p_id = p_id;
    }

    public int getAmount() {
    return amount;
    }

    public void setAmount(int amount) {
    this.amount = amount;
    }

    public String getPname() {
    return pname;
    }

    public void setPname(String pname) {
    this.pname = pname;
    }

    public int getCategory_id() {
    return category_id;
    }

    public void setCategory_id(int category_id) {
    this.category_id = category_id;
    }

    public float getPrice() {
    return price;
    }

    public void setPrice(float price) {
    this.price = price;
    }

    @Override
    public void write(DataOutput dataOutput) throws IOException {
    dataOutput.writeInt(order_id);
    dataOutput.writeUTF(dataString);
    dataOutput.writeUTF(p_id);
    dataOutput.writeInt(amount);
    dataOutput.writeUTF(pname);
    dataOutput.writeInt(category_id);
    dataOutput.writeFloat(price);
    dataOutput.writeUTF(flag);
    }

    @Override
    public void readFields(DataInput dataInput) throws IOException {
    this.order_id = dataInput.readInt();
    this.dataString = dataInput.readUTF();
    this.p_id = dataInput.readUTF();
    this.amount = dataInput.readInt();
    this.pname=dataInput.readUTF();
    this.category_id=dataInput.readInt();
    this.price=dataInput.readFloat();
    this.flag=dataInput.readUTF();
    }

    @Override
    public String toString() {
    return
    “order_id=” + order_id +
    “, dataString=’” + dataString + ‘’’ +
    “, p_id=” + p_id +
    “, amount=” + amount +
    “, pname=’” + pname + ‘’’ +
    “, category_id=” + category_id +
    “, price=” + price +
    “, flag=’” + flag;
    }
    }

你可能感兴趣的:(大数据)