MapReduce

一.mapreduce框架的设计思想:

二.简单的单词统计:
map:
package hadoop.mapreduce.wordcount;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;

import java.io.IOException;

/**

  • @program:bigdata
  • @package:hadoop.mapreduce.wordcount
  • @filename:WordCountMapper.java
  • @create:2019.09.24.09.08
  • @author:Administrator
  • @descrption.
    */

/*

  • 1:默认情况下,是mr框架所读到的一行文本的起始偏移量,Long
  • 但是在hadoop中有更精简的序列化接口,所有不能直接用Long,而用LongWritable
  • 2:默认情况下是mr框架所读到的一行文本内容,String ,同上Text
  • 3:是用户自定义逻辑处理完成之后输出数据中的key,在此处是单词,String,同上Text
  • 4:是用户自定义逻辑处理完成之后输出数据中的value,在此处是单词次数,Integer,同上IntWritable
  • /
    public class WordCountMapper extends Mapper {
    /
    • map阶段的业务逻辑就写在自定义的map方法中
    • maptask会对每一行输入数据调用一次我们自定义的map()方法
    • */
      @Override
      protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
      //将maptask传给我们的文本内容转换成String
      String line = value.toString();
      //根据空格切割
      String[] words = line.split(" ");
      //将单词输出<单词,1>
      for (String word: words){
      //将单词作为key,次数作为value,以便后续的数据分发,以便于相同的单词会到相同的reducetask
      context.write(new Text(word),new IntWritable(1));
      }
      }
      }

reduce:
package hadoop.mapreduce.wordcount;

import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;

import java.io.IOException;
import java.util.Iterator;

/**

  • @program:bigdata
  • @package:hadoop.mapreduce.wordcount
  • @filename:WordCountReduce.java
  • @create:2019.09.24.09.47
  • @author:Administrator
  • @descrption.
    */

/*

  • 1,2:对应map的输出
  • 3,4:自定义reduce程序的输出结果
  • /
    public class WordCountReduce extends Reducer {
    /

      • key是一组单词的key
    • */
      @Override
      protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
      int count =0;
      // 可以用迭代器
      // Iterator iterator = values.iterator();
      // 也可以用for
      for (IntWritable value:values){
      count+=value.get();
      }
      context.write(new Text(key),new IntWritable(count));
      }
      }

driver类
package hadoop.mapreduce.wordcount;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**

  • @program:bigdata

  • @package:hadoop.mapreduce.wordcount

  • @filename:WordCountDriver.java

  • @create:2019.09.24.10.09

  • @author:Administrator

  • @descrption.相当于一个yarn集群的客户端

  • 需要再此封装我们的mr程序的相关参数,指定jar包

  • 最后提交给yarn
    */
    public class WordCountDriver {
    public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf);
    //指定本程序的jar包所在的本地路径
    job.setJarByClass(WordCountDriver.class);

     //指定本业务job要使用的mapper/reduce业务类
     job.setMapperClass(WordCountMapper.class);
     job.setReducerClass(WordCountReduce.class);
    
     //指定mapper输出的kv类型
     job.setMapOutputKeyClass(Text.class);
     job.setMapOutputValueClass(IntWritable.class);
    
     //指定最终的输出类型
     job.setOutputKeyClass(Text.class);
     job.setOutputValueClass(IntWritable.class);
    
     //指定job的输入原始文件的所在目录
     FileInputFormat.setInputPaths(job,new Path(args[0]));
     //指定job的输出结果所在目录
     FileOutputFormat.setOutputPath(job,new Path(args[1]));
    
     //将job中配置的相关参数以及job以及job所用的java类所在的jar包,提交给yarn去运行、
    

// job.submit();//看不到结果
boolean res = job.waitForCompletion(true);
System.exit(res?0:1);

}

}

普通的jar运行:
jar -cp bigdata-1.0-SNAPSHOT.jar hadoop.mapreduce.wordcount.WordCountDriver /wordcount/input wordcount/output这时没有hadoop的jar包所以用hadoop jar
hadoop jar bigdata-1.0-SNAPSHOT.jar hadoop.mapreduce.wordcount.WordCountDriver /wordcount/input /wordcount/output

三.wordcount运行过程的解析

四:流量汇总程序
package hadoop.mapreduce.flowsum;

import hadoop.mapreduce.wordcount.WordCountDriver;
import hadoop.mapreduce.wordcount.WordCountMapper;
import hadoop.mapreduce.wordcount.WordCountReduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;

import java.io.IOException;

/**

  • @program:bigdata

  • @package:hadoop.mapreduce.flowsum

  • @filename:FlowCountMapper.java

  • @create:2019.09.24.16.25

  • @author:Administrator

  • @descrption.
    */
    public class FlowCount {

    static class FlowCountMapper extends Mapper {
    @Override
    protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
    //将一行内容转成string
    String line = value.toString();
    //切分自动
    String[] fields = line.split("\t");
    //取出手机号
    String phoneNbr=fields[1];
    //取出上行流量
    long upFlow=Long.parseLong(fields[fields.length-3]);
    //取出下行流量
    long dFlow=Long.parseLong(fields[fields.length-2]);
    context.write(new Text(phoneNbr),new FlowBean(upFlow,dFlow));
    }
    }

    static class FlowCountReduce extends Reducer {
    @Override
    protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
    long sum_upFlow=0;
    long sum_dFlow=0;

         for (FlowBean bean:values){
             sum_upFlow+=bean.getUpFlow();
             sum_dFlow+=bean.getdFlow();
         }
         context.write(new Text(key),new FlowBean(sum_upFlow,sum_dFlow));
     }
    

    }

    public static void main(String[] args) throws Exception {
    Configuration conf = new Configuration();
    Job job = Job.getInstance(conf);
    //指定本程序的jar包所在的本地路径
    job.setJarByClass(FlowCount.class);

     //指定本业务job要使用的mapper/reduce业务类
     job.setMapperClass(FlowCountMapper.class);
     job.setReducerClass(FlowCountReduce.class);
    
     //指定mapper输出的kv类型
     job.setMapOutputKeyClass(Text.class);
     job.setMapOutputValueClass(FlowBean.class);
    
     //指定最终的输出类型
     job.setOutputKeyClass(Text.class);
     job.setOutputValueClass(FlowBean.class);
    
     //指定job的输入原始文件的所在目录
     FileInputFormat.setInputPaths(job,new Path(args[0]));
     //指定job的输出结果所在目录
     FileOutputFormat.setOutputPath(job,new Path(args[1]));
    
     //将job中配置的相关参数以及job以及job所用的java类所在的jar包,提交给yarn去运行、
    

// job.submit();//看不到结果
boolean res = job.waitForCompletion(true);
System.exit(res?0:1);
}

}

运行: hadoop jar bigdata-1.0-SNAPSHOT.jar hadoop.mapreduce.flowsum.FlowCount /flowsum/input /flowsum/output

4.1:在四的基础上自定义分区
按省分区
package hadoop.mapreduce.province;

import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Partitioner;

import java.util.HashMap;

/**

  • @program:bigdata

  • @package:hadoop.mapreduce.province

  • @filename:ProvincePartitioner.java

  • @create:2019.09.25.09.27

  • @author:Administrator

  • @descrption.
    /
    /

  • 泛型1,2是map的kv

  • */
    public class ProvincePartitioner extends Partitioner {
    public static HashMap prId=new HashMap();
    static {
    prId.put(“136”,0);
    prId.put(“137”,1);
    prId.put(“138”,2);
    prId.put(“139”,3);
    }

    public int getPartition(Text text, FlowBean flowBean, int i) {
    String prefix = text.toString().substring(0, 3);
    Integer provinceId = prId.get(prefix);
    return (provinceId==null)?4:provinceId;
    }
    }

其他代码于四一致,只需在main方法中添加一下代码:
//指定我们自定义的数据分区
job.setPartitionerClass(ProvincePartitioner.class);
//同时指定相应数据分区数量的reducetask
job.setNumReduceTasks(5);

五:maptask任务分配切片机制

1.3.1 mapTask并行度的决定机制
一个job的map阶段并行度由客户端在提交job时决定
而客户端对map阶段并行度的规划的基本逻辑为:
将待处理数据执行逻辑切片(即按照一个特定切片大小,将待处理数据划分成逻辑上的多个split),然后每一个split分配一个mapTask并行实例处理

这段逻辑及形成的切片规划描述文件,由FileInputFormat实现类的getSplits()方法完成,其过程如下图:

1.3.2 FileInputFormat切片机制
1、切片定义在InputFormat类中的getSplit()方法
2、FileInputFormat中默认的切片机制:
a) 简单地按照文件的内容长度进行切片
b) 切片大小,默认等于block大小
c) 切片时不考虑数据集整体,而是逐个针对每一个文件单独切片
比如待处理数据有两个文件:
file1.txt    320Mfile2.txt    10M

经过FileInputFormat的切片机制运算后,形成的切片信息如下:
file1.txt.split1--  0~128file1.txt.split2--  128~256file1.txt.split3--  256~320file2.txt.split1--  0~10M
如果剩下的文件大小/切片大小<1.1,切成一块
3、FileInputFormat中切片的大小的参数配置
通过分析源码,在FileInputFormat中,计算切片大小的逻辑:Math.max(minSize, Math.min(maxSize, blockSize)); 切片主要由这几个值来运算决定
minsize:默认值:1     配置参数: mapreduce.input.fileinputformat.split.minsize    
maxsize:默认值:Long.MAXValue      配置参数:mapreduce.input.fileinputformat.split.maxsize
blocksize
因此,默认情况下,切片大小=blocksize
maxsize(切片最大值):
参数如果调得比blocksize小,则会让切片变小,而且就等于配置的这个参数的值
minsize (切片最小值):
参数调的比blockSize大,则可以让切片变得比blocksize还大

选择并发数的影响因素:
1、运算节点的硬件配置
2、运算任务的类型:CPU密集型还是IO密集型
3、运算任务的数据量
六:MapReduce程序的整体提交流程

你可能感兴趣的:(大数据)