一.mapreduce框架的设计思想:
二.简单的单词统计:
map:
package hadoop.mapreduce.wordcount;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import java.io.IOException;
/**
/*
reduce:
package hadoop.mapreduce.wordcount;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
import java.io.IOException;
import java.util.Iterator;
/**
/*
driver类
package hadoop.mapreduce.wordcount;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
@program:bigdata
@package:hadoop.mapreduce.wordcount
@filename:WordCountDriver.java
@create:2019.09.24.10.09
@author:Administrator
@descrption.相当于一个yarn集群的客户端
需要再此封装我们的mr程序的相关参数,指定jar包
最后提交给yarn
*/
public class WordCountDriver {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
//指定本程序的jar包所在的本地路径
job.setJarByClass(WordCountDriver.class);
//指定本业务job要使用的mapper/reduce业务类
job.setMapperClass(WordCountMapper.class);
job.setReducerClass(WordCountReduce.class);
//指定mapper输出的kv类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(IntWritable.class);
//指定最终的输出类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//指定job的输入原始文件的所在目录
FileInputFormat.setInputPaths(job,new Path(args[0]));
//指定job的输出结果所在目录
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//将job中配置的相关参数以及job以及job所用的java类所在的jar包,提交给yarn去运行、
// job.submit();//看不到结果
boolean res = job.waitForCompletion(true);
System.exit(res?0:1);
}
}
普通的jar运行:
jar -cp bigdata-1.0-SNAPSHOT.jar hadoop.mapreduce.wordcount.WordCountDriver /wordcount/input wordcount/output这时没有hadoop的jar包所以用hadoop jar
hadoop jar bigdata-1.0-SNAPSHOT.jar hadoop.mapreduce.wordcount.WordCountDriver /wordcount/input /wordcount/output
三.wordcount运行过程的解析
四:流量汇总程序
package hadoop.mapreduce.flowsum;
import hadoop.mapreduce.wordcount.WordCountDriver;
import hadoop.mapreduce.wordcount.WordCountMapper;
import hadoop.mapreduce.wordcount.WordCountReduce;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import java.io.IOException;
/**
@program:bigdata
@package:hadoop.mapreduce.flowsum
@filename:FlowCountMapper.java
@create:2019.09.24.16.25
@author:Administrator
@descrption.
*/
public class FlowCount {
static class FlowCountMapper extends Mapper
@Override
protected void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//将一行内容转成string
String line = value.toString();
//切分自动
String[] fields = line.split("\t");
//取出手机号
String phoneNbr=fields[1];
//取出上行流量
long upFlow=Long.parseLong(fields[fields.length-3]);
//取出下行流量
long dFlow=Long.parseLong(fields[fields.length-2]);
context.write(new Text(phoneNbr),new FlowBean(upFlow,dFlow));
}
}
static class FlowCountReduce extends Reducer
@Override
protected void reduce(Text key, Iterable values, Context context) throws IOException, InterruptedException {
long sum_upFlow=0;
long sum_dFlow=0;
for (FlowBean bean:values){
sum_upFlow+=bean.getUpFlow();
sum_dFlow+=bean.getdFlow();
}
context.write(new Text(key),new FlowBean(sum_upFlow,sum_dFlow));
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
//指定本程序的jar包所在的本地路径
job.setJarByClass(FlowCount.class);
//指定本业务job要使用的mapper/reduce业务类
job.setMapperClass(FlowCountMapper.class);
job.setReducerClass(FlowCountReduce.class);
//指定mapper输出的kv类型
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(FlowBean.class);
//指定最终的输出类型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);
//指定job的输入原始文件的所在目录
FileInputFormat.setInputPaths(job,new Path(args[0]));
//指定job的输出结果所在目录
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//将job中配置的相关参数以及job以及job所用的java类所在的jar包,提交给yarn去运行、
// job.submit();//看不到结果
boolean res = job.waitForCompletion(true);
System.exit(res?0:1);
}
}
运行: hadoop jar bigdata-1.0-SNAPSHOT.jar hadoop.mapreduce.flowsum.FlowCount /flowsum/input /flowsum/output
4.1:在四的基础上自定义分区
按省分区
package hadoop.mapreduce.province;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.Partitioner;
import java.util.HashMap;
/**
@program:bigdata
@package:hadoop.mapreduce.province
@filename:ProvincePartitioner.java
@create:2019.09.25.09.27
@author:Administrator
@descrption.
/
/
泛型1,2是map的kv
*/
public class ProvincePartitioner extends Partitioner
public static HashMap
static {
prId.put(“136”,0);
prId.put(“137”,1);
prId.put(“138”,2);
prId.put(“139”,3);
}
public int getPartition(Text text, FlowBean flowBean, int i) {
String prefix = text.toString().substring(0, 3);
Integer provinceId = prId.get(prefix);
return (provinceId==null)?4:provinceId;
}
}
其他代码于四一致,只需在main方法中添加一下代码:
//指定我们自定义的数据分区
job.setPartitionerClass(ProvincePartitioner.class);
//同时指定相应数据分区数量的reducetask
job.setNumReduceTasks(5);
五:maptask任务分配切片机制
1.3.1 mapTask并行度的决定机制
一个job的map阶段并行度由客户端在提交job时决定
而客户端对map阶段并行度的规划的基本逻辑为:
将待处理数据执行逻辑切片(即按照一个特定切片大小,将待处理数据划分成逻辑上的多个split),然后每一个split分配一个mapTask并行实例处理
这段逻辑及形成的切片规划描述文件,由FileInputFormat实现类的getSplits()方法完成,其过程如下图:
1.3.2 FileInputFormat切片机制
1、切片定义在InputFormat类中的getSplit()方法
2、FileInputFormat中默认的切片机制:
a) 简单地按照文件的内容长度进行切片
b) 切片大小,默认等于block大小
c) 切片时不考虑数据集整体,而是逐个针对每一个文件单独切片
比如待处理数据有两个文件:
file1.txt 320Mfile2.txt 10M
经过FileInputFormat的切片机制运算后,形成的切片信息如下:
file1.txt.split1-- 0~128file1.txt.split2-- 128~256file1.txt.split3-- 256~320file2.txt.split1-- 0~10M
如果剩下的文件大小/切片大小<1.1,切成一块
3、FileInputFormat中切片的大小的参数配置
通过分析源码,在FileInputFormat中,计算切片大小的逻辑:Math.max(minSize, Math.min(maxSize, blockSize)); 切片主要由这几个值来运算决定
minsize:默认值:1 配置参数: mapreduce.input.fileinputformat.split.minsize
maxsize:默认值:Long.MAXValue 配置参数:mapreduce.input.fileinputformat.split.maxsize
blocksize
因此,默认情况下,切片大小=blocksize
maxsize(切片最大值):
参数如果调得比blocksize小,则会让切片变小,而且就等于配置的这个参数的值
minsize (切片最小值):
参数调的比blockSize大,则可以让切片变得比blocksize还大
选择并发数的影响因素:
1、运算节点的硬件配置
2、运算任务的类型:CPU密集型还是IO密集型
3、运算任务的数据量
六:MapReduce程序的整体提交流程