案例一:单词对应的目录统计
//按照分数降序排序
@Override
public int compareTo(Score o) {
return o.score-this.score;
}
package cn.tedu.invert;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class InvertDriver {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "JobName");
job.setJarByClass(cn.tedu.invert.InvertDriver.class);
job.setMapperClass(InvertMapper.class);
job.setReducerClass(InvertReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
FileInputFormat.setInputPaths(job, new Path("hdfs://192.168.253.129:9000/mr/invert/"));
FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.253.129:9000/result/invert"));
if (!job.waitForCompletion(true))
return;
}
}
package cn.tedu.invert;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.lib.input.FileSplit;
/*
* 单词对应的目录统计
* */
public class InvertMapper
extends Mapper<LongWritable, Text, Text, Text> {
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
//获取文件名
FileSplit fs=(FileSplit)context.getInputSplit();
String name=fs.getPath().getName();
//拆分单词
String[] arr=value.toString().split(" ");
for (String str : arr) {
//单词放前边,文件名放后边
context.write(new Text(str), new Text(name));
}
}
}
package cn.tedu.invert;
import java.io.IOException;
import java.util.HashSet;
import java.util.Set;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class InvertReducer extends Reducer<Text, Text, Text, Text> {
public void reduce(Text key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
// 考虑文件名的去重问题
Set<String> set=new HashSet<>();
for (Text val : values) {
set.add(val.toString());
}
StringBuilder sb=new StringBuilder();
for (String str : set) {
sb.append(str).append("\t");
}
context.write(key, new Text(sb.toString()));
}
}
案例二:按月份统计每一个人的总成绩
package cn.tedu.partscore;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Partitioner;
public class MonthPartitioner extends Partitioner<Text,Score>{
@Override
public int getPartition(Text key, Score value, int numReduceTasks) {
// 按月份进行分类
//1 2 3
//0 1 2
return value.getMonth()-1;
}
}
package cn.tedu.partscore;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class PartScoreDriver {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "JobName");
job.setJarByClass(cn.tedu.partscore.PartScoreDriver.class);
job.setMapperClass(PartScoreMapper.class);
job.setReducerClass(PartScoreReducer.class);
job.setPartitionerClass(MonthPartitioner.class);
job.setNumReduceTasks(3);
job.setMapOutputKeyClass(Text.class);
job.setMapOutputValueClass(Score.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.setInputPaths(job, new Path("hdfs://192.168.253.129:9000/mr/score1/score1.txt"));
FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.253.129:9000/result/score1"));
if (!job.waitForCompletion(true))
return;
}
}
package cn.tedu.partscore;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class PartScoreMapper
extends Mapper<LongWritable, Text, Text, Score> {
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] arr=value.toString().split(" ");
Score s=new Score();
s.setMonth(Integer.parseInt(arr[0]));
s.setName(arr[1]);
s.setScore(Integer.parseInt(arr[2]));
context.write(new Text(s.getName()), s);
}
}
package cn.tedu.partscore;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
/*
* 按月份统计每一个人的总成绩
*/
public class PartScoreReducer
extends Reducer<Text, Score, Text, IntWritable> {
public void reduce(Text key, Iterable<Score> values, Context context) throws IOException, InterruptedException {
int sum=0;
for (Score val : values) {
sum+=val.getScore();
}
context.write(key, new IntWritable(sum));
}
}
package cn.tedu.partscore;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.Writable;
public class Score implements Writable{
private int month;
private String name;
private int score;
public int getMonth() {
return month;
}
public void setMonth(int month) {
this.month = month;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public int getScore() {
return score;
}
public void setScore(int score) {
this.score = score;
}
@Override
public String toString() {
return "Score [month=" + month + ", name=" + name + ", score=" + score + "]";
}
@Override
public void readFields(DataInput in) throws IOException {
this.month=in.readInt();
this.name=in.readUTF();
this.score=in.readInt();
}
@Override
public void write(DataOutput out) throws IOException {
out.writeInt(month);
out.writeUTF(name);
out.writeInt(score);
}
}
二、分区- Partition
1.分区的作用是将数据进行分类
2.有无分区并不影响Map和Reduce的执行逻辑
3.分区默认是从O开始依次递增的
4.在MapReduce中,每一个分区要对应一个ReduceTask,每一个ReduceTask都会产生一个结果文件。默认情况下只有1个分区,也就只有1个ReduceTask,只产生一个结果文件
5.在MapReduce中,如果没有手动指定Partitioner,那么默认使用的分区类是
HashPartitioner
将姓名和分数输出(分数是降序)
package cn.tedu.sortscore;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
//序列化和排序
public class Score implements WritableComparable<Score>{
private String name;
private int score;
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public int getScore() {
return score;
}
public void setScore(int score) {
this.score = score;
}
@Override
public String toString() {
return "Score [name=" + name + ", score=" + score + "]";
}
@Override
public void readFields(DataInput in) throws IOException {
this.name=in.readUTF();
this.score=in.readInt();
}
@Override
public void write(DataOutput out) throws IOException {
out.writeUTF(name);
out.writeInt(score);
}
//按照分数降序排序
@Override
public int compareTo(Score o) {
return o.score-this.score;
}
}
package cn.tedu.sortscore;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class SortScoreDriver {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "JobName");
job.setJarByClass(cn.tedu.sortscore.SortScoreDriver.class);
job.setMapperClass(SortScoreMapper.class);
job.setReducerClass(SortScoreReducer.class);
job.setMapOutputKeyClass(Score.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
//_开头的文件在MapReducer中被认为是一个隐藏文件不被读取
FileInputFormat.setInputPaths(job, new Path("hdfs://192.168.253.129:9000/mr/sortscore/score2/"));
FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.253.129:9000/result/sortscore"));
if (!job.waitForCompletion(true))
return;
}
}
package cn.tedu.sortscore;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class SortScoreMapper
extends Mapper<LongWritable, Text, Score, NullWritable> {
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] arr=value.toString().split("\t");
Score s=new Score();
s.setName(arr[0]);
s.setScore(Integer.parseInt(arr[1]));
context.write(s, NullWritable.get());
}
}
package cn.tedu.sortscore;
import java.io.IOException;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Reducer;
public class SortScoreReducer
extends Reducer<Score, NullWritable, Text, IntWritable> {
public void reduce(Score key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
//
context.write(new Text(key.getName()), new IntWritable(key.getScore()));
}
}
三、排序
1.在MapReduce中,自动对键进行排序
2.要求键所对应的类必须实现Comparable接口,但是考虑到键需要序列化,所以一般实现的WritableComparable
3.在排序过程中,如果compareTo方法的返回值为o,则MapReduce会认为这两个键是同一个,则将这两个键的值放到一组(去重操作),是个bug
案例:练习∶按照月份升序排序,如果月份一样,则按照利润降序排序(文件:profit2.txt )
package cn.tedu.sortprofit;
import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import org.apache.hadoop.io.WritableComparable;
public class Profit implements WritableComparable<Profit>{
private int month;
private String name;
private int profit;
public int getMonth() {
return month;
}
public void setMonth(int month) {
this.month = month;
}
public String getName() {
return name;
}
public void setName(String name) {
this.name = name;
}
public int getProfit() {
return profit;
}
public void setProfit(int profit) {
this.profit = profit;
}
@Override
public String toString() {
return "Profit [month=" + month + ", name=" + name + ", profit=" + profit + "]";
}
@Override
public void readFields(DataInput in) throws IOException {
this.month=in.readInt();
this.name=in.readUTF();
this.profit=in.readInt();
}
@Override
public void write(DataOutput out) throws IOException {
//
out.writeInt(month);
out.writeUTF(name);
out.writeInt(profit);
}
//先按照月份排序,如果月份一致则按照利润降序
@Override
public int compareTo(Profit o) {
int r1=this.month-o.month;
if(r1==0){
// return o.profit-this.profit;//有去重的bug
int r2=o.profit-this.profit;
return r2==0 ? 1:r2;
}
return r1;
}
}
package cn.tedu.sortprofit;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
public class ScoreProfitDriver {
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf, "JobName");
job.setJarByClass(cn.tedu.sortprofit.ScoreProfitDriver.class);
job.setMapperClass(ScoreProfitMapper.class);
job.setReducerClass(ScoreProfitReducer.class);
job.setOutputKeyClass(Profit.class);
job.setOutputValueClass(NullWritable.class);
FileInputFormat.setInputPaths(job, new Path("hdfs://192.168.253.129:9000/mr/profit2/profit2.txt"));
FileOutputFormat.setOutputPath(job, new Path("hdfs://192.168.253.129:9000/result/profit2"));
if (!job.waitForCompletion(true))
return;
}
}
package cn.tedu.sortprofit;
import java.io.IOException;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Mapper;
public class ScoreProfitMapper extends Mapper<LongWritable, Text, Profit, NullWritable> {
public void map(LongWritable key, Text value, Context context) throws IOException, InterruptedException {
String[] arr=value.toString().split(" ");
Profit p=new Profit();
p.setMonth(Integer.parseInt(arr[0]));
p.setName(arr[1]);
p.setProfit(Integer.parseInt(arr[2]));
context.write(p, NullWritable.get());
}
}
package cn.tedu.sortprofit;
import java.io.IOException;
import org.apache.hadoop.io.NullWritable;
import org.apache.hadoop.mapreduce.Reducer;
public class ScoreProfitReducer
extends Reducer<Profit, NullWritable, Profit, NullWritable> {
public void reduce(Profit key, Iterable<NullWritable> values, Context context) throws IOException, InterruptedException {
context.write(key, NullWritable.get());
}
}
四、合并-Combine
1.大多数情况下,MapTask的数量要远远多于ReduceTask的数量,导致计算压力几乎全部落在了ReduceTask上,ReduceTask的计算效率就成为整个MapReduce的瓶颈job.setCombinerClass(Reducer.class);
2.合并的逻辑和Reducer的逻辑是一样的-只需要在Driver中添加
3.合并的特点:减少数据总量但是不改变计算结果
4.如果进行汇总、获取最值、去重之类操作可以使用Combiner,但是例如求平均之类的操作不能使用Combiner
求平均不适合Combine
细节
一、数据本地化策略
1.当JobTracker收到MR程序的时候,会访问NameNode获取文件信息。文件
信息包含文件大小以及块信息
2.JobTracker对这个文件进行切片处理。注意:切片是逻辑切分不是物理切分。切片数量决定了MapTask的数量。默认情况下,Split和Block是等大的
3.JobTracker会将划分出来的MapTask分配到TaskTracker上执行
4.因为MapTask在执行过程中需要读取数据,而数据在DataNode上,所以将
DataNode和TaskTracker部署在相同的节点上以减少跨集群的网络传输
5.为了减少网络资源的消耗,在分配任务的时候会考虑Block的位置。哪个节点
上有要处理的数据,将任务分配给哪个节点,这个过程称之为数据本地化
6.切片产生过程︰
a.如果文件为空,则整个文件作为一个切片处理
b.在MapReduce中,文件要区分可切或者不可切,例如绝大部分的压缩文件就是不可切的
c.如果文件不可切,则整个文件作为一个切片处理
d.如果需要减小splitsize,需要调小maxsize;如果需要调大splitsize ,需要调大minsize
e.在计算切片的时候,需要考虑切片阈值– SPLIT_SLOP =1.1
二、MR执行流程
1.准备阶段︰
a.检查输入和输出路径
b.计算切片数量
c.如果有必要,设置缓存存根
d.将jar包和配置上传到HDFS上
e.将任务提交给JobTracker,并且可以选择是否监控这个任务
2.执行阶段∶
a. JobTracker收到Job任务之后,会将这个任务进行拆分,拆分成MapTask
和ReduceTask。MapTask的数量由切片决定;ReduceTask的数量由分区数量决定
b. JobTracker在拆分完成任务之后,会等待TaskTracker的心跳,然后给TaskTracker分配任务。分配任务的时候,MapTask尽量满足数据本地化策略,ReduceTask无法满足数据本地化,所以ReduceTask在分配的时候是考虑节点的空闲
c. TaskTracker通过心跳领取任务,领取到任务之后,会去对应节点上下载jar包,这一步体现的思想是逻辑移动数据固定
d. TaskTracker会在本节点上开启JVM子进程执行MapTask或者ReduceTask。注意:每一个MapTask或者ReduceTask的执行都会开启一次JVM子进程
shuffle
—、Map益的shuffle
1.map方法在处理完成数据之后会将结果写出到MapTask自带的缓冲区中-每一个MapTask自带一个缓冲区- MapOutputCollector
2.数据在缓冲区中进行分区、排序,如果指定了Combiner,那么数据在缓冲区中还会进行combine。注意:在缓冲区中的排序是将完全无序的数据整理成有序数据,采取的是快速排序
3.缓冲区是维系在内存中,默认是100M
4.当缓冲区的使用达到一定限度(溢写阈值:0.8)的时候,会将缓冲区中的数
据溢写(spill)到磁盘上,map方法后续产生的结果会继续写到缓冲区中
5.每一次溢写都会产生一个新的溢写文件-单个溢写文件中的数据是分区且有序的,所有的溢写文件是局部有序的
6.在map方法完成之后,将所有的溢写文件进行合并(merge),将所有的溢写文件合并成一个结果文件(final out),在merge过程中,数据会再次进行分区排序- final out是整体分区且有序的.merge过程中的排序是将局部有序变成整体有序,所以采取的是归并排序。
7.如果map方法执行完成之后,缓冲区中依然有数据,则会直接合并到最后的final out中
8.在merge过程中,如果spill文件个数>=3并且指定了Combiner,则在merge的时候会再进行一次combine
9.注意问题:
a. spill过程不一定产生
b.默认情况下,溢写文件的大小不一定是80M,考虑序列化因素(可能只序列化一部分内容)
c.缓冲区本质上是一个环形的字节数组,设置为环形的目的是为了避免寻址,能够重复利用缓冲区
d.阈值的作用是为了减少写入的阻塞
二、Reduce端的shuffle
1.ReduceTask启动多个fetch线程去MapTask处抓取对应分区的数据
2.ReduceTask将从每一个MapTask上抓取过来的数据存储在一个本地文件中
3.将抓取来数据进行一次merge,合并成一个大文件,在merge过程中,会再次进行排序,采用的是归并排序
4. merge完成之后,ReduceTask会再将相同的键对应的值放到一个迭代器中,这个过程称之为分组(group)
5.分组完成之后,每一个键对应一个迭代器,每一个键调用一次reduce方法
6.注意问题:
a. ReduceTask的启动阈值:0.05-当5%的MapTask结束,就会启动ReduceTask去抓取数据
b. fetch线程通过HTTP请求获取数据
c. fetch线程的数量默认为5
d. merge因子:10 -每10个小文件合并成1个大文件
搭建Hadoop需要的服务器类型:倾向于内存比较好的服务器