FairScheduler是由Facebook公司提出的,为了解决Facebook要处理生产型作业(数据分析、Hive)、大型批处理作业(数据挖掘、机器学习)、小型交互作业(Hive查询)的问题。同时满足不同用户提交的作业在计算时间、存储空间、数据流量和响应时间都有不同需求的情况下,使用Hadoop mapreduce框架能够应对多种类型作业并行执行,使得用户具有良好的体验,所以Facebook提出了该算法。
负载均衡器: mapred.fairscheduler.loadmanager 任务选择器: mapred.fairscheduler.taskselector 权重调整器: mapred.fairscheduler.weightadjuster另外,FairScheduler的属性assignMultiple被用来控制给一个TaskTracker节点分配任务的数量,如果该值配置为true,则最多可以给一个TaskTracker节点分配一个Map任务和一个Reduce任务,否则最多只为其分配一个Map任务或者一个Reduce任务。该值通过 mapred.fairscheduler.assignmultiple 来设置;属性sizeBasedWeight被用来在更新作业的权重的时候是否应该考虑该作业尚未完成任务的大小,可通过mapred.fairscheduler.sizebasedweight来设置。
private double calculateRawWeight(JobInProgress job, TaskType taskType) { if (!isRunnable(job)) {//作业是否在当前可调度的作业集中 return 0; } else { double weight = 1.0; if (sizeBasedWeight) { // 作业还未完成的任务数量 weight = Math.log1p(runnableTasks(job, taskType)) / Math.log(2); } weight *= getPriorityFactor(job.getPriority());//作业的优先级 if (weightAdjuster != null) { //用户来调整作业的权重 weight = weightAdjuster.adjustWeight(job, taskType, weight); } return weight; } }刚才说过,FairScheduler会先基于FIFO的策略从User和Pool的限制层面上选择一批作业作为当前可调度作业集,这里的User限制是指在这个可调度作业集中属于该User的作业数量不能超过他的上限, Pool限制则指在这个可调度作业集中属于该Pool的作业数量不能超过它的上限,各个User、Pool的限制都保存在PoolManager中,而PoolManager是通过加载配置文件来得到这些限制信息的。而这个配置文件的路径又可以通过JobTracker节点的配置文件来设置,对应的配置项为:mapred.fairscheduler.allocation.file,同时在这个配置文件中,还可以指定一个Poll至少可分得集群中多少Map/Reduce计算资源。关于如何配置User、Pool的限制及Pool的计算资源,感兴趣的同学可以参考Hadoop的官网。另外,对于如何指定一个作业属于哪一个Pool(请注意,这里的Pool不同于作业所属的队列,但可以通过配置让Pool等价于作业队列),可以通过作业的配置文件来执行,对应的配置项名则又是由JobTracker节点的配置文件中的mapred.fairscheduler.poolnameproperty项所决定。
//应该分得计算资源(*FairShare)与实际得到的资源(running*s)之间的差乘以处于这种"不公平"状态的时间timeDelta private void updateDeficits(long timeDelta) { for (JobInfo info: infos.values()) { info.mapDeficit += (info.mapFairShare - info.runningMaps) * timeDelta; info.reduceDeficit += (info.reduceFairShare - info.runningReduces) * timeDelta; } }每一个作业池都配置有固定的计算资源(如果在配置文件中没有明确配置,那么该Poo的计算资源就默认为0),因此就需要将该作业池的计算资源分配给该Pool中的当前可调度作业。而Pool中的每一个可调度作业到底要被分配多少个计算资源主要依赖于该作业的全局权重。这种基于Pool的计算资源和作业的全局权重来分配计算资源的方法如下:
private void updateMinSlots() { //Clear old minSlots for (JobInfo info: infos.values()) { info.minMaps = 0; info.minReduces = 0; } // 为每一个Pool中的可调度作业分配计算资源. PoolManager poolMgr = getPoolManager(); for (Pool pool: poolMgr.getPools()) { for (final TaskType type: TaskType.values()) { Set<JobInProgress> jobs = new HashSet<JobInProgress>(pool.getJobs()); //该Pool的计算资源总量 int slotsLeft = poolMgr.getAllocation(pool.getName(), type); //给该Pool中所有可调度的作业分配计算资源 while (slotsLeft > 0) { // Figure out total weight of jobs that still need slots double totalWeight = 0; for (Iterator<JobInProgress> it = jobs.iterator(); it.hasNext();) { JobInProgress job = it.next(); //选择该Pool中还需要计算资源的可调度作业并统计它们的权重和 if (isRunnable(job) && runnableTasks(job, type) > minTasks(job, type)) { totalWeight += weight(job, type); } else { it.remove(); } } if (totalWeight == 0) break; //对于还需要计算资源的可调度作业,根据它们的权重比重把该Pool中剩余的计算资源分配给他们 int oldSlots = slotsLeft; for (JobInProgress job: jobs) { double weight = weight(job, type); int share = (int) Math.floor(oldSlots * weight / totalWeight); slotsLeft = giveMinSlots(job, type, slotsLeft, share); } if (slotsLeft == oldSlots) { // No tasks were assigned; do another pass using ceil, giving the // extra slots to jobs in order of weight then deficit List<JobInProgress> sortedJobs = new ArrayList<JobInProgress>(jobs); Collections.sort(sortedJobs, new Comparator<JobInProgress>() { public int compare(JobInProgress j1, JobInProgress j2) { double dif = weight(j2, type) - weight(j1, type); if (dif == 0) // Weights are equal, compare by deficit dif = deficit(j2, type) - deficit(j1, type); return (int) Math.signum(dif); } }); for (JobInProgress job: sortedJobs) { double weight = weight(job, type); int share = (int) Math.ceil(oldSlots * weight / totalWeight); slotsLeft = giveMinSlots(job, type, slotsLeft, share); } if (slotsLeft > 0) { LOG.warn("Had slotsLeft = " + slotsLeft + " after the final loop in updateMinSlots. This probably means some fair scheduler weights are being set to NaN or Infinity."); } break; } }//while }//for }//for }对于通过集群的计算资源和所有可调度作业的全局权重来最终确定该作业的应该分配的公平份额的算法,笔者认为这个算法可能存在某些问题而导致很难理解,所以本文不会详细讨论,有知道的博友可以@我。该算法的实现源码如下:
private void updateFairShares(ClusterStatus clusterStatus) { // Clear old fairShares for (JobInfo info: infos.values()) { info.mapFairShare = 0; info.reduceFairShare = 0; } // 计算每一个可调度作业应该分得的每类计算资源. for (TaskType type: TaskType.values()) { //选择未完成并且可调度的作业 HashSet<JobInfo> jobsLeft = new HashSet<JobInfo>(); for (Entry<JobInProgress, JobInfo> entry: infos.entrySet()) { JobInProgress job = entry.getKey(); JobInfo info = entry.getValue(); if (isRunnable(job) && runnableTasks(job, type) > 0) { jobsLeft.add(info); } } //获取整个集群的计算资源 double slotsLeft = getTotalSlots(type, clusterStatus); //计算每一个未完成并且可调度作业应该分得的某一类计算资源 while (!jobsLeft.isEmpty()) { double totalWeight = 0; //统计所有未完成并可调度作业的权重和 for (JobInfo info: jobsLeft) { double weight = (type == TaskType.MAP ? info.mapWeight : info.reduceWeight); totalWeight += weight; } boolean recomputeSlots = false; double oldSlots = slotsLeft; // Copy slotsLeft so we can modify it for (Iterator<JobInfo> iter = jobsLeft.iterator(); iter.hasNext();) { JobInfo info = iter.next(); double minSlots = (type == TaskType.MAP ? info.minMaps : info.minReduces); double weight = (type == TaskType.MAP ? info.mapWeight : info.reduceWeight); //基于公平性计算该作业应该分配的计算资源 double fairShare = weight / totalWeight * oldSlots; //对于以Pool的计算为准来更新作业的公平资源配额 if (minSlots > fairShare) { if (type == TaskType.MAP) info.mapFairShare = minSlots; else info.reduceFairShare = minSlots; slotsLeft -= minSlots; iter.remove(); recomputeSlots = true; } }//for if (!recomputeSlots) { // All minimums are met. Give each job its fair share of excess slots. for (JobInfo info: jobsLeft) { double weight = (type == TaskType.MAP ? info.mapWeight : info.reduceWeight); double fairShare = weight / totalWeight * oldSlots; if (type == TaskType.MAP) info.mapFairShare = fairShare; else info.reduceFairShare = fairShare; } break; } }//while } }
private class DeficitComparator implements Comparator<JobInProgress> {
private final TaskType taskType;
private DeficitComparator(TaskType taskType) {
this.taskType = taskType;
public int compare(JobInProgress j1, JobInProgress j2) {
JobInfo j1Info = infos.get(j1);
JobInfo j2Info = infos.get(j2);
long deficitDif;
boolean j1Needy, j2Needy;
if (taskType == TaskType.MAP) {
j1Needy = j1.runningMaps() < Math.floor(j1Info.minMaps);
j2Needy = j2.runningMaps() < Math.floor(j2Info.minMaps);
deficitDif = j2Info.mapDeficit - j1Info.mapDeficit;
} else {
j1Needy = j1.runningReduces() < Math.floor(j1Info.minReduces);
j2Needy = j2.runningReduces() < Math.floor(j2Info.minReduces);
deficitDif = j2Info.reduceDeficit - j1Info.reduceDeficit;
if (j1Needy && !j2Needy)
return -1;
else if (j2Needy && !j1Needy)
return 1;
else // Both needy or both non-needy; compare by deficit
return (int) Math.signum(deficitDif);