CapacityScheduler#nodeUpdate()方法
protected void nodeUpdate(RMNode rmNode) {
long begin = System.nanoTime();
try {
readLock.lock();
setLastNodeUpdateTime(Time.now());
super.nodeUpdate(rmNode);
} finally {
readLock.unlock();
}
// Try to do scheduling
if (!scheduleAsynchronously) {
try {
writeLock.lock();
ActivitiesLogger.NODE.startNodeUpdateRecording(activitiesManager,
rmNode.getNodeID());
// reset allocation and reservation stats before we start doing any
// work
updateSchedulerHealth(lastNodeUpdateTime, rmNode.getNodeID(),
CSAssignment.NULL_ASSIGNMENT);
allocateContainersToNode(rmNode.getNodeID(), true);
ActivitiesLogger.NODE.finishNodeUpdateRecording(activitiesManager,
rmNode.getNodeID());
} finally {
writeLock.unlock();
}
}
long latency = System.nanoTime() - begin;
CapacitySchedulerMetrics.getMetrics().addNodeUpdate(latency);
}
CapacityScheduler#allocateContainersToNode()方法
/**
* We need to make sure when doing allocation, Node should be existed
* And we will construct a {@link CandidateNodeSet} before proceeding
*/
private void allocateContainersToNode(NodeId nodeId,
boolean withNodeHeartbeat) {
FiCaSchedulerNode node = getNode(nodeId);
if (null != node) {
int offswitchCount = 0;
int assignedContainers = 0;
CandidateNodeSet candidates = getCandidateNodeSet(
node);
CSAssignment assignment = allocateContainersToNode(candidates,
withNodeHeartbeat);
// Only check if we can allocate more container on the same node when
// scheduling is triggered by node heartbeat
if (null != assignment && withNodeHeartbeat) {
if (assignment.getType() == NodeType.OFF_SWITCH) {
offswitchCount++;
}
if (Resources.greaterThan(calculator, getClusterResource(),
assignment.getResource(), Resources.none())) {
assignedContainers++;
}
while (canAllocateMore(assignment, offswitchCount,
assignedContainers)) {
// Try to see if it is possible to allocate multiple container for
// the same node heartbeat
assignment = allocateContainersToNode(candidates, true);
if (null != assignment
&& assignment.getType() == NodeType.OFF_SWITCH) {
offswitchCount++;
}
if (null != assignment
&& Resources.greaterThan(calculator, getClusterResource(),
assignment.getResource(), Resources.none())) {
assignedContainers++;
}
}
if (offswitchCount >= offswitchPerHeartbeatLimit) {
if (LOG.isDebugEnabled()) {
LOG.debug("Assigned maximum number of off-switch containers: "
+ offswitchCount + ", assignments so far: " + assignment);
}
}
}
}
}
CapacityScheduler#allocateContainersToNode()方法
CSAssignment allocateContainersToNode(
CandidateNodeSet candidates,
boolean withNodeHeartbeat) {
if (rmContext.isWorkPreservingRecoveryEnabled() && !rmContext
.isSchedulerReadyForAllocatingContainers()) {
return null;
}
long startTime = System.nanoTime();
// Backward compatible way to make sure previous behavior which allocation
// driven by node heartbeat works.
FiCaSchedulerNode node = CandidateNodeSetUtils.getSingleNode(candidates);
// We have two different logics to handle allocation on single node / multi
// nodes.
CSAssignment assignment;
if (!multiNodePlacementEnabled) {
assignment = allocateContainerOnSingleNode(candidates,
node, withNodeHeartbeat);
} else{
assignment = allocateContainersOnMultiNodes(candidates);
}
if (assignment != null && assignment.getAssignmentInformation() != null
&& assignment.getAssignmentInformation().getNumAllocations() > 0) {
long allocateTime = System.nanoTime() - startTime;
CapacitySchedulerMetrics.getMetrics().addAllocate(allocateTime);
}
return assignment;
}
CapacityScheduler#allocateContainersOnMultiNodes()方法
/*
* New behavior, allocate containers considering multiple nodes
*/
private CSAssignment allocateContainersOnMultiNodes(
CandidateNodeSet candidates) {
// When this time look at multiple nodes, try schedule if the
// partition has any available resource or killable resource
if (getRootQueue().getQueueCapacities().getUsedCapacity(
candidates.getPartition()) >= 1.0f
&& preemptionManager.getKillableResource(
CapacitySchedulerConfiguration.ROOT, candidates.getPartition())
== Resources.none()) {
if (LOG.isDebugEnabled()) {
LOG.debug("This node or this node partition doesn't have available or"
+ "killable resource");
}
return null;
}
return allocateOrReserveNewContainers(candidates, false);
}
CapacityScheduler#allocateOrReserveNewContainers()方法
private CSAssignment allocateOrReserveNewContainers(
CandidateNodeSet candidates,
boolean withNodeHeartbeat) {
CSAssignment assignment = getRootQueue().assignContainers(
getClusterResource(), candidates, new ResourceLimits(labelManager
.getResourceByLabel(candidates.getPartition(),
getClusterResource())),
SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY);
assignment.setSchedulingMode(SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY);
submitResourceCommitRequest(getClusterResource(), assignment);
if (Resources.greaterThan(calculator, getClusterResource(),
assignment.getResource(), Resources.none())) {
FiCaSchedulerNode node = CandidateNodeSetUtils.getSingleNode(candidates);
NodeId nodeId = null;
if (node != null) {
nodeId = node.getNodeID();
}
if (withNodeHeartbeat) {
updateSchedulerHealth(lastNodeUpdateTime, nodeId, assignment);
}
return assignment;
}
// Only do non-exclusive allocation when node has node-labels.
if (StringUtils.equals(candidates.getPartition(),
RMNodeLabelsManager.NO_LABEL)) {
return null;
}
// Only do non-exclusive allocation when the node-label supports that
try {
if (rmContext.getNodeLabelManager().isExclusiveNodeLabel(
candidates.getPartition())) {
return null;
}
} catch (IOException e) {
LOG.warn(
"Exception when trying to get exclusivity of node label=" + candidates
.getPartition(), e);
return null;
}
// Try to use NON_EXCLUSIVE
assignment = getRootQueue().assignContainers(getClusterResource(),
candidates,
// TODO, now we only consider limits for parent for non-labeled
// resources, should consider labeled resources as well.
new ResourceLimits(labelManager
.getResourceByLabel(RMNodeLabelsManager.NO_LABEL,
getClusterResource())),
SchedulingMode.IGNORE_PARTITION_EXCLUSIVITY);
assignment.setSchedulingMode(SchedulingMode.IGNORE_PARTITION_EXCLUSIVITY);
submitResourceCommitRequest(getClusterResource(), assignment);
return assignment;
}
ParentQueue#assignContainers()方法
public CSAssignment assignContainers(Resource clusterResource,
CandidateNodeSet candidates,
ResourceLimits resourceLimits, SchedulingMode schedulingMode) {
FiCaSchedulerNode node = CandidateNodeSetUtils.getSingleNode(candidates);
// if our queue cannot access this node, just return
if (schedulingMode == SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY
&& !accessibleToPartition(candidates.getPartition())) {
if (LOG.isDebugEnabled()) {
long now = System.currentTimeMillis();
// Do logging every 1 sec to avoid excessive logging.
if (now - this.lastSkipQueueDebugLoggingTimestamp > 1000) {
LOG.debug("Skip this queue=" + getQueuePath()
+ ", because it is not able to access partition=" + candidates
.getPartition());
this.lastSkipQueueDebugLoggingTimestamp = now;
}
}
ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node,
getParentName(), getQueueName(), ActivityState.REJECTED,
ActivityDiagnosticConstant.NOT_ABLE_TO_ACCESS_PARTITION
+ candidates.getPartition());
if (rootQueue) {
ActivitiesLogger.NODE.finishSkippedNodeAllocation(activitiesManager,
node);
}
return CSAssignment.NULL_ASSIGNMENT;
}
// Check if this queue need more resource, simply skip allocation if this
// queue doesn't need more resources.
if (!super.hasPendingResourceRequest(candidates.getPartition(),
clusterResource, schedulingMode)) {
if (LOG.isDebugEnabled()) {
long now = System.currentTimeMillis();
// Do logging every 1 sec to avoid excessive logging.
if (now - this.lastSkipQueueDebugLoggingTimestamp > 1000) {
LOG.debug("Skip this queue=" + getQueuePath()
+ ", because it doesn't need more resource, schedulingMode="
+ schedulingMode.name() + " node-partition=" + candidates
.getPartition());
this.lastSkipQueueDebugLoggingTimestamp = now;
}
}
ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node,
getParentName(), getQueueName(), ActivityState.SKIPPED,
ActivityDiagnosticConstant.QUEUE_DO_NOT_NEED_MORE_RESOURCE);
if (rootQueue) {
ActivitiesLogger.NODE.finishSkippedNodeAllocation(activitiesManager,
node);
}
return CSAssignment.NULL_ASSIGNMENT;
}
CSAssignment assignment = new CSAssignment(Resources.createResource(0, 0),
NodeType.NODE_LOCAL);
while (canAssign(clusterResource, node)) {
if (LOG.isDebugEnabled()) {
LOG.debug("Trying to assign containers to child-queue of "
+ getQueueName());
}
// Are we over maximum-capacity for this queue?
// This will also consider parent's limits and also continuous reservation
// looking
if (!super.canAssignToThisQueue(clusterResource,
candidates.getPartition(),
resourceLimits, Resources
.createResource(getMetrics().getReservedMB(),
getMetrics().getReservedVirtualCores()), schedulingMode)) {
ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node,
getParentName(), getQueueName(), ActivityState.SKIPPED,
ActivityDiagnosticConstant.QUEUE_MAX_CAPACITY_LIMIT);
if (rootQueue) {
ActivitiesLogger.NODE.finishSkippedNodeAllocation(activitiesManager,
node);
}
break;
}
// Schedule
CSAssignment assignedToChild = assignContainersToChildQueues(
clusterResource, candidates, resourceLimits, schedulingMode);
assignment.setType(assignedToChild.getType());
assignment.setRequestLocalityType(
assignedToChild.getRequestLocalityType());
assignment.setExcessReservation(assignedToChild.getExcessReservation());
assignment.setContainersToKill(assignedToChild.getContainersToKill());
// Done if no child-queue assigned anything
if (Resources.greaterThan(resourceCalculator, clusterResource,
assignedToChild.getResource(), Resources.none())) {
ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node,
getParentName(), getQueueName(), ActivityState.ACCEPTED,
ActivityDiagnosticConstant.EMPTY);
boolean isReserved =
assignedToChild.getAssignmentInformation().getReservationDetails()
!= null && !assignedToChild.getAssignmentInformation()
.getReservationDetails().isEmpty();
if (node != null && !isReserved) {
if (rootQueue) {
ActivitiesLogger.NODE.finishAllocatedNodeAllocation(
activitiesManager, node,
assignedToChild.getAssignmentInformation()
.getFirstAllocatedOrReservedContainerId(),
AllocationState.ALLOCATED);
}
} else{
if (rootQueue) {
ActivitiesLogger.NODE.finishAllocatedNodeAllocation(
activitiesManager, node,
assignedToChild.getAssignmentInformation()
.getFirstAllocatedOrReservedContainerId(),
AllocationState.RESERVED);
}
}
// Track resource utilization in this pass of the scheduler
Resources.addTo(assignment.getResource(),
assignedToChild.getResource());
Resources.addTo(assignment.getAssignmentInformation().getAllocated(),
assignedToChild.getAssignmentInformation().getAllocated());
Resources.addTo(assignment.getAssignmentInformation().getReserved(),
assignedToChild.getAssignmentInformation().getReserved());
assignment.getAssignmentInformation().incrAllocations(
assignedToChild.getAssignmentInformation().getNumAllocations());
assignment.getAssignmentInformation().incrReservations(
assignedToChild.getAssignmentInformation().getNumReservations());
assignment.getAssignmentInformation().getAllocationDetails().addAll(
assignedToChild.getAssignmentInformation()
.getAllocationDetails());
assignment.getAssignmentInformation().getReservationDetails().addAll(
assignedToChild.getAssignmentInformation()
.getReservationDetails());
assignment.setIncreasedAllocation(
assignedToChild.isIncreasedAllocation());
if (LOG.isDebugEnabled()) {
LOG.debug("assignedContainer reserved=" + isReserved + " queue="
+ getQueueName() + " usedCapacity=" + getUsedCapacity()
+ " absoluteUsedCapacity=" + getAbsoluteUsedCapacity() + " used="
+ queueUsage.getUsed() + " cluster=" + clusterResource);
LOG.debug(
"ParentQ=" + getQueueName() + " assignedSoFarInThisIteration="
+ assignment.getResource() + " usedCapacity="
+ getUsedCapacity() + " absoluteUsedCapacity="
+ getAbsoluteUsedCapacity());
}
} else{
assignment.setSkippedType(assignedToChild.getSkippedType());
ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node,
getParentName(), getQueueName(), ActivityState.SKIPPED,
ActivityDiagnosticConstant.EMPTY);
if (rootQueue) {
ActivitiesLogger.NODE.finishSkippedNodeAllocation(activitiesManager,
node);
}
break;
}
/*
* Previously here, we can allocate more than one container for each
* allocation under rootQ. Now this logic is not proper any more
* in global scheduling world.
*
* So here do not try to allocate more than one container for each
* allocation, let top scheduler make the decision.
*/
break;
}
return assignment;
}
ParentQueue#assignContainersToChildQueues()方法
private CSAssignment assignContainersToChildQueues(Resource cluster,
CandidateNodeSet candidates, ResourceLimits limits,
SchedulingMode schedulingMode) {
CSAssignment assignment = CSAssignment.NULL_ASSIGNMENT;
printChildQueues();
// Try to assign to most 'under-served' sub-queue
for (Iterator iter = sortAndGetChildrenAllocationIterator(
candidates.getPartition()); iter.hasNext(); ) {
CSQueue childQueue = iter.next();
if(LOG.isDebugEnabled()) {
LOG.debug("Trying to assign to queue: " + childQueue.getQueuePath()
+ " stats: " + childQueue);
}
// Get ResourceLimits of child queue before assign containers
ResourceLimits childLimits =
getResourceLimitsOfChild(childQueue, cluster, limits.getNetLimit(),
candidates.getPartition());
CSAssignment childAssignment = childQueue.assignContainers(cluster,
candidates, childLimits, schedulingMode);
if(LOG.isDebugEnabled()) {
LOG.debug("Assigned to queue: " + childQueue.getQueuePath() +
" stats: " + childQueue + " --> " +
childAssignment.getResource() + ", " + childAssignment.getType());
}
if (Resources.greaterThan(
resourceCalculator, cluster,
childAssignment.getResource(), Resources.none())) {
assignment = childAssignment;
break;
} else if (childAssignment.getSkippedType() ==
CSAssignment.SkippedType.QUEUE_LIMIT) {
if (assignment.getSkippedType() !=
CSAssignment.SkippedType.QUEUE_LIMIT) {
assignment = childAssignment;
}
Resource blockedHeadroom = null;
if (childQueue instanceof LeafQueue) {
blockedHeadroom = childLimits.getHeadroom();
} else {
blockedHeadroom = childLimits.getBlockedHeadroom();
}
Resource resourceToSubtract = Resources.max(resourceCalculator,
cluster, blockedHeadroom, Resources.none());
limits.addBlockedHeadroom(resourceToSubtract);
if(LOG.isDebugEnabled()) {
LOG.debug("Decrease parentLimits " + limits.getLimit() +
" for " + this.getQueueName() + " by " +
resourceToSubtract + " as childQueue=" +
childQueue.getQueueName() + " is blocked");
}
}
}
return assignment;
}
LeafQueue#assignContainers()方法
public CSAssignment assignContainers(Resource clusterResource,
CandidateNodeSet candidates,
ResourceLimits currentResourceLimits, SchedulingMode schedulingMode) {
updateCurrentResourceLimits(currentResourceLimits, clusterResource);
FiCaSchedulerNode node = CandidateNodeSetUtils.getSingleNode(candidates);
if (LOG.isDebugEnabled()) {
LOG.debug("assignContainers: partition=" + candidates.getPartition()
+ " #applications=" + orderingPolicy.getNumSchedulableEntities());
}
setPreemptionAllowed(currentResourceLimits, candidates.getPartition());
// Check for reserved resources, try to allocate reserved container first.
CSAssignment assignment = allocateFromReservedContainer(clusterResource,
candidates, currentResourceLimits, schedulingMode);
if (null != assignment) {
return assignment;
}
// if our queue cannot access this node, just return
if (schedulingMode == SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY
&& !accessibleToPartition(candidates.getPartition())) {
ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node,
getParent().getQueueName(), getQueueName(), ActivityState.REJECTED,
ActivityDiagnosticConstant.NOT_ABLE_TO_ACCESS_PARTITION + candidates
.getPartition());
return CSAssignment.NULL_ASSIGNMENT;
}
// Check if this queue need more resource, simply skip allocation if this
// queue doesn't need more resources.
if (!hasPendingResourceRequest(candidates.getPartition(), clusterResource,
schedulingMode)) {
if (LOG.isDebugEnabled()) {
LOG.debug("Skip this queue=" + getQueuePath()
+ ", because it doesn't need more resource, schedulingMode="
+ schedulingMode.name() + " node-partition=" + candidates
.getPartition());
}
ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node,
getParent().getQueueName(), getQueueName(), ActivityState.SKIPPED,
ActivityDiagnosticConstant.QUEUE_DO_NOT_NEED_MORE_RESOURCE);
return CSAssignment.NULL_ASSIGNMENT;
}
Map userLimits = new HashMap<>();
boolean needAssignToQueueCheck = true;
for (Iterator assignmentIterator =
orderingPolicy.getAssignmentIterator();
assignmentIterator.hasNext(); ) {
FiCaSchedulerApp application = assignmentIterator.next();
ActivitiesLogger.APP.startAppAllocationRecording(activitiesManager,
node, SystemClock.getInstance().getTime(), application);
// Check queue max-capacity limit
Resource appReserved = application.getCurrentReservation();
if (needAssignToQueueCheck) {
if (!super.canAssignToThisQueue(clusterResource,
candidates.getPartition(), currentResourceLimits, appReserved,
schedulingMode)) {
ActivitiesLogger.APP.recordRejectedAppActivityFromLeafQueue(
activitiesManager, node, application, application.getPriority(),
ActivityDiagnosticConstant.QUEUE_MAX_CAPACITY_LIMIT);
ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node,
getParent().getQueueName(), getQueueName(), ActivityState.SKIPPED,
ActivityDiagnosticConstant.EMPTY);
return CSAssignment.NULL_ASSIGNMENT;
}
// If there was no reservation and canAssignToThisQueue returned
// true, there is no reason to check further.
if (!this.reservationsContinueLooking
|| appReserved.equals(Resources.none())) {
needAssignToQueueCheck = false;
}
}
CachedUserLimit cul = userLimits.get(application.getUser());
Resource cachedUserLimit = null;
if (cul != null) {
cachedUserLimit = cul.userLimit;
}
Resource userLimit = computeUserLimitAndSetHeadroom(application,
clusterResource, candidates.getPartition(), schedulingMode,
cachedUserLimit);
if (cul == null) {
cul = new CachedUserLimit(userLimit);
userLimits.put(application.getUser(), cul);
}
// Check user limit
boolean userAssignable = true;
if (!cul.canAssign && Resources.fitsIn(appReserved, cul.reservation)) {
userAssignable = false;
} else {
userAssignable = canAssignToUser(clusterResource, application.getUser(),
userLimit, application, candidates.getPartition(),
currentResourceLimits);
if (!userAssignable && Resources.fitsIn(cul.reservation, appReserved)) {
cul.canAssign = false;
cul.reservation = appReserved;
}
}
if (!userAssignable) {
application.updateAMContainerDiagnostics(AMState.ACTIVATED,
"User capacity has reached its maximum limit.");
ActivitiesLogger.APP.recordRejectedAppActivityFromLeafQueue(
activitiesManager, node, application, application.getPriority(),
ActivityDiagnosticConstant.USER_CAPACITY_MAXIMUM_LIMIT);
continue;
}
// Try to schedule
assignment = application.assignContainers(clusterResource,
candidates, currentResourceLimits, schedulingMode, null);
if (LOG.isDebugEnabled()) {
LOG.debug("post-assignContainers for application " + application
.getApplicationId());
application.showRequests();
}
// Did we schedule or reserve a container?
Resource assigned = assignment.getResource();
if (Resources.greaterThan(resourceCalculator, clusterResource, assigned,
Resources.none())) {
ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node,
getParent().getQueueName(), getQueueName(),
ActivityState.ACCEPTED, ActivityDiagnosticConstant.EMPTY);
return assignment;
} else if (assignment.getSkippedType()
== CSAssignment.SkippedType.OTHER) {
ActivitiesLogger.APP.finishSkippedAppAllocationRecording(
activitiesManager, application.getApplicationId(),
ActivityState.SKIPPED, ActivityDiagnosticConstant.EMPTY);
application.updateNodeInfoForAMDiagnostics(node);
} else if (assignment.getSkippedType()
== CSAssignment.SkippedType.QUEUE_LIMIT) {
return assignment;
} else{
// If we don't allocate anything, and it is not skipped by application,
// we will return to respect FIFO of applications
ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node,
getParent().getQueueName(), getQueueName(), ActivityState.SKIPPED,
ActivityDiagnosticConstant.RESPECT_FIFO);
ActivitiesLogger.APP.finishSkippedAppAllocationRecording(
activitiesManager, application.getApplicationId(),
ActivityState.SKIPPED, ActivityDiagnosticConstant.EMPTY);
return CSAssignment.NULL_ASSIGNMENT;
}
}
ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node,
getParent().getQueueName(), getQueueName(), ActivityState.SKIPPED,
ActivityDiagnosticConstant.EMPTY);
return CSAssignment.NULL_ASSIGNMENT;
}
FiCaSchedulerApp#assignContainers()方法
public CSAssignment assignContainers(Resource clusterResource,
CandidateNodeSet ps,
ResourceLimits currentResourceLimits, SchedulingMode schedulingMode,
RMContainer reservedContainer) {
if (LOG.isDebugEnabled()) {
LOG.debug("pre-assignContainers for application "
+ getApplicationId());
showRequests();
}
return containerAllocator.assignContainers(clusterResource, ps,
schedulingMode, currentResourceLimits, reservedContainer);
}
RegularContainerAllocator#assignContainer()方法
public CSAssignment assignContainers(Resource clusterResource,
CandidateNodeSet candidates,
SchedulingMode schedulingMode, ResourceLimits resourceLimits,
RMContainer reservedContainer) {
FiCaSchedulerNode node = CandidateNodeSetUtils.getSingleNode(candidates);
if (reservedContainer == null) {
// Check if application needs more resource, skip if it doesn't need more.
if (!application.hasPendingResourceRequest(candidates.getPartition(),
schedulingMode)) {
if (LOG.isDebugEnabled()) {
LOG.debug("Skip app_attempt=" + application.getApplicationAttemptId()
+ ", because it doesn't need more resource, schedulingMode="
+ schedulingMode.name() + " node-label=" + candidates
.getPartition());
}
ActivitiesLogger.APP.recordSkippedAppActivityWithoutAllocation(
activitiesManager, node, application, application.getPriority(),
ActivityDiagnosticConstant.APPLICATION_DO_NOT_NEED_RESOURCE);
return CSAssignment.SKIP_ASSIGNMENT;
}
// Schedule in priority order
for (SchedulerRequestKey schedulerKey : application.getSchedulerKeys()) {
ContainerAllocation result = allocate(clusterResource, candidates,
schedulingMode, resourceLimits, schedulerKey, null);
AllocationState allocationState = result.getAllocationState();
if (allocationState == AllocationState.PRIORITY_SKIPPED) {
continue;
}
return getCSAssignmentFromAllocateResult(clusterResource, result,
null, node);
}
// We will reach here if we skipped all priorities of the app, so we will
// skip the app.
ActivitiesLogger.APP.recordSkippedAppActivityWithoutAllocation(
activitiesManager, node, application, application.getPriority(),
ActivityDiagnosticConstant.SKIPPED_ALL_PRIORITIES);
return CSAssignment.SKIP_ASSIGNMENT;
} else {
ContainerAllocation result =
allocate(clusterResource, candidates, schedulingMode, resourceLimits,
reservedContainer.getReservedSchedulerKey(), reservedContainer);
return getCSAssignmentFromAllocateResult(clusterResource, result,
reservedContainer, node);
}
}
CapacityScheduler的比较器
/**
* Comparator that both looks at priority and utilization
*/
private class PriorityQueueComparator implements Comparator {
@Override
public int compare(CSQueue q1, CSQueue q2) {
String p = partitionToLookAt.get();
int rc = compareQueueAccessToPartition(q1, q2, p);
if (0 != rc) {
return rc;
}
float q1AbsCapacity = q1.getQueueCapacities().getAbsoluteCapacity(p);
float q2AbsCapacity = q2.getQueueCapacities().getAbsoluteCapacity(p);
//If q1's abs capacity > 0 and q2 is 0, then prioritize q1
if (Float.compare(q1AbsCapacity, 0f) > 0 && Float.compare(q2AbsCapacity,
0f) == 0) {
return -1;
//If q2's abs capacity > 0 and q1 is 0, then prioritize q2
} else if (Float.compare(q2AbsCapacity, 0f) > 0 && Float.compare(
q1AbsCapacity, 0f) == 0) {
return 1;
} else if (Float.compare(q1AbsCapacity, 0f) == 0 && Float.compare(
q2AbsCapacity, 0f) == 0) {
// both q1 has 0 and q2 has 0 capacity, then fall back to using
// priority, abs used capacity to prioritize
float used1 = q1.getQueueCapacities().getAbsoluteUsedCapacity(p);
float used2 = q2.getQueueCapacities().getAbsoluteUsedCapacity(p);
return compare(q1, q2, used1, used2, p);
} else{
// both q1 has positive abs capacity and q2 has positive abs
// capacity
float used1 = q1.getQueueCapacities().getUsedCapacity(p);
float used2 = q2.getQueueCapacities().getUsedCapacity(p);
return compare(q1, q2, used1, used2, p);
}
}
private int compare(CSQueue q1, CSQueue q2, float q1Used, float q2Used,
String partition) {
int p1 = 0;
int p2 = 0;
if (respectPriority) {
p1 = q1.getPriority().getPriority();
p2 = q2.getPriority().getPriority();
}
int rc = PriorityUtilizationQueueOrderingPolicy.compare(q1Used, q2Used,
p1, p2);
// For queue with same used ratio / priority, queue with higher configured
// capacity goes first
if (0 == rc) {
Resource minEffRes1 =
q1.getQueueResourceQuotas().getConfiguredMinResource(partition);
Resource minEffRes2 =
q2.getQueueResourceQuotas().getConfiguredMinResource(partition);
if (!minEffRes1.equals(Resources.none()) && !minEffRes2.equals(
Resources.none())) {
return minEffRes2.compareTo(minEffRes1);
}
float abs1 = q1.getQueueCapacities().getAbsoluteCapacity(partition);
float abs2 = q2.getQueueCapacities().getAbsoluteCapacity(partition);
return Float.compare(abs2, abs1);
}
return rc;
}
private int compareQueueAccessToPartition(CSQueue q1, CSQueue q2,
String partition) {
// Everybody has access to default partition
if (StringUtils.equals(partition, RMNodeLabelsManager.NO_LABEL)) {
return 0;
}
/*
* Check accessible to given partition, if one queue accessible and
* the other not, accessible queue goes first.
*/
boolean q1Accessible =
q1.getAccessibleNodeLabels() != null && q1.getAccessibleNodeLabels()
.contains(partition) || q1.getAccessibleNodeLabels().contains(
RMNodeLabelsManager.ANY);
boolean q2Accessible =
q2.getAccessibleNodeLabels() != null && q2.getAccessibleNodeLabels()
.contains(partition) || q2.getAccessibleNodeLabels().contains(
RMNodeLabelsManager.ANY);
if (q1Accessible && !q2Accessible) {
return -1;
} else if (!q1Accessible && q2Accessible) {
return 1;
}
return 0;
}
}
PriorityUtilizationQueueOrderingPolicy#compare()方法
/**
* Compare two queues with possibly different priority and assigned capacity,
* Will be used by preemption policy as well.
*
* @param relativeAssigned1 relativeAssigned1
* @param relativeAssigned2 relativeAssigned2
* @param priority1 p1
* @param priority2 p2
* @return compared result
*/
public static int compare(double relativeAssigned1, double relativeAssigned2,
int priority1, int priority2) {
if (priority1 == priority2) {
// The queue with less relative used-capacity goes first
return Double.compare(relativeAssigned1, relativeAssigned2);
} else{
// When priority is different:
if ((relativeAssigned1 < 1.0f && relativeAssigned2 < 1.0f) || (
relativeAssigned1 >= 1.0f && relativeAssigned2 >= 1.0f)) {
// When both the queues are under their guaranteed capacities,
// Or both the queues are over or meeting their guaranteed capacities
// queue with higher used-capacity goes first
return Integer.compare(priority2, priority1);
} else{
// Otherwise, when one of the queues is over or meeting their
// guaranteed capacities and the other is under: The queue that is
// under its capacity guarantee gets the resources.
return Double.compare(relativeAssigned1, relativeAssigned2);
}
}
}