yarn3.2源码分析之CapacityScheduler的心跳调度

CapacityScheduler#nodeUpdate()方法

protected void nodeUpdate(RMNode rmNode) {
    long begin = System.nanoTime();
    try {
      readLock.lock();
      setLastNodeUpdateTime(Time.now());
      super.nodeUpdate(rmNode);
    } finally {
      readLock.unlock();
    }

    // Try to do scheduling
    if (!scheduleAsynchronously) {
      try {
        writeLock.lock();
        ActivitiesLogger.NODE.startNodeUpdateRecording(activitiesManager,
            rmNode.getNodeID());

        // reset allocation and reservation stats before we start doing any
        // work
        updateSchedulerHealth(lastNodeUpdateTime, rmNode.getNodeID(),
            CSAssignment.NULL_ASSIGNMENT);

        allocateContainersToNode(rmNode.getNodeID(), true);
        ActivitiesLogger.NODE.finishNodeUpdateRecording(activitiesManager,
            rmNode.getNodeID());
      } finally {
        writeLock.unlock();
      }
    }

    long latency = System.nanoTime() - begin;
    CapacitySchedulerMetrics.getMetrics().addNodeUpdate(latency);
  }

CapacityScheduler#allocateContainersToNode()方法

/**
   * We need to make sure when doing allocation, Node should be existed
   * And we will construct a {@link CandidateNodeSet} before proceeding
   */
  private void allocateContainersToNode(NodeId nodeId,
      boolean withNodeHeartbeat) {
    FiCaSchedulerNode node = getNode(nodeId);
    if (null != node) {
      int offswitchCount = 0;
      int assignedContainers = 0;

      CandidateNodeSet candidates = getCandidateNodeSet(
          node);
      CSAssignment assignment = allocateContainersToNode(candidates,
          withNodeHeartbeat);
      // Only check if we can allocate more container on the same node when
      // scheduling is triggered by node heartbeat
      if (null != assignment && withNodeHeartbeat) {
        if (assignment.getType() == NodeType.OFF_SWITCH) {
          offswitchCount++;
        }

        if (Resources.greaterThan(calculator, getClusterResource(),
            assignment.getResource(), Resources.none())) {
          assignedContainers++;
        }

        while (canAllocateMore(assignment, offswitchCount,
            assignedContainers)) {
          // Try to see if it is possible to allocate multiple container for
          // the same node heartbeat
          assignment = allocateContainersToNode(candidates, true);

          if (null != assignment
              && assignment.getType() == NodeType.OFF_SWITCH) {
            offswitchCount++;
          }

          if (null != assignment
              && Resources.greaterThan(calculator, getClusterResource(),
                  assignment.getResource(), Resources.none())) {
            assignedContainers++;
          }
        }

        if (offswitchCount >= offswitchPerHeartbeatLimit) {
          if (LOG.isDebugEnabled()) {
            LOG.debug("Assigned maximum number of off-switch containers: "
                + offswitchCount + ", assignments so far: " + assignment);
          }
        }
      }
    }
  }

CapacityScheduler#allocateContainersToNode()方法

 CSAssignment allocateContainersToNode(
      CandidateNodeSet candidates,
      boolean withNodeHeartbeat) {
    if (rmContext.isWorkPreservingRecoveryEnabled() && !rmContext
        .isSchedulerReadyForAllocatingContainers()) {
      return null;
    }

    long startTime = System.nanoTime();

    // Backward compatible way to make sure previous behavior which allocation
    // driven by node heartbeat works.
    FiCaSchedulerNode node = CandidateNodeSetUtils.getSingleNode(candidates);

    // We have two different logics to handle allocation on single node / multi
    // nodes.
    CSAssignment assignment;
    if (!multiNodePlacementEnabled) {
      assignment = allocateContainerOnSingleNode(candidates,
          node, withNodeHeartbeat);
    } else{
      assignment = allocateContainersOnMultiNodes(candidates);
    }

    if (assignment != null && assignment.getAssignmentInformation() != null
        && assignment.getAssignmentInformation().getNumAllocations() > 0) {
      long allocateTime = System.nanoTime() - startTime;
      CapacitySchedulerMetrics.getMetrics().addAllocate(allocateTime);
    }
    return assignment;
  }

CapacityScheduler#allocateContainersOnMultiNodes()方法

/*
   * New behavior, allocate containers considering multiple nodes
   */
  private CSAssignment allocateContainersOnMultiNodes(
      CandidateNodeSet candidates) {
    // When this time look at multiple nodes, try schedule if the
    // partition has any available resource or killable resource
    if (getRootQueue().getQueueCapacities().getUsedCapacity(
        candidates.getPartition()) >= 1.0f
        && preemptionManager.getKillableResource(
        CapacitySchedulerConfiguration.ROOT, candidates.getPartition())
        == Resources.none()) {
      if (LOG.isDebugEnabled()) {
        LOG.debug("This node or this node partition doesn't have available or"
            + "killable resource");
      }
      return null;
    }

    return allocateOrReserveNewContainers(candidates, false);
  }

CapacityScheduler#allocateOrReserveNewContainers()方法

private CSAssignment allocateOrReserveNewContainers(
      CandidateNodeSet candidates,
      boolean withNodeHeartbeat) {
    CSAssignment assignment = getRootQueue().assignContainers(
        getClusterResource(), candidates, new ResourceLimits(labelManager
            .getResourceByLabel(candidates.getPartition(),
                getClusterResource())),
        SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY);

    assignment.setSchedulingMode(SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY);
    submitResourceCommitRequest(getClusterResource(), assignment);

    if (Resources.greaterThan(calculator, getClusterResource(),
        assignment.getResource(), Resources.none())) {
      FiCaSchedulerNode node = CandidateNodeSetUtils.getSingleNode(candidates);
      NodeId nodeId = null;
      if (node != null) {
        nodeId = node.getNodeID();
      }
      if (withNodeHeartbeat) {
        updateSchedulerHealth(lastNodeUpdateTime, nodeId, assignment);
      }
      return assignment;
    }

    // Only do non-exclusive allocation when node has node-labels.
    if (StringUtils.equals(candidates.getPartition(),
        RMNodeLabelsManager.NO_LABEL)) {
      return null;
    }

    // Only do non-exclusive allocation when the node-label supports that
    try {
      if (rmContext.getNodeLabelManager().isExclusiveNodeLabel(
          candidates.getPartition())) {
        return null;
      }
    } catch (IOException e) {
      LOG.warn(
          "Exception when trying to get exclusivity of node label=" + candidates
          .getPartition(), e);
      return null;
    }

    // Try to use NON_EXCLUSIVE
    assignment = getRootQueue().assignContainers(getClusterResource(),
        candidates,
        // TODO, now we only consider limits for parent for non-labeled
        // resources, should consider labeled resources as well.
        new ResourceLimits(labelManager
            .getResourceByLabel(RMNodeLabelsManager.NO_LABEL,
                getClusterResource())),
        SchedulingMode.IGNORE_PARTITION_EXCLUSIVITY);
    assignment.setSchedulingMode(SchedulingMode.IGNORE_PARTITION_EXCLUSIVITY);
    submitResourceCommitRequest(getClusterResource(), assignment);

    return assignment;
  }

ParentQueue#assignContainers()方法

public CSAssignment assignContainers(Resource clusterResource,
      CandidateNodeSet candidates,
      ResourceLimits resourceLimits, SchedulingMode schedulingMode) {
    FiCaSchedulerNode node = CandidateNodeSetUtils.getSingleNode(candidates);

    // if our queue cannot access this node, just return
    if (schedulingMode == SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY
        && !accessibleToPartition(candidates.getPartition())) {
      if (LOG.isDebugEnabled()) {
        long now = System.currentTimeMillis();
        // Do logging every 1 sec to avoid excessive logging.
        if (now - this.lastSkipQueueDebugLoggingTimestamp > 1000) {
          LOG.debug("Skip this queue=" + getQueuePath()
              + ", because it is not able to access partition=" + candidates
              .getPartition());
          this.lastSkipQueueDebugLoggingTimestamp = now;
        }
      }

      ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node,
          getParentName(), getQueueName(), ActivityState.REJECTED,
          ActivityDiagnosticConstant.NOT_ABLE_TO_ACCESS_PARTITION
              + candidates.getPartition());
      if (rootQueue) {
        ActivitiesLogger.NODE.finishSkippedNodeAllocation(activitiesManager,
            node);
      }

      return CSAssignment.NULL_ASSIGNMENT;
    }

    // Check if this queue need more resource, simply skip allocation if this
    // queue doesn't need more resources.
    if (!super.hasPendingResourceRequest(candidates.getPartition(),
        clusterResource, schedulingMode)) {
      if (LOG.isDebugEnabled()) {
        long now = System.currentTimeMillis();
        // Do logging every 1 sec to avoid excessive logging.
        if (now - this.lastSkipQueueDebugLoggingTimestamp > 1000) {
          LOG.debug("Skip this queue=" + getQueuePath()
              + ", because it doesn't need more resource, schedulingMode="
              + schedulingMode.name() + " node-partition=" + candidates
              .getPartition());
          this.lastSkipQueueDebugLoggingTimestamp = now;
        }
      }

      ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node,
          getParentName(), getQueueName(), ActivityState.SKIPPED,
          ActivityDiagnosticConstant.QUEUE_DO_NOT_NEED_MORE_RESOURCE);
      if (rootQueue) {
        ActivitiesLogger.NODE.finishSkippedNodeAllocation(activitiesManager,
            node);
      }

      return CSAssignment.NULL_ASSIGNMENT;
    }

    CSAssignment assignment = new CSAssignment(Resources.createResource(0, 0),
        NodeType.NODE_LOCAL);

    while (canAssign(clusterResource, node)) {
      if (LOG.isDebugEnabled()) {
        LOG.debug("Trying to assign containers to child-queue of "
            + getQueueName());
      }

      // Are we over maximum-capacity for this queue?
      // This will also consider parent's limits and also continuous reservation
      // looking
      if (!super.canAssignToThisQueue(clusterResource,
          candidates.getPartition(),
          resourceLimits, Resources
              .createResource(getMetrics().getReservedMB(),
                  getMetrics().getReservedVirtualCores()), schedulingMode)) {

        ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node,
            getParentName(), getQueueName(), ActivityState.SKIPPED,
            ActivityDiagnosticConstant.QUEUE_MAX_CAPACITY_LIMIT);
        if (rootQueue) {
          ActivitiesLogger.NODE.finishSkippedNodeAllocation(activitiesManager,
              node);
        }

        break;
      }

      // Schedule
      CSAssignment assignedToChild = assignContainersToChildQueues(
          clusterResource, candidates, resourceLimits, schedulingMode);
      assignment.setType(assignedToChild.getType());
      assignment.setRequestLocalityType(
          assignedToChild.getRequestLocalityType());
      assignment.setExcessReservation(assignedToChild.getExcessReservation());
      assignment.setContainersToKill(assignedToChild.getContainersToKill());

      // Done if no child-queue assigned anything
      if (Resources.greaterThan(resourceCalculator, clusterResource,
          assignedToChild.getResource(), Resources.none())) {

        ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node,
            getParentName(), getQueueName(), ActivityState.ACCEPTED,
            ActivityDiagnosticConstant.EMPTY);

        boolean isReserved =
            assignedToChild.getAssignmentInformation().getReservationDetails()
                != null && !assignedToChild.getAssignmentInformation()
                .getReservationDetails().isEmpty();
        if (node != null && !isReserved) {
          if (rootQueue) {
            ActivitiesLogger.NODE.finishAllocatedNodeAllocation(
                activitiesManager, node,
                assignedToChild.getAssignmentInformation()
                    .getFirstAllocatedOrReservedContainerId(),
                AllocationState.ALLOCATED);
          }
        } else{
          if (rootQueue) {
            ActivitiesLogger.NODE.finishAllocatedNodeAllocation(
                activitiesManager, node,
                assignedToChild.getAssignmentInformation()
                    .getFirstAllocatedOrReservedContainerId(),
                AllocationState.RESERVED);
          }
        }

        // Track resource utilization in this pass of the scheduler
        Resources.addTo(assignment.getResource(),
            assignedToChild.getResource());
        Resources.addTo(assignment.getAssignmentInformation().getAllocated(),
            assignedToChild.getAssignmentInformation().getAllocated());
        Resources.addTo(assignment.getAssignmentInformation().getReserved(),
            assignedToChild.getAssignmentInformation().getReserved());
        assignment.getAssignmentInformation().incrAllocations(
            assignedToChild.getAssignmentInformation().getNumAllocations());
        assignment.getAssignmentInformation().incrReservations(
            assignedToChild.getAssignmentInformation().getNumReservations());
        assignment.getAssignmentInformation().getAllocationDetails().addAll(
            assignedToChild.getAssignmentInformation()
                .getAllocationDetails());
        assignment.getAssignmentInformation().getReservationDetails().addAll(
            assignedToChild.getAssignmentInformation()
                .getReservationDetails());
        assignment.setIncreasedAllocation(
            assignedToChild.isIncreasedAllocation());

        if (LOG.isDebugEnabled()) {
          LOG.debug("assignedContainer reserved=" + isReserved + " queue="
              + getQueueName() + " usedCapacity=" + getUsedCapacity()
              + " absoluteUsedCapacity=" + getAbsoluteUsedCapacity() + " used="
              + queueUsage.getUsed() + " cluster=" + clusterResource);

          LOG.debug(
              "ParentQ=" + getQueueName() + " assignedSoFarInThisIteration="
                  + assignment.getResource() + " usedCapacity="
                  + getUsedCapacity() + " absoluteUsedCapacity="
                  + getAbsoluteUsedCapacity());
        }
      } else{
        assignment.setSkippedType(assignedToChild.getSkippedType());

        ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node,
            getParentName(), getQueueName(), ActivityState.SKIPPED,
            ActivityDiagnosticConstant.EMPTY);
        if (rootQueue) {
          ActivitiesLogger.NODE.finishSkippedNodeAllocation(activitiesManager,
              node);
        }

        break;
      }

      /*
       * Previously here, we can allocate more than one container for each
       * allocation under rootQ. Now this logic is not proper any more
       * in global scheduling world.
       *
       * So here do not try to allocate more than one container for each
       * allocation, let top scheduler make the decision.
       */
      break;
    }

    return assignment;
  }

ParentQueue#assignContainersToChildQueues()方法

private CSAssignment assignContainersToChildQueues(Resource cluster,
      CandidateNodeSet candidates, ResourceLimits limits,
      SchedulingMode schedulingMode) {
    CSAssignment assignment = CSAssignment.NULL_ASSIGNMENT;

    printChildQueues();

    // Try to assign to most 'under-served' sub-queue
    for (Iterator iter = sortAndGetChildrenAllocationIterator(
        candidates.getPartition()); iter.hasNext(); ) {
      CSQueue childQueue = iter.next();
      if(LOG.isDebugEnabled()) {
        LOG.debug("Trying to assign to queue: " + childQueue.getQueuePath()
          + " stats: " + childQueue);
      }

      // Get ResourceLimits of child queue before assign containers
      ResourceLimits childLimits =
          getResourceLimitsOfChild(childQueue, cluster, limits.getNetLimit(),
              candidates.getPartition());

      CSAssignment childAssignment = childQueue.assignContainers(cluster,
          candidates, childLimits, schedulingMode);
      if(LOG.isDebugEnabled()) {
        LOG.debug("Assigned to queue: " + childQueue.getQueuePath() +
            " stats: " + childQueue + " --> " +
            childAssignment.getResource() + ", " + childAssignment.getType());
      }

      if (Resources.greaterThan(
              resourceCalculator, cluster, 
              childAssignment.getResource(), Resources.none())) {
        assignment = childAssignment;
        break;
      } else if (childAssignment.getSkippedType() ==
          CSAssignment.SkippedType.QUEUE_LIMIT) {
        if (assignment.getSkippedType() !=
            CSAssignment.SkippedType.QUEUE_LIMIT) {
          assignment = childAssignment;
        }
        Resource blockedHeadroom = null;
        if (childQueue instanceof LeafQueue) {
          blockedHeadroom = childLimits.getHeadroom();
        } else {
          blockedHeadroom = childLimits.getBlockedHeadroom();
        }
        Resource resourceToSubtract = Resources.max(resourceCalculator,
            cluster, blockedHeadroom, Resources.none());
        limits.addBlockedHeadroom(resourceToSubtract);
        if(LOG.isDebugEnabled()) {
          LOG.debug("Decrease parentLimits " + limits.getLimit() +
              " for " + this.getQueueName() + " by " +
              resourceToSubtract + " as childQueue=" +
              childQueue.getQueueName() + " is blocked");
        }
      }
    }

    return assignment;
  }

LeafQueue#assignContainers()方法

public CSAssignment assignContainers(Resource clusterResource,
      CandidateNodeSet candidates,
      ResourceLimits currentResourceLimits, SchedulingMode schedulingMode) {
    updateCurrentResourceLimits(currentResourceLimits, clusterResource);
    FiCaSchedulerNode node = CandidateNodeSetUtils.getSingleNode(candidates);

    if (LOG.isDebugEnabled()) {
      LOG.debug("assignContainers: partition=" + candidates.getPartition()
          + " #applications=" + orderingPolicy.getNumSchedulableEntities());
    }

    setPreemptionAllowed(currentResourceLimits, candidates.getPartition());

    // Check for reserved resources, try to allocate reserved container first.
    CSAssignment assignment = allocateFromReservedContainer(clusterResource,
        candidates, currentResourceLimits, schedulingMode);
    if (null != assignment) {
      return assignment;
    }

    // if our queue cannot access this node, just return
    if (schedulingMode == SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY
        && !accessibleToPartition(candidates.getPartition())) {
      ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node,
          getParent().getQueueName(), getQueueName(), ActivityState.REJECTED,
          ActivityDiagnosticConstant.NOT_ABLE_TO_ACCESS_PARTITION + candidates
              .getPartition());
      return CSAssignment.NULL_ASSIGNMENT;
    }

    // Check if this queue need more resource, simply skip allocation if this
    // queue doesn't need more resources.
    if (!hasPendingResourceRequest(candidates.getPartition(), clusterResource,
        schedulingMode)) {
      if (LOG.isDebugEnabled()) {
        LOG.debug("Skip this queue=" + getQueuePath()
            + ", because it doesn't need more resource, schedulingMode="
            + schedulingMode.name() + " node-partition=" + candidates
            .getPartition());
      }
      ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node,
          getParent().getQueueName(), getQueueName(), ActivityState.SKIPPED,
          ActivityDiagnosticConstant.QUEUE_DO_NOT_NEED_MORE_RESOURCE);
      return CSAssignment.NULL_ASSIGNMENT;
    }

    Map userLimits = new HashMap<>();
    boolean needAssignToQueueCheck = true;
    for (Iterator assignmentIterator =
         orderingPolicy.getAssignmentIterator();
         assignmentIterator.hasNext(); ) {
      FiCaSchedulerApp application = assignmentIterator.next();

      ActivitiesLogger.APP.startAppAllocationRecording(activitiesManager,
          node, SystemClock.getInstance().getTime(), application);

      // Check queue max-capacity limit
      Resource appReserved = application.getCurrentReservation();
      if (needAssignToQueueCheck) {
        if (!super.canAssignToThisQueue(clusterResource,
            candidates.getPartition(), currentResourceLimits, appReserved,
            schedulingMode)) {
          ActivitiesLogger.APP.recordRejectedAppActivityFromLeafQueue(
              activitiesManager, node, application, application.getPriority(),
              ActivityDiagnosticConstant.QUEUE_MAX_CAPACITY_LIMIT);
          ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node,
              getParent().getQueueName(), getQueueName(), ActivityState.SKIPPED,
              ActivityDiagnosticConstant.EMPTY);
          return CSAssignment.NULL_ASSIGNMENT;
        }
        // If there was no reservation and canAssignToThisQueue returned
        // true, there is no reason to check further.
        if (!this.reservationsContinueLooking
            || appReserved.equals(Resources.none())) {
          needAssignToQueueCheck = false;
        }
      }

      CachedUserLimit cul = userLimits.get(application.getUser());
      Resource cachedUserLimit = null;
      if (cul != null) {
        cachedUserLimit = cul.userLimit;
      }
      Resource userLimit = computeUserLimitAndSetHeadroom(application,
          clusterResource, candidates.getPartition(), schedulingMode,
          cachedUserLimit);
      if (cul == null) {
        cul = new CachedUserLimit(userLimit);
        userLimits.put(application.getUser(), cul);
      }
      // Check user limit
      boolean userAssignable = true;
      if (!cul.canAssign && Resources.fitsIn(appReserved, cul.reservation)) {
        userAssignable = false;
      } else {
        userAssignable = canAssignToUser(clusterResource, application.getUser(),
            userLimit, application, candidates.getPartition(),
            currentResourceLimits);
        if (!userAssignable && Resources.fitsIn(cul.reservation, appReserved)) {
          cul.canAssign = false;
          cul.reservation = appReserved;
        }
      }
      if (!userAssignable) {
        application.updateAMContainerDiagnostics(AMState.ACTIVATED,
            "User capacity has reached its maximum limit.");
        ActivitiesLogger.APP.recordRejectedAppActivityFromLeafQueue(
            activitiesManager, node, application, application.getPriority(),
            ActivityDiagnosticConstant.USER_CAPACITY_MAXIMUM_LIMIT);
        continue;
      }

      // Try to schedule
      assignment = application.assignContainers(clusterResource,
          candidates, currentResourceLimits, schedulingMode, null);

      if (LOG.isDebugEnabled()) {
        LOG.debug("post-assignContainers for application " + application
            .getApplicationId());
        application.showRequests();
      }

      // Did we schedule or reserve a container?
      Resource assigned = assignment.getResource();

      if (Resources.greaterThan(resourceCalculator, clusterResource, assigned,
          Resources.none())) {
        ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node,
            getParent().getQueueName(), getQueueName(),
            ActivityState.ACCEPTED, ActivityDiagnosticConstant.EMPTY);
        return assignment;
      } else if (assignment.getSkippedType()
          == CSAssignment.SkippedType.OTHER) {
        ActivitiesLogger.APP.finishSkippedAppAllocationRecording(
            activitiesManager, application.getApplicationId(),
            ActivityState.SKIPPED, ActivityDiagnosticConstant.EMPTY);
        application.updateNodeInfoForAMDiagnostics(node);
      } else if (assignment.getSkippedType()
          == CSAssignment.SkippedType.QUEUE_LIMIT) {
        return assignment;
      } else{
        // If we don't allocate anything, and it is not skipped by application,
        // we will return to respect FIFO of applications
        ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node,
            getParent().getQueueName(), getQueueName(), ActivityState.SKIPPED,
            ActivityDiagnosticConstant.RESPECT_FIFO);
        ActivitiesLogger.APP.finishSkippedAppAllocationRecording(
            activitiesManager, application.getApplicationId(),
            ActivityState.SKIPPED, ActivityDiagnosticConstant.EMPTY);
        return CSAssignment.NULL_ASSIGNMENT;
      }
    }
    ActivitiesLogger.QUEUE.recordQueueActivity(activitiesManager, node,
        getParent().getQueueName(), getQueueName(), ActivityState.SKIPPED,
        ActivityDiagnosticConstant.EMPTY);

    return CSAssignment.NULL_ASSIGNMENT;
  }

FiCaSchedulerApp#assignContainers()方法

public CSAssignment assignContainers(Resource clusterResource,
      CandidateNodeSet ps,
      ResourceLimits currentResourceLimits, SchedulingMode schedulingMode,
      RMContainer reservedContainer) {
    if (LOG.isDebugEnabled()) {
      LOG.debug("pre-assignContainers for application "
          + getApplicationId());
      showRequests();
    }

    return containerAllocator.assignContainers(clusterResource, ps,
        schedulingMode, currentResourceLimits, reservedContainer);
  }

RegularContainerAllocator#assignContainer()方法

public CSAssignment assignContainers(Resource clusterResource,
      CandidateNodeSet candidates,
      SchedulingMode schedulingMode, ResourceLimits resourceLimits,
      RMContainer reservedContainer) {
    FiCaSchedulerNode node = CandidateNodeSetUtils.getSingleNode(candidates);

    if (reservedContainer == null) {
      // Check if application needs more resource, skip if it doesn't need more.
      if (!application.hasPendingResourceRequest(candidates.getPartition(),
          schedulingMode)) {
        if (LOG.isDebugEnabled()) {
          LOG.debug("Skip app_attempt=" + application.getApplicationAttemptId()
              + ", because it doesn't need more resource, schedulingMode="
              + schedulingMode.name() + " node-label=" + candidates
              .getPartition());
        }
        ActivitiesLogger.APP.recordSkippedAppActivityWithoutAllocation(
            activitiesManager, node, application, application.getPriority(),
            ActivityDiagnosticConstant.APPLICATION_DO_NOT_NEED_RESOURCE);
        return CSAssignment.SKIP_ASSIGNMENT;
      }
      
      // Schedule in priority order
      for (SchedulerRequestKey schedulerKey : application.getSchedulerKeys()) {
        ContainerAllocation result = allocate(clusterResource, candidates,
            schedulingMode, resourceLimits, schedulerKey, null);

        AllocationState allocationState = result.getAllocationState();
        if (allocationState == AllocationState.PRIORITY_SKIPPED) {
          continue;
        }
        return getCSAssignmentFromAllocateResult(clusterResource, result,
            null, node);
      }

      // We will reach here if we skipped all priorities of the app, so we will
      // skip the app.
      ActivitiesLogger.APP.recordSkippedAppActivityWithoutAllocation(
          activitiesManager, node, application, application.getPriority(),
          ActivityDiagnosticConstant.SKIPPED_ALL_PRIORITIES);
      return CSAssignment.SKIP_ASSIGNMENT;
    } else {
      ContainerAllocation result =
          allocate(clusterResource, candidates, schedulingMode, resourceLimits,
              reservedContainer.getReservedSchedulerKey(), reservedContainer);
      return getCSAssignmentFromAllocateResult(clusterResource, result,
          reservedContainer, node);
    }
  }

CapacityScheduler的比较器

 /**
   * Comparator that both looks at priority and utilization
   */
  private class PriorityQueueComparator implements Comparator {

    @Override
    public int compare(CSQueue q1, CSQueue q2) {
      String p = partitionToLookAt.get();

      int rc = compareQueueAccessToPartition(q1, q2, p);
      if (0 != rc) {
        return rc;
      }

      float q1AbsCapacity = q1.getQueueCapacities().getAbsoluteCapacity(p);
      float q2AbsCapacity = q2.getQueueCapacities().getAbsoluteCapacity(p);

      //If q1's abs capacity > 0 and q2 is 0, then prioritize q1
      if (Float.compare(q1AbsCapacity, 0f) > 0 && Float.compare(q2AbsCapacity,
          0f) == 0) {
        return -1;
        //If q2's abs capacity > 0 and q1 is 0, then prioritize q2
      } else if (Float.compare(q2AbsCapacity, 0f) > 0 && Float.compare(
          q1AbsCapacity, 0f) == 0) {
        return 1;
      } else if (Float.compare(q1AbsCapacity, 0f) == 0 && Float.compare(
          q2AbsCapacity, 0f) == 0) {
        // both q1 has 0 and q2 has 0 capacity, then fall back to using
        // priority, abs used capacity to prioritize
        float used1 = q1.getQueueCapacities().getAbsoluteUsedCapacity(p);
        float used2 = q2.getQueueCapacities().getAbsoluteUsedCapacity(p);

        return compare(q1, q2, used1, used2, p);
      } else{
        // both q1 has positive abs capacity and q2 has positive abs
        // capacity
        float used1 = q1.getQueueCapacities().getUsedCapacity(p);
        float used2 = q2.getQueueCapacities().getUsedCapacity(p);

        return compare(q1, q2, used1, used2, p);
      }
    }

    private int compare(CSQueue q1, CSQueue q2, float q1Used, float q2Used,
        String partition) {

      int p1 = 0;
      int p2 = 0;
      if (respectPriority) {
        p1 = q1.getPriority().getPriority();
        p2 = q2.getPriority().getPriority();
      }

      int rc = PriorityUtilizationQueueOrderingPolicy.compare(q1Used, q2Used,
          p1, p2);

      // For queue with same used ratio / priority, queue with higher configured
      // capacity goes first
      if (0 == rc) {
        Resource minEffRes1 =
            q1.getQueueResourceQuotas().getConfiguredMinResource(partition);
        Resource minEffRes2 =
            q2.getQueueResourceQuotas().getConfiguredMinResource(partition);
        if (!minEffRes1.equals(Resources.none()) && !minEffRes2.equals(
            Resources.none())) {
          return minEffRes2.compareTo(minEffRes1);
        }

        float abs1 = q1.getQueueCapacities().getAbsoluteCapacity(partition);
        float abs2 = q2.getQueueCapacities().getAbsoluteCapacity(partition);
        return Float.compare(abs2, abs1);
      }

      return rc;
    }

    private int compareQueueAccessToPartition(CSQueue q1, CSQueue q2,
        String partition) {
      // Everybody has access to default partition
      if (StringUtils.equals(partition, RMNodeLabelsManager.NO_LABEL)) {
        return 0;
      }

      /*
       * Check accessible to given partition, if one queue accessible and
       * the other not, accessible queue goes first.
       */
      boolean q1Accessible =
          q1.getAccessibleNodeLabels() != null && q1.getAccessibleNodeLabels()
              .contains(partition) || q1.getAccessibleNodeLabels().contains(
              RMNodeLabelsManager.ANY);
      boolean q2Accessible =
          q2.getAccessibleNodeLabels() != null && q2.getAccessibleNodeLabels()
              .contains(partition) || q2.getAccessibleNodeLabels().contains(
              RMNodeLabelsManager.ANY);
      if (q1Accessible && !q2Accessible) {
        return -1;
      } else if (!q1Accessible && q2Accessible) {
        return 1;
      }

      return 0;
    }
  }

PriorityUtilizationQueueOrderingPolicy#compare()方法

/**
   * Compare two queues with possibly different priority and assigned capacity,
   * Will be used by preemption policy as well.
   *
   * @param relativeAssigned1 relativeAssigned1
   * @param relativeAssigned2 relativeAssigned2
   * @param priority1 p1
   * @param priority2 p2
   * @return compared result
   */
  public static int compare(double relativeAssigned1, double relativeAssigned2,
      int priority1, int priority2) {
    if (priority1 == priority2) {
      // The queue with less relative used-capacity goes first
      return Double.compare(relativeAssigned1, relativeAssigned2);
    } else{
      // When priority is different:
      if ((relativeAssigned1 < 1.0f && relativeAssigned2 < 1.0f) || (
          relativeAssigned1 >= 1.0f && relativeAssigned2 >= 1.0f)) {
        // When both the queues are under their guaranteed capacities,
        // Or both the queues are over or meeting their guaranteed capacities
        // queue with higher used-capacity goes first
        return Integer.compare(priority2, priority1);
      } else{
        // Otherwise, when one of the queues is over or meeting their
        // guaranteed capacities and the other is under: The queue that is
        // under its capacity guarantee gets the resources.
        return Double.compare(relativeAssigned1, relativeAssigned2);
      }
    }
  }

 

你可能感兴趣的:(Yarn)