public static final long DISABLED_MEMORY_LIMIT = -1L; static final String MAPRED_CLUSTER_MAP_MEMORY_MB_PROPERTY = "mapred.cluster.map.memory.mb"; static final String MAPRED_CLUSTER_REDUCE_MEMORY_MB_PROPERTY = "mapred.cluster.reduce.memory.mb"; maxCurrentMapTasks = conf.getInt("mapred.tasktracker.map.tasks.maximum", 2); maxCurrentReduceTasks = conf.getInt("mapred.tasktracker.reduce.tasks.maximum", 2); mapSlotMemorySizeOnTT = fConf.getLong( JobTracker.MAPRED_CLUSTER_MAP_MEMORY_MB_PROPERTY, JobConf.DISABLED_MEMORY_LIMIT); reduceSlotSizeMemoryOnTT = fConf.getLong(JobTracker.MAPRED_CLUSTER_REDUCE_MEMORY_MB_PROPERTY, JobConf.DISABLED_MEMORY_LIMIT); totalMemoryAllottedForTasks = maxCurrentMapTasks * mapSlotMemorySizeOnTT + maxCurrentReduceTasks * reduceSlotSizeMemoryOnTT;首先必须强调的是,TaskTracker节点的内存管理器所监控的内存使用量指的是JVM实例使用的内存(JVM进程是该工作节点为执行分配的Map/Reduce任务而开启)。当一个TaskTracker节点设置了内存使用上限值时,它就会在启动的时候开启这个内存管理器TaskMomeryManagerThread,显然,TaskMomeryManagerThread是一个后台工作线程,它的工作流程如下:
public void run() { LOG.info("Starting thread: " + this.getClass()); while (true) { // Print the processTrees for debugging. if (LOG.isDebugEnabled()) { StringBuffer tmp = new StringBuffer("[ "); for (ProcessTreeInfo p : processTreeInfoMap.values()) { tmp.append(p.getPID()); tmp.append(" "); } LOG.debug("Current ProcessTree list : " + tmp.substring(0, tmp.length()) + "]"); } //监控新添加的任务 synchronized (tasksToBeAdded) { processTreeInfoMap.putAll(tasksToBeAdded); tasksToBeAdded.clear(); } //取消对已完成任务的监控 synchronized (tasksToBeRemoved) { for (TaskAttemptID tid : tasksToBeRemoved) { processTreeInfoMap.remove(tid); } tasksToBeRemoved.clear(); } long memoryStillInUsage = 0; //计算正在节点上执行的任务所占用的内存总和 for (Iterator<Map.Entry<TaskAttemptID, ProcessTreeInfo>> it = processTreeInfoMap.entrySet().iterator(); it.hasNext();) { Map.Entry<TaskAttemptID, ProcessTreeInfo> entry = it.next(); TaskAttemptID tid = entry.getKey(); ProcessTreeInfo ptInfo = entry.getValue(); try { String pId = ptInfo.getPID(); // Initialize any uninitialized processTrees if (pId == null) { // get pid from pid-file pId = getPid(ptInfo.pidFile); if (pId != null) { // PID will be null, either if the pid file is yet to be created // or if the tip is finished and we removed pidFile, but the TIP // itself is still retained in runningTasks till successful // transmission to JT // create process tree object ProcfsBasedProcessTree pt = new ProcfsBasedProcessTree(pId); LOG.debug("Tracking ProcessTree " + pId + " for the first time"); ptInfo.setPid(pId); ptInfo.setProcessTree(pt); } } // End of initializing any uninitialized processTrees if (pId == null) { continue; // processTree cannot be tracked } LOG.debug("Constructing ProcessTree for : PID = " + pId + " TID = " + tid); ProcfsBasedProcessTree pTree = ptInfo.getProcessTree(); pTree = pTree.getProcessTree(); // get the updated process-tree ptInfo.setProcessTree(pTree); // update ptInfo with proces-tree of // updated state long currentMemUsage = pTree.getCumulativeVmem(); // as processes begin with an age 1, we want to see if there // are processes more than 1 iteration old. long curMemUsageOfAgedProcesses = pTree.getCumulativeVmem(1); long limit = ptInfo.getMemLimit(); LOG.info("Memory usage of ProcessTree " + pId + " :" + currentMemUsage + "bytes. Limit : " + limit + "bytes"); //检查当前任务所占用的内存是否超过了它所设置的最大内存使用量 if (isProcessTreeOverLimit(tid.toString(), currentMemUsage, curMemUsageOfAgedProcesses, limit)) { // Task (the root process) is still alive and overflowing memory. // Clean up. String msg = "TaskTree [pid=" + pId + ",tipID=" + tid + "] is running beyond memory-limits. Current usage : " + currentMemUsage + "bytes. Limit : " + limit + "bytes. Killing task."; LOG.warn(msg); taskTracker.cleanUpOverMemoryTask(tid, true, msg); //kill掉当前正在执行的任务,由于它的内存使用超过限制. pTree.destroy(); it.remove(); LOG.info("Removed ProcessTree with root " + pId); } else { // Accounting the total memory in usage for all tasks that are still // alive and within limits. memoryStillInUsage += currentMemUsage; } } catch (Exception e) { // Log the exception and proceed to the next task. LOG.warn("Uncaught exception in TaskMemoryManager " + "while managing memory of " + tid + " : " + StringUtils.stringifyException(e)); } } //如果内存使用总量超过设置的上限值则组要kill合适的正在执行的任务 if (memoryStillInUsage > maxMemoryAllowedForAllTasks) { LOG.warn("The total memory in usage " + memoryStillInUsage + " is still overflowing TTs limits " + maxMemoryAllowedForAllTasks + ". Trying to kill a few tasks with the least progress."); killTasksWithLeastProgress(memoryStillInUsage); } // Sleep for some time before beginning next cycle try { LOG.debug(this.getClass() + " : Sleeping for " + monitoringInterval + " ms"); Thread.sleep(monitoringInterval); } catch (InterruptedException ie) { LOG.warn(this.getClass() + " interrupted. Finishing the thread and returning."); return; } }从上面的代码可以看出,TaskMemoeryManagerThread的工作流程很简单,它每隔 monitoringInterval ms 就会统计一次正在运行的任务所占用的系统总内存,如果该TaskTracker节点当前正在执行的任务占用的总内存超过设置的阈值,内存管理器就会kill掉一些正在执行的任务,以保证内存使用总量低于这个阈值。不过,在统计之前,它需要加上新运行的任务,删除已经运行完了的任务。Task内存使用量的统计间隔时间 monitoringInterval是通过TaskTracker节点的配置文件来设置的,对应的配置项为:mapred.tasktracker.taskmemory.monitoring-interval。这里就有一个问题了,TaskTracker节点是把每一个Map/Reduce任务交给对应的一个JVM实例来执行的,那么内存管理器是如何准确的获取到这些JVM进程的内存使用量的?
话又说回来,一个不会开启子进程的任务所能使用的内存上限最终取决于系统分配给对应的JVM实例的内存总量,为了解决一些特殊的作业内存限制问题,Hadoop在Job级别开放了一个设置参数来配置运行该作业任务的JVM内存分配,该配置项为:mapred.child.java.opts,值的形式如:–Xms256m –Xmx256m –Xmn64m。