BlockPlacementPolicyDefault这个类负责为一个数据块的各副本选择目标数据结点。
副本放置策略如下:如果写入者在一个datanode上,那么第一个副本在本机。否则随机选取一个结点。第二个副本先在另一个机架上,第三个副本被放置在同第二个副本同一机架,但不同的数据结点上。
/** * The class is responsible for choosing the desired number of targets * for placing block replicas. * The replica placement strategy is that if the writer is on a datanode, * the 1st replica is placed on the local machine, * otherwise a random datanode. The 2nd replica is placed on a datanode * that is on a different rack. The 3rd replica is placed on a datanode * which is on a different node of the rack as the second replica. */
initialize方法如下:
@Override public void initialize(Configuration conf, FSClusterStats stats, NetworkTopology clusterMap, Host2NodesMap host2datanodeMap) { this.considerLoad = conf.getBoolean( DFSConfigKeys.DFS_NAMENODE_REPLICATION_CONSIDERLOAD_KEY, true); this.stats = stats; this.clusterMap = clusterMap; this.host2datanodeMap = host2datanodeMap; this.heartbeatInterval = conf.getLong( DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_KEY, DFSConfigKeys.DFS_HEARTBEAT_INTERVAL_DEFAULT) * 1000; this.tolerateHeartbeatMultiplier = conf.getInt( DFSConfigKeys.DFS_NAMENODE_TOLERATE_HEARTBEAT_MULTIPLIER_KEY, DFSConfigKeys.DFS_NAMENODE_TOLERATE_HEARTBEAT_MULTIPLIER_DEFAULT); this.staleInterval = conf.getLong( DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_INTERVAL_KEY, DFSConfigKeys.DFS_NAMENODE_STALE_DATANODE_INTERVAL_DEFAULT); }
以下的方法为写入器选择numOfReplicas个数据结点来存储一个数据块的副本,数据块大小为blocksize。如果数量不够numOfReplicas,尽可能多返回。
@param srcPath 这个方法返回的数据块是哪个文件的。
@param numOfReplicas 需要更多的副本。
@param write, 写入器所在的服务器,如果不是集群中的服务器,则为空。
@param chosen 已经选择作为目标的数据结点。
@param returnChosenNodes 如果为真,那么返回已经选择的数据结点。
@param excludedNodes 这个列表中的结点应该排除在外,不能被选为目标结点。
@param blocksize,数据要写入的大小。
@return 返回DatanodeDescriptor的实例数组,这些结点作为此数据块的目标结点,并且被为作一个pipeline被排序。/** This is the implementation. */ private DatanodeStorageInfo[] chooseTarget(int numOfReplicas, Node writer, List<DatanodeStorageInfo> chosenStorage, boolean returnChosenNodes, Set<Node> excludedNodes, long blocksize, final BlockStoragePolicy storagePolicy) { if (numOfReplicas == 0 || clusterMap.getNumOfLeaves()==0) { return DatanodeStorageInfo.EMPTY_ARRAY; } if (excludedNodes == null) { excludedNodes = new HashSet<Node>(); } //在本机调试 int[] result = getMaxNodesPerRack(chosenStorage.size(), numOfReplicas); numOfReplicas = result[0]; int maxNodesPerRack = result[1]; final List<DatanodeStorageInfo> results = new ArrayList<DatanodeStorageInfo>(chosenStorage); for (DatanodeStorageInfo storage : chosenStorage) { // add localMachine and related nodes to excludedNodes addToExcludedNodes(storage.getDatanodeDescriptor(), excludedNodes); } boolean avoidStaleNodes = (stats != null && stats.isAvoidingStaleDataNodesForWrite()); final Node localNode = chooseTarget(numOfReplicas, writer, excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes, storagePolicy, EnumSet.noneOf(StorageType.class), results.isEmpty()); if (!returnChosenNodes) { results.removeAll(chosenStorage); } // sorting nodes to form a pipeline return getPipeline( (writer != null && writer instanceof DatanodeDescriptor) ? writer : localNode, results.toArray(new DatanodeStorageInfo[results.size()])); }getMaxNodesPerRack方法计算每个机架允许分配的最大副本数,它也限定了一个集群中所有结点的总共副本数量。
在本机调试,numOfChosen为0,numOfReplicas
/** * Calculate the maximum number of replicas to allocate per rack. It also * limits the total number of replicas to the total number of nodes in the * cluster. Caller should adjust the replica count to the return value. * * @param numOfChosen The number of already chosen nodes. * @param numOfReplicas The number of additional nodes to allocate. * @return integer array. Index 0: The number of nodes allowed to allocate * in addition to already chosen nodes. * Index 1: The maximum allowed number of nodes per rack. This * is independent of the number of chosen nodes, as it is calculated * using the target number of replicas. */ private int[] getMaxNodesPerRack(int numOfChosen, int numOfReplicas) { int clusterSize = clusterMap.getNumOfLeaves();//datanode的数量 int totalNumOfReplicas = numOfChosen + numOfReplicas;//总共的副本数量 if (totalNumOfReplicas > clusterSize) { //如果总的数量大于集群datanode的数量 numOfReplicas -= (totalNumOfReplicas-clusterSize); //修正副本的数量 totalNumOfReplicas = clusterSize; //总的副本的数量等于集群datanode的数量 } // No calculation needed when there is only one rack or picking one node.//如果只有一个rack或者只选取一个结点,那么没有必要计算。 int numOfRacks = clusterMap.getNumOfRacks(); if (numOfRacks == 1 || totalNumOfReplicas <= 1) { return new int[] {numOfReplicas, totalNumOfReplicas}; } int maxNodesPerRack = (totalNumOfReplicas-1)/numOfRacks + 2;//如果totalNumOfReplicas-1 < numOfRacks,那么maxNodesPerRack为2。 //如果totalNumOfReplicas - 1 = numOfRacks 那么 maxNodesPerRack=3,如果totalNumOfReplicas-1 > numOfRacks,那么就在各机架平均分配值再加2。 // At this point, there are more than one racks and more than one replicas // to store. Avoid all replicas being in the same rack. // // maxNodesPerRack has the following properties at this stage. // 1) maxNodesPerRack >= 2 // 2) (maxNodesPerRack-1) * numOfRacks > totalNumOfReplicas // when numOfRacks > 1 // // Thus, the following adjustment will still result in a value that forces // multi-rack allocation and gives enough number of total nodes. if (maxNodesPerRack == totalNumOfReplicas) { maxNodesPerRack--; } return new int[] {numOfReplicas, maxNodesPerRack}; }
接下来,调用以下方法:
/** * choose <i>numOfReplicas</i> from all data nodes * @param numOfReplicas additional number of replicas wanted * @param writer the writer's machine, could be a non-DatanodeDescriptor node * @param excludedNodes datanodes that should not be considered as targets * @param blocksize size of the data to be written * @param maxNodesPerRack max nodes allowed per rack * @param results the target nodes already chosen * @param avoidStaleNodes avoid stale nodes in replica choosing * @return local node of writer (not chosen node) */ private Node chooseTarget(int numOfReplicas, Node writer, final Set<Node> excludedNodes, final long blocksize, final int maxNodesPerRack, final List<DatanodeStorageInfo> results, final boolean avoidStaleNodes, final BlockStoragePolicy storagePolicy, final EnumSet<StorageType> unavailableStorages, final boolean newBlock) { if (numOfReplicas == 0 || clusterMap.getNumOfLeaves()==0) { return (writer instanceof DatanodeDescriptor) ? writer : null; } final int numOfResults = results.size(); final int totalReplicasExpected = numOfReplicas + numOfResults; if ((writer == null || !(writer instanceof DatanodeDescriptor)) && !newBlock) { writer = results.get(0).getDatanodeDescriptor(); } // Keep a copy of original excludedNodes final Set<Node> oldExcludedNodes = new HashSet<Node>(excludedNodes); // choose storage types; use fallbacks for unavailable storages final List<StorageType> requiredStorageTypes = storagePolicy .chooseStorageTypes((short) totalReplicasExpected, DatanodeStorageInfo.toStorageTypes(results), unavailableStorages, newBlock); final EnumMap<StorageType, Integer> storageTypes = getRequiredStorageTypes(requiredStorageTypes); if (LOG.isTraceEnabled()) { LOG.trace("storageTypes=" + storageTypes); } try { if ((numOfReplicas = requiredStorageTypes.size()) == 0) { throw new NotEnoughReplicasException( "All required storage types are unavailable: " + " unavailableStorages=" + unavailableStorages + ", storagePolicy=" + storagePolicy); } if (numOfResults == 0) { writer = chooseLocalStorage(writer, excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes, true) .getDatanodeDescriptor(); if (--numOfReplicas == 0) { return writer; } } final DatanodeDescriptor dn0 = results.get(0).getDatanodeDescriptor(); if (numOfResults <= 1) { chooseRemoteRack(1, dn0, excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes); if (--numOfReplicas == 0) { return writer; } } if (numOfResults <= 2) { final DatanodeDescriptor dn1 = results.get(1).getDatanodeDescriptor(); if (clusterMap.isOnSameRack(dn0, dn1)) { chooseRemoteRack(1, dn0, excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes); } else if (newBlock){ chooseLocalRack(dn1, excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes); } else { chooseLocalRack(writer, excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes); } if (--numOfReplicas == 0) { return writer; } } chooseRandom(numOfReplicas, NodeBase.ROOT, excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes); } catch (NotEnoughReplicasException e) { final String message = "Failed to place enough replicas, still in need of " + (totalReplicasExpected - results.size()) + " to reach " + totalReplicasExpected + " (unavailableStorages=" + unavailableStorages + ", storagePolicy=" + storagePolicy + ", newBlock=" + newBlock + ")"; if (LOG.isTraceEnabled()) { LOG.trace(message, e); } else { LOG.warn(message + " " + e.getMessage()); } if (avoidStaleNodes) { // Retry chooseTarget again, this time not avoiding stale nodes. // excludedNodes contains the initial excludedNodes and nodes that were // not chosen because they were stale, decommissioned, etc. // We need to additionally exclude the nodes that were added to the // result list in the successful calls to choose*() above. for (DatanodeStorageInfo resultStorage : results) { addToExcludedNodes(resultStorage.getDatanodeDescriptor(), oldExcludedNodes); } // Set numOfReplicas, since it can get out of sync with the result list // if the NotEnoughReplicasException was thrown in chooseRandom(). numOfReplicas = totalReplicasExpected - results.size(); return chooseTarget(numOfReplicas, writer, oldExcludedNodes, blocksize, maxNodesPerRack, results, false, storagePolicy, unavailableStorages, newBlock); } boolean retry = false; // simply add all the remaining types into unavailableStorages and give // another try. No best effort is guaranteed here. for (StorageType type : storageTypes.keySet()) { if (!unavailableStorages.contains(type)) { unavailableStorages.add(type); retry = true; } } if (retry) { for (DatanodeStorageInfo resultStorage : results) { addToExcludedNodes(resultStorage.getDatanodeDescriptor(), oldExcludedNodes); } numOfReplicas = totalReplicasExpected - results.size(); return chooseTarget(numOfReplicas, writer, oldExcludedNodes, blocksize, maxNodesPerRack, results, false, storagePolicy, unavailableStorages, newBlock); } } return writer; }
/** * Choose <i>localMachine</i> as the target. * if <i>localMachine</i> is not available, * choose a node on the same rack * @return the chosen storage */ protected DatanodeStorageInfo chooseLocalStorage(Node localMachine, Set<Node> excludedNodes, long blocksize, int maxNodesPerRack, List<DatanodeStorageInfo> results, boolean avoidStaleNodes, EnumMap<StorageType, Integer> storageTypes, boolean fallbackToLocalRack) throws NotEnoughReplicasException { // if no local machine, randomly choose one node if (localMachine == null) { return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes); } if (preferLocalNode && localMachine instanceof DatanodeDescriptor) { DatanodeDescriptor localDatanode = (DatanodeDescriptor) localMachine; // otherwise try local machine first if (excludedNodes.add(localMachine)) { // was not in the excluded list for (Iterator<Map.Entry<StorageType, Integer>> iter = storageTypes .entrySet().iterator(); iter.hasNext(); ) { Map.Entry<StorageType, Integer> entry = iter.next(); for (DatanodeStorageInfo localStorage : DFSUtil.shuffle( localDatanode.getStorageInfos())) { StorageType type = entry.getKey(); if (addIfIsGoodTarget(localStorage, excludedNodes, blocksize, maxNodesPerRack, false, results, avoidStaleNodes, type) >= 0) { int num = entry.getValue(); if (num == 1) { iter.remove(); } else { entry.setValue(num - 1); } return localStorage; } } } } } if (!fallbackToLocalRack) { return null; } // try a node on local rack return chooseLocalRack(localMachine, excludedNodes, blocksize, maxNodesPerRack, results, avoidStaleNodes, storageTypes); }
/** * If the given storage is a good target, add it to the result list and * update the set of excluded nodes. * @return -1 if the given is not a good target; * otherwise, return the number of nodes added to excludedNodes set. */ int addIfIsGoodTarget(DatanodeStorageInfo storage, Set<Node> excludedNodes, long blockSize, int maxNodesPerRack, boolean considerLoad, List<DatanodeStorageInfo> results, boolean avoidStaleNodes, StorageType storageType) { if (isGoodTarget(storage, blockSize, maxNodesPerRack, considerLoad, results, avoidStaleNodes, storageType)) { results.add(storage); // add node and related nodes to excludedNode return addToExcludedNodes(storage.getDatanodeDescriptor(), excludedNodes); } else { return -1; } }
/** * Determine if a storage is a good target. * * @param storage The target storage * @param blockSize Size of block * @param maxTargetPerRack Maximum number of targets per rack. The value of * this parameter depends on the number of racks in * the cluster and total number of replicas for a block * @param considerLoad whether or not to consider load of the target node * @param results A list containing currently chosen nodes. Used to check if * too many nodes has been chosen in the target rack. * @param avoidStaleNodes Whether or not to avoid choosing stale nodes * @return Return true if <i>node</i> has enough space, * does not have too much load, * and the rack does not have too many nodes. */ private boolean isGoodTarget(DatanodeStorageInfo storage, long blockSize, int maxTargetPerRack, boolean considerLoad, List<DatanodeStorageInfo> results, boolean avoidStaleNodes, StorageType requiredStorageType) { if (storage.getStorageType() != requiredStorageType) { logNodeIsNotChosen(storage, "storage types do not match," + " where the required storage type is " + requiredStorageType); return false; } if (storage.getState() == State.READ_ONLY_SHARED) { logNodeIsNotChosen(storage, "storage is read-only"); return false; } if (storage.getState() == State.FAILED) { logNodeIsNotChosen(storage, "storage has failed"); return false; } DatanodeDescriptor node = storage.getDatanodeDescriptor(); // check if the node is (being) decommissioned if (node.isDecommissionInProgress() || node.isDecommissioned()) { logNodeIsNotChosen(storage, "the node is (being) decommissioned "); return false; } if (avoidStaleNodes) { if (node.isStale(this.staleInterval)) { logNodeIsNotChosen(storage, "the node is stale "); return false; } } final long requiredSize = blockSize * HdfsConstants.MIN_BLOCKS_FOR_WRITE; final long scheduledSize = blockSize * node.getBlocksScheduled(storage.getStorageType()); final long remaining = node.getRemaining(storage.getStorageType(), requiredSize); if (requiredSize > remaining - scheduledSize) { logNodeIsNotChosen(storage, "the node does not have enough " + storage.getStorageType() + " space" + " (required=" + requiredSize + ", scheduled=" + scheduledSize + ", remaining=" + remaining + ")"); return false; } // check the communication traffic of the target machine if (considerLoad) { final double maxLoad = 2.0 * stats.getInServiceXceiverAverage(); final int nodeLoad = node.getXceiverCount(); if (nodeLoad > maxLoad) { logNodeIsNotChosen(storage, "the node is too busy (load: " + nodeLoad + " > " + maxLoad + ") "); return false; } } // check if the target rack has chosen too many nodes String rackname = node.getNetworkLocation(); int counter=1; for(DatanodeStorageInfo resultStorage : results) { if (rackname.equals( resultStorage.getDatanodeDescriptor().getNetworkLocation())) { counter++; } } if (counter>maxTargetPerRack) { logNodeIsNotChosen(storage, "the rack has too many chosen nodes "); return false; } return true; }
getPipeline方法如下:
/** * Return a pipeline of nodes. * The pipeline is formed finding a shortest path that * starts from the writer and traverses all <i>nodes</i> * This is basically a traveling salesman problem. */ private DatanodeStorageInfo[] getPipeline(Node writer, DatanodeStorageInfo[] storages) { if (storages.length == 0) { return storages; } synchronized(clusterMap) { int index=0; if (writer == null || !clusterMap.contains(writer)) { writer = storages[0].getDatanodeDescriptor(); } for(; index < storages.length; index++) { DatanodeStorageInfo shortestStorage = storages[index]; int shortestDistance = clusterMap.getDistance(writer, shortestStorage.getDatanodeDescriptor()); int shortestIndex = index; for(int i = index + 1; i < storages.length; i++) { int currentDistance = clusterMap.getDistance(writer, storages[i].getDatanodeDescriptor()); if (shortestDistance>currentDistance) { shortestDistance = currentDistance; shortestStorage = storages[i]; shortestIndex = i; } } //switch position index & shortestIndex if (index != shortestIndex) { storages[shortestIndex] = storages[index]; storages[index] = shortestStorage; } writer = shortestStorage.getDatanodeDescriptor(); } } return storages; }