* 为一个Block的副本选择本地存放位置
private DatanodeDescriptor chooseLocalNode(DatanodeDescriptor localMachine, List excludedNodes, long blocksize, int maxNodesPerRack, List results) throws NotEnoughReplicasException {
// if no local machine, randomly choose one node
if (localMachine == null)
return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, maxNodesPerRack, results);
// otherwise try local machine first
if (!excludedNodes.contains(localMachine)) {
if (isGoodTarget(localMachine, blocksize, maxNodesPerRack, false, results)) {
return localMachine;
// try a node on local rack
return chooseLocalRack(localMachine, excludedNodes, blocksize, maxNodesPerRack, results);
private boolean isGoodTarget(DatanodeDescriptor node, long blockSize, int maxTargetPerLoc, boolean considerLoad, List results) {
Log logr = FSNamesystem.LOG;
// 节点不可用了
if (node.isDecommissionInProgress() || node.isDecommissioned()) {
logr.debug("Node "+NodeBase.getPath(node)+ " is not chosen because the node is (being) decommissioned");
return false;
long remaining = node.getRemaining() - (node.getBlocksScheduled() * blockSize);
// 节点剩余的容量够不够
if (blockSize* FSConstants.MIN_BLOCKS_FOR_WRITE>remaining) {
logr.debug("Node "+NodeBase.getPath(node)+ " is not chosen because the node does not have enough space");
return false;
// 节点当前的负载情况
if (considerLoad) {
double avgLoad = 0;
int size = clusterMap.getNumOfLeaves();
if (size != 0) {
avgLoad = (double)fs.getTotalLoad()/size;
if (node.getXceiverCount() > (2.0 * avgLoad)) {
logr.debug("Node "+NodeBase.getPath(node)+ " is not chosen because the node is too busy");
return false;
// 该节点坐在的机架被选择存放当前数据块副本的数据节点过多
String rackname = node.getNetworkLocation();
int counter=1;
for(Iterator iter = results.iterator(); iter.hasNext();) {
Node result = iter.next();
if (rackname.equals(result.getNetworkLocation())) {
if (counter>maxTargetPerLoc) {
logr.debug("Node "+NodeBase.getPath(node)+ " is not chosen because the rack has too many chosen nodes");
return false;
return true;
private DatanodeDescriptor chooseLocalRack(DatanodeDescriptor localMachine, List excludedNodes, long blocksize, int maxNodesPerRack, List results)throws NotEnoughReplicasException {
// 如果参考点为空,则从整个集群中随机选择一个合适的数据节点作为此时的本地机架节点
if (localMachine == null) {
return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, maxNodesPerRack, results);
try {
return chooseRandom(localMachine.getNetworkLocation(), excludedNodes, blocksize, maxNodesPerRack, results);
} catch (NotEnoughReplicasException e1) {
DatanodeDescriptor newLocal=null;
for(Iterator iter=results.iterator(); iter.hasNext();) {
DatanodeDescriptor nextNode = iter.next();
if (nextNode != localMachine) {
newLocal = nextNode;
if (newLocal != null) {//找到了一个新的参考点
try {
return chooseRandom(newLocal.getNetworkLocation(), excludedNodes, blocksize, maxNodesPerRack, results);
} catch(NotEnoughReplicasException e2) {
return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, maxNodesPerRack, results);
} else {
return chooseRandom(NodeBase.ROOT, excludedNodes, blocksize, maxNodesPerRack, results);
3.选择一个远程机架节点private void chooseRemoteRack(int numOfReplicas, DatanodeDescriptor localMachine, List excludedNodes, long blocksize, int maxReplicasPerRack, List results)
throws NotEnoughReplicasException {
int oldNumOfReplicas = results.size();
// randomly choose one node from remote racks
try {
chooseRandom(numOfReplicas, "~"+localMachine.getNetworkLocation(), excludedNodes, blocksize, maxReplicasPerRack, results);
} catch (NotEnoughReplicasException e) {
chooseRandom(numOfReplicas-(results.size()-oldNumOfReplicas), localMachine.getNetworkLocation(), excludedNodes, blocksize, maxReplicasPerRack, results);
private void chooseRandom(int numOfReplicas, String nodes, List excludedNodes, long blocksize, int maxNodesPerRack, List results) throws NotEnoughReplicasException {
boolean toContinue = true;
do {
DatanodeDescriptor[] selectedNodes = chooseRandom(numOfReplicas, nodes, excludedNodes);
if (selectedNodes.length < numOfReplicas) {
toContinue = false;
for(int i=0; i0 && toContinue);
if (numOfReplicas>0) {
throw new NotEnoughReplicasException( "Not able to place enough replicas");
private DatanodeDescriptor[] chooseRandom(int numOfReplicas, String nodes, List excludedNodes) {
List results = new ArrayList();
int numOfAvailableNodes = clusterMap.countNumOfAvailableNodes(nodes, excludedNodes);
numOfReplicas = (numOfAvailableNodes 0) {
DatanodeDescriptor choosenNode = (DatanodeDescriptor)(clusterMap.chooseRandom(nodes));
if (!excludedNodes.contains(choosenNode)) {
return (DatanodeDescriptor[])results.toArray(new DatanodeDescriptor[results.size()]);
private DatanodeDescriptor[] getPipeline( DatanodeDescriptor writer, DatanodeDescriptor[] nodes) {
if (nodes.length==0) return nodes;
synchronized(clusterMap) {
int index=0;
if (writer == null || !clusterMap.contains(writer)) {
writer = nodes[0];
for(;indexcurrentDistance) {
shortestDistance = currentDistance;
shortestNode = currentNode;
shortestIndex = i;
//switch position index & shortestIndex
if (index != shortestIndex) {
nodes[shortestIndex] = nodes[index];
nodes[index] = shortestNode;
writer = shortestNode;
return nodes;
private DatanodeDescriptor chooseTarget(int numOfReplicas, DatanodeDescriptor writer, List excludedNodes, long blocksize, int maxNodesPerRack, List results) {
if (numOfReplicas == 0 || clusterMap.getNumOfLeaves()==0) {
return writer;
int numOfResults = results.size();
boolean newBlock = (numOfResults==0);
if (writer == null && !newBlock) {
writer = (DatanodeDescriptor)results.get(0);
try {
switch(numOfResults) {
case 0:
LOG.debug("Try to choose a local DataNode for a replication of block..");
writer = chooseLocalNode(writer, excludedNodes, blocksize, maxNodesPerRack, results);
if (--numOfReplicas == 0) {
case 1:
LOG.debug("Try to choose a remote DataNode for a replication of block..");
chooseRemoteRack(1, results.get(0), excludedNodes, blocksize, maxNodesPerRack, results);
if (--numOfReplicas == 0) {
case 2:
LOG.debug("Try to choose a local rack DataNode for a replication of block..");
if (clusterMap.isOnSameRack(results.get(0), results.get(1))) {
chooseRemoteRack(1, results.get(0), excludedNodes, blocksize, maxNodesPerRack, results);
} else if (newBlock){
chooseLocalRack(results.get(1), excludedNodes, blocksize, maxNodesPerRack, results);
} else {
chooseLocalRack(writer, excludedNodes, blocksize, maxNodesPerRack, results);
if (--numOfReplicas == 0) {
LOG.debug("Try to randomly choose a local DataNode for a replication of block..");
chooseRandom(numOfReplicas, NodeBase.ROOT, excludedNodes, blocksize, maxNodesPerRack, results);
} catch (NotEnoughReplicasException e) {
FSNamesystem.LOG.warn("Not able to place enough replicas, still in need of " + numOfReplicas);
return writer;