Node 代表一个子结点或者一个中间结点。
/** The interface defines a node in a network topology. * A node may be a leave representing a data node or an inner * node representing a datacenter or rack. * Each data has a name and its location in the network is * decided by a string with syntax similar to a file name. * For example, a data node's name is hostname:port# and if it's located at * rack "orange" in datacenter "dog", the string representation of its * network location is /dog/orange */ @InterfaceAudience.LimitedPrivate({"HDFS", "MapReduce"}) @InterfaceStability.Unstable public interface Node { /** @return the string representation of this node's network location */ public String getNetworkLocation(); /** Set this node's network location * @param location the location */ public void setNetworkLocation(String location); /** @return this node's name */ public String getName(); /** @return this node's parent */ public Node getParent(); /** Set this node's parent * @param parent the parent */ public void setParent(Node parent); /** @return this node's level in the tree. * E.g. the root of a tree returns 0 and its children return 1 */ public int getLevel(); /** Set this node's level in the tree * @param i the level */ public void setLevel(int i); }
DatanodeId代表一个Datanode结点的基本标识。Datanodes用(hostname,port)的它们的storageID连在一起标识。DatanodeId里有hostName字段和peerHostName字段,hostName是Datanode上报的内容。peerHostName是Namenode根据ipAddr和/etc/hosts的配置反解析过来的。
/** * This class represents the primary identifier for a Datanode. * Datanodes are identified by how they can be contacted (hostname * and ports) and their storage ID, a unique number that associates * the Datanodes blocks with a particular Datanode. * * {@link DatanodeInfo#getName()} should be used to get the network * location (for topology) of a datanode, instead of using * {@link DatanodeID#getXferAddr()} here. Helpers are defined below * for each context in which a DatanodeID is used. */ @InterfaceAudience.Private @InterfaceStability.Evolving public class DatanodeID implements Comparable<DatanodeID> { public static final DatanodeID[] EMPTY_ARRAY = {}; private String ipAddr; // IP address private String hostName; // hostname claimed by datanode private String peerHostName; // hostname from the actual connection private int xferPort; // data streaming port private int infoPort; // info server port private int infoSecurePort; // info server port private int ipcPort; // IPC server port /** * UUID identifying a given datanode. For upgraded Datanodes this is the * same as the StorageID that was previously used by this Datanode. * For newly formatted Datanodes it is a UUID. */ private String datanodeUuid = null;
/** * This class extends the primary identifier of a Datanode with ephemeral * state, eg usage information, current administrative state, and the * network location that is communicated to clients. */ @InterfaceAudience.Private @InterfaceStability.Evolving public class DatanodeInfo extends DatanodeID implements Node { private long capacity; private long dfsUsed; private long remaining; private long blockPoolUsed; private long cacheCapacity; private long cacheUsed; private long lastUpdate; private int xceiverCount; private String location = NetworkTopology.DEFAULT_RACK; private String softwareVersion; private List<String> dependentHostNames = new LinkedList<String>(); protected AdminStates adminState; private transient int level; //which level of the tree the node resides private transient Node parent; //its parent
/** * This class extends the DatanodeInfo class with ephemeral information (eg * health, capacity, what blocks are associated with the Datanode) that is * private to the Namenode, ie this class is not exposed to clients. */
有以下内部类,BlockTargetPair存放Block和DatanodeStorageInfo的对照关系,通俗的讲,即数据块在哪个磁盘上存储。
/** Block and targets pair */ @InterfaceAudience.Private @InterfaceStability.Evolving public static class BlockTargetPair { public final Block block; public final DatanodeStorageInfo[] targets; BlockTargetPair(Block block, DatanodeStorageInfo[] targets) { this.block = block; this.targets = targets; } }
以下的内部类是用LinkedList实现的一个BlockQueue
/** A BlockTargetPair queue. */ private static class BlockQueue<E> { private final Queue<E> blockq = new LinkedList<E>(); /** Size of the queue */ synchronized int size() {return blockq.size();} /** Enqueue */ synchronized boolean offer(E e) { return blockq.offer(e); } /** Dequeue */ synchronized List<E> poll(int numBlocks) { if (numBlocks <= 0 || blockq.isEmpty()) { return null; } List<E> results = new ArrayList<E>(); for(; !blockq.isEmpty() && numBlocks > 0; numBlocks--) { results.add(blockq.poll()); } return results; } /** * Returns <tt>true</tt> if the queue contains the specified element. */ boolean contains(E e) { return blockq.contains(e); } synchronized void clear() { blockq.clear(); } }
CachedBlocksList
/** * A list of CachedBlock objects on this datanode. */ public static class CachedBlocksList extends IntrusiveCollection<CachedBlock> { public enum Type { PENDING_CACHED, CACHED, PENDING_UNCACHED } private final DatanodeDescriptor datanode; private final Type type; CachedBlocksList(DatanodeDescriptor datanode, Type type) { this.datanode = datanode; this.type = type; } public DatanodeDescriptor getDatanode() { return datanode; } public Type getType() { return type; } }
/** * process datanode heartbeat or stats initialization. */ public void updateHeartbeatState(StorageReport[] reports, long cacheCapacity, long cacheUsed, int xceiverCount, int volFailures) { long totalCapacity = 0; long totalRemaining = 0; long totalBlockPoolUsed = 0; long totalDfsUsed = 0; Set<DatanodeStorageInfo> failedStorageInfos = null; // Decide if we should check for any missing StorageReport and mark it as // failed. There are different scenarios. // 1. When DN is running, a storage failed. Given the current DN // implementation doesn't add recovered storage back to its storage list // until DN restart, we can assume volFailures won't decrease // during the current DN registration session. // When volumeFailures == this.volumeFailures, it implies there is no // state change. No need to check for failed storage. This is an // optimization. // 2. After DN restarts, volFailures might not increase and it is possible // we still have new failed storage. For example, admins reduce // available storages in configuration. Another corner case // is the failed volumes might change after restart; a) there // is one good storage A, one restored good storage B, so there is // one element in storageReports and that is A. b) A failed. c) Before // DN sends HB to NN to indicate A has failed, DN restarts. d) After DN // restarts, storageReports has one element which is B. boolean checkFailedStorages = (volFailures > this.volumeFailures) || !heartbeatedSinceRegistration; if (checkFailedStorages) {//默认每种storage都是失效的,然后如果上报这个storage的信息,再去掉这个storage. LOG.info("Number of failed storage changes from " + this.volumeFailures + " to " + volFailures); failedStorageInfos = new HashSet<DatanodeStorageInfo>( storageMap.values()); } setCacheCapacity(cacheCapacity); setCacheUsed(cacheUsed); setXceiverCount(xceiverCount); setLastUpdate(Time.now()); this.volumeFailures = volFailures; for (StorageReport report : reports) { DatanodeStorageInfo storage = updateStorage(report.getStorage()); if (checkFailedStorages) {//每一个上报的storage代表这个storage正常工作。 failedStorageInfos.remove(storage); } storage.receivedHeartbeat(report); totalCapacity += report.getCapacity(); totalRemaining += report.getRemaining(); totalBlockPoolUsed += report.getBlockPoolUsed(); totalDfsUsed += report.getDfsUsed(); } rollBlocksScheduled(getLastUpdate()); // Update total metrics for the node. setCapacity(totalCapacity); setRemaining(totalRemaining); setBlockPoolUsed(totalBlockPoolUsed); setDfsUsed(totalDfsUsed); if (checkFailedStorages) { updateFailedStorage(failedStorageInfos); } }
在面有一个循环,调用DatanodeStorageInfo storage = updateStorage(report.getStorage());
updateStorage方法的代码如下:
DatanodeStorageInfo updateStorage(DatanodeStorage s) { synchronized (storageMap) { DatanodeStorageInfo storage = storageMap.get(s.getStorageID()); if (storage == null) { LOG.info("Adding new storage ID " + s.getStorageID() + " for DN " + getXferAddr()); storage = new DatanodeStorageInfo(this, s); storageMap.put(s.getStorageID(), storage); } else if (storage.getState() != s.getState() || storage.getStorageType() != s.getStorageType()) { // For backwards compatibility, make sure that the type and // state are updated. Some reports from older datanodes do // not include these fields so we may have assumed defaults. // This check can be removed in the next major release after // 2.4. storage.updateFromStorage(s); storageMap.put(storage.getStorageID(), storage); } return storage; } }
然后调用rollBlocksScheduled,默认计算10分钟内该datanode每种存储类型被安排写数据块的次数,代码如下:
/** Adjusts curr and prev number of blocks scheduled every few minutes. */ private void rollBlocksScheduled(long now) { if (now - lastBlocksScheduledRollTime > BLOCKS_SCHEDULED_ROLL_INTERVAL) { prevApproxBlocksScheduled.set(currApproxBlocksScheduled); currApproxBlocksScheduled.reset(); lastBlocksScheduledRollTime = now; } }
/* Variables for maintaining number of blocks scheduled to be written to * this storage. This count is approximate and might be slightly bigger * in case of errors (e.g. datanode does not report if an error occurs * while writing the block). */ private EnumCounters<StorageType> currApproxBlocksScheduled = new EnumCounters<StorageType>(StorageType.class); private EnumCounters<StorageType> prevApproxBlocksScheduled = new EnumCounters<StorageType>(StorageType.class);
Iterator<BlockInfo> getBlockIterator() { return new BlockIterator(getStorageInfos()); }
BlockIterator类是一个静态内部类,比较好玩,因为它的每个元素同时又是一个Iterator对象。
private static class BlockIterator implements Iterator<BlockInfo> { private int index = 0; private final List<Iterator<BlockInfo>> iterators; private BlockIterator(final DatanodeStorageInfo... storages) { List<Iterator<BlockInfo>> iterators = new ArrayList<Iterator<BlockInfo>>(); for (DatanodeStorageInfo e : storages) { iterators.add(e.getBlockIterator()); } this.iterators = Collections.unmodifiableList(iterators); } @Override public boolean hasNext() { update(); return !iterators.isEmpty() && iterators.get(index).hasNext(); } @Override public BlockInfo next() { update(); return iterators.get(index).next(); } @Override public void remove() { throw new UnsupportedOperationException("Remove unsupported."); } private void update() { while(index < iterators.size() - 1 && !iterators.get(index).hasNext()) { index++; } } }
/** Decommissioning status */ public class DecommissioningStatus { private int underReplicatedBlocks; private int decommissionOnlyReplicas; private int underReplicatedInOpenFiles; private long startTime; synchronized void set(int underRep, int onlyRep, int underConstruction) { if (isDecommissionInProgress() == false) { return; } underReplicatedBlocks = underRep; decommissionOnlyReplicas = onlyRep; underReplicatedInOpenFiles = underConstruction; } /** @return the number of under-replicated blocks */ public synchronized int getUnderReplicatedBlocks() { if (isDecommissionInProgress() == false) { return 0; } return underReplicatedBlocks; } /** @return the number of decommission-only replicas */ public synchronized int getDecommissionOnlyReplicas() { if (isDecommissionInProgress() == false) { return 0; } return decommissionOnlyReplicas; } /** @return the number of under-replicated blocks in open files */ public synchronized int getUnderReplicatedInOpenFiles() { if (isDecommissionInProgress() == false) { return 0; } return underReplicatedInOpenFiles; } /** Set start time */ public synchronized void setStartTime(long time) { startTime = time; } /** @return start time */ public synchronized long getStartTime() { if (isDecommissionInProgress() == false) { return 0; } return startTime; } }