main()->Cli.run()->Balancer.run()
boolean done = false;
for(int iteration = 0; !done; iteration++) {
done = true;
// 打散多个namespace
Collections.shuffle(connectors);
for(NameNodeConnector nnc : connectors) {
// balancer 初始化,定义threshold和线程池大小
final Balancer b = new Balancer(nnc, p, conf);
// 主要balance流程,遍历每个datanode
final ReturnStatus r = b.run(iteration, formatter, conf);
// clean all lists
b.resetData(conf);
// 未平衡则继续,否则退出
if (r == ReturnStatus.IN_PROGRESS) {
done = false;
} else if (r != ReturnStatus.SUCCESS) {
//must be an error statue, return.
return r.code;
}
}
if (!done) {
Thread.sleep(sleeptime);
}
}
/**
* Construct a balancer.
* Initialize balancer. It sets the value of the threshold, and
* builds the communication proxies to
* namenode as a client and a secondary namenode and retry proxies
* when connection fails.
*/
Balancer(NameNodeConnector theblockpool, Parameters p, Configuration conf) {
this.threshold = p.threshold;
this.policy = p.policy;
this.nnc = theblockpool;
cluster = NetworkTopology.getInstance(conf);
// 初始化mover线程池,默认值1000,是真正执行block拷贝的线程
this.moverExecutor = Executors.newFixedThreadPool(
conf.getInt(DFSConfigKeys.DFS_BALANCER_MOVERTHREADS_KEY,
DFSConfigKeys.DFS_BALANCER_MOVERTHREADS_DEFAULT));
// 初始化dispatcher线程池,默认200,是从source的节点选取block的线程
this.dispatcherExecutor = Executors.newFixedThreadPool(
conf.getInt(DFSConfigKeys.DFS_BALANCER_DISPATCHERTHREADS_KEY,
DFSConfigKeys.DFS_BALANCER_DISPATCHERTHREADS_DEFAULT));
// 初始化每个datanode上同时拷贝block的并行数,默认值是5
this.maxConcurrentMovesPerNode =
conf.getInt(DFSConfigKeys.DFS_DATANODE_BALANCE_MAX_NUM_CONCURRENT_MOVES_KEY,
DFSConfigKeys.DFS_DATANODE_BALANCE_MAX_NUM_CONCURRENT_MOVES_DEFAULT);
}
/* Given a data node set, build a network topology and decide
* over-utilized datanodes, above average utilized datanodes,
* below average utilized datanodes, and underutilized datanodes.
* The input data node set is shuffled before the datanodes
* are put into the over-utilized datanodes, above average utilized
* datanodes, below average utilized datanodes, and
* underutilized datanodes lists. This will add some randomness
* to the node matching later on.
*/
final long bytesLeftToMove = initNodes(nnc.client.getDatanodeReport(DatanodeReportType.LIVE));
/* Decide all the nodes that will participate in the block move and
* the number of bytes that need to be moved from one node to another
* in this iteration. Maximum bytes to be moved per node is
* Min(1 Band worth of bytes, MAX_SIZE_TO_MOVE).
*/
final long bytesToMove = chooseNodes();
/* Start a thread to dispatch block moves for each source.
* The thread selects blocks to move & sends request to proxy source to
* initiate block move. The process is flow controlled. Block selection is
* blocked if there are too many un-confirmed block moves.
* Return the total number of bytes successfully moved in this iteration.
*/
this.nnc.shouldContinue(dispatchBlockMoves());
一、 initNodes() 操作流程图:
通过计算获得上图中上步4个List,list中的元素是BalancerDatanode(初始化时已经指定:final private static long MAX_SIZE_TO_MOVE = 10*1024*1024*1024L; //10GB),同时通过overUtilizedDatanodes和underUtilizedDatanodes 计算出overLoadedBytes和underLoadedBytes,规则如下:
// 使用overUtilizedDatanodes中节点
overLoadedBytes += (long)((datanodeS.utilization-avg-threshold)*datanodeS.datanode.getCapacity()/100.0);
// 使用underUtilizedDatanodes中节点
underLoadedBytes += (long)((avg-threshold-datanodeS.utilization)*datanodeS.datanode.getCapacity()/100.0);
最后反回的bytesLeftToMove 就是overLoadedBytes和underLoadedBytes的最大值。
二、 chooseNodes() 操作规则:
优先选取source节点和dest节点同机架:1. overUtilizedDatanodes->underUtilizedDatanodes;2. overUtilizedDatanodes->belowAvgUtilizedDatanodes;3. aboveAvgUtilizedDatanodes->underUtilizedDatanodes(除了aboveAvgUtilizedDatanodes->belowAvgUtilizedDatanodes都有了);
其次选取source节点和dest节点不同机器(无限制):1. overUtilizedDatanodes->underUtilizedDatanodes;2. overUtilizedDatanodes->belowAvgUtilizedDatanodes;3. aboveAvgUtilizedDatanodes->underUtilizedDatanodes(除了aboveAvgUtilizedDatanodes->belowAvgUtilizedDatanodes都有了);
每个source到dest节点拷贝的数据大小加和作为返回值。默认就是每台机器10GB。(也就表示要从source拷贝数据到dest)
三、 dispatchBlockMoves() 操作:
从source中选取block拷贝到dest的具体操作,使用了Balancer初始化时初始化的dispatcherExecutor和moverExecutor两个线程池(含义可看上面代码中注释)。