private def doHandleStateChanges(replicaId: Int, replicas: Seq[PartitionAndReplica], targetState: ReplicaState): Unit = {
targetState match {
case OfflineReplica =>
//对有效的副本发送StopReplicaRequest
validReplicas.foreach { replica =>
controllerBrokerRequestBatch.addStopReplicaRequestForBrokers(Seq(replicaId), replica.topicPartition, deletePartition = false)
}
}
}
private def doHandleStateChanges(replicaId: Int, replicas: Seq[PartitionAndReplica], targetState: ReplicaState): Unit = {
targetState match {
case ReplicaDeletionStarted =>
validReplicas.foreach { replica =>
val currentState = controllerContext.replicaState(replica)
logSuccessfulTransition(replicaId, replica.topicPartition, currentState, ReplicaDeletionStarted)
controllerContext.putReplicaState(replica, ReplicaDeletionStarted)
//多了一个StopReplicaRequest的处理
controllerBrokerRequestBatch.addStopReplicaRequestForBrokers(Seq(replicaId), replica.topicPartition, deletePartition = true)
}
}
}
以上代码对比一下就会发现两处发送StopReplicaRequest的不同,副本下线deletePartition传的是false,副本删除传的true,可以猜测这个字段应该与删除log日志有关
停止副本request主要做了以下几件事
1、校验controllerEpoch
2、停止fetch线程,即同步数据线程
3、根据需要判断是否删除日志文件
def handleStopReplicaRequest(request: RequestChannel.Request): Unit = {
// ensureTopicExists is only for client facing requests
// We can't have the ensureTopicExists check here since the controller sends it as an advisory to all brokers so they
// stop serving data to clients for the topic being deleted
val stopReplicaRequest = request.body[StopReplicaRequest]
authorizeClusterOperation(request, CLUSTER_ACTION)
//校验纪元
if (isBrokerEpochStale(stopReplicaRequest.brokerEpoch)) {
// When the broker restarts very quickly, it is possible for this broker to receive request intended
// for its previous generation so the broker should skip the stale request.
info("Received stop replica request with broker epoch " +
s"${stopReplicaRequest.brokerEpoch} smaller than the current broker epoch ${controller.brokerEpoch}")
sendResponseExemptThrottle(request, new StopReplicaResponse(new StopReplicaResponseData().setErrorCode(Errors.STALE_BROKER_EPOCH.code)))
} else {
//调用副本管理类来做这个事
val (result, error) = replicaManager.stopReplicas(stopReplicaRequest)
// Clear the coordinator caches in case we were the leader. In the case of a reassignment, we
// cannot rely on the LeaderAndIsr API for this since it is only sent to active replicas.
result.foreach { case (topicPartition, error) =>
if (error == Errors.NONE && stopReplicaRequest.deletePartitions) {
//对内置的topic及事务topic特殊处理
if (topicPartition.topic == GROUP_METADATA_TOPIC_NAME) {
groupCoordinator.onResignation(topicPartition.partition)
} else if (topicPartition.topic == TRANSACTION_STATE_TOPIC_NAME) {
// The StopReplica API does not pass through the leader epoch
txnCoordinator.onResignation(topicPartition.partition, coordinatorEpoch = None)
}
}
}
def toStopReplicaPartition(tp: TopicPartition, error: Errors) =
new StopReplicaResponseData.StopReplicaPartitionError()
.setTopicName(tp.topic)
.setPartitionIndex(tp.partition)
.setErrorCode(error.code)
sendResponseExemptThrottle(request, new StopReplicaResponse(new StopReplicaResponseData()
.setErrorCode(error.code)
.setPartitionErrors(result.map { case (tp, error) => toStopReplicaPartition(tp, error) }.toBuffer.asJava)))
}
CoreUtils.swallow(replicaManager.replicaFetcherManager.shutdownIdleFetcherThreads(), this)
}
def stopReplicas(stopReplicaRequest: StopReplicaRequest): (mutable.Map[TopicPartition, Errors], Errors) = {
replicaStateChangeLock synchronized {
val responseMap = new collection.mutable.HashMap[TopicPartition, Errors]
if (stopReplicaRequest.controllerEpoch() < controllerEpoch) {
stateChangeLogger.warn("Received stop replica request from an old controller epoch " +
s"${stopReplicaRequest.controllerEpoch}. Latest known controller epoch is $controllerEpoch")
(responseMap, Errors.STALE_CONTROLLER_EPOCH)
} else {
val partitions = stopReplicaRequest.partitions.asScala.toSet
controllerEpoch = stopReplicaRequest.controllerEpoch
// First stop fetchers for all partitions, then stop the corresponding replicas
//停掉当前broker对应的topicPartition的Fetcher线程,即停掉了副本对应同步数据的线程
replicaFetcherManager.removeFetcherForPartitions(partitions)
replicaAlterLogDirsManager.removeFetcherForPartitions(partitions)
for (topicPartition <- partitions){
try {
//对副本的处理主要是根据deletePartitions来判断是否删除log文件
stopReplica(topicPartition, stopReplicaRequest.deletePartitions)
responseMap.put(topicPartition, Errors.NONE)
} catch {
case e: KafkaStorageException =>
stateChangeLogger.error(s"Ignoring stop replica (delete=${stopReplicaRequest.deletePartitions}) for " +
s"partition $topicPartition due to storage exception", e)
responseMap.put(topicPartition, Errors.KAFKA_STORAGE_ERROR)
}
}
(responseMap, Errors.NONE)
}
}
}
def stopReplica(topicPartition: TopicPartition, deletePartition: Boolean) = {
stateChangeLogger.trace(s"Handling stop replica (delete=$deletePartition) for partition $topicPartition")
//删除分区日志文件,状态机下线状态那儿传过来的是false,删除副本状态为true
if (deletePartition) {
//这里会根据topic对应分区的状态来判断,有可能会出现副本不存在或者磁盘损坏的情况
getPartition(topicPartition) match {
case HostedPartition.Offline =>
throw new KafkaStorageException(s"Partition $topicPartition is on an offline disk")
case hostedPartition @ HostedPartition.Online(removedPartition) =>
if (allPartitions.remove(topicPartition, hostedPartition)) {
//删除topic对应的统计指标
maybeRemoveTopicMetrics(topicPartition.topic)
// this will delete the local log. This call may throw exception if the log is on offline directory
//删除日志文件 todo 后面再统一来研究日志系统
removedPartition.delete()
}
case HostedPartition.None =>
stateChangeLogger.trace(s"Ignoring stop replica (delete=$deletePartition) for partition " +
s"$topicPartition as replica doesn't exist on broker")
}
// Delete log and corresponding folders in case replica manager doesn't hold them anymore.
// This could happen when topic is being deleted while broker is down and recovers.
//如果在被标记为删除后broker下线然后再恢复,会继续之前的删除操作
if (logManager.getLog(topicPartition).isDefined)
logManager.asyncDelete(topicPartition)
if (logManager.getLog(topicPartition, isFuture = true).isDefined)
logManager.asyncDelete(topicPartition, isFuture = true)
}
更新元数据主要分为以下几步
1、校验controllerEpoch
2、调用replicaManager更新kafka.server.MetadataCache中的元数据
3、根据更新的结果判断是否有需要删除的分区,如果有,则删除对应的offset
4、更新对应的统计指标
5、完成延迟操作
def handleUpdateMetadataRequest(request: RequestChannel.Request): Unit = {
//请求的唯一id
val correlationId = request.header.correlationId
val updateMetadataRequest = request.body[UpdateMetadataRequest]
authorizeClusterOperation(request, CLUSTER_ACTION)
//还是校验controller的纪元
if (isBrokerEpochStale(updateMetadataRequest.brokerEpoch)) {
// When the broker restarts very quickly, it is possible for this broker to receive request intended
// for its previous generation so the broker should skip the stale request.
info("Received update metadata request with broker epoch " +
s"${updateMetadataRequest.brokerEpoch} smaller than the current broker epoch ${controller.brokerEpoch}")
sendResponseExemptThrottle(request,
new UpdateMetadataResponse(new UpdateMetadataResponseData().setErrorCode(Errors.STALE_BROKER_EPOCH.code)))
} else {
//调用副本管理类来更新元数据缓存
val deletedPartitions = replicaManager.maybeUpdateMetadataCache(correlationId, updateMetadataRequest)
//如果存在需要删除的分区,则调用组协调器删除offsets
if (deletedPartitions.nonEmpty)
groupCoordinator.handleDeletedPartitions(deletedPartitions)
//完成topic延迟操作
if (adminManager.hasDelayedTopicOperations) {
updateMetadataRequest.partitionStates.asScala.foreach { partitionState =>
adminManager.tryCompleteDelayedTopicOperations(partitionState.topicName)
}
}
//更新对应的统计指标
quotas.clientQuotaCallback.foreach { callback =>
if (callback.updateClusterMetadata(metadataCache.getClusterMetadata(clusterId, request.context.listenerName))) {
quotas.fetch.updateQuotaMetricConfigs()
quotas.produce.updateQuotaMetricConfigs()
quotas.request.updateQuotaMetricConfigs()
}
}
//完成延迟选举操作
if (replicaManager.hasDelayedElectionOperations) {
updateMetadataRequest.partitionStates.asScala.foreach { partitionState =>
val tp = new TopicPartition(partitionState.topicName, partitionState.partitionIndex)
replicaManager.tryCompleteElection(TopicPartitionOperationKey(tp))
}
}
sendResponseExemptThrottle(request, new UpdateMetadataResponse(
new UpdateMetadataResponseData().setErrorCode(Errors.NONE.code)))
}
}
我们主要来看一下更新元数据缓存方法,可以看到这里还是校验了controllerEpoch,然后返回需要删除的分区
def maybeUpdateMetadataCache(correlationId: Int, updateMetadataRequest: UpdateMetadataRequest) : Seq[TopicPartition] = {
replicaStateChangeLock synchronized {
//再次校验controllerEpoch
if(updateMetadataRequest.controllerEpoch < controllerEpoch) {
val stateControllerEpochErrorMessage = s"Received update metadata request with correlation id $correlationId " +
s"from an old controller ${updateMetadataRequest.controllerId} with epoch ${updateMetadataRequest.controllerEpoch}. " +
s"Latest known controller epoch is $controllerEpoch"
stateChangeLogger.warn(stateControllerEpochErrorMessage)
throw new ControllerMovedException(stateChangeLogger.messageWithPrefix(stateControllerEpochErrorMessage))
} else {
//直接调用metadataCache来更新元数据,然后返回需要删除的分区
val deletedPartitions = metadataCache.updateMetadata(correlationId, updateMetadataRequest)
controllerEpoch = updateMetadataRequest.controllerEpoch
deletedPartitions
}
}
}
这里就是更新元数据的核心逻辑了,首先根据request中的数据构造aliveBrokers及aliveNodes,aliveBrokers包含了brokerId,相关host,ip以及安全认证方式,aliveNodes
包含的是kafka集群节点信息,两个维度不一样,具体内容可见下图,同时kafka也支持通过updateMetadataRequest来更新部分元数据信息,首先会copy一份partitionStates
,然后根据传入的数据逐一更新,如果传入的leaderId被标记为删除,则还会将此数据返回
def updateMetadata(correlationId: Int, updateMetadataRequest: UpdateMetadataRequest): Seq[TopicPartition] = {
inWriteLock(partitionMetadataLock) {
//这里metadataSnapshot字面意思就是元数据快照
val aliveBrokers = new mutable.LongMap[Broker](metadataSnapshot.aliveBrokers.size)
val aliveNodes = new mutable.LongMap[collection.Map[ListenerName, Node]](metadataSnapshot.aliveNodes.size)
val controllerId = updateMetadataRequest.controllerId match {
case id if id < 0 => None
case id => Some(id)
}
updateMetadataRequest.liveBrokers.asScala.foreach { broker =>
//这里说明了选择java的HashMap的原因,后续会用AnyRefMap来替代
// `aliveNodes` is a hot path for metadata requests for large clusters, so we use java.util.HashMap which
// is a bit faster than scala.collection.mutable.HashMap. When we drop support for Scala 2.10, we could
// move to `AnyRefMap`, which has comparable performance.
val nodes = new java.util.HashMap[ListenerName, Node]
val endPoints = new mutable.ArrayBuffer[EndPoint]
broker.endpoints.asScala.foreach { ep =>
val listenerName = new ListenerName(ep.listener)
endPoints += new EndPoint(ep.host, ep.port, listenerName, SecurityProtocol.forId(ep.securityProtocol))
nodes.put(listenerName, new Node(broker.id, ep.host, ep.port))
}
aliveBrokers(broker.id) = Broker(broker.id, endPoints, Option(broker.rack))
aliveNodes(broker.id) = nodes.asScala
}
aliveNodes.get(brokerId).foreach { listenerMap =>
val listeners = listenerMap.keySet
if (!aliveNodes.values.forall(_.keySet == listeners))
error(s"Listeners are not identical across brokers: $aliveNodes")
}
val deletedPartitions = new mutable.ArrayBuffer[TopicPartition]
if (!updateMetadataRequest.partitionStates.iterator.hasNext) {
metadataSnapshot = MetadataSnapshot(metadataSnapshot.partitionStates, controllerId, aliveBrokers, aliveNodes)
} else {
//允许更新部分元数据信息,先对快照信息进行拷贝,读写分离的思想
val partitionStates = new mutable.AnyRefMap[String, mutable.LongMap[UpdateMetadataPartitionState]](metadataSnapshot.partitionStates.size)
metadataSnapshot.partitionStates.foreach { case (topic, oldPartitionStates) =>
val copy = new mutable.LongMap[UpdateMetadataPartitionState](oldPartitionStates.size)
copy ++= oldPartitionStates
partitionStates += (topic -> copy)
}
updateMetadataRequest.partitionStates.asScala.foreach { info =>
val controllerId = updateMetadataRequest.controllerId
val controllerEpoch = updateMetadataRequest.controllerEpoch
val tp = new TopicPartition(info.topicName, info.partitionIndex)
if (info.leader == LeaderAndIsr.LeaderDuringDelete) {
removePartitionInfo(partitionStates, tp.topic, tp.partition)
stateChangeLogger.trace(s"Deleted partition $tp from metadata cache in response to UpdateMetadata " +
s"request sent by controller $controllerId epoch $controllerEpoch with correlation id $correlationId")
deletedPartitions += tp
} else {
addOrUpdatePartitionInfo(partitionStates, tp.topic, tp.partition, info)
stateChangeLogger.trace(s"Cached leader info $info for partition $tp in response to " +
s"UpdateMetadata request sent by controller $controllerId epoch $controllerEpoch with correlation id $correlationId")
}
}
metadataSnapshot = MetadataSnapshot(partitionStates, controllerId, aliveBrokers, aliveNodes)
}
deletedPartitions
}
}