四、Replication Subsystem
class Replica(val brokerId: Int, val partition: Partition, time: Time = SystemTime, initialHighWatermarkValue: Long = 0L, val log: Option[Log] = None) extends Logging { // the high watermark offset value, in non-leader replicas only its message offsets are kept @volatile private[this] var highWatermarkMetadata: LogOffsetMetadata = new LogOffsetMetadata(initialHighWatermarkValue) // the log end offset value, kept in all replicas; // for local replica it is the log's end offset, for remote replicas its value is only updated by follower fetch @volatile private[this] var logEndOffsetMetadata: LogOffsetMetadata = LogOffsetMetadata.UnknownOffsetMetadata // the time when log offset is updated private[this] val logEndOffsetUpdateTimeMsValue = new AtomicLong(time.milliseconds) val topic = partition.topic val partitionId = partition.partitionId …………………………………… override def equals(that: Any): Boolean = { if(!(that.isInstanceOf[Replica])) return false val other = that.asInstanceOf[Replica] if(topic.equals(other.topic) && brokerId == other.brokerId && partition.equals(other.partition)) return true false } override def hashCode(): Int = { 31 + topic.hashCode() + 17*brokerId + partition.hashCode() } …………………………………… }
/** * Select the new leader, new isr and receiving replicas (for the LeaderAndIsrRequest): * 1. If at least one broker from the isr is alive, it picks a broker from the live isr as the new leader and the live * isr as the new isr. * 2. Else, if unclean leader election for the topic is disabled, it throws a NoReplicaOnlineException. * 3. Else, it picks some alive broker from the assigned replica list as the new leader and the new isr. * 4. If no broker in the assigned replica list is alive, it throws a NoReplicaOnlineException * Replicas to receive LeaderAndIsr request = live assigned replicas * Once the leader is successfully registered in zookeeper, it updates the allLeaders cache */ class OfflinePartitionLeaderSelector(controllerContext: ControllerContext, config: KafkaConfig) extends PartitionLeaderSelector with Logging { this.logIdent = "[OfflinePartitionLeaderSelector]: " def selectLeader(topicAndPartition: TopicAndPartition, currentLeaderAndIsr: LeaderAndIsr): (LeaderAndIsr, Seq[Int]) = { controllerContext.partitionReplicaAssignment.get(topicAndPartition) match { case Some(assignedReplicas) => val liveAssignedReplicas = assignedReplicas.filter(r => controllerContext.liveBrokerIds.contains(r)) val liveBrokersInIsr = currentLeaderAndIsr.isr.filter(r => controllerContext.liveBrokerIds.contains(r)) val currentLeaderEpoch = currentLeaderAndIsr.leaderEpoch val currentLeaderIsrZkPathVersion = currentLeaderAndIsr.zkVersion val newLeaderAndIsr = liveBrokersInIsr.isEmpty match { case true => // Prior to electing an unclean (i.e. non-ISR) leader, ensure that doing so is not disallowed by the configuration // for unclean leader election. if (!LogConfig.fromProps(config.props.props, AdminUtils.fetchTopicConfig(controllerContext.zkClient, topicAndPartition.topic)).uncleanLeaderElectionEnable) { throw new NoReplicaOnlineException(("No broker in ISR for partition " + "%s is alive. Live brokers are: [%s],".format(topicAndPartition, controllerContext.liveBrokerIds)) + " ISR brokers are: [%s]".format(currentLeaderAndIsr.isr.mkString(","))) } debug("No broker in ISR is alive for %s. Pick the leader from the alive assigned replicas: %s" .format(topicAndPartition, liveAssignedReplicas.mkString(","))) liveAssignedReplicas.isEmpty match { case true => throw new NoReplicaOnlineException(("No replica for partition " + "%s is alive. Live brokers are: [%s],".format(topicAndPartition, controllerContext.liveBrokerIds)) + " Assigned replicas are: [%s]".format(assignedReplicas)) case false => ControllerStats.uncleanLeaderElectionRate.mark() val newLeader = liveAssignedReplicas.head warn("No broker in ISR is alive for %s. Elect leader %d from live brokers %s. There's potential data loss." .format(topicAndPartition, newLeader, liveAssignedReplicas.mkString(","))) new LeaderAndIsr(newLeader, currentLeaderEpoch + 1, List(newLeader), currentLeaderIsrZkPathVersion + 1) } case false => val liveReplicasInIsr = liveAssignedReplicas.filter(r => liveBrokersInIsr.contains(r)) val newLeader = liveReplicasInIsr.head debug("Some broker in ISR is alive for %s. Select %d from ISR %s to be the leader." .format(topicAndPartition, newLeader, liveBrokersInIsr.mkString(","))) new LeaderAndIsr(newLeader, currentLeaderEpoch + 1, liveBrokersInIsr.toList, currentLeaderIsrZkPathVersion + 1) } info("Selected new leader and ISR %s for offline partition %s".format(newLeaderAndIsr.toString(), topicAndPartition)) (newLeaderAndIsr, liveAssignedReplicas) case None => throw new NoReplicaOnlineException("Partition %s doesn't have replicas assigned to it".format(topicAndPartition)) } } }
主要代码如下,可以看到,管理offsets topic的方法和管理普通topic没有多大区别:
/** * Fetch the current offset for the given group/topic/partition from the underlying offsets storage. * * @param key The requested group-topic-partition * @return If the key is present, return the offset and metadata; otherwise return None */ private def getOffset(key: GroupTopicPartition) = { val offsetAndMetadata = offsetsCache.get(key) if (offsetAndMetadata == null) OffsetMetadataAndError.NoOffset else OffsetMetadataAndError(offsetAndMetadata.offset, offsetAndMetadata.metadata, ErrorMapping.NoError) } /** * Put the (already committed) offset for the given group/topic/partition into the cache. * * @param key The group-topic-partition * @param offsetAndMetadata The offset/metadata to be stored */ private def putOffset(key: GroupTopicPartition, offsetAndMetadata: OffsetAndMetadata) { offsetsCache.put(key, offsetAndMetadata) } def putOffsets(group: String, offsets: Map[TopicAndPartition, OffsetAndMetadata]) { // this method is called _after_ the offsets have been durably appended to the commit log, so there is no need to // check for current leadership as we do for the offset fetch trace("Putting offsets %s for group %s in offsets partition %d.".format(offsets, group, partitionFor(group))) offsets.foreach { case (topicAndPartition, offsetAndMetadata) => putOffset(GroupTopicPartition(group, topicAndPartition), offsetAndMetadata) } } /** * The most important guarantee that this API provides is that it should never return a stale offset. i.e., it either * returns the current offset or it begins to sync the cache from the log (and returns an error code). */ def getOffsets(group: String, topicPartitions: Seq[TopicAndPartition]): Map[TopicAndPartition, OffsetMetadataAndError] = { trace("Getting offsets %s for group %s.".format(topicPartitions, group)) val offsetsPartition = partitionFor(group) /** * followerTransitionLock protects against fetching from an empty/cleared offset cache (i.e., cleared due to a * leader->follower transition). i.e., even if leader-is-local is true a follower transition can occur right after * the check and clear the cache. i.e., we would read from the empty cache and incorrectly return NoOffset. */ followerTransitionLock synchronized { if (leaderIsLocal(offsetsPartition)) { if (loadingPartitions synchronized loadingPartitions.contains(offsetsPartition)) { debug("Cannot fetch offsets for group %s due to ongoing offset load.".format(group)) topicPartitions.map { topicAndPartition => val groupTopicPartition = GroupTopicPartition(group, topicAndPartition) (groupTopicPartition.topicPartition, OffsetMetadataAndError.OffsetsLoading) }.toMap } else { if (topicPartitions.size == 0) { // Return offsets for all partitions owned by this consumer group. (this only applies to consumers that commit offsets to Kafka.) offsetsCache.filter(_._1.group == group).map { case(groupTopicPartition, offsetAndMetadata) => (groupTopicPartition.topicPartition, OffsetMetadataAndError(offsetAndMetadata.offset, offsetAndMetadata.metadata, ErrorMapping.NoError)) }.toMap } else { topicPartitions.map { topicAndPartition => val groupTopicPartition = GroupTopicPartition(group, topicAndPartition) (groupTopicPartition.topicPartition, getOffset(groupTopicPartition)) }.toMap } } } else { debug("Could not fetch offsets for group %s (not offset coordinator).".format(group)) topicPartitions.map { topicAndPartition => val groupTopicPartition = GroupTopicPartition(group, topicAndPartition) (groupTopicPartition.topicPartition, OffsetMetadataAndError.NotOffsetManagerForGroup) }.toMap } } } /** * Asynchronously read the partition from the offsets topic and populate the cache */ def loadOffsetsFromLog(offsetsPartition: Int) { val topicPartition = TopicAndPartition(OffsetManager.OffsetsTopicName, offsetsPartition) loadingPartitions synchronized { if (loadingPartitions.contains(offsetsPartition)) { info("Offset load from %s already in progress.".format(topicPartition)) } else { loadingPartitions.add(offsetsPartition) scheduler.schedule(topicPartition.toString, loadOffsets) } } def loadOffsets() { info("Loading offsets from " + topicPartition) val startMs = SystemTime.milliseconds try { replicaManager.logManager.getLog(topicPartition) match { case Some(log) => var currOffset = log.logSegments.head.baseOffset val buffer = ByteBuffer.allocate(config.loadBufferSize) // loop breaks if leader changes at any time during the load, since getHighWatermark is -1 while (currOffset < getHighWatermark(offsetsPartition) && !shuttingDown.get()) { buffer.clear() val messages = log.read(currOffset, config.loadBufferSize).messageSet.asInstanceOf[FileMessageSet] messages.readInto(buffer, 0) val messageSet = new ByteBufferMessageSet(buffer) messageSet.foreach { msgAndOffset => require(msgAndOffset.message.key != null, "Offset entry key should not be null") val key = OffsetManager.readMessageKey(msgAndOffset.message.key) if (msgAndOffset.message.payload == null) { if (offsetsCache.remove(key) != null) trace("Removed offset for %s due to tombstone entry.".format(key)) else trace("Ignoring redundant tombstone for %s.".format(key)) } else { val value = OffsetManager.readMessageValue(msgAndOffset.message.payload) putOffset(key, value) trace("Loaded offset %s for %s.".format(value, key)) } currOffset = msgAndOffset.nextOffset } } if (!shuttingDown.get()) info("Finished loading offsets from %s in %d milliseconds." .format(topicPartition, SystemTime.milliseconds - startMs)) case None => warn("No log found for " + topicPartition) } } catch { case t: Throwable => error("Error in loading offsets from " + topicPartition, t) } finally { loadingPartitions synchronized loadingPartitions.remove(offsetsPartition) } } }
这两个抽象类是用于管理partition的fetcher,即kafka的数据消费机制。其中Manager的作用是创建Thread,并将Thread绑定到partition上(或从partition上移除)。Thread的作用是处理FetchRequest,从指定partition的当前offset处继续读取不超过HW的数据,读取的当前offset由内存中一个(topic,partition) -> offset的HashMap进行管理,且对该对象的读写都是互斥的。