下面的例子只是各拿一个做举例,不是全部场景,不要以为logStartOffset,LogEndOffset,HW,LW只有三个场景可以修改
这里需要针对logStartOffset和LogEndOffset做特殊说明,要不会让大家脑袋混乱,并且前言后的章节讲的都是
主题分区级别
的
在Leader副本中的ISR集合中,最小的主题分区级别的LogEndOffset中为HW
这个数据是虚线的值,不是实际存储的值,可以参考第五章节证明环节,
但是LW和所有副本(AR)中最小的主题分区级别的logStartOffset是一致的
def becomeLeaderOrFollower(correlationId: Int,
leaderAndIsrRequest: LeaderAndIsrRequest,
onLeadershipChange: (Iterable[Partition], Iterable[Partition]) => Unit): LeaderAndIsrResponse = {
//todo 启动高水位线定时任务,目的是把每一个分区的高水位线的数据写入到高水位标记文件中
startHighWatermarkCheckPointThread()
}
def startHighWatermarkCheckPointThread(): Unit = {
if (highWatermarkCheckPointThreadStarted.compareAndSet(false, true)) {
//() => checkpointHighWatermarks() 是一个runner
scheduler.schedule("highwatermark-checkpoint", () => checkpointHighWatermarks(), 0L, config.replicaHighWatermarkCheckpointIntervalMs)
}
}
定时任务checkpointHighWatermarks
def checkpointHighWatermarks(): Unit = {
//该函数接受两个参数:logDirToCheckpoints和log。函数的作用是将log的高水位标记(highWatermark)存储到logDirToCheckpoints中。
def putHw(logDirToCheckpoints: mutable.AnyRefMap[String, mutable.AnyRefMap[TopicPartition, Long]],
log: UnifiedLog): Unit = {
val checkpoints = logDirToCheckpoints.getOrElseUpdate(log.parentDir,
new mutable.AnyRefMap[TopicPartition, Long]())
checkpoints.put(log.topicPartition, log.highWatermark)
}
//它是一个mutable.AnyRefMap[String, mutable.AnyRefMap[TopicPartition, Long]]类型的可变映射。初始化大小为allPartitions.size。
val logDirToHws = new mutable.AnyRefMap[String, mutable.AnyRefMap[TopicPartition, Long]](
allPartitions.size)
// 使用onlinePartitionsIterator迭代器遍历每个分区,并对每个分区的日志调用putHw函数来更新logDirToHws。
onlinePartitionsIterator.foreach { partition =>
partition.log.foreach(putHw(logDirToHws, _))
partition.futureLog.foreach(putHw(logDirToHws, _))
}
//使用logDirToHws中的每个(logDir, hws)对调用highWatermarkCheckpoints.get(logDir),并尝试将hws写入高水位标记文件
for ((logDir, hws) <- logDirToHws) {
try highWatermarkCheckpoints.get(logDir).foreach(_.write(hws))
catch {
//如果写入过程中发生KafkaStorageException异常,则打印错误日志
case e: KafkaStorageException =>
error(s"Error while writing to highwatermark file in directory $logDir", e)
}
}
}
至于为什么是fetchRecords
方法,你可以看一下kakfa 3.5 kafka服务端处理消费者客户端拉取数据请求源码
def fetchRecords(
fetchParams: FetchParams,
fetchPartitionData: FetchRequest.PartitionData,
fetchTimeMs: Long,
maxBytes: Int,
minOneMessage: Boolean,
updateFetchState: Boolean
): LogReadInfo = {
//省略代码
//判断获取数据的请求是否来自Follower
if (fetchParams.isFromFollower) {
// Check that the request is from a valid replica before doing the read
val (replica, logReadInfo) = inReadLock(leaderIsrUpdateLock) {
//省略代码
val logReadInfo = readFromLocalLog(localLog)
(replica, logReadInfo)
}
//todo Follower副本在fetch数据后,修改一些信息
if (updateFetchState && !logReadInfo.divergingEpoch.isPresent) {
//如果 fetch 来自 broker 的副本同步,那么就更新相关的 log end offset
updateFollowerFetchState(
replica,
followerFetchOffsetMetadata = logReadInfo.fetchedData.fetchOffsetMetadata,
followerStartOffset = fetchPartitionData.logStartOffset,
followerFetchTimeMs = fetchTimeMs,
leaderEndOffset = logReadInfo.logEndOffset,
fetchParams.replicaEpoch
)
}
logReadInfo
}
//省略代码
}
/**
* Update the follower's state in the leader based on the last fetch request. See
* [[Replica.updateFetchState()]] for details.
*
* This method is visible for performance testing (see `UpdateFollowerFetchStateBenchmark`)
*/
def updateFollowerFetchState(
replica: Replica,
followerFetchOffsetMetadata: LogOffsetMetadata,
followerStartOffset: Long,
followerFetchTimeMs: Long,
leaderEndOffset: Long,
brokerEpoch: Long
): Unit = {
// No need to calculate low watermark if there is no delayed DeleteRecordsRequest
//通过判断是否存在延迟的DeleteRecordsRequest来确定是否需要计算低水位(lowWatermarkIfLeader)。如果没有延迟的DeleteRecordsRequest,则将oldLeaderLW设为-1。
val oldLeaderLW = if (delayedOperations.numDelayedDelete > 0) lowWatermarkIfLeader else -1L
//获取副本的先前的跟随者日志结束偏移量
val prevFollowerEndOffset = replica.stateSnapshot.logEndOffset
//调用replica.updateFetchState方法来更新副本的抓取状态,包括跟随者的抓取偏移量元数据、起始偏移量、抓取时间、领导者的结束偏移量和代理节点的时期。
replica.updateFetchState(
followerFetchOffsetMetadata,
followerStartOffset,
followerFetchTimeMs,
leaderEndOffset,
brokerEpoch
)
//再次判断是否存在延迟的DeleteRecordsRequest,如果没有则将newLeaderLW设为-1。
val newLeaderLW = if (delayedOperations.numDelayedDelete > 0) lowWatermarkIfLeader else -1L
// check if the LW of the partition has incremented
// since the replica's logStartOffset may have incremented
//检查分区的低水位是否增加,即新的低水位(newLeaderLW)是否大于旧的低水位(oldLeaderLW)。
val leaderLWIncremented = newLeaderLW > oldLeaderLW
// Check if this in-sync replica needs to be added to the ISR.
//调用maybeExpandIsr方法来检查是否需要将该同步副本添加到ISR(In-Sync Replicas)中。
maybeExpandIsr(replica)
// check if the HW of the partition can now be incremented
// since the replica may already be in the ISR and its LEO has just incremented
//检查分区的高水位是否可以增加。如果副本的日志结束偏移量(replica.stateSnapshot.logEndOffset)发生变化,
val leaderHWIncremented = if (prevFollowerEndOffset != replica.stateSnapshot.logEndOffset) {
// the leader log may be updated by ReplicaAlterLogDirsThread so the following method must be in lock of
// leaderIsrUpdateLock to prevent adding new hw to invalid log.
//尝试增加高水位(maybeIncrementLeaderHW方法),并在leaderIsrUpdateLock锁的保护下执行该操作。
inReadLock(leaderIsrUpdateLock) {
leaderLogIfLocal.exists(leaderLog => maybeIncrementLeaderHW(leaderLog, followerFetchTimeMs))
}
} else {
false
}
// some delayed operations may be unblocked after HW or LW changed
//如果低水位或高水位发生变化,则尝试完成延迟请求(tryCompleteDelayedRequests方法)。
if (leaderLWIncremented || leaderHWIncremented)
tryCompleteDelayedRequests()
debug(s"Recorded replica ${replica.brokerId} log end offset (LEO) position " +
s"${followerFetchOffsetMetadata.messageOffset} and log start offset $followerStartOffset.")
}
/**
* 检查并可能增加分区的高水位线;
* 1. Partition ISR changed 1.分区 ISR 已更改
* 2. Any replica's LEO changed 2。任何副本的 LEO 已更改
*
* HW由同步或被视为已捕获的所有副本中的最小日志结束偏移量确定。
* 这样,如果一个副本被视为已捕获,但其对数结束偏移小于HW,我们将等待此副本赶上HW,然后再推进HW。
* 这有助于 ISR 仅包含领导者副本且从属者试图赶上的情况。
* 如果我们在前进HW时不等待跟随者,则跟随者的对数结束偏移量可能会一直落后于HW(由领导者的对数结束偏移量决定),因此永远不会添加到 ISR 中。
* 随着 AlterPartition 的添加,我们还在推进硬件时将新添加的副本视为 ISR 的一部分。
* 控制器尚未将这些副本提交到 ISR,因此我们可以恢复到之前提交的 ISR。
* 但是,向 ISR 添加其他副本会使其更具限制性,因此更安全。我们将此集合称为“最大”ISR。
*/
private def maybeIncrementLeaderHW(leaderLog: UnifiedLog, currentTimeMs: Long = time.milliseconds): Boolean = {
//首先获得Leader分区的LogEndOffset
val leaderLogEndOffset = leaderLog.logEndOffsetMetadata
//创建一个新的Hw。不一定用上,
var newHighWatermark = leaderLogEndOffset
//首先,代码通过迭代remoteReplicasMap中的每个副本(replica)来确定新的高水位线。对于每个副本,它检查副本的状态快照(replica.stateSnapshot)的日志结束偏移
remoteReplicasMap.values.foreach { replica =>
// Note here we are using the "maximal", see explanation above
val replicaState = replica.stateSnapshot
//如果副本的日志结束偏移小于新的高水位线,并且副本已经追赶上了领导者日志,或者副本的brokerId包含在最大ISR(in-sync replicas)列表中,则将该日志结束偏移赋值给新的高水位线。
if (replicaState.logEndOffsetMetadata.messageOffset < newHighWatermark.messageOffset &&
(replicaState.isCaughtUp(leaderLogEndOffset.messageOffset, currentTimeMs, replicaLagTimeMaxMs)
|| partitionState.maximalIsr.contains(replica.brokerId))) {
//则将该副本日志结束偏移赋值给新的高水位线。相当于HW设置为ISR中endOffset最小的那一个
newHighWatermark = replicaState.logEndOffsetMetadata
}
}
leaderLog.maybeIncrementHighWatermark(newHighWatermark) match {
case Some(oldHighWatermark) =>
//尝试更新领导者日志的高水位线。如果成功更新了旧的高水位线,则会输出一条调试信息,并返回true
debug(s"High watermark updated from $oldHighWatermark to $newHighWatermark")
true
//省略代码
}
}
leaderLog.maybeIncrementHighWatermark
这个是一个方法,有返回值
/**
*当且仅当高水位线大于旧值时,才将其更新为新值。更新为大于日志结束偏移量的值是错误的。
*此方法旨在由领导者在更新追随者提取偏移量后更新高水位线。
* @return the old high watermark, if updated by the new value
*/
def maybeIncrementHighWatermark(newHighWatermark: LogOffsetMetadata): Option[LogOffsetMetadata] = {
//省略代码
lock.synchronized {
val oldHighWatermark = fetchHighWatermarkMetadata
//省略代码
if (oldHighWatermark.messageOffset < newHighWatermark.messageOffset ||
(oldHighWatermark.messageOffset == newHighWatermark.messageOffset && oldHighWatermark.onOlderSegment(newHighWatermark))) {
updateHighWatermarkMetadata(newHighWatermark)
Some(oldHighWatermark)
} else {
None
}
}
}
updateHighWatermarkMetadata
修改HW元数据的方法
//更新HW高水位线的方法
private def updateHighWatermarkMetadata(newHighWatermark: LogOffsetMetadata): Unit = {
if (newHighWatermark.messageOffset < 0)
throw new IllegalArgumentException("High watermark offset should be non-negative")
lock synchronized {
if (newHighWatermark.messageOffset < highWatermarkMetadata.messageOffset) {
warn(s"Non-monotonic update of high watermark from $highWatermarkMetadata to $newHighWatermark")
}
//先更新HW缓存,再更新数据
highWatermarkMetadata = newHighWatermark
producerStateManager.onHighWatermarkUpdated(newHighWatermark.messageOffset)
logOffsetsListener.onHighWatermarkUpdated(newHighWatermark.messageOffset)
maybeIncrementFirstUnstableOffset()
}
trace(s"Setting high watermark $newHighWatermark")
}
至于生产者推送消息到服务端,可以参考kafka 3.5 kafka服务端接收生产者发送的数据源码
private def append(records: MemoryRecords,
origin: AppendOrigin,
interBrokerProtocolVersion: MetadataVersion,
validateAndAssignOffsets: Boolean,
leaderEpoch: Int,
requestLocal: Option[RequestLocal],
ignoreRecordSize: Boolean): LogAppendInfo = {
//把数据追加到数据文件、索引文件、时间索引文件的方法
//这里会修改LogEndOffset,保证这个点是最新数据的位点+1
localLog.append(appendInfo.lastOffset, appendInfo.maxTimestamp, appendInfo.offsetOfMaxTimestamp, validRecords)
//修改高水位线最后一个日志的偏移量
updateHighWatermarkWithLogEndOffset()
}
private[log] def append(lastOffset: Long, largestTimestamp: Long, shallowOffsetOfMaxTimestamp: Long, records: MemoryRecords): Unit = {
//给数据文件增加数据,并且根据条件判断是否给索引文件和时间索引文件增加数据
segments.activeSegment.append(largestOffset = lastOffset, largestTimestamp = largestTimestamp,
shallowOffsetOfMaxTimestamp = shallowOffsetOfMaxTimestamp, records = records)
//更新日志的结束偏移量,并更新恢复点。
updateLogEndOffset(lastOffset + 1)
}
其中updateLogEndOffset会修改logEndOffset
/**
* The offset metadata of the next message that will be appended to the log
*/
private[log] def logEndOffsetMetadata: LogOffsetMetadata = nextOffsetMetadata
/**
* The offset of the next message that will be appended to the log
*/
private[log] def logEndOffset: Long = nextOffsetMetadata.messageOffset
/**
* Update end offset of the log, and update the recoveryPoint.
*更新日志的结束偏移量,并更新恢复点。
* @param endOffset the new end offset of the log
*/
private[log] def updateLogEndOffset(endOffset: Long): Unit = {
nextOffsetMetadata = new LogOffsetMetadata(endOffset, segments.activeSegment.baseOffset, segments.activeSegment.size)
//恢复点设置为上次的endOffset
if (recoveryPoint > endOffset) {
updateRecoveryPoint(endOffset)
}
}
new LogOffsetMetadata
会创建一个新的给logEndOffsetMetadata
,覆盖掉旧的,并且logEndOffset变成了新的logEndOffsetMetadata
中的messageOffset
public LogOffsetMetadata(long messageOffset,
long segmentBaseOffset,
int relativePositionInSegment) {
this.messageOffset = messageOffset;
this.segmentBaseOffset = segmentBaseOffset;
this.relativePositionInSegment = relativePositionInSegment;
}
执行deleteSegments
一般是segment
过期执行删除操作,都是从时间最久的segment开始删除,所以LogStartOffset
才会递增
private def deleteSegments(deletable: Iterable[LogSegment], reason: SegmentDeletionReason): Int = {
maybeHandleIOException(s"Error while deleting segments for $topicPartition in dir ${dir.getParent}") {
val numToDelete = deletable.size
if (numToDelete > 0) {
// we must always have at least one segment, so if we are going to delete all the segments, create a new one first
var segmentsToDelete = deletable
if (localLog.segments.numberOfSegments == numToDelete) {
val newSegment = roll()
if (deletable.last.baseOffset == newSegment.baseOffset) {
warn(s"Empty active segment at ${deletable.last.baseOffset} was deleted and recreated due to $reason")
segmentsToDelete = deletable.dropRight(1)
}
}
localLog.checkIfMemoryMappedBufferClosed()
// remove the segments for lookups
localLog.removeAndDeleteSegments(segmentsToDelete, asyncDelete = true, reason)
deleteProducerSnapshots(deletable, asyncDelete = true)
maybeIncrementLogStartOffset(localLog.segments.firstSegmentBaseOffset.get, LogStartOffsetIncrementReason.SegmentDeletion)
}
numToDelete
}
}
在删除segment时,会调用maybeIncrementLogStartOffset
会尝试修改LogStartOffset
/**
*如果提供的偏移量较大,则递增日志开始偏移量。
*如果日志开始偏移量发生更改,则此方法还会更新一些键偏移量,以便“logStartOffset <= logStableOffset <= highWatermark”。
* 前导纪元缓存也会更新,以便该组件中引用的所有偏移都指向此日志中的有效偏移。
* @throws OffsetOutOfRangeException if the log start offset is greater than the high watermark
* @return true if the log start offset was updated; otherwise false
*/
def maybeIncrementLogStartOffset(newLogStartOffset: Long, reason: LogStartOffsetIncrementReason): Boolean = {
var updatedLogStartOffset = false
maybeHandleIOException(s"Exception while increasing log start offset for $topicPartition to $newLogStartOffset in dir ${dir.getParent}") {
lock synchronized {
if (newLogStartOffset > highWatermark)
throw new OffsetOutOfRangeException(s"Cannot increment the log start offset to $newLogStartOffset of partition $topicPartition " +
s"since it is larger than the high watermark $highWatermark")
localLog.checkIfMemoryMappedBufferClosed()
if (newLogStartOffset > logStartOffset) {
//修改LogStartOffset
updatedLogStartOffset = true
updateLogStartOffset(newLogStartOffset)
_localLogStartOffset = newLogStartOffset
info(s"Incremented log start offset to $newLogStartOffset due to $reason")
leaderEpochCache.foreach(_.truncateFromStart(logStartOffset))
producerStateManager.onLogStartOffsetIncremented(newLogStartOffset)
maybeIncrementFirstUnstableOffset()
}
}
}
updatedLogStartOffset
}
//修改LogStartOffset
private def updateLogStartOffset(offset: Long): Unit = {
logStartOffset = offset
if (highWatermark < offset) {
updateHighWatermark(offset)
}
if (localLog.recoveryPoint < offset) {
localLog.updateRecoveryPoint(offset)
}
}
但是比如HW,logStartOffset、logEndOffset都存在
class UnifiedLog(@volatile var logStartOffset: Long,
private val localLog: LocalLog,
brokerTopicStats: BrokerTopicStats,
val producerIdExpirationCheckIntervalMs: Int,
@volatile var leaderEpochCache: Option[LeaderEpochFileCache],
val producerStateManager: ProducerStateManager,
@volatile private var _topicId: Option[Uuid],
val keepPartitionMetadataFile: Boolean,
val remoteStorageSystemEnable: Boolean = false,
remoteLogManager: Option[RemoteLogManager] = None,
@volatile private var logOffsetsListener: LogOffsetsListener = LogOffsetsListener.NO_OP_OFFSETS_LISTENER) extends Logging {
//logStartOffset
@volatile private[kafka] var _localLogStartOffset: Long = logStartOffset
//highWatermark
def highWatermark: Long = highWatermarkMetadata.messageOffset
//logEndOffset
def logEndOffset: Long = localLog.logEndOffset
}
基本获得lowWatermark
的地方都是用lowWatermarkIfLeader
获得LW,而方法内部是通过遍历所有副本中最小的logStartOffset
得到的
/**
* 低水位线偏移值,仅当本地副本是分区前导符时才计算 它仅由领导代理用于决定何时满足 DeleteRecordsRequest。
* 它的值是所有活动副本的最小 logStartOffset 当领导者代理收到 FetchRequest 或 DeleteRecordsRequest 时,低水位线将增加。
*/
def lowWatermarkIfLeader: Long = {
//不是Leader抛异常
if (!isLeader)
throw new NotLeaderOrFollowerException(s"Leader not local for partition $topicPartition on broker $localBrokerId")
//当 DeleteRecordsRequest 未完成时,lowWatermarkIfLeader 可能会被多次调用,已注意避免在此代码中生成不必要的集合
//首先获得Leader的logStartOffset
var lowWaterMark = localLogOrException.logStartOffset
//遍历所有的副本,如果有一个副本的logStartOffset小于当前Leader的logStartOffset,则LW则重新设置为最小的那个
remoteReplicas.foreach { replica =>
val logStartOffset = replica.stateSnapshot.logStartOffset
if (metadataCache.hasAliveBroker(replica.brokerId) && logStartOffset < lowWaterMark) {
lowWaterMark = logStartOffset
}
}
//如果存在未来的日志(futureLog),则将最小水位线与未来日志的起始偏移量进行比较,取较小值作为最终的最小水位线;如果不存在未来的日志,则直接返回最小水位线。
futureLog match {
case Some(partitionFutureLog) =>
Math.min(lowWaterMark, partitionFutureLog.logStartOffset)
case None =>
lowWaterMark
}
}