KafkaServer,依次启动各个模块
quotaManagers = QuotaFactory.instantiate(config, metrics, time)
kafkaScheduler.startup()
logManager = createLogManager(zkUtils.zkClient, brokerState)
socketServer = new SocketServer(config, metrics, time)
replicaManager = new ReplicaManager(config, metrics, time, zkUtils, kafkaScheduler, logManager,
isShuttingDown, quotaManagers.follower)
kafkaController = new KafkaController(config, zkUtils, brokerState, time, metrics, threadNamePrefix)
groupCoordinator = GroupCoordinator(config, zkUtils, replicaManager, Time.SYSTEM)
apis = new KafkaApis(socketServer.requestChannel, replicaManager, adminManager, groupCoordinator,
kafkaController, zkUtils, config.brokerId, config, metadataCache, metrics, authorizer, quotaManagers,
clusterId, time)
requestHandlerPool = new KafkaRequestHandlerPool(config.brokerId, socketServer.requestChannel, apis, time,
config.numIoThreads)
dynamicConfigManager = new DynamicConfigManager(zkUtils, dynamicConfigHandlers)
kafkaHealthcheck = new KafkaHealthcheck(config.brokerId, listeners, zkUtils, config.rack,
config.interBrokerProtocolVersion)
case ApiKeys.PRODUCE => handleProducerRequest(request)
case ApiKeys.FETCH => handleFetchRequest(request)
case ApiKeys.LIST_OFFSETS => handleOffsetRequest(request)
case ApiKeys.METADATA => handleTopicMetadataRequest(request)
case ApiKeys.LEADER_AND_ISR => handleLeaderAndIsrRequest(request)
case ApiKeys.STOP_REPLICA => handleStopReplicaRequest(request)
case ApiKeys.UPDATE_METADATA_KEY => handleUpdateMetadataRequest(request)
case ApiKeys.CONTROLLED_SHUTDOWN_KEY => handleControlledShutdownRequest(request)
case ApiKeys.OFFSET_COMMIT => handleOffsetCommitRequest(request)
case ApiKeys.OFFSET_FETCH => handleOffsetFetchRequest(request)
case ApiKeys.GROUP_COORDINATOR => handleGroupCoordinatorRequest(request)
case ApiKeys.JOIN_GROUP => handleJoinGroupRequest(request)
case ApiKeys.HEARTBEAT => handleHeartbeatRequest(request)
case ApiKeys.LEAVE_GROUP => handleLeaveGroupRequest(request)
case ApiKeys.SYNC_GROUP => handleSyncGroupRequest(request)
case ApiKeys.DESCRIBE_GROUPS => handleDescribeGroupRequest(request)
case ApiKeys.LIST_GROUPS => handleListGroupsRequest(request)
case ApiKeys.SASL_HANDSHAKE => handleSaslHandshakeRequest(request)
case ApiKeys.API_VERSIONS => handleApiVersionsRequest(request)
case ApiKeys.CREATE_TOPICS => handleCreateTopicsRequest(request)
case ApiKeys.DELETE_TOPICS => handleDeleteTopicsRequest(request)
case requestId => throw new KafkaException("Unknown api code " + requestId)
before kafkaApi , we need to look at some classes : FileMessageSet, LogSegment,Log,Replica,Partition,ReplicaManage
class FileMessageSet private[kafka](@volatile var file: File,
private[log] val channel: FileChannel,
private[log] val start: Int,
private[log] val end: Int,
isSlice: Boolean) extends MessageSet {
...
}
class Log
class Replica:
class Partition:
class ReplicaManager:
private val allPartitions = new Pool[(String, Int), Partition](valueFactory = Some { case (t, p) =>
new Partition(t, p, time, this)
})
@volatile var controllerEpoch: Int = KafkaController.InitialControllerEpoch - 1
private val localBrokerId = config.brokerId
private val allPartitions = new Pool[(String, Int), Partition](valueFactory = Some { case (t, p) =>
new Partition(t, p, time, this)
})
replicaManager.appendMessages(
produceRequest.timeout.toLong,
produceRequest.acks,
internalTopicsAllowed,
authorizedMessagesPerPartition,
sendResponseCallback)`
messagesPerPartition: Map[TopicPartition, MessageSet]
val localProduceResults = appendToLocalLog(internalTopicsAllowed, messagesPerPartition, requiredAcks)
....
case Some(partition) =>
partition.appendMessagesToLeader(messages.asInstanceOf[ByteBufferMessageSet], requiredAcks)
val info = log.append(messages, assignOffsets = true)
....
// maybe roll the log if this segment is full
val segment = maybeRoll(messagesSize = validMessages.sizeInBytes,
maxTimestampInMessages = appendInfo.maxTimestamp)
// now append to the log
segment.append(firstOffset = appendInfo.firstOffset, largestTimestamp = appendInfo.maxTimestamp,
offsetOfLargestTimestamp = appendInfo.offsetOfMaxTimestamp, messages = validMessages)
...
*/这里是一个NIO写入
def append(messages: ByteBufferMessageSet) {
val written = messages.writeFullyTo(channel)
_size.getAndAdd(written)
}
summar:
MessageSet
Segment
Log
ByteBufferMessageSet
class : FetchRequest
private final int replicaId;
private final int maxWait;
private final int minBytes;
private final int maxBytes;
private final LinkedHashMap fetchData;
val (existingAndAuthorizedForDescribeTopics, nonExistingOrUnauthorizedForDescribeTopics) = fetchRequest.fetchData.asScala.toSeq.partition {
case (tp, _) => authorize(request.session, Describe, new Resource(auth.Topic, tp.topic)) && metadataCache.contains(tp.topic)
}
public FetchResponse(int version, LinkedHashMap responseData, int throttleTime) {
super(new Struct(ProtoUtils.responseSchema(ApiKeys.FETCH.id, version)));
writeStruct(struct, version, responseData, throttleTime);
this.responseData = responseData;
this.throttleTime = throttleTime;
}
public static final class PartitionData {
public final short errorCode;
public final long highWatermark;
public final Records records;
public PartitionData(short errorCode, long highWatermark, Records records) {
this.errorCode = errorCode;
this.highWatermark = highWatermark;
this.records = records;
}
}
public interface Records extends Iterable<LogEntry> {
int SIZE_LENGTH = 4;
int OFFSET_LENGTH = 8;
int LOG_OVERHEAD = SIZE_LENGTH + OFFSET_LENGTH;
}
....
replicaManager.fetchMessages(
fetchRequest.maxWait.toLong,
fetchRequest.replicaId,
fetchRequest.minBytes,
fetchRequest.maxBytes,
versionId <= 2,
authorizedRequestInfo,
replicationQuota(fetchRequest),
sendResponseCallback)
val logReadResults = readFromLocalLog(
replicaId = replicaId,
fetchOnlyFromLeader = fetchOnlyFromLeader,
readOnlyCommitted = fetchOnlyCommitted,
fetchMaxBytes = fetchMaxBytes,
hardMaxBytesLimit = hardMaxBytesLimit,
readPartitionInfo = fetchInfos,
quota = quota)
...
// if the fetch comes from the follower,
// update its corresponding log end offset
if(Request.isValidBrokerId(replicaId))
updateFollowerLogReadResults(replicaId, logReadResults)
...
def fetchResponseCallback(delayTimeMs: Int) {
trace(s"Sending fetch response to client $clientId of " +
s"${convertedPartitionData.map { case (_, v) => v.records.sizeInBytes }.sum} bytes")
val fetchResponse = if (delayTimeMs > 0) new FetchResponse(versionId, fetchedPartitionData, delayTimeMs) else response
requestChannel.sendResponse(new RequestChannel.Response(request, fetchResponse))
}
public FetchResponse(int version, LinkedHashMap responseData, int throttleTime) {
super(new Struct(ProtoUtils.responseSchema(ApiKeys.FETCH.id, version)));
writeStruct(struct, version, responseData, throttleTime);
this.responseData = responseData;
this.throttleTime = throttleTime;
}
def fetchResponseCallback(delayTimeMs: Int) {
trace(s"Sending fetch response to client $clientId of " +
s"${convertedPartitionData.map { case (_, v) => v.records.sizeInBytes }.sum} bytes")
val fetchResponse = if (delayTimeMs > 0) new FetchResponse(versionId, fetchedPartitionData, delayTimeMs) else response
requestChannel.sendResponse(new RequestChannel.Response(request, fetchResponse))
}
public class ListOffsetRequest extends AbstractRequest {
...
private final Map offsetData;
private final Map partitionTimestamps;
private final Set duplicatePartitions;
...
}
val responseBody = new MetadataResponse(
brokers.map(_.getNode(request.securityProtocol)).asJava,
clusterId,
metadataCache.getControllerId.getOrElse(MetadataResponse.NO_CONTROLLER_ID),
completeTopicMetadata.asJava,
requestVersion
)
public MetadataResponse(List brokers, String clusterId, int controllerId, List topicMetadata, int version) {
super(new Struct(ProtoUtils.responseSchema(ApiKeys.METADATA.id, version)));
this.brokers = brokers;
this.controller = getControllerNode(controllerId, brokers);
this.topicMetadata = topicMetadata;
this.clusterId = clusterId;
class LeaderAndIsrRequest{
...
private final int controllerId;
private final int controllerEpoch;
private final Map partitionStates;
private final Set liveLeaders;
...
}
public class PartitionState {
public final int controllerEpoch;
public final int leader;
public final int leaderEpoch;
public final List isr;
public final int zkVersion;
public final Set replicas;
...
}
// If the leader epoch is valid record the epoch of the controller that made the leadership decision.
// This is useful while updating the isr to maintain the decision maker controller's epoch in the zookeeper path
if (partitionLeaderEpoch < stateInfo.leaderEpoch) {
if(stateInfo.replicas.contains(config.brokerId))
partitionState.put(partition, stateInfo)
...}
val partitionsTobeLeader = partitionState.filter { case (_, stateInfo) =>
stateInfo.leader == config.brokerId
}
val partitionsToBeFollower = partitionState -- partitionsTobeLeader.keys
val partitionsBecomeLeader = if (partitionsTobeLeader.nonEmpty)
makeLeaders(controllerId, controllerEpoch, partitionsTobeLeader, correlationId, responseMap)
/*
* Make the current broker to become leader for a given set of partitions by:
*
* 1. Stop fetchers for these partitions
* 2. Update the partition metadata in cache
* 3. Add these partitions to the leader partitions set
*
* If an unexpected error is thrown in this function, it will be propagated to KafkaApis where
* the error message will be set on each partition since we do not know which partition caused it. Otherwise,
* return the set of partitions that are made leader due to this method
*
* TODO: the above may need to be fixed later
*/
private def makeLeaders(controllerId: Int,
epoch: Int,
partitionState: Map[Partition, PartitionState],
correlationId: Int,
responseMap: mutable.Map[TopicPartition, Short]): Set[Partition] = {
private def authorize(session: Session, operation: Operation, resource: Resource): Boolean =
authorizer.forall(_.authorize(session, operation, resource))
case class Session(principal: KafkaPrincipal, clientAddress: InetAddress) {
val sanitizedUser = QuotaId.sanitize(principal.getName)
}
public class KafkaPrincipal implements Principal {
public static final String SEPARATOR = ":";
public static final String USER_TYPE = "User";
public final static KafkaPrincipal ANONYMOUS = new KafkaPrincipal(KafkaPrincipal.USER_TYPE, "ANONYMOUS");
private String principalType;
private String name;
...
}
sealed trait Operation extends BaseEnum
case object Read extends Operation { val name = "Read" }
case object Write extends Operation { val name = "Write" }
case object Create extends Operation { val name = "Create" }
case object Delete extends Operation { val name = "Delete" }
case object Alter extends Operation { val name = "Alter" }
case object Describe extends Operation { val name = "Describe" }
case object ClusterAction extends Operation { val name = "ClusterAction" }
case object All extends Operation { val name = "All" }
val (result, error) = replicaManager.stopReplicas(stopReplicaRequest)
// First stop fetchers for all partitions, then stop the corresponding replicas
replicaFetcherManager.removeFetcherForPartitions(partitions)
//then delete partition
def stopReplica(topicPartition: TopicPartition, deletePartition: Boolean): Short = {
class KafkaController(val config : KafkaConfig, zkUtils: ZkUtils, val brokerState: BrokerState, time: Time, metrics: Metrics, threadNamePrefix: Option[String] = None) extends Logging with KafkaMetricsGroup {
this.logIdent = "[Controller " + config.brokerId + "]: "
private var isRunning = true
private val stateChangeLogger = KafkaController.stateChangeLogger
val controllerContext = new ControllerContext(zkUtils)
val partitionStateMachine = new PartitionStateMachine(this)
val replicaStateMachine = new ReplicaStateMachine(this)
private val controllerElector = new ZookeeperLeaderElector(controllerContext, ZkUtils.ControllerPath, onControllerFailover,
onControllerResignation, config.brokerId, time)
// have a separate scheduler for the controller to be able to start and stop independently of the
// kafka server
private val autoRebalanceScheduler = new KafkaScheduler(1)
var deleteTopicManager: TopicDeletionManager = null
val offlinePartitionSelector = new OfflinePartitionLeaderSelector(controllerContext, config)
private val reassignedPartitionLeaderSelector = new ReassignedPartitionLeaderSelector(controllerContext)
private val preferredReplicaPartitionLeaderSelector = new PreferredReplicaPartitionLeaderSelector(controllerContext)
private val controlledShutdownPartitionLeaderSelector = new ControlledShutdownLeaderSelector(controllerContext)
private val brokerRequestBatch = new ControllerBrokerRequestBatch(this)
private val partitionReassignedListener = new PartitionsReassignedListener(this)
private val preferredReplicaElectionListener = new PreferredReplicaElectionListener(this)
private val isrChangeNotificationListener = new IsrChangeNotificationListener(this)
...
}
def handleControlledShutdownRequest(request: RequestChannel.Request) {
// ensureTopicExists is only for client facing requests
// We can't have the ensureTopicExists check here since the controller sends it as an advisory to all brokers so they
// stop serving data to clients for the topic being deleted
val controlledShutdownRequest = request.requestObj.asInstanceOf[ControlledShutdownRequest]
authorizeClusterAction(request)
val partitionsRemaining = controller.shutdownBroker(controlledShutdownRequest.brokerId)
val controlledShutdownResponse = new ControlledShutdownResponse(controlledShutdownRequest.correlationId,
Errors.NONE.code, partitionsRemaining)
requestChannel.sendResponse(new Response(request, new RequestOrResponseSend(request.connectionId, controlledShutdownResponse)))
}
// reject the request if not authorized to the group
if (!authorize(request.session, Read, new Resource(Group, offsetCommitRequest.groupId))) {
/*GroupCoordinator
*/
coordinator.handleCommitOffsets(
offsetCommitRequest.groupId,
offsetCommitRequest.memberId,
offsetCommitRequest.generationId,
partitionData,
sendResponseCallback)
/**
* GroupCoordinator handles general group membership and offset management.
*
* Each Kafka server instantiates a coordinator which is responsible for a set of
* groups. Groups are assigned to coordinators based on their group names.
*/
class GroupCoordinator(val brokerId: Int,
val groupConfig: GroupConfig,
val offsetConfig: OffsetConfig,
val groupManager: GroupMetadataManager,
val heartbeatPurgatory: DelayedOperationPurgatory[DelayedHeartbeat],
val joinPurgatory: DelayedOperationPurgatory[DelayedJoin],
time: Time) extends Logging {
/**
* Group contains the following metadata:
*
* Membership metadata:
* 1. Members registered in this group
* 2. Current protocol assigned to the group (e.g. partition assignment strategy for consumers)
* 3. Protocol metadata associated with group members
*
* State metadata:
* 1. group state
* 2. generation id
* 3. leader id
*/
@nonthreadsafe
private[coordinator] class GroupMetadata(val groupId: String, initialState: GroupState = Empty) {
case Some(group) =>
doCommitOffsets(group, memberId, generationId, offsetMetadata, responseCallback)
def doCommitOffsets(group: GroupMetadata,
memberId: String,
generationId: Int,
offsetMetadata: immutable.Map[TopicPartition, OffsetAndMetadata],
responseCallback: immutable.Map[TopicPartition, Short] => Unit) {
var delayedOffsetStore: Option[DelayedStore] = None
group synchronized {
if (group.is(Dead)) {
responseCallback(offsetMetadata.mapValues(_ => Errors.UNKNOWN_MEMBER_ID.code))
} else if (generationId < 0 && group.is(Empty)) {
// the group is only using Kafka to store offsets
delayedOffsetStore = groupManager.prepareStoreOffsets(group, memberId, generationId,
offsetMetadata, responseCallback)
} else if (group.is(AwaitingSync)) {
responseCallback(offsetMetadata.mapValues(_ => Errors.REBALANCE_IN_PROGRESS.code))
} else if (!group.has(memberId)) {
responseCallback(offsetMetadata.mapValues(_ => Errors.UNKNOWN_MEMBER_ID.code))
} else if (generationId != group.generationId) {
responseCallback(offsetMetadata.mapValues(_ => Errors.ILLEGAL_GENERATION.code))
} else {
val member = group.get(memberId)
completeAndScheduleNextHeartbeatExpiration(group, member)
delayedOffsetStore = groupManager.prepareStoreOffsets(group, memberId, generationId,
offsetMetadata, responseCallback)
}
}
// store the offsets without holding the group lock
delayedOffsetStore.foreach(groupManager.store)
}
/**
* Store offsets by appending it to the replicated log and then inserting to cache
*/
def prepareStoreOffsets(group: GroupMetadata,
consumerId: String,
generationId: Int,
offsetMetadata: immutable.Map[TopicPartition, OffsetAndMetadata],
responseCallback: immutable.Map[TopicPartition, Short] => Unit): Option[DelayedStore] = {
// first filter out partitions with offset metadata size exceeding limit
...
// construct the message set to append
val magicValueAndTimestampOpt = getMessageFormatVersionAndTimestamp(partitionFor(group.groupId))
magicValueAndTimestampOpt match {
case Some((magicValue, timestamp)) =>
val messages = filteredOffsetMetadata.map { case (topicAndPartition, offsetAndMetadata) =>
new Message(
key = GroupMetadataManager.offsetCommitKey(group.groupId, topicAndPartition.topic, topicAndPartition.partition),
bytes = GroupMetadataManager.offsetCommitValue(offsetAndMetadata),
timestamp = timestamp,
magicValue = magicValue
)
}.toSeq
val offsetTopicPartition = new TopicPartition(Topic.GroupMetadataTopicName, partitionFor(group.groupId))
val offsetsAndMetadataMessageSet = Map(offsetTopicPartition ->
new ByteBufferMessageSet(config.offsetsTopicCompressionCodec, messages:_*))
// set the callback function to insert offsets into cache after log append completed
def putCacheCallback(responseStatus: Map[TopicPartition, PartitionResponse]) {
// the append response should only contain the topics partition
if (responseStatus.size != 1 || ! responseStatus.contains(offsetTopicPartition))
throw new IllegalStateException("Append status %s should only have one partition %s"
.format(responseStatus, offsetTopicPartition))
// construct the commit response status and insert
// the offset and metadata to cache if the append status has no error
val status = responseStatus(offsetTopicPartition)
val statusError = Errors.forCode(status.errorCode)
val responseCode =
group synchronized {
if (statusError == Errors.NONE) {
if (!group.is(Dead)) {
filteredOffsetMetadata.foreach { case (topicAndPartition, offsetAndMetadata) =>
group.completePendingOffsetWrite(topicAndPartition, offsetAndMetadata)
}
}
Errors.NONE.code
} else {
if (!group.is(Dead)) {
filteredOffsetMetadata.foreach { case (topicAndPartition, offsetAndMetadata) =>
group.failPendingOffsetWrite(topicAndPartition, offsetAndMetadata)
}
}
debug(s"Offset commit $filteredOffsetMetadata from group ${group.groupId}, consumer $consumerId " +
s"with generation $generationId failed when appending to log due to ${statusError.exceptionName}")
// transform the log append error code to the corresponding the commit status error code
val responseError = statusError match {
case Errors.UNKNOWN_TOPIC_OR_PARTITION
| Errors.NOT_ENOUGH_REPLICAS
| Errors.NOT_ENOUGH_REPLICAS_AFTER_APPEND =>
Errors.GROUP_COORDINATOR_NOT_AVAILABLE
case Errors.NOT_LEADER_FOR_PARTITION =>
Errors.NOT_COORDINATOR_FOR_GROUP
case Errors.MESSAGE_TOO_LARGE
| Errors.RECORD_LIST_TOO_LARGE
| Errors.INVALID_FETCH_SIZE =>
Errors.INVALID_COMMIT_OFFSET_SIZE
case other => other
}
responseError.code
}
}
// compute the final error codes for the commit response
val commitStatus = offsetMetadata.map { case (topicAndPartition, offsetAndMetadata) =>
if (validateOffsetMetadataLength(offsetAndMetadata.metadata))
(topicAndPartition, responseCode)
else
(topicAndPartition, Errors.OFFSET_METADATA_TOO_LARGE.code)
}
// finally trigger the callback logic passed from the API layer
responseCallback(commitStatus)
}
group synchronized {
group.prepareOffsetCommit(offsetMetadata)
}
Some(DelayedStore(offsetsAndMetadataMessageSet, putCacheCallback))
...
}
} else {
// version 1 reads offsets from Kafka;
val offsets = coordinator.handleFetchOffsets(offsetFetchRequest.groupId, authorizedTopicPartitions).toMap
// Note that we do not need to filter the partitions in the
// metadata cache as the topic partitions will be filtered
// in coordinator's offset manager through the offset cache
new OffsetFetchResponse((offsets ++ unauthorizedStatus).asJava)
}
groupManager.getOffsets(groupId, partitions)
trace("Getting offsets %s for group %s.".format(topicPartitions, groupId))
val group = groupMetadataCache.get(groupId)
if (group == null) {
topicPartitions.map { topicPartition =>
(topicPartition, new OffsetFetchResponse.PartitionData(OffsetFetchResponse.INVALID_OFFSET, "", Errors.NONE.code))
}.toMap
} else {
group synchronized {
if (group.is(Dead)) {
topicPartitions.map { topicPartition =>
(topicPartition, new OffsetFetchResponse.PartitionData(OffsetFetchResponse.INVALID_OFFSET, "", Errors.NONE.code))
}.toMap
} else {
if (topicPartitions.isEmpty) {
// Return offsets for all partitions owned by this consumer group. (this only applies to consumers that commit offsets to Kafka.)
group.allOffsets.map { case (topicPartition, offsetAndMetadata) =>
(topicPartition, new OffsetFetchResponse.PartitionData(offsetAndMetadata.offset, offsetAndMetadata.metadata, Errors.NONE.code))
}
} else {
topicPartitions.map { topicPartition =>
group.offset(topicPartition) match {
case None => (topicPartition, new OffsetFetchResponse.PartitionData(OffsetFetchResponse.INVALID_OFFSET, "", Errors.NONE.code))
case Some(offsetAndMetadata) =>
(topicPartition, new OffsetFetchResponse.PartitionData(offsetAndMetadata.offset, offsetAndMetadata.metadata, Errors.NONE.code))
}
}.toMap
}
}
}
public class CreateTopicsRequest extends AbstractRequest {
private static final Schema CURRENT_SCHEMA = ProtoUtils.currentRequestSchema(ApiKeys.CREATE_TOPICS.id);
private static final String REQUESTS_KEY_NAME = "create_topic_requests";
private static final String TIMEOUT_KEY_NAME = "timeout";
private static final String TOPIC_KEY_NAME = "topic";
private static final String NUM_PARTITIONS_KEY_NAME = "num_partitions";
private static final String REPLICATION_FACTOR_KEY_NAME = "replication_factor";
private static final String REPLICA_ASSIGNMENT_KEY_NAME = "replica_assignment";
private static final String REPLICA_ASSIGNMENT_PARTITION_ID_KEY_NAME = "partition_id";
private static final String REPLICA_ASSIGNMENT_REPLICAS_KEY_NAME = "replicas";
private static final String CONFIG_KEY_KEY_NAME = "config_key";
private static final String CONFIG_VALUE_KEY_NAME = "config_value";
private static final String CONFIGS_KEY_NAME = "configs";
public static final class TopicDetails {
public final int numPartitions;
public final short replicationFactor;
public final Map> replicasAssignments;
public final Map configs;
...
}
adminManager.createTopics(
createTopicsRequest.timeout.toInt,
validTopics,
sendResponseWithDuplicatesCallback
)
/**
* Create topics and wait until the topics have been completely created.
* The callback function will be triggered either when timeout, error or the topics are created.
*/
def createTopics(timeout: Int,
createInfo: Map[String, TopicDetails],
responseCallback: Map[String, Errors] => Unit) {
else {
AdminUtils.assignReplicasToBrokers(brokers, arguments.numPartitions, arguments.replicationFactor)
}
/**
* There are 3 goals of replica assignment:
*
* 1. Spread the replicas evenly among brokers.
* 2. For partitions assigned to a particular broker, their other replicas are spread over the other brokers.
* 3. If all brokers have rack information, assign the replicas for each partition to different racks if possible
*
* To achieve this goal for replica assignment without considering racks, we:
* 1. Assign the first replica of each partition by round-robin, starting from a random position in the broker list.
* 2. Assign the remaining replicas of each partition with an increasing shift.
*
* Here is an example of assigning
* broker-0 broker-1 broker-2 broker-3 broker-4
* p0 p1 p2 p3 p4 (1st replica)
* p5 p6 p7 p8 p9 (1st replica)
* p4 p0 p1 p2 p3 (2nd replica)
* p8 p9 p5 p6 p7 (2nd replica)
* p3 p4 p0 p1 p2 (3nd replica)
* p7 p8 p9 p5 p6 (3nd replica)
*
* To create rack aware assignment, this API will first create a rack alternated broker list. For example,
* from this brokerID -> rack mapping:
*
* 0 -> "rack1", 1 -> "rack3", 2 -> "rack3", 3 -> "rack2", 4 -> "rack2", 5 -> "rack1"
*
* The rack alternated list will be:
*
* 0, 3, 1, 5, 4, 2
*
* Then an easy round-robin assignment can be applied. Assume 6 partitions with replication factor of 3, the assignment
* will be:
*
* 0 -> 0,3,1
* 1 -> 3,1,5
* 2 -> 1,5,4
* 3 -> 5,4,2
* 4 -> 4,2,0
* 5 -> 2,0,3
*
* Once it has completed the first round-robin, if there are more partitions to assign, the algorithm will start
* shifting the followers. This is to ensure we will not always get the same set of sequences.
* In this case, if there is another partition to assign (partition #6), the assignment will be:
*
* 6 -> 0,4,2 (instead of repeating 0,3,1 as partition 0)
*
* The rack aware assignment always chooses the 1st replica of the partition using round robin on the rack alternated
* broker list. For rest of the replicas, it will be biased towards brokers on racks that do not have
* any replica assignment, until every rack has a replica. Then the assignment will go back to round-robin on
* the broker list.
*
* As the result, if the number of replicas is equal to or greater than the number of racks, it will ensure that
* each rack will get at least one replica. Otherwise, each rack will get at most one replica. In a perfect
* situation where the number of replicas is the same as the number of racks and each rack has the same number of
* brokers, it guarantees that the replica distribution is even across brokers and racks.
*
* @return a Map from partition id to replica ids
* @throws AdminOperationException If rack information is supplied but it is incomplete, or if it is not possible to
* assign each replica to a unique rack.
*
*/
def assignReplicasToBrokers(brokerMetadatas: Seq[BrokerMetadata],
nPartitions: Int,
replicationFactor: Int,
fixedStartIndex: Int = -1,
startPartitionId: Int = -1): Map[Int, Seq[Int]] = {
adminManager.deleteTopics(
deleteTopicRequest.timeout.toInt,
authorizedTopics,
sendResponseCallback
)
/**
* Delete topics and wait until the topics have been completely deleted.
* The callback function will be triggered either when timeout, error or the topics are deleted.
*/
def deleteTopics(timeout: Int,
topics: Set[String],
responseCallback: Map[String, Errors] => Unit) {
// 1. map over topics calling the asynchronous delete
val metadata = topics.map { topic =>
try {
AdminUtils.deleteTopic(zkUtils, topic)
DeleteTopicMetadata(topic, Errors.NONE)
} catch {
case _: TopicAlreadyMarkedForDeletionException =>
// swallow the exception, and still track deletion allowing multiple calls to wait for deletion
DeleteTopicMetadata(topic, Errors.NONE)
case e: Throwable =>
error(s"Error processing delete topic request for topic $topic", e)
DeleteTopicMetadata(topic, Errors.forException(e))
}
}
// 2. if timeout <= 0 or no topics can proceed return immediately
if (timeout <= 0 || !metadata.exists(_.error == Errors.NONE)) {
val results = metadata.map { deleteTopicMetadata =>
// ignore topics that already have errors
if (deleteTopicMetadata.error == Errors.NONE) {
(deleteTopicMetadata.topic, Errors.REQUEST_TIMED_OUT)
} else {
(deleteTopicMetadata.topic, deleteTopicMetadata.error)
}
}.toMap
responseCallback(results)
} else {
// 3. else pass the topics and errors to the delayed operation and set the keys
val delayedDelete = new DelayedDeleteTopics(timeout, metadata.toSeq, this, responseCallback)
val delayedDeleteKeys = topics.map(new TopicKey(_)).toSeq
// try to complete the request immediately, otherwise put it into the purgatory
topicPurgatory.tryCompleteElseWatch(delayedDelete, delayedDeleteKeys)
}
}