版本-7.2
注册主分片处理函数
- TransportReplicationAction.handlePrimaryRequest是处理的入口
protected TransportReplicationAction(Settings settings, String actionName, TransportService transportService,
ClusterService clusterService, IndicesService indicesService,
ThreadPool threadPool, ShardStateAction shardStateAction,
ActionFilters actionFilters,
IndexNameExpressionResolver indexNameExpressionResolver, Writeable.Reader requestReader,
Writeable.Reader replicaRequestReader, String executor,
boolean syncGlobalCheckpointAfterOperation, boolean forceExecutionOnPrimary) {
super(actionName, actionFilters, transportService.getTaskManager());
this.threadPool = threadPool;
this.transportService = transportService;
this.clusterService = clusterService;
this.indicesService = indicesService;
this.shardStateAction = shardStateAction;
this.indexNameExpressionResolver = indexNameExpressionResolver;
this.executor = executor;
this.transportPrimaryAction = actionName + "[p]";
this.transportReplicaAction = actionName + "[r]";
transportService.registerRequestHandler(actionName, ThreadPool.Names.SAME, requestReader, this::handleOperationRequest);
transportService.registerRequestHandler(transportPrimaryAction, executor, forceExecutionOnPrimary, true,
in -> new ConcreteShardRequest<>(requestReader, in), this::handlePrimaryRequest);
// we must never reject on because of thread pool capacity on replicas
transportService.registerRequestHandler(transportReplicaAction, executor, true, true,
in -> new ConcreteReplicaRequest<>(replicaRequestReader, in), this::handleReplicaRequest);
this.transportOptions = transportOptions(settings);
this.syncGlobalCheckpointAfterOperation = syncGlobalCheckpointAfterOperation;
}
通过AsyncPrimaryAction执行
protected void handlePrimaryRequest(final ConcreteShardRequest request, final TransportChannel channel, final Task task) {
new AsyncPrimaryAction(
request, new ChannelActionListener<>(channel, transportPrimaryAction, request), (ReplicationTask) task).run();
}
AsyncPrimaryAction.runWithPrimaryShardReference
- 执行主分片的引用
- 通过ReplicationOperation.execute
void runWithPrimaryShardReference(final PrimaryShardReference primaryShardReference) {
try {
...
new ReplicationOperation<>(primaryRequest.getRequest(), primaryShardReference,
ActionListener.wrap(result -> result.respond(globalCheckpointSyncingListener), referenceClosingListener::onFailure),
newReplicasProxy(), logger, actionName, primaryRequest.getPrimaryTerm()).execute();
} catch (Exception e) {
handleException(primaryShardReference, e);
}
}
ReplicationOperation.execute
- primary.perform(request, ActionListener.wrap(this::handlePrimaryResult, resultListener::onFailure))
- 这一行需要特别注意,primary.perform执行完主分片的写入,同时创建了一个监听器,当分片写完了,通过handlePrimaryResult写入副本分配
public void execute() throws Exception {
final String activeShardCountFailure = checkActiveShardCount();
final ShardRouting primaryRouting = primary.routingEntry();
final ShardId primaryId = primaryRouting.shardId();
if (activeShardCountFailure != null) {
finishAsFailed(new UnavailableShardsException(primaryId,
"{} Timeout: [{}], request: [{}]", activeShardCountFailure, request.timeout(), request));
return;
}
totalShards.incrementAndGet();
pendingActions.incrementAndGet(); // increase by 1 until we finish all primary coordination
primary.perform(request, ActionListener.wrap(this::handlePrimaryResult, resultListener::onFailure));
}
TransportReplicationAction.PrimaryShardReference
- 通过PrimaryShardReference.perform执行
@Override
public void perform(Request request, ActionListener> listener) {
if (Assertions.ENABLED) {
listener = ActionListener.map(listener, result -> {
assert result.replicaRequest() == null || result.finalFailure == null : "a replica request [" + result.replicaRequest()
+ "] with a primary failure [" + result.finalFailure + "]";
return result;
});
}
assert indexShard.getActiveOperationsCount() != 0 : "must perform shard operation under a permit";
shardOperationOnPrimary(request, indexShard, listener);
}
TransportShardBulkAction.shardOperationOnPrimary
- shardOperationOnPrimary在TransportReplicationAction是一个抽象函数
public static void performOnPrimary(
BulkShardRequest request,
IndexShard primary,
UpdateHelper updateHelper,
LongSupplier nowInMillisSupplier,
MappingUpdatePerformer mappingUpdater,
Consumer> waitForMappingUpdate,
ActionListener> listener,
ThreadPool threadPool) {
new ActionRunnable>(listener) {
private final Executor executor = threadPool.executor(ThreadPool.Names.WRITE);
private final BulkPrimaryExecutionContext context = new BulkPrimaryExecutionContext(request, primary);
@Override
protected void doRun() throws Exception {
//遍历分片,逐个执行
while (context.hasMoreOperationsToExecute()) {
if (executeBulkItemRequest(context, updateHelper, nowInMillisSupplier, mappingUpdater, waitForMappingUpdate,
ActionListener.wrap(v -> executor.execute(this), this::onRejection)) == false) {
// We are waiting for a mapping update on another thread, that will invoke this action again once its done
// so we just break out here.
return;
}
assert context.isInitial(); // either completed and moved to next or reset
}
// We're done, there's no more operations to execute so we resolve the wrapped listener
finishRequest();
}
}.run();
}
TransportShardBulkAction.executeBulkItemRequest
- 通过IndexShard对象执行写操作
final IndexRequest request = context.getRequestToExecute();
result = primary.applyIndexOperationOnPrimary(version, request.versionType(), new SourceToParse(
request.index(), request.type(), request.id(), request.source(), request.getContentType(), request.routing()),
request.ifSeqNo(), request.ifPrimaryTerm(), request.getAutoGeneratedTimestamp(), request.isRetry());
IndexShard.applyIndexOperation
- 主要调用prepareIndex和 index比较底层的函数
- prepareIndex的作用的生成Engine.Index对象,生成文档id,并没有实际的文件操作
private Engine.IndexResult applyIndexOperation(Engine engine, long seqNo, long opPrimaryTerm, long version,
@Nullable VersionType versionType, long ifSeqNo, long ifPrimaryTerm,
long autoGeneratedTimeStamp, boolean isRetry, Engine.Operation.Origin origin,
SourceToParse sourceToParse) throws IOException {
assert opPrimaryTerm <= getOperationPrimaryTerm()
: "op term [ " + opPrimaryTerm + " ] > shard term [" + getOperationPrimaryTerm() + "]";
ensureWriteAllowed(origin);
Engine.Index operation;
try {
final String resolvedType = mapperService.resolveDocumentType(sourceToParse.type());
final SourceToParse sourceWithResolvedType;
if (resolvedType.equals(sourceToParse.type())) {
sourceWithResolvedType = sourceToParse;
} else {
sourceWithResolvedType = new SourceToParse(sourceToParse.index(), resolvedType, sourceToParse.id(),
sourceToParse.source(), sourceToParse.getXContentType(), sourceToParse.routing());
}
operation = prepareIndex(docMapper(resolvedType), indexSettings.getIndexVersionCreated(), sourceWithResolvedType,
seqNo, opPrimaryTerm, version, versionType, origin, autoGeneratedTimeStamp, isRetry, ifSeqNo, ifPrimaryTerm);
Mapping update = operation.parsedDoc().dynamicMappingsUpdate();
if (update != null) {
return new Engine.IndexResult(update);
}
} catch (Exception e) {
// We treat any exception during parsing and or mapping update as a document level failure
// with the exception side effects of closing the shard. Since we don't have the shard, we
// can not raise an exception that may block any replication of previous operations to the
// replicas
verifyNotClosed(e);
return new Engine.IndexResult(e, version, opPrimaryTerm, seqNo);
}
return index(engine, operation);
}
InternalEngine.index
public IndexResult index(Index index) throws IOException {
assert Objects.equals(index.uid().field(), IdFieldMapper.NAME) : index.uid().field();
final boolean doThrottle = index.origin().isRecovery() == false;
try (ReleasableLock releasableLock = readLock.acquire()) {
ensureOpen();
assert assertIncomingSequenceNumber(index.origin(), index.seqNo());
try (Releasable ignored = versionMap.acquireLock(index.uid().bytes());
Releasable indexThrottle = doThrottle ? () -> {} : throttle.acquireThrottle()) {
lastWriteNanos = index.startTime();
final IndexingStrategy plan = indexingStrategyForOperation(index);
final IndexResult indexResult;
if (plan.earlyResultOnPreFlightError.isPresent()) {
indexResult = plan.earlyResultOnPreFlightError.get();
assert indexResult.getResultType() == Result.Type.FAILURE : indexResult.getResultType();
} else {
// generate or register sequence number
if (index.origin() == Operation.Origin.PRIMARY) {
index = new Index(index.uid(), index.parsedDoc(), generateSeqNoForOperationOnPrimary(index), index.primaryTerm(),
index.version(), index.versionType(), index.origin(), index.startTime(), index.getAutoGeneratedIdTimestamp(),
index.isRetry(), index.getIfSeqNo(), index.getIfPrimaryTerm());
final boolean toAppend = plan.indexIntoLucene && plan.useLuceneUpdateDocument == false;
if (toAppend == false) {
advanceMaxSeqNoOfUpdatesOrDeletesOnPrimary(index.seqNo());
}
} else {
markSeqNoAsSeen(index.seqNo());
}
assert index.seqNo() >= 0 : "ops should have an assigned seq no.; origin: " + index.origin();
if (plan.indexIntoLucene || plan.addStaleOpToLucene) {
indexResult = indexIntoLucene(index, plan);
} else {
indexResult = new IndexResult(
plan.versionForIndexing, getPrimaryTerm(), index.seqNo(), plan.currentNotFoundOrDeleted);
}
}
if (index.origin().isFromTranslog() == false) {
final Translog.Location location;
if (indexResult.getResultType() == Result.Type.SUCCESS) {
location = translog.add(new Translog.Index(index, indexResult));
} else if (indexResult.getSeqNo() != SequenceNumbers.UNASSIGNED_SEQ_NO) {
// if we have document failure, record it as a no-op in the translog and Lucene with the generated seq_no
final NoOp noOp = new NoOp(indexResult.getSeqNo(), index.primaryTerm(), index.origin(),
index.startTime(), indexResult.getFailure().toString());
location = innerNoOp(noOp).getTranslogLocation();
} else {
location = null;
}
indexResult.setTranslogLocation(location);
}
if (plan.indexIntoLucene && indexResult.getResultType() == Result.Type.SUCCESS) {
final Translog.Location translogLocation = trackTranslogLocation.get() ? indexResult.getTranslogLocation() : null;
versionMap.maybePutIndexUnderLock(index.uid().bytes(),
new IndexVersionValue(translogLocation, plan.versionForIndexing, index.seqNo(), index.primaryTerm()));
}
localCheckpointTracker.markSeqNoAsCompleted(indexResult.getSeqNo());
indexResult.setTook(System.nanoTime() - index.startTime());
indexResult.freeze();
return indexResult;
}
} catch (RuntimeException | IOException e) {
try {
maybeFailEngine("index", e);
} catch (Exception inner) {
e.addSuppressed(inner);
}
throw e;
}
}
参考来源
https://cloud.tencent.com/developer/article/1361160
https://zhuanlan.zhihu.com/p/34669354