public DLedgerServer create(String selfId, String peers, String leaderId) {
DLedgerConfig dLedgerConfig = new DLedgerConfig();
dLedgerConfig.setMappedFileSizeForEntryData(1024);
dLedgerConfig.setEnableLeaderElector(false);
dLedgerConfig.setEnableDiskForceClean(false);
dLedgerConfig.setStoreType(DLedgerConfig.FILE);
dLedgerConfig.setPeers(peers);
dLedgerConfig.setSelfId(selfId);
dLedgerConfig.setGroup("test");
dLedgerConfig.setStoreBaseDir(...);
DLedgerServer dLedgerServer = new DLedgerServer(dLedgerConfig);
MemberState memberState = dLedgerServer.getMemberState();
memberState.setCurrTermForTest(0);
if (selfId.equals(leaderId)) {
memberState.changeToLeader(0);
} else {
memberState.changeToFollower(0, leaderId);
}
dLedgerServer.startup();
return dLedgerServer;
}
private String joinPeers(List<String> nodes, String s) {
StringBuilder builder = new StringBuilder();
for (String node : nodes) {
builder.append(node).append(s);
}
builder.deleteCharAt(builder.length() - 1);
String join = builder.toString();
return join;
}
@Test
public void syncTest() throws Exception {
List<String> nodes = Arrays.asList(
"n1-localhost:1001",
"n2-localhost:1002"
);
String peers = joinPeers(nodes, ";");
DLedgerServer server1 = create("n1", peers, "n1");
//delete server2 StoreBaseDir
DLedgerServer server2 = create("n2", peers, "n1");
CountDownLatch countDownLatch = new CountDownLatch(1);
countDownLatch.await();
}
在删除server2的StoreBaseDir后模拟消息数据被删除,触发同步
在EntryDispatcher代码中有如下状态转换
COMPARE ---- TRUNCATE ---- APPEND ---- COMMIT
^ |
|---<-----<------<-------<----|
EntryDispatcher初始状态为COMPARE
private class EntryDispatcher extends ShutdownAbleThread {
@Override
public void doWork() {
try {
if (!checkAndFreshState()) {
waitForRunning(1);
return;
}
if (type.get() == PushEntryRequest.Type.APPEND) {
if (dLedgerConfig.isEnableBatchPush()) {
doBatchAppend();
} else {
doAppend();
}
} else {
doCompare();
}
waitForRunning(1);
} catch (Throwable t) {
DLedgerEntryPusher.logger.error("[Push-{}]Error in {} writeIndex={} compareIndex={}", peerId, getName(), writeIndex, compareIndex, t);
changeState(-1, PushEntryRequest.Type.COMPARE);
DLedgerUtils.sleep(500);
}
}
}
}
private void doCompare() throws Exception {
......
//revise the compareIndex
if (compareIndex == -1) {
compareIndex = dLedgerStore.getLedgerEndIndex();
logger.info("[Push-{}][DoCompare] compareIndex=-1 means start to compare", peerId);
} else if (compareIndex > dLedgerStore.getLedgerEndIndex() || compareIndex < dLedgerStore.getLedgerBeginIndex()) {
logger.info("[Push-{}][DoCompare] compareIndex={} out of range {}-{}", peerId, compareIndex, dLedgerStore.getLedgerBeginIndex(), dLedgerStore.getLedgerEndIndex());
compareIndex = dLedgerStore.getLedgerEndIndex();
}
//获取loader的获取最后一个entry
DLedgerEntry entry = dLedgerStore.get(compareIndex);
PreConditions.check(entry != null, DLedgerResponseCode.INTERNAL_ERROR, "compareIndex=%d", compareIndex);
PushEntryRequest request = buildPushRequest(entry, PushEntryRequest.Type.COMPARE);
//发送compare请求
CompletableFuture<PushEntryResponse> responseFuture = dLedgerRpcService.push(request);
PushEntryResponse response = responseFuture.get(3, TimeUnit.SECONDS);
//response返回beginIndex、endIndex
PreConditions.check(response != null, DLedgerResponseCode.INTERNAL_ERROR, "compareIndex=%d", compareIndex);
PreConditions.check(response.getCode() == DLedgerResponseCode.INCONSISTENT_STATE.getCode() || response.getCode() == DLedgerResponseCode.SUCCESS.getCode()
, DLedgerResponseCode.valueOf(response.getCode()), "compareIndex=%d", compareIndex);
......
与leader的dLedgerStore做对比,计算follower需要同步的序号truncateIndex
......
if (truncateIndex != -1) {
changeState(truncateIndex, PushEntryRequest.Type.TRUNCATE);
doTruncate(truncateIndex);
break;
}
}
private void doTruncate(long truncateIndex) throws Exception {
PreConditions.check(type.get() == PushEntryRequest.Type.TRUNCATE, DLedgerResponseCode.UNKNOWN);
DLedgerEntry truncateEntry = dLedgerStore.get(truncateIndex);
PreConditions.check(truncateEntry != null, DLedgerResponseCode.UNKNOWN);
logger.info("[Push-{}]Will push data to truncate truncateIndex={} pos={}", peerId, truncateIndex, truncateEntry.getPos());
PushEntryRequest truncateRequest = buildPushRequest(truncateEntry, PushEntryRequest.Type.TRUNCATE);
//发送truncate请求
PushEntryResponse truncateResponse = dLedgerRpcService.push(truncateRequest).get(3, TimeUnit.SECONDS);
PreConditions.check(truncateResponse != null, DLedgerResponseCode.UNKNOWN, "truncateIndex=%d", truncateIndex);
PreConditions.check(truncateResponse.getCode() == DLedgerResponseCode.SUCCESS.getCode(), DLedgerResponseCode.valueOf(truncateResponse.getCode()), "truncateIndex=%d", truncateIndex);
lastPushCommitTimeMs = System.currentTimeMillis();
//切换状态到APPEND
changeState(truncateIndex, PushEntryRequest.Type.APPEND);
}
EntryDispatcher.doWork继续工作时为APPEND状态
private void doAppend() throws Exception {
while (true) {
if (!checkAndFreshState()) {
break;
}
if (type.get() != PushEntryRequest.Type.APPEND) {
break;
}
if (writeIndex > dLedgerStore.getLedgerEndIndex()) {
//同步完成,commit
doCommit();
doCheckAppendResponse();
break;
}
if (pendingMap.size() >= maxPendingSize || DLedgerUtils.elapsed(lastCheckLeakTimeMs) > 1000) {
long peerWaterMark = getPeerWaterMark(term, peerId);
for (Long index : pendingMap.keySet()) {
if (index < peerWaterMark) {
pendingMap.remove(index);
}
}
lastCheckLeakTimeMs = System.currentTimeMillis();
}
if (pendingMap.size() >= maxPendingSize) {
//处理失败重发
doCheckAppendResponse();
break;
}
//同步当前writeIndex的消息
doAppendInner(writeIndex);
writeIndex++;
}
}
private void doAppendInner(long index) throws Exception {
DLedgerEntry entry = getDLedgerEntryForAppend(index);
if (null == entry) {
return;
}
checkQuotaAndWait(entry);
PushEntryRequest request = buildPushRequest(entry, PushEntryRequest.Type.APPEND);
//同步给follower
CompletableFuture<PushEntryResponse> responseFuture = dLedgerRpcService.push(request);
pendingMap.put(index, System.currentTimeMillis());
responseFuture.whenComplete((x, ex) -> {
try {
PreConditions.check(ex == null, DLedgerResponseCode.UNKNOWN);
DLedgerResponseCode responseCode = DLedgerResponseCode.valueOf(x.getCode());
switch (responseCode) {
case SUCCESS:
pendingMap.remove(x.getIndex());
updatePeerWaterMark(x.getTerm(), peerId, x.getIndex());
quorumAckChecker.wakeup();
break;
case INCONSISTENT_STATE:
logger.info("[Push-{}]Get INCONSISTENT_STATE when push index={} term={}", peerId, x.getIndex(), x.getTerm());
changeState(-1, PushEntryRequest.Type.COMPARE);
break;
default:
logger.warn("[Push-{}]Get error response code {} {}", peerId, responseCode, x.baseInfo());
break;
}
} catch (Throwable t) {
logger.error("", t);
}
});
lastPushCommitTimeMs = System.currentTimeMillis();
}
private void doAppend() throws Exception {
......
if (!checkAndFreshState()) {
break;
}
if (type.get() != PushEntryRequest.Type.APPEND) {
break;
}
if (writeIndex > dLedgerStore.getLedgerEndIndex()) {
//当writeIndex都发送完成后做commit
doCommit();
doCheckAppendResponse();
break;
}
......
}
private void doCommit() throws Exception {
if (DLedgerUtils.elapsed(lastPushCommitTimeMs) > 1000) {
PushEntryRequest request = buildPushRequest(null, PushEntryRequest.Type.COMMIT);
//Ignore the results
dLedgerRpcService.push(request);
lastPushCommitTimeMs = System.currentTimeMillis();
}
}
private PushEntryRequest buildPushRequest(DLedgerEntry entry, PushEntryRequest.Type target) {
PushEntryRequest request = new PushEntryRequest();
request.setGroup(memberState.getGroup());
request.setRemoteId(peerId);
request.setLeaderId(leaderId);
request.setLocalId(memberState.getSelfId());
request.setTerm(term);
request.setEntry(entry);
request.setType(target);
//把committedIndex同步到每个follower
request.setCommitIndex(dLedgerStore.getCommittedIndex());
return request;
}
dLedgerRpcService.push发送后,follower接收到后会调用handleDoCommit
private CompletableFuture<PushEntryResponse> handleDoCommit(long committedIndex, PushEntryRequest request,
CompletableFuture<PushEntryResponse> future) {
try {
PreConditions.check(committedIndex == request.getCommitIndex(), DLedgerResponseCode.UNKNOWN);
PreConditions.check(request.getType() == PushEntryRequest.Type.COMMIT, DLedgerResponseCode.UNKNOWN);
updateCommittedIndex(request.getTerm(), committedIndex);
future.complete(buildResponse(request, DLedgerResponseCode.SUCCESS.getCode()));
} catch (Throwable t) {
logger.error("[HandleDoCommit] committedIndex={}", request.getCommitIndex(), t);
future.complete(buildResponse(request, DLedgerResponseCode.UNKNOWN.getCode()));
}
return future;
}
@Override
public void updateCommittedIndex(long term, long newCommittedIndex) {
if (newCommittedIndex == -1
|| ledgerEndIndex == -1
|| term < memberState.currTerm()
|| newCommittedIndex == this.committedIndex) {
return;
}
if (newCommittedIndex < this.committedIndex
|| newCommittedIndex < this.ledgerBeginIndex) {
logger.warn("[MONITOR]Skip update committed index for new={} < old={} or new={} < beginIndex={}", newCommittedIndex, this.committedIndex, newCommittedIndex, this.ledgerBeginIndex);
return;
}
long endIndex = ledgerEndIndex;
if (newCommittedIndex > endIndex) {
//If the node fall behind too much, the committedIndex will be larger than enIndex.
newCommittedIndex = endIndex;
}
Pair<Long, Integer> posAndSize = getEntryPosAndSize(newCommittedIndex);
PreConditions.check(posAndSize != null, DLedgerResponseCode.DISK_ERROR);
//更新index
this.committedIndex = newCommittedIndex;
this.committedPos = posAndSize.getKey() + posAndSize.getValue();
}
定时会触发persistCheckPoint,保存checkpoint文件
void persistCheckPoint() {
try {
Properties properties = new Properties();
properties.put(END_INDEX_KEY, getLedgerEndIndex());
properties.put(COMMITTED_INDEX_KEY, getCommittedIndex());
String data = IOUtils.properties2String(properties);
IOUtils.string2File(data, dLedgerConfig.getDefaultPath() + File.separator + CHECK_POINT_FILE);
} catch (Throwable t) {
logger.error("Persist checkpoint failed", t);
}
}
checkpoint文件格式
endIndex=115
committedIndex=58
客户端请求写入消息时是Append成功就返回,还是要等待Commit才返回?
答:从上述代码可以发现commit流程是独立的,又要有半数follower写入成功就算append成功了。