postgresql在流复制模式(stream)时,slave侧log出现以下错误:
record with zero length at XXX
FATAL:terminating walreceiver process due to administrator command
错误
xlog.c | ||||||
4069 | ||||||
else if (record->xl_len == 0) | ||||||
{ | ||||||
ereport(emode_for_corrupt_record(emode, *RecPtr), | ||||||
(errmsg("record with zero length at %X/%X", | ||||||
RecPtr->xlogid, RecPtr->xrecoff))); | ||||||
goto next_record_is_invalid; | ||||||
} | ||||||
next_record_is_invalid: | ||||||
failedSources |= readSource; | ||||||
if (readFile >= 0) | ||||||
{ | ||||||
close(readFile); | ||||||
readFile = -1; | ||||||
} | ||||||
/* | ||||||
* If archive recovery was requested, but we were still doing crash | ||||||
* recovery, switch to archive recovery and retry using the offline | ||||||
* archive. We have now replayed all the valid WAL in pg_xlog, so | ||||||
* we are presumably now consistent. | ||||||
* | ||||||
* We require that there's at least some valid WAL present in | ||||||
* pg_xlog, however (!fetch_ckpt). We could recover using the WAL | ||||||
* from the archive, even if pg_xlog is completely empty, but we'd | ||||||
* have no idea how far we'd have to replay to reach consistency. | ||||||
* So err on the safe side and give up. | ||||||
*/ | ||||||
if (!InArchiveRecovery && ArchiveRecoveryRequested && !fetching_ckpt) | ||||||
{ | ||||||
ereport(DEBUG1, | ||||||
(errmsg_internal("reached end of WAL in pg_xlog, entering archive recovery"))); | ||||||
InArchiveRecovery = true; | ||||||
if (StandbyModeRequested) | ||||||
StandbyMode = true; | ||||||
/* initialize minRecoveryPoint to this record */ | ||||||
LWLockAcquire(ControlFileLock, LW_EXCLUSIVE); | ||||||
ControlFile->state = DB_IN_ARCHIVE_RECOVERY; | ||||||
if (XLByteLT(ControlFile->minRecoveryPoint, EndRecPtr)) | ||||||
ControlFile->minRecoveryPoint = EndRecPtr; | ||||||
/* update local copy */ | ||||||
minRecoveryPoint = ControlFile->minRecoveryPoint; | ||||||
UpdateControlFile(); | ||||||
LWLockRelease(ControlFileLock); | ||||||
CheckRecoveryConsistency(); | ||||||
goto retry; | ||||||
} | ||||||
retry: | ||||||
/* See if we need to retrieve more data */ | ||||||
if (readFile < 0 || | ||||||
(readSource == XLOG_FROM_STREAM && !XLByteLT(*RecPtr, receivedUpto))) | ||||||
{ | ||||||
if (StandbyMode) | ||||||
{ | ||||||
/* | ||||||
* In standby mode, wait for the requested record to become | ||||||
* available, either via restore_command succeeding to restore the | ||||||
* segment, or via walreceiver having streamed the record. | ||||||
*/ | ||||||
for (;;) | ||||||
{ | ||||||
if (WalRcvInProgress()) | ||||||
{ | ||||||
bool | ||||||
/* | ||||||
* If we find an invalid record in the WAL streamed from | ||||||
* master, something is seriously wrong. There's little | ||||||
* chance that the problem will just go away, but PANIC is | ||||||
* not good for availability either, especially in hot | ||||||
* standby mode. Disconnect, and retry from | ||||||
* archive/pg_xlog again. The WAL in the archive should be | ||||||
* identical to what was streamed, so it's unlikely that | ||||||
* it helps, but one can hope... | ||||||
*/ | ||||||
if (failedSources & XLOG_FROM_STREAM) | ||||||
{ | ||||||
ShutdownWalRcv(); | ||||||
continue; | ||||||
} | ||||||
/* | ||||||
* Walreceiver is active, so see if new data has arrived. | ||||||
* | ||||||
* We only advance XLogReceiptTime when we obtain fresh | ||||||
* WAL from walreceiver and observe that we had already | ||||||
* processed everything before the most recent "chunk" | ||||||
* that it flushed to disk. In steady state where we are | ||||||
* keeping up with the incoming data, XLogReceiptTime will | ||||||
* be updated on each cycle. When we are behind, | ||||||
* XLogReceiptTime will not advance, so the grace time | ||||||
* alloted to conflicting queries will decrease. | ||||||
*/ | ||||||
if (XLByteLT(*RecPtr, receivedUpto)) | ||||||
havedata = true; | ||||||
else | ||||||
{ | ||||||
XLogRecPtr | ||||||
receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart); | ||||||
if (XLByteLT(*RecPtr, receivedUpto)) | ||||||
{ | ||||||
} | ||||||
else | ||||||
} | ||||||
if (havedata) | ||||||
{ | ||||||
/* | ||||||
* Great, streamed far enough. Open the file if it's | ||||||
* not open already. Use XLOG_FROM_STREAM so that | ||||||
* source info is set correctly and XLogReceiptTime | ||||||
* isn't changed. | ||||||
*/ | ||||||
if (readFile < 0) | ||||||
{ | ||||||
} | ||||||
else | ||||||
{ | ||||||
} | ||||||
break; | ||||||
} | ||||||
/* | ||||||
* Data not here yet, so check for trigger then sleep for | ||||||
* five seconds like in the WAL file polling case below. | ||||||
*/ | ||||||
if (CheckForStandbyTrigger()) | ||||||
goto retry; | ||||||
/* | ||||||
* Wait for more WAL to arrive, or timeout to be reached | ||||||
*/ | ||||||
WaitLatch(&XLogCtl->recoveryWakeupLatch, | ||||||
ResetLatch(&XLogCtl->recoveryWakeupLatch); | ||||||
} | ||||||
else | ||||||
{ | ||||||
int | ||||||
pg_time_t | now; | |||||
/* | ||||||
* Until walreceiver manages to reconnect, poll the | ||||||
* archive. | ||||||
*/ | ||||||
if (readFile >= 0) | ||||||
{ | ||||||
close(readFile); | ||||||
readFile = -1; | ||||||
} | ||||||
/* Reset curFileTLI if random fetch. */ | ||||||
if (randAccess) | ||||||
curFileTLI = 0; | ||||||
/* | ||||||
* Try to restore the file from archive, or read an | ||||||
* existing file from pg_xlog. | ||||||
*/ | ||||||
sources = XLOG_FROM_ARCHIVE | XLOG_FROM_PG_XLOG; | ||||||
if (!(sources & ~failedSources)) | ||||||
{ | ||||||
/* | ||||||
* We've exhausted all options for retrieving the | ||||||
* file. Retry. | ||||||
*/ | ||||||
failedSources = 0; | ||||||
/* | ||||||
* Before we sleep, re-scan for possible new timelines | ||||||
* if we were requested to recover to the latest | ||||||
* timeline. | ||||||
*/ | ||||||
if (recoveryTargetIsLatest) | ||||||
{ | ||||||
} | ||||||
/* | ||||||
* If it hasn't been long since last attempt, sleep to | ||||||
* avoid busy-waiting. | ||||||
*/ | ||||||
now = (pg_time_t) time(NULL); | ||||||
if ((now - last_fail_time) < 5) | ||||||
{ | ||||||
} | ||||||
last_fail_time = now; | ||||||
/* | ||||||
* If primary_conninfo is set, launch walreceiver to | ||||||
* try to stream the missing WAL, before retrying to | ||||||
* restore from archive/pg_xlog. | ||||||
* | ||||||
* If fetching_ckpt is TRUE, RecPtr points to the | ||||||
* initial checkpoint location. In that case, we use | ||||||
* RedoStartLSN as the streaming start position | ||||||
* instead of RecPtr, so that when we later jump | ||||||
* backwards to start redo at RedoStartLSN, we will | ||||||
* have the logs streamed already. | ||||||
*/ | ||||||
if (PrimaryConnInfo) | ||||||
{ | ||||||
} | ||||||
} | ||||||
/* Don't try to read from a source that just failed */ | ||||||
sources &= ~failedSources; | ||||||
readFile = XLogFileReadAnyTLI(readId, readSeg, DEBUG2, | ||||||
switched_segment = true; | ||||||
if (readFile >= 0) | ||||||
break; | ||||||
/* | ||||||
* Nope, not found in archive and/or pg_xlog. | ||||||
*/ | ||||||
failedSources |= sources; | ||||||
/* | ||||||
* Check to see if the trigger file exists. Note that we | ||||||
* do this only after failure, so when you create the | ||||||
* trigger file, we still finish replaying as much as we | ||||||
* can from archive and pg_xlog before failover. | ||||||
*/ | ||||||
if (CheckForStandbyTrigger()) | ||||||
goto triggered; | ||||||
} | ||||||
/* | ||||||
* This possibly-long loop needs to handle interrupts of | ||||||
* startup process. | ||||||
*/ | ||||||
HandleStartupProcInterrupts(); | ||||||
} | ||||||
} | ||||||
else | ||||||
{ | ||||||
/* In archive or crash recovery. */ | ||||||
if (readFile < 0) | ||||||
{ | ||||||
int | ||||||
/* Reset curFileTLI if random fetch. */ | ||||||
if (randAccess) | ||||||
curFileTLI = 0; | ||||||
sources = XLOG_FROM_PG_XLOG; | ||||||
if (InArchiveRecovery) | ||||||
sources |= XLOG_FROM_ARCHIVE; | ||||||
readFile = XLogFileReadAnyTLI(readId, readSeg, emode, | ||||||
switched_segment = true; | ||||||
if (readFile < 0) | ||||||
return false; | ||||||
} | ||||||
} | ||||||
} | ||||||
/* | ||||||
* At this point, we have the right segment open and if we're streaming we | ||||||
* know the requested record is in it. | ||||||
*/ | ||||||
Assert(readFile != -1); | ||||||
/* | ||||||
* If the current segment is being streamed from master, calculate how | ||||||
* much of the current page we have received already. We know the | ||||||
* requested record has been received, but this is for the benefit of | ||||||
* future calls, to allow quick exit at the top of this function. | ||||||
*/ | ||||||
if (readSource == XLOG_FROM_STREAM) | ||||||
{ | ||||||
if (RecPtr->xlogid != receivedUpto.xlogid || | ||||||
(RecPtr->xrecoff / XLOG_BLCKSZ) != (receivedUpto.xrecoff / XLOG_BLCKSZ)) | ||||||
{ | ||||||
readLen = XLOG_BLCKSZ; | ||||||
} | ||||||
else | ||||||
readLen = receivedUpto.xrecoff % XLogSegSize - targetPageOff; | ||||||
} | ||||||
else | ||||||
readLen = XLOG_BLCKSZ; | ||||||
if (switched_segment && targetPageOff != 0) | ||||||
{ | ||||||
/* | ||||||
* Whenever switching to a new WAL segment, we read the first page of | ||||||
* the file and validate its header, even if that's not where the | ||||||
* target record is. This is so that we can check the additional | ||||||
* identification info that is present in the first page's "long" | ||||||
* header. | ||||||
*/ | ||||||
readOff = 0; | ||||||
if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ) | ||||||
{ | ||||||
ereport(emode_for_corrupt_record(emode, *RecPtr), | ||||||
(errcode_for_file_access(), | ||||||
errmsg("could not read from log file %u, segment %u, offset %u: %m", | ||||||
goto next_record_is_invalid; | ||||||
} | ||||||
if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode, true)) | ||||||
goto next_record_is_invalid; | ||||||
} | ||||||
/* Read the requested page */ | ||||||
readOff = targetPageOff; | ||||||
if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0) | ||||||
{ | ||||||
ereport(emode_for_corrupt_record(emode, *RecPtr), | ||||||
(errcode_for_file_access(), | ||||||
errmsg("could not seek in log file %u, segment %u to offset %u: %m", | ||||||
readId, readSeg, readOff))); | ||||||
goto next_record_is_invalid; | ||||||
} | ||||||
if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ) | ||||||
{ | ||||||
ereport(emode_for_corrupt_record(emode, *RecPtr), | ||||||
(errcode_for_file_access(), | ||||||
errmsg("could not read from log file %u, segment %u, offset %u: %m", | ||||||
readId, readSeg, readOff))); | ||||||
goto next_record_is_invalid; | ||||||
} | ||||||
if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode, false)) | ||||||
goto next_record_is_invalid; | ||||||
Assert(targetId == readId); | ||||||
Assert(targetSeg == readSeg); | ||||||
Assert(targetPageOff == readOff); | ||||||
Assert(targetRecOff < readLen); | ||||||
return true; | ||||||
next_record_is_invalid: | ||||||
failedSources |= readSource; | ||||||
if (readFile >= 0) | ||||||
close(readFile); | ||||||
readFile = -1; | ||||||
readLen = 0; | ||||||
readSource = 0; | ||||||
/* In standby-mode, keep trying */ | ||||||
if (StandbyMode) | ||||||
goto retry; | ||||||
else | ||||||
return false; | ||||||
triggered: | ||||||
if (readFile >= 0) | ||||||
close(readFile); | ||||||
readFile = -1; | ||||||
readLen = 0; | ||||||
readSource = 0; | ||||||
return false; | ||||||
} | ||||||