1
0
mirror of https://github.com/pgbackrest/pgbackrest.git synced 2025-04-17 11:46:39 +02:00

Loop while waiting for checkpoint LSN to reach replay LSN.

It is possible for the checkpoint LSN to lag slightly behind the replay LSN until pg_control has been updated.

Add a loop to keep checking rather than failing when the checkpoint LSN has not been updated.
This commit is contained in:
Stefan Fercot 2021-07-15 13:45:06 +02:00 committed by GitHub
parent a6691c6f61
commit 8fac1ac50d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 60 additions and 31 deletions

View File

@ -72,6 +72,19 @@
<p>Change level of <cmd>backup</cmd>/<cmd>restore</cmd> copied file logging to detail.</p>
</release-item>
<release-item>
<github-issue id="1460"/>
<github-pull-request id="1450"/>
<release-item-contributor-list>
<release-item-ideator id="fatih.mencutekin"/>
<release-item-contributor id="stefan.fercot"/>
<release-item-reviewer id="david.steele"/>
</release-item-contributor-list>
<p>Loop while waiting for checkpoint LSN to reach replay LSN.</p>
</release-item>
</release-improvement-list>
<release-development-list>
@ -9999,6 +10012,11 @@
<contributor-id type="github">evanbenoit</contributor-id>
</contributor>
<contributor id="fatih.mencutekin">
<contributor-name-display>Fatih Mencutekin</contributor-name-display>
<contributor-id type="github">fatihmtekin</contributor-id>
</contributor>
<contributor id="floris.van.nee">
<contributor-name-display>Floris van Nee</contributor-name-display>
<contributor-id type="github">fvannee</contributor-id>

View File

@ -567,30 +567,37 @@ dbReplayWait(Db *this, const String *targetLsn, TimeMSec timeout)
// Perform a checkpoint
dbExec(this, STRDEF("checkpoint"));
// On PostgreSQL >= 9.6 the checkpoint location can be verified
//
// ??? We have seen one instance where this check failed. Is there any chance that the replayed position could be ahead of
// the checkpoint recorded in pg_control? It seems possible since it would be safer if the checkpoint in pg_control was
// behind rather than ahead, so add a loop to keep checking until the checkpoint has been recorded or timeout.
// On PostgreSQL >= 9.6 the checkpoint location can be verified so loop until lsn has been reached or timeout
if (dbPgVersion(this) >= PG_VERSION_96)
{
// Build the query
const String *query = strNewFmt(
"select (checkpoint_%s > '%s')::bool as targetReached,\n"
" checkpoint_%s::text as checkpointLsn\n"
" from pg_catalog.pg_control_checkpoint()",
lsnName, strZ(targetLsn), lsnName);
wait = waitNew(timeout);
targetReached = false;
const String *checkpointLsn = NULL;
// Execute query
VariantList *row = dbQueryRow(this, query);
do
{
// Build the query
const String *query = strNewFmt(
"select (checkpoint_%s > '%s')::bool as targetReached,\n"
" checkpoint_%s::text as checkpointLsn\n"
" from pg_catalog.pg_control_checkpoint()",
lsnName, strZ(targetLsn), lsnName);
// Verify target was reached
if (!varBool(varLstGet(row, 0)))
// Execute the query and get checkpointLsn
VariantList *row = dbQueryRow(this, query);
targetReached = varBool(varLstGet(row, 0));
checkpointLsn = varStr(varLstGet(row, 1));
protocolKeepAlive();
}
while (!targetReached && waitMore(wait));
// Error if a timeout occurred before the target lsn was reached
if (!targetReached)
{
THROW_FMT(
ArchiveTimeoutError,
"the checkpoint lsn %s is less than the target lsn %s even though the replay lsn is %s",
strZ(varStr(varLstGet(row, 1))), strZ(targetLsn), strZ(replayLsn));
ArchiveTimeoutError, "timeout before standby checkpoint lsn reached %s - only reached %s", strZ(targetLsn),
strZ(checkpointLsn));
}
}
}

View File

@ -452,14 +452,15 @@ Macros for defining groups of functions that implement various queries and comma
#define HRNPQ_MACRO_REPLAY_TARGET_REACHED_GE_10(sessionParam, targetLsnParam, targetReachedParam, reachedLsnParam) \
HRNPQ_MACRO_REPLAY_TARGET_REACHED(sessionParam, "wal", "lsn", targetLsnParam, targetReachedParam, reachedLsnParam)
#define HRNPQ_MACRO_CHECKPOINT_TARGET_REACHED(sessionParam, lsnNameParam, targetLsnParam, targetReachedParam, checkpointLsnParam) \
#define HRNPQ_MACRO_CHECKPOINT_TARGET_REACHED( \
sessionParam, lsnNameParam, targetLsnParam, targetReachedParam, checkpointLsnParam, sleepParam) \
{.session = sessionParam, \
.function = HRNPQ_SENDQUERY, \
.param = strZ(strNewFmt( \
"[\"select (checkpoint_" lsnNameParam " > '%s')::bool as targetReached,\\n" \
" checkpoint_" lsnNameParam "::text as checkpointLsn\\n" \
" from pg_catalog.pg_control_checkpoint()\"]", targetLsnParam)), \
.resultInt = 1}, \
.resultInt = 1, .sleep = sleepParam}, \
{.session = sessionParam, .function = HRNPQ_CONSUMEINPUT}, \
{.session = sessionParam, .function = HRNPQ_ISBUSY}, \
{.session = sessionParam, .function = HRNPQ_GETRESULT}, \
@ -473,11 +474,13 @@ Macros for defining groups of functions that implement various queries and comma
{.session = sessionParam, .function = HRNPQ_CLEAR}, \
{.session = sessionParam, .function = HRNPQ_GETRESULT, .resultNull = true}
#define HRNPQ_MACRO_CHECKPOINT_TARGET_REACHED_96(sessionParam, targetLsnParam, targetReachedParam, checkpointLsnParam) \
HRNPQ_MACRO_CHECKPOINT_TARGET_REACHED(sessionParam, "location", targetLsnParam, targetReachedParam, checkpointLsnParam)
#define HRNPQ_MACRO_CHECKPOINT_TARGET_REACHED_96(sessionParam, targetLsnParam, targetReachedParam, checkpointLsnParam, sleepParam) \
HRNPQ_MACRO_CHECKPOINT_TARGET_REACHED( \
sessionParam, "location", targetLsnParam, targetReachedParam, checkpointLsnParam, sleepParam)
#define HRNPQ_MACRO_CHECKPOINT_TARGET_REACHED_GE_10(sessionParam, targetLsnParam, targetReachedParam, checkpointLsnParam) \
HRNPQ_MACRO_CHECKPOINT_TARGET_REACHED(sessionParam, "lsn", targetLsnParam, targetReachedParam, checkpointLsnParam)
#define HRNPQ_MACRO_CHECKPOINT_TARGET_REACHED_GE_10( \
sessionParam, targetLsnParam, targetReachedParam, checkpointLsnParam, sleepParam) \
HRNPQ_MACRO_CHECKPOINT_TARGET_REACHED(sessionParam, "lsn", targetLsnParam, targetReachedParam, checkpointLsnParam, sleepParam)
#define HRNPQ_MACRO_REPLAY_WAIT_LE_95(sessionParam, targetLsnParam) \
HRNPQ_MACRO_REPLAY_TARGET_REACHED_LE_96(sessionParam, targetLsnParam, true, "X/X"), \
@ -486,12 +489,12 @@ Macros for defining groups of functions that implement various queries and comma
#define HRNPQ_MACRO_REPLAY_WAIT_96(sessionParam, targetLsnParam) \
HRNPQ_MACRO_REPLAY_TARGET_REACHED_LE_96(sessionParam, targetLsnParam, true, "X/X"), \
HRNPQ_MACRO_CHECKPOINT(sessionParam), \
HRNPQ_MACRO_CHECKPOINT_TARGET_REACHED_96(sessionParam, targetLsnParam, true, "X/X")
HRNPQ_MACRO_CHECKPOINT_TARGET_REACHED_96(sessionParam, targetLsnParam, true, "X/X", 0)
#define HRNPQ_MACRO_REPLAY_WAIT_GE_10(sessionParam, targetLsnParam) \
HRNPQ_MACRO_REPLAY_TARGET_REACHED_GE_10(sessionParam, targetLsnParam, true, "X/X"), \
HRNPQ_MACRO_CHECKPOINT(sessionParam), \
HRNPQ_MACRO_CHECKPOINT_TARGET_REACHED_GE_10(sessionParam, targetLsnParam, true, "X/X")
HRNPQ_MACRO_CHECKPOINT_TARGET_REACHED_GE_10(sessionParam, targetLsnParam, true, "X/X", 0)
#define HRNPQ_MACRO_CLOSE(sessionParam) \
{.session = sessionParam, .function = HRNPQ_FINISH}

View File

@ -470,10 +470,11 @@ testRun(void)
HRNPQ_MACRO_REPLAY_TARGET_REACHED_PROGRESS_GE_10(2, "5/5", false, "5/3", "5/3", false, 250),
HRNPQ_MACRO_REPLAY_TARGET_REACHED_PROGRESS_GE_10(2, "5/5", false, "5/3", "5/3", false, 0),
// Checkpoint target not reached
// Checkpoint target timeout waiting for sync
HRNPQ_MACRO_REPLAY_TARGET_REACHED_GE_10(2, "5/5", true, "5/5"),
HRNPQ_MACRO_CHECKPOINT(2),
HRNPQ_MACRO_CHECKPOINT_TARGET_REACHED_GE_10(2, "5/5", false, "5/4"),
HRNPQ_MACRO_CHECKPOINT_TARGET_REACHED_GE_10(2, "5/5", false, "5/4", 250),
HRNPQ_MACRO_CHECKPOINT_TARGET_REACHED_GE_10(2, "5/5", false, "5/4", 0),
// Wait for standby to sync
HRNPQ_MACRO_REPLAY_TARGET_REACHED_GE_10(2, "5/5", false, "5/3"),
@ -481,7 +482,7 @@ testRun(void)
HRNPQ_MACRO_REPLAY_TARGET_REACHED_PROGRESS_GE_10(2, "5/5", false, "5/4", "5/3", true, 0),
HRNPQ_MACRO_REPLAY_TARGET_REACHED_PROGRESS_GE_10(2, "5/5", true, "5/5", "5/4", true, 0),
HRNPQ_MACRO_CHECKPOINT(2),
HRNPQ_MACRO_CHECKPOINT_TARGET_REACHED_GE_10(2, "5/5", true, "X/X"),
HRNPQ_MACRO_CHECKPOINT_TARGET_REACHED_GE_10(2, "5/5", true, "X/X", 0),
// Close standby
HRNPQ_MACRO_CLOSE(2),
@ -509,8 +510,8 @@ testRun(void)
"timeout before standby replayed to 5/5 - only reached 5/3");
TEST_ERROR(
dbReplayWait(db.standby, STRDEF("5/5"), 1000), ArchiveTimeoutError,
"the checkpoint lsn 5/4 is less than the target lsn 5/5 even though the replay lsn is 5/5");
dbReplayWait(db.standby, STRDEF("5/5"), 200), ArchiveTimeoutError,
"timeout before standby checkpoint lsn reached 5/5 - only reached 5/4");
TEST_RESULT_VOID(dbReplayWait(db.standby, STRDEF("5/5"), 1000), "sync standby");