1
0
mirror of https://github.com/pgbackrest/pgbackrest.git synced 2024-12-12 10:04:14 +02:00

Fix issues with leftover spool files from a prior restore.

If an ok file (which indicates the WAL segment was not found) is present on the first iteration of the loop then remove it and spawn the async process to retry. This action also resets the queue.

Also error if no response is received from the async process rather than returning not found. PostgreSQL will respond the same either way, but this allows us to determine when something is going wrong with the async process.

Update archiveAsyncStatus() to allow warnings to be suppressed. It is better to retry if no WAL segment was found before warning because the warning might be stale.
This commit is contained in:
David Steele 2021-05-13 17:51:39 -04:00 committed by GitHub
parent 34dd6636b8
commit f45e76fa2e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 127 additions and 40 deletions

View File

@ -15,6 +15,22 @@
<release date="XXXX-XX-XX" version="2.34dev" title="UNDER DEVELOPMENT">
<release-core-list>
<release-bug-list>
<release-item>
<github-issue id="1357"/>
<github-pull-request id="1371"/>
<release-item-contributor-list>
<release-item-ideator id="floris.van.nee"/>
<release-item-contributor id="david.steele"/>
<release-item-reviewer id="cynthia.shang"/>
<release-item-reviewer id="stefan.fercot"/>
<!-- Actually tester, but we don't have a tag for that yet -->
<release-item-reviewer id="floris.van.nee"/>
</release-item-contributor-list>
<p>Fix issues with leftover spool files from a prior <cmd>restore</cmd>.</p>
</release-item>
<release-item>
<github-issue id="1368"/>
<github-pull-request id="1374"/>

View File

@ -71,12 +71,13 @@ archiveAsyncErrorClear(ArchiveMode archiveMode, const String *archiveFile)
/**********************************************************************************************************************************/
bool
archiveAsyncStatus(ArchiveMode archiveMode, const String *walSegment, bool throwOnError)
archiveAsyncStatus(ArchiveMode archiveMode, const String *walSegment, bool throwOnError, bool warnOnOk)
{
FUNCTION_LOG_BEGIN(logLevelDebug);
FUNCTION_LOG_PARAM(STRING_ID, archiveMode);
FUNCTION_LOG_PARAM(STRING, walSegment);
FUNCTION_LOG_PARAM(BOOL, throwOnError);
FUNCTION_LOG_PARAM(BOOL, warnOnOk);
FUNCTION_LOG_END();
ASSERT(walSegment != NULL);
@ -143,7 +144,7 @@ archiveAsyncStatus(ArchiveMode archiveMode, const String *walSegment, bool throw
if (okFileExists)
{
// If there is content in the status file it is a warning
if (strSize(content) != 0)
if (strSize(content) != 0 && warnOnOk)
{
// If error code is not success, then this was a renamed error file
if (code != 0)

View File

@ -69,8 +69,9 @@ Functions
// old error may be reported rather than waiting for the async process to succeed or fail.
void archiveAsyncErrorClear(ArchiveMode archiveMode, const String *archiveFile);
// Check for ok/error status files in the spool in/out directory
bool archiveAsyncStatus(ArchiveMode archiveMode, const String *walSegment, bool throwOnError);
// Check for ok/error status files in the spool in/out directory. throwOnError determines whether an error will be thrown when an
// error file is found. warnOnOk determines whether a warning will be output when found in an ok file.
bool archiveAsyncStatus(ArchiveMode archiveMode, const String *walSegment, bool throwOnError, bool warnOnOk);
// Write an ok status file
void archiveAsyncStatusOkWrite(ArchiveMode archiveMode, const String *walSegment, const String *warning);

View File

@ -631,10 +631,11 @@ cmdArchiveGet(void)
// Async get can only be performed on WAL segments, history or other files must use synchronous mode
if (cfgOptionBool(cfgOptArchiveAsync) && walIsSegment(walSegment))
{
bool first = true; // Is this the first time the loop has run?
bool found = false; // Has the WAL segment been found yet?
bool foundOk = false; // Was an OK file found which confirms the file was missing?
bool queueFull = false; // Is the queue half or more full?
bool forked = false; // Has the async process been forked yet?
bool throwOnError = false; // Should we throw errors?
// Loop and wait for the WAL segment to be pushed
Wait *wait = waitNew(cfgOptionUInt64(cfgOptArchiveTimeout));
@ -645,15 +646,25 @@ cmdArchiveGet(void)
found = storageExistsP(storageSpool(), strNewFmt(STORAGE_SPOOL_ARCHIVE_IN "/%s", strZ(walSegment)));
// Check for errors or missing files. For archive-get ok indicates that the process succeeded but there is no WAL
// file to download, or that there was a warning.
if (archiveAsyncStatus(archiveModeGet, walSegment, throwOnError))
// file to download, or that there was a warning. Do not error on the first run so the async process can be spawned
// to correct any errors from a previous run. Do not warn on the first run if the segment was not found so the async
// process can be spawned to check for the file again.
if (archiveAsyncStatus(archiveModeGet, walSegment, !first, found || !first))
{
storageRemoveP(
storageSpoolWrite(), strNewFmt(STORAGE_SPOOL_ARCHIVE_IN "/%s" STATUS_EXT_OK, strZ(walSegment)),
.errorOnMissing = true);
if (!found)
// Break if an ok file was found but no segment exists, which means the segment was missing. However, don't
// break if this is the first time through the loop since this means the ok file was written by an async process
// spawned by a prior archive-get execution, which means we should spawn the async process again to see if the
// file exists now. This also prevents spool files from a previous recovery interfering with the current
// recovery.
if (!found && !first)
{
foundOk = true;
break;
}
}
// If found then move the WAL segment to the destination directory
@ -746,14 +757,26 @@ cmdArchiveGet(void)
if (found)
break;
// Now that the async process has been launched, throw any errors that are found
throwOnError = true;
// No longer the first run, so errors will be thrown and missing files will be reported
first = false;
}
while (waitMore(wait));
// Log that the file was not found
if (result == 1)
LOG_INFO_FMT(UNABLE_TO_FIND_IN_ARCHIVE_MSG " asynchronously", strZ(walSegment));
// If the WAL segment was not found
if (!found)
{
// If no ok file was found then something may be wrong with the async process. It's better to throw an error here
// than report not found for debugging purposes. Either way PostgreSQL will halt if it has not reached consistency.
if (!foundOk)
{
THROW_FMT(
ArchiveTimeoutError, "unable to get WAL file '%s' from the archive asynchronously after %s second(s)",
strZ(walSegment), strZ(strNewDbl((double)cfgOptionInt64(cfgOptArchiveTimeout) / MSEC_PER_SEC)));
}
// Else report that the WAL segment could not be found
else
LOG_INFO_FMT(UNABLE_TO_FIND_IN_ARCHIVE_MSG " asynchronously", strZ(walSegment));
}
}
// Else perform synchronous get
else
@ -982,7 +1005,9 @@ cmdArchiveGetAsync(void)
archiveAsyncStatusErrorWrite(
archiveModeGet, checkResult.errorFile, errorTypeCode(checkResult.errorType), message);
}
// Else log a warning if any files were missing
// If any files were missing write an ok file for the first missing file and add any warnings. It is important that this
// happen right before the async process exits so the main process can immediately respawn the async process to retry
// missing files.
else if (archiveFileMissing != NULL)
{
LOG_DETAIL_FMT(UNABLE_TO_FIND_IN_ARCHIVE_MSG, strZ(archiveFileMissing));

View File

@ -345,7 +345,7 @@ cmdArchivePush(void)
{
// Check if the WAL segment has been pushed. Errors will not be thrown on the first try to allow the async process
// a chance to fix them.
pushed = archiveAsyncStatus(archiveModePush, archiveFile, throwOnError);
pushed = archiveAsyncStatus(archiveModePush, archiveFile, throwOnError, true);
// If the WAL segment has not already been pushed then start the async process to push it. There's no point in
// forking the async process off more than once so track that as well. Use an archive lock to prevent more than

View File

@ -8,6 +8,7 @@ Test Archive Common
#include "common/harnessConfig.h"
#include "common/harnessFork.h"
#include "common/harnessStorage.h"
/***********************************************************************************************************************************
Test Run
@ -33,15 +34,15 @@ testRun(void)
// -------------------------------------------------------------------------------------------------------------------------
String *segment = strNew("000000010000000100000001");
TEST_RESULT_BOOL(archiveAsyncStatus(archiveModePush, segment, false), false, "directory and status file not present");
TEST_RESULT_BOOL(archiveAsyncStatus(archiveModeGet, segment, false), false, "directory and status file not present");
TEST_RESULT_BOOL(archiveAsyncStatus(archiveModePush, segment, false, true), false, "directory and status file not present");
TEST_RESULT_BOOL(archiveAsyncStatus(archiveModeGet, segment, false, true), false, "directory and status file not present");
// -------------------------------------------------------------------------------------------------------------------------
mkdir(strZ(strNewFmt("%s/archive", testPath())), 0750);
mkdir(strZ(strNewFmt("%s/archive/db", testPath())), 0750);
mkdir(strZ(strNewFmt("%s/archive/db/out", testPath())), 0750);
TEST_RESULT_BOOL(archiveAsyncStatus(archiveModePush, segment, false), false, "status file not present");
TEST_RESULT_BOOL(archiveAsyncStatus(archiveModePush, segment, false, true), false, "status file not present");
// -------------------------------------------------------------------------------------------------------------------------
TEST_TITLE("clear archive file errors");
@ -62,34 +63,44 @@ testRun(void)
storageNewWriteP(storageSpoolWrite(), strNewFmt(STORAGE_SPOOL_ARCHIVE_OUT "/%s.ok", strZ(segment))),
BUFSTRDEF(BOGUS_STR));
TEST_ERROR(
archiveAsyncStatus(archiveModePush, segment, false), FormatError,
archiveAsyncStatus(archiveModePush, segment, false, true), FormatError,
"000000010000000100000001.ok content must have at least two lines");
storagePutP(
storageNewWriteP(storageSpoolWrite(), strNewFmt(STORAGE_SPOOL_ARCHIVE_OUT "/%s.ok", strZ(segment))),
BUFSTRDEF(BOGUS_STR "\n"));
TEST_ERROR(
archiveAsyncStatus(archiveModePush, segment, false), FormatError, "000000010000000100000001.ok message must be > 0");
archiveAsyncStatus(archiveModePush, segment, false, true), FormatError,
"000000010000000100000001.ok message must be > 0");
storagePutP(
storageNewWriteP(storageSpoolWrite(), strNewFmt(STORAGE_SPOOL_ARCHIVE_OUT "/%s.ok", strZ(segment))),
BUFSTRDEF(BOGUS_STR "\nmessage"));
TEST_ERROR(
archiveAsyncStatus(archiveModePush, segment, false), FormatError, "unable to convert base 10 string 'BOGUS' to int");
archiveAsyncStatus(archiveModePush, segment, false, true),
FormatError, "unable to convert base 10 string 'BOGUS' to int");
storagePutP(storageNewWriteP(storageSpoolWrite(), strNewFmt(STORAGE_SPOOL_ARCHIVE_OUT "/%s.ok", strZ(segment))), NULL);
TEST_RESULT_BOOL(archiveAsyncStatus(archiveModePush, segment, false), true, "ok file");
TEST_RESULT_BOOL(archiveAsyncStatus(archiveModePush, segment, false, true), true, "ok file");
storagePutP(
storageNewWriteP(storageSpoolWrite(), strNewFmt(STORAGE_SPOOL_ARCHIVE_OUT "/%s.ok", strZ(segment))),
BUFSTRDEF("0\nwarning"));
TEST_RESULT_BOOL(archiveAsyncStatus(archiveModePush, segment, false), true, "ok file with warning");
TEST_RESULT_BOOL(archiveAsyncStatus(archiveModePush, segment, false, true), true, "ok file with warning");
harnessLogResult("P00 WARN: warning");
// -------------------------------------------------------------------------------------------------------------------------
TEST_TITLE("ignore ok file warning");
HRN_STORAGE_PUT_Z(storageSpoolWrite(), STORAGE_SPOOL_ARCHIVE_OUT "/000000010000000100000001.ok", "0\nwarning 2");
TEST_RESULT_BOOL(archiveAsyncStatus(archiveModePush, segment, false, false), true, "check status");
TEST_RESULT_LOG("");
// -------------------------------------------------------------------------------------------------------------------------
storagePutP(
storageNewWriteP(storageSpoolWrite(), strNewFmt(STORAGE_SPOOL_ARCHIVE_OUT "/%s.ok", strZ(segment))),
BUFSTRDEF("25\nerror"));
TEST_RESULT_BOOL(archiveAsyncStatus(archiveModePush, segment, false), true, "error status renamed to ok");
TEST_RESULT_BOOL(archiveAsyncStatus(archiveModePush, segment, false, true), true, "error status renamed to ok");
harnessLogResult(
"P00 WARN: WAL segment '000000010000000100000001' was not pushed due to error [25] and was manually skipped: error");
TEST_RESULT_VOID(
@ -101,22 +112,22 @@ testRun(void)
storagePutP(
storageNewWriteP(storageSpoolWrite(), strNewFmt(STORAGE_SPOOL_ARCHIVE_OUT "/%s.error", strZ(segment))), bufNew(0));
TEST_ERROR(
archiveAsyncStatus(archiveModePush, segment, true), AssertError,
archiveAsyncStatus(archiveModePush, segment, true, true), AssertError,
"status file '000000010000000100000001.error' has no content");
storagePutP(
storageNewWriteP(storageSpoolWrite(), strNewFmt(STORAGE_SPOOL_ARCHIVE_OUT "/%s.error", strZ(segment))),
BUFSTRDEF("25\nmessage"));
TEST_ERROR(archiveAsyncStatus(archiveModePush, segment, true), AssertError, "message");
TEST_ERROR(archiveAsyncStatus(archiveModePush, segment, true, true), AssertError, "message");
TEST_RESULT_BOOL(archiveAsyncStatus(archiveModePush, segment, false), false, "suppress error");
TEST_RESULT_BOOL(archiveAsyncStatus(archiveModePush, segment, false, true), false, "suppress error");
// -------------------------------------------------------------------------------------------------------------------------
storagePutP(
storageNewWriteP(storageSpoolWrite(), strNew(STORAGE_SPOOL_ARCHIVE_OUT "/global.error")),
BUFSTRDEF("102\nexecute error"));
TEST_ERROR(archiveAsyncStatus(archiveModePush, strNew("anyfile"), true), ExecuteError, "execute error");
TEST_ERROR(archiveAsyncStatus(archiveModePush, strNew("anyfile"), true, true), ExecuteError, "execute error");
}
// *****************************************************************************************************************************

View File

@ -654,17 +654,17 @@ testRun(void)
THROW_ON_SYS_ERROR(chdir(strZ(cfgOptionStr(cfgOptPgPath))) != 0, PathMissingError, "unable to chdir()");
TEST_RESULT_INT(cmdArchiveGet(), 1, "timeout getting WAL segment");
harnessLogResult("P00 INFO: unable to find 000000010000000100000001 in the archive asynchronously");
TEST_ERROR(
cmdArchiveGet(), ArchiveTimeoutError,
"unable to get WAL file '000000010000000100000001' from the archive asynchronously after 1 second(s)");
// Check for missing WAL
// -------------------------------------------------------------------------------------------------------------------------
HRN_STORAGE_PUT_EMPTY(storageSpoolWrite(), STORAGE_SPOOL_ARCHIVE_IN "/000000010000000100000001.ok");
TEST_RESULT_INT(cmdArchiveGet(), 1, "successful get of missing WAL");
harnessLogResult("P00 INFO: unable to find 000000010000000100000001 in the archive asynchronously");
TEST_ERROR(
cmdArchiveGet(), ArchiveTimeoutError,
"unable to get WAL file '000000010000000100000001' from the archive asynchronously after 1 second(s)");
TEST_RESULT_BOOL(
storageExistsP(storageSpool(), STRDEF(STORAGE_SPOOL_ARCHIVE_IN "/000000010000000100000001.ok")), false,
@ -712,11 +712,11 @@ testRun(void)
IoWrite *write = ioFdWriteNew(strNew("child write"), HARNESS_FORK_CHILD_WRITE(), 2000);
ioWriteOpen(write);
TEST_RESULT_VOID(
lockAcquire(
TEST_RESULT_VOID(
lockAcquire(
cfgOptionStr(cfgOptLockPath), cfgOptionStr(cfgOptStanza), STRDEF("999-dededede"), cfgLockType(), 30000,
true),
"acquire lock");
"acquire lock");
// Let the parent know the lock has been acquired and wait for the parent to allow lock release
ioWriteStrLine(write, strNew(""));
@ -737,9 +737,9 @@ testRun(void)
// Wait for the child to acquire the lock
ioReadLine(read);
TEST_RESULT_INT(cmdArchiveGet(), 1, "timeout waiting for lock");
harnessLogResult("P00 INFO: unable to find 000000010000000100000001 in the archive asynchronously");
TEST_ERROR(
cmdArchiveGet(), ArchiveTimeoutError,
"unable to get WAL file '000000010000000100000001' from the archive asynchronously after 1 second(s)");
// Notify the child to release the lock
ioWriteLine(write, bufNew(0));
@ -1109,6 +1109,39 @@ testRun(void)
.remove = true);
bufUsedSet(serverWrite, 0);
// -------------------------------------------------------------------------------------------------------------------------
TEST_TITLE("no segments to find with existing ok file");
argList = strLstNew();
hrnCfgArgRawZ(argList, cfgOptPgPath, TEST_PATH_PG);
hrnCfgArgRawZ(argList, cfgOptRepoPath, TEST_PATH_REPO);
hrnCfgArgRawZ(argList, cfgOptStanza, "test1");
hrnCfgArgRawZ(argList, cfgOptArchiveTimeout, "10");
hrnCfgArgRawZ(argList, cfgOptSpoolPath, TEST_PATH_SPOOL);
hrnCfgArgRawBool(argList, cfgOptArchiveAsync, true);
strLstAddZ(argList, "000000010000000100000001");
strLstAddZ(argList, "pg_wal/RECOVERYXLOG");
harnessCfgLoad(cfgCmdArchiveGet, argList);
HRN_INFO_PUT(
storageRepoIdxWrite(0), INFO_ARCHIVE_PATH_FILE,
"[db]\n"
"db-id=2\n"
"\n"
"[db:history]\n"
"2={\"db-id\":18072658121562454734,\"db-version\":\"10\"}");
// Put a warning in the file to show that it was read and later overwritten
HRN_STORAGE_PUT_Z(storageSpoolWrite(), STORAGE_SPOOL_ARCHIVE_IN "/000000010000000100000001.ok", "0\nshould not be output");
TEST_RESULT_VOID(cmdArchiveGet(), "get async");
harnessLogResult("P00 INFO: unable to find 000000010000000100000001 in the archive asynchronously");
// Check that the ok file is missing since it should have been removed on the first loop and removed again on a subsequent
// loop once the async process discovered that the file was missing and wrote the ok file again.
TEST_STORAGE_LIST_EMPTY(storageSpool(), STORAGE_SPOOL_ARCHIVE_IN);
}
FUNCTION_HARNESS_RETURN_VOID();