From 10038db9c9f2b4753c3821e0b246fa8626527184 Mon Sep 17 00:00:00 2001 From: David Steele Date: Wed, 23 Feb 2022 09:14:27 -0600 Subject: [PATCH] Add archive-missing-retry option. Retry a WAL segment that was previously reported as missing by the archive-get command. This prevents notifications in the spool path from a prior restore from being used and possibly causing a recovery failure if consistency has not been reached. Disabling this option allows PostgreSQL to more reliably recognize when the end of the WAL in the archive has been reached, which permits it to switch over to streaming from the primary. With retries enabled, a steady stream of WAL being archived will cause PostgreSQL to continue getting WAL from the archive rather than switch to streaming. When disabling this option it is important to ensure that the spool path for the stanza is empty. The restore command does this automatically if the spool path is configured at restore time. Otherwise, it is up to the user to ensure the spool path is empty. --- doc/xml/release.xml | 11 ++++++++ src/build/config/config.yaml | 9 ++++++ src/build/help/help.xml | 15 ++++++++++ src/command/archive/get/get.c | 8 ++++-- src/config/config.auto.h | 4 ++- src/config/parse.auto.c | 28 +++++++++++++++++++ test/src/module/command/archiveGetTest.c | 35 ++++++++++++++++++++++++ 7 files changed, 107 insertions(+), 3 deletions(-) diff --git a/doc/xml/release.xml b/doc/xml/release.xml index 8d39d51c7..0ff8b8d0f 100644 --- a/doc/xml/release.xml +++ b/doc/xml/release.xml @@ -50,6 +50,17 @@

Add support for AWS S3 server-side encryption using KMS.

+ + + + + + + + +

Add archive-missing-retry option.

+
+ diff --git a/src/build/config/config.yaml b/src/build/config/config.yaml index a0be08355..f47e41793 100644 --- a/src/build/config/config.yaml +++ b/src/build/config/config.yaml @@ -1040,6 +1040,15 @@ option: main: {} async: {} + archive-missing-retry: + section: global + type: boolean + default: true + command: + archive-get: {} + command-role: + main: {} + archive-push-queue-max: section: global type: size diff --git a/src/build/help/help.xml b/src/build/help/help.xml index 49dce5de5..59cae4825 100644 --- a/src/build/help/help.xml +++ b/src/build/help/help.xml @@ -1291,6 +1291,21 @@ n + + + Retry missing WAL segment + + +

Retry a WAL segment that was previously reported as missing by the archive-get command when in asynchronous mode. This prevents notifications in the spool path from a prior restore from being used and possibly causing a recovery failure if consistency has not been reached.

+ +

Disabling this option allows to more reliably recognize when the end of the WAL in the archive has been reached, which permits it to switch over to streaming from the primary. With retries enabled, a steady stream of WAL being archived will cause to continue getting WAL from the archive rather than switch to streaming.

+ +

When disabling this option it is important to ensure that the spool path for the stanza is empty. The restore command does this automatically if the spool path is configured at restore time. Otherwise, it is up to the user to ensure the spool path is empty.

+
+ + n +
+ Maximum size of the archive queue. diff --git a/src/command/archive/get/get.c b/src/command/archive/get/get.c index 2339f1c91..99e206414 100644 --- a/src/command/archive/get/get.c +++ b/src/command/archive/get/get.c @@ -647,11 +647,15 @@ cmdArchiveGet(void) // Check if the WAL segment is already in the queue found = storageExistsP(storageSpool(), strNewFmt(STORAGE_SPOOL_ARCHIVE_IN "/%s", strZ(walSegment))); + // Determine whether a missing WAL segment will be retried. Retrying is safer, but not retrying lets PostgreSQL + // know that there are probably no more WAL segments in the archive which means it can switch to streaming. + const bool missingRetry = first && cfgOptionBool(cfgOptArchiveMissingRetry); + // Check for errors or missing files. For archive-get ok indicates that the process succeeded but there is no WAL // file to download, or that there was a warning. Do not error on the first run so the async process can be spawned // to correct any errors from a previous run. Do not warn on the first run if the segment was not found so the async // process can be spawned to check for the file again. - if (archiveAsyncStatus(archiveModeGet, walSegment, !first, found || !first)) + if (archiveAsyncStatus(archiveModeGet, walSegment, !first, found || !missingRetry)) { storageRemoveP( storageSpoolWrite(), strNewFmt(STORAGE_SPOOL_ARCHIVE_IN "/%s" STATUS_EXT_OK, strZ(walSegment)), @@ -662,7 +666,7 @@ cmdArchiveGet(void) // spawned by a prior archive-get execution, which means we should spawn the async process again to see if the // file exists now. This also prevents spool files from a previous recovery interfering with the current // recovery. - if (!found && !first) + if (!found && !missingRetry) { foundOk = true; break; diff --git a/src/config/config.auto.h b/src/config/config.auto.h index 9c53a00bf..b4d69aa2c 100644 --- a/src/config/config.auto.h +++ b/src/config/config.auto.h @@ -47,6 +47,7 @@ Option constants #define CFGOPT_ARCHIVE_COPY "archive-copy" #define CFGOPT_ARCHIVE_GET_QUEUE_MAX "archive-get-queue-max" #define CFGOPT_ARCHIVE_HEADER_CHECK "archive-header-check" +#define CFGOPT_ARCHIVE_MISSING_RETRY "archive-missing-retry" #define CFGOPT_ARCHIVE_MODE "archive-mode" #define CFGOPT_ARCHIVE_MODE_CHECK "archive-mode-check" #define CFGOPT_ARCHIVE_PUSH_QUEUE_MAX "archive-push-queue-max" @@ -129,7 +130,7 @@ Option constants #define CFGOPT_TLS_SERVER_PORT "tls-server-port" #define CFGOPT_TYPE "type" -#define CFG_OPTION_TOTAL 153 +#define CFG_OPTION_TOTAL 154 /*********************************************************************************************************************************** Option value constants @@ -360,6 +361,7 @@ typedef enum cfgOptArchiveCopy, cfgOptArchiveGetQueueMax, cfgOptArchiveHeaderCheck, + cfgOptArchiveMissingRetry, cfgOptArchiveMode, cfgOptArchiveModeCheck, cfgOptArchivePushQueueMax, diff --git a/src/config/parse.auto.c b/src/config/parse.auto.c index ad61c2ff2..ba5a485d4 100644 --- a/src/config/parse.auto.c +++ b/src/config/parse.auto.c @@ -896,6 +896,33 @@ static const ParseRuleOption parseRuleOption[CFG_OPTION_TOTAL] = ), ), + // ----------------------------------------------------------------------------------------------------------------------------- + PARSE_RULE_OPTION + ( + PARSE_RULE_OPTION_NAME("archive-missing-retry"), + PARSE_RULE_OPTION_TYPE(cfgOptTypeBoolean), + PARSE_RULE_OPTION_NEGATE(true), + PARSE_RULE_OPTION_RESET(true), + PARSE_RULE_OPTION_REQUIRED(true), + PARSE_RULE_OPTION_SECTION(cfgSectionGlobal), + + PARSE_RULE_OPTION_COMMAND_ROLE_MAIN_VALID_LIST + ( + PARSE_RULE_OPTION_COMMAND(cfgCmdArchiveGet) + ), + + PARSE_RULE_OPTIONAL + ( + PARSE_RULE_OPTIONAL_GROUP + ( + PARSE_RULE_OPTIONAL_DEFAULT + ( + PARSE_RULE_VAL_BOOL_TRUE, + ), + ), + ), + ), + // ----------------------------------------------------------------------------------------------------------------------------- PARSE_RULE_OPTION ( @@ -9197,6 +9224,7 @@ static const ConfigOption optionResolveOrder[] = cfgOptArchiveAsync, cfgOptArchiveGetQueueMax, cfgOptArchiveHeaderCheck, + cfgOptArchiveMissingRetry, cfgOptArchiveMode, cfgOptArchivePushQueueMax, cfgOptArchiveTimeout, diff --git a/test/src/module/command/archiveGetTest.c b/test/src/module/command/archiveGetTest.c index 2ab529d25..82c02495b 100644 --- a/test/src/module/command/archiveGetTest.c +++ b/test/src/module/command/archiveGetTest.c @@ -1119,6 +1119,41 @@ testRun(void) // Check that the ok file is missing since it should have been removed on the first loop and removed again on a subsequent // loop once the async process discovered that the file was missing and wrote the ok file again. TEST_STORAGE_LIST_EMPTY(storageSpool(), STORAGE_SPOOL_ARCHIVE_IN); + + // ------------------------------------------------------------------------------------------------------------------------- + TEST_TITLE("do not retry missing segment"); + + argList = strLstNew(); + hrnCfgArgRawZ(argList, cfgOptPgPath, TEST_PATH "/pg"); + hrnCfgArgRawZ(argList, cfgOptRepoPath, TEST_PATH "/repo"); + hrnCfgArgRawZ(argList, cfgOptStanza, "test1"); + hrnCfgArgRawZ(argList, cfgOptArchiveTimeout, "10"); + hrnCfgArgRawZ(argList, cfgOptSpoolPath, TEST_PATH "/spool"); + hrnCfgArgRawBool(argList, cfgOptArchiveAsync, true); + hrnCfgArgRawBool(argList, cfgOptArchiveMissingRetry, false); + strLstAddZ(argList, "000000010000000100000001"); + strLstAddZ(argList, "pg_wal/RECOVERYXLOG"); + HRN_CFG_LOAD(cfgCmdArchiveGet, argList); + + // Make sure that a WAL segment is found when the ok file is missing + HRN_STORAGE_PUT_EMPTY( + storageRepoWrite(), STORAGE_REPO_ARCHIVE "/10-2/000000010000000100000001-abcdabcdabcdabcdabcdabcdabcdabcdabcdabcd"); + + TEST_RESULT_VOID(cmdArchiveGet(), "get async"); + TEST_RESULT_LOG("P00 INFO: found 000000010000000100000001 in the archive asynchronously"); + + // Remove the ok file created by the async process + TEST_STORAGE_LIST(storageSpoolWrite(), STORAGE_SPOOL_ARCHIVE_IN, "000000010000000100000002.ok\n", .remove = true); + + // Write an ok file + HRN_STORAGE_PUT_EMPTY(storageSpoolWrite(), STORAGE_SPOOL_ARCHIVE_IN "/000000010000000100000001.ok"); + + // Missing should be returned since archive-missing-retry=n + TEST_RESULT_VOID(cmdArchiveGet(), "get async"); + TEST_RESULT_LOG("P00 INFO: unable to find 000000010000000100000001 in the archive asynchronously"); + + // Check that the ok file was removed + TEST_STORAGE_LIST_EMPTY(storageSpool(), STORAGE_SPOOL_ARCHIVE_IN); } FUNCTION_HARNESS_RETURN_VOID();