diff --git a/doc/xml/release.xml b/doc/xml/release.xml index c4ad16743..2c1296689 100644 --- a/doc/xml/release.xml +++ b/doc/xml/release.xml @@ -57,6 +57,21 @@

Add recovery start time to online backup restore log.

+ + + + + + + + + + + + +

Report original error and retries on local job failure.

+
+ diff --git a/src/protocol/server.c b/src/protocol/server.c index 44a991a60..03364b569 100644 --- a/src/protocol/server.c +++ b/src/protocol/server.c @@ -182,9 +182,16 @@ protocolServerProcess( // Send the command to the handler MEM_CONTEXT_TEMP_BEGIN() { + // Variables to store first error message and retry messages + const ErrorType *errType = NULL; + String *errMessage = NULL; + const String *errMessageFirst = NULL; + const String *errStackTrace = NULL; + // Initialize retries in case of command failure bool retry = false; unsigned int retryRemaining = retryInterval != NULL ? varLstSize(retryInterval) : 0; + TimeMSec retrySleepMs = 0; // Handler retry loop do @@ -197,12 +204,33 @@ protocolServerProcess( } CATCH_ANY() { + // On first error record the error details. Only the first error will contain a stack trace since + // the first error is most likely to contain valuable information. + if (errType == NULL) + { + errType = errorType(); + errMessage = strNewZ(errorMessage()); + errMessageFirst = strNewZ(errorMessage()); + errStackTrace = strNewZ(errorStackTrace()); + } + // Else on a retry error only record the error type and message. Retry errors are less likely to + // contain valuable information but may be helpful for debugging. + else + { + strCatFmt( + errMessage, "\n[%s] on retry after %" PRIu64 "ms", errorTypeName(errorType()), + retrySleepMs); + + // Only append the message if it differs from the first message + if (!strEqZ(errMessageFirst, errorMessage())) + strCatFmt(errMessage, ": %s", errorMessage()); + } + // Are there retries remaining? if (retryRemaining > 0) { // Get the sleep interval for this retry - TimeMSec retrySleepMs = varUInt64( - varLstGet(retryInterval, varLstSize(retryInterval) - retryRemaining)); + retrySleepMs = varUInt64(varLstGet(retryInterval, varLstSize(retryInterval) - retryRemaining)); // Log the retry LOG_DEBUG_FMT( @@ -222,7 +250,7 @@ protocolServerProcess( } // Else report error to the client else - protocolServerError(this, errorCode(), STR(errorMessage()), STR(errorStackTrace())); + protocolServerError(this, errorTypeCode(errType), errMessage, errStackTrace); } TRY_END(); } diff --git a/test/src/module/command/archiveGetTest.c b/test/src/module/command/archiveGetTest.c index 34b067018..f60e76ade 100644 --- a/test/src/module/command/archiveGetTest.c +++ b/test/src/module/command/archiveGetTest.c @@ -533,7 +533,8 @@ testRun(void) " repo1: 10-1/0000000100000002/000000010000000200000000-aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa.gz" " [FormatError] unexpected eof in compressed data\n" " repo2: 10-1/0000000100000002/000000010000000200000000-aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa.gz" - " [FormatError] unexpected eof in compressed data"); + " [FormatError] unexpected eof in compressed data\n" + " [FileReadError] on retry after 0ms"); TEST_STORAGE_GET( storageSpoolWrite(), STORAGE_SPOOL_ARCHIVE_IN "/000000010000000200000000.error", @@ -543,6 +544,7 @@ testRun(void) " [FormatError] unexpected eof in compressed data\n" "repo2: 10-1/0000000100000002/000000010000000200000000-aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa.gz" " [FormatError] unexpected eof in compressed data\n" + "[FileReadError] on retry after 0ms\n" "repo3: [ArchiveMismatchError] unable to retrieve the archive id for database version '10' and system-id" " '18072658121562454734'", .remove = true); diff --git a/test/src/module/command/archivePushTest.c b/test/src/module/command/archivePushTest.c index 6605b6ed2..54bf2ff86 100644 --- a/test/src/module/command/archivePushTest.c +++ b/test/src/module/command/archivePushTest.c @@ -835,7 +835,8 @@ testRun(void) " HINT: this is valid in some recovery scenarios but may also indicate a problem.\n" "P01 DETAIL: pushed WAL file '000000010000000100000001' to the archive\n" "P01 WARN: could not push WAL file '000000010000000100000002' to the archive (will be retried): " - "[55] raised from local-1 shim protocol: " STORAGE_ERROR_READ_MISSING, + "[55] raised from local-1 shim protocol: " STORAGE_ERROR_READ_MISSING "\n" + " [FileMissingError] on retry after 0ms", TEST_PATH "/pg/pg_xlog/000000010000000100000002"); TEST_STORAGE_EXISTS( diff --git a/test/src/module/command/restoreTest.c b/test/src/module/command/restoreTest.c index 31966364c..0bda9e168 100644 --- a/test/src/module/command/restoreTest.c +++ b/test/src/module/command/restoreTest.c @@ -2743,7 +2743,8 @@ testRun(void) TEST_ERROR( cmdRestore(), FileMissingError, "raised from local-1 shim protocol: unable to open missing file" - " '" TEST_PATH "/repo/backup/test1/20161219-212741F_20161219-212918I/pg_data/global/pg_control' for read"); + " '" TEST_PATH "/repo/backup/test1/20161219-212741F_20161219-212918I/pg_data/global/pg_control' for read\n" + "[FileMissingError] on retry after 0ms"); // Free local processes that were not freed because of the error protocolFree(); diff --git a/test/src/module/command/verifyTest.c b/test/src/module/command/verifyTest.c index ac0e89af3..39ac79ac7 100644 --- a/test/src/module/command/verifyTest.c +++ b/test/src/module/command/verifyTest.c @@ -1174,6 +1174,7 @@ testRun(void) "local-1 shim protocol: unable to open file '" TEST_PATH "/repo/archive/db/" "11-2/0000000200000008/000000020000000800000003-656817043007aa2100c44c712bcb456db705dab9' for read:" " [13] Permission denied\n" + " [FileOpenError] on retry after 0ms\n" "P00 WARN: unable to open missing file '" TEST_PATH "/repo/backup/db/20181119-152800F/backup.manifest' for read\n" "P00 WARN: unable to open missing file '" TEST_PATH "/repo/backup/db/20181119-152800F/backup.manifest.copy'" " for read\n" @@ -1188,6 +1189,7 @@ testRun(void) "P01 ERROR: [039]: invalid result UNPROCESSEDBACKUP/pg_data/testother: [41] raised from local-1 shim protocol:" " unable to open file '" TEST_PATH "/repo/backup/db/UNPROCESSEDBACKUP/pg_data/testother' for read: [13]" " Permission denied\n" + " [FileOpenError] on retry after 0ms\n" "P00 DETAIL: archiveId: 11-2, wal start: 000000020000000700000FFD, wal stop: 000000020000000800000000\n" "P00 DETAIL: archiveId: 11-2, wal start: 000000020000000800000002, wal stop: 000000020000000800000003\n" "P00 DETAIL: archiveId: 11-2, wal start: 000000030000000000000000, wal stop: 000000030000000000000001\n" diff --git a/test/src/module/protocol/protocolTest.c b/test/src/module/protocol/protocolTest.c index c28bbc39a..f0c8ff98a 100644 --- a/test/src/module/protocol/protocolTest.c +++ b/test/src/module/protocol/protocolTest.c @@ -20,6 +20,8 @@ Test protocol server command handlers ***********************************************************************************************************************************/ #define TEST_PROTOCOL_COMMAND_ASSERT STRID5("assert", 0x2922ce610) +static unsigned int testCommandAssertProtocolTotal = 0; + __attribute__((__noreturn__)) static void testCommandAssertProtocol(PackRead *const param, ProtocolServer *const server) { @@ -31,7 +33,8 @@ testCommandAssertProtocol(PackRead *const param, ProtocolServer *const server) ASSERT(param == NULL); ASSERT(server != NULL); - hrnErrorThrowP(); + testCommandAssertProtocolTotal++; + hrnErrorThrowP(.message = testCommandAssertProtocolTotal <= 3 ? NULL : "ERR_MESSAGE_RETRY"); // No FUNCTION_HARNESS_RETURN_VOID() because the function does not return } @@ -525,6 +528,7 @@ testRun(void) VariantList *retryList = varLstNew(); varLstAdd(retryList, varNewUInt64(0)); + varLstAdd(retryList, varNewUInt64(500)); TEST_ASSIGN( server, protocolServerNew(STRDEF("test server"), STRDEF("test"), HRN_FORK_CHILD_READ(), HRN_FORK_CHILD_WRITE()), @@ -666,6 +670,15 @@ testRun(void) pckReadBoolP(protocolClientExecute(client, protocolCommandNew(TEST_PROTOCOL_COMMAND_RETRY), true)), true, "execute"); + // ----------------------------------------------------------------------------------------------------------------- + TEST_TITLE("command throws assert with retry messages"); + + TEST_ERROR( + protocolClientExecute(client, protocolCommandNew(TEST_PROTOCOL_COMMAND_ASSERT), false), AssertError, + "raised from test client: ERR_MESSAGE\n" + "[AssertError] on retry after 0ms\n" + "[AssertError] on retry after 500ms: ERR_MESSAGE_RETRY"); + // ----------------------------------------------------------------------------------------------------------------- TEST_TITLE("free client");