diff --git a/doc/xml/release.xml b/doc/xml/release.xml
index c4ad16743..2c1296689 100644
--- a/doc/xml/release.xml
+++ b/doc/xml/release.xml
@@ -57,6 +57,21 @@
Add recovery start time to online backup restore log.
+
+
+
+
+
+
+
+
+
+
+
+
+ Report original error and retries on local job failure.
+
+
diff --git a/src/protocol/server.c b/src/protocol/server.c
index 44a991a60..03364b569 100644
--- a/src/protocol/server.c
+++ b/src/protocol/server.c
@@ -182,9 +182,16 @@ protocolServerProcess(
// Send the command to the handler
MEM_CONTEXT_TEMP_BEGIN()
{
+ // Variables to store first error message and retry messages
+ const ErrorType *errType = NULL;
+ String *errMessage = NULL;
+ const String *errMessageFirst = NULL;
+ const String *errStackTrace = NULL;
+
// Initialize retries in case of command failure
bool retry = false;
unsigned int retryRemaining = retryInterval != NULL ? varLstSize(retryInterval) : 0;
+ TimeMSec retrySleepMs = 0;
// Handler retry loop
do
@@ -197,12 +204,33 @@ protocolServerProcess(
}
CATCH_ANY()
{
+ // On first error record the error details. Only the first error will contain a stack trace since
+ // the first error is most likely to contain valuable information.
+ if (errType == NULL)
+ {
+ errType = errorType();
+ errMessage = strNewZ(errorMessage());
+ errMessageFirst = strNewZ(errorMessage());
+ errStackTrace = strNewZ(errorStackTrace());
+ }
+ // Else on a retry error only record the error type and message. Retry errors are less likely to
+ // contain valuable information but may be helpful for debugging.
+ else
+ {
+ strCatFmt(
+ errMessage, "\n[%s] on retry after %" PRIu64 "ms", errorTypeName(errorType()),
+ retrySleepMs);
+
+ // Only append the message if it differs from the first message
+ if (!strEqZ(errMessageFirst, errorMessage()))
+ strCatFmt(errMessage, ": %s", errorMessage());
+ }
+
// Are there retries remaining?
if (retryRemaining > 0)
{
// Get the sleep interval for this retry
- TimeMSec retrySleepMs = varUInt64(
- varLstGet(retryInterval, varLstSize(retryInterval) - retryRemaining));
+ retrySleepMs = varUInt64(varLstGet(retryInterval, varLstSize(retryInterval) - retryRemaining));
// Log the retry
LOG_DEBUG_FMT(
@@ -222,7 +250,7 @@ protocolServerProcess(
}
// Else report error to the client
else
- protocolServerError(this, errorCode(), STR(errorMessage()), STR(errorStackTrace()));
+ protocolServerError(this, errorTypeCode(errType), errMessage, errStackTrace);
}
TRY_END();
}
diff --git a/test/src/module/command/archiveGetTest.c b/test/src/module/command/archiveGetTest.c
index 34b067018..f60e76ade 100644
--- a/test/src/module/command/archiveGetTest.c
+++ b/test/src/module/command/archiveGetTest.c
@@ -533,7 +533,8 @@ testRun(void)
" repo1: 10-1/0000000100000002/000000010000000200000000-aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa.gz"
" [FormatError] unexpected eof in compressed data\n"
" repo2: 10-1/0000000100000002/000000010000000200000000-aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa.gz"
- " [FormatError] unexpected eof in compressed data");
+ " [FormatError] unexpected eof in compressed data\n"
+ " [FileReadError] on retry after 0ms");
TEST_STORAGE_GET(
storageSpoolWrite(), STORAGE_SPOOL_ARCHIVE_IN "/000000010000000200000000.error",
@@ -543,6 +544,7 @@ testRun(void)
" [FormatError] unexpected eof in compressed data\n"
"repo2: 10-1/0000000100000002/000000010000000200000000-aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa.gz"
" [FormatError] unexpected eof in compressed data\n"
+ "[FileReadError] on retry after 0ms\n"
"repo3: [ArchiveMismatchError] unable to retrieve the archive id for database version '10' and system-id"
" '18072658121562454734'",
.remove = true);
diff --git a/test/src/module/command/archivePushTest.c b/test/src/module/command/archivePushTest.c
index 6605b6ed2..54bf2ff86 100644
--- a/test/src/module/command/archivePushTest.c
+++ b/test/src/module/command/archivePushTest.c
@@ -835,7 +835,8 @@ testRun(void)
" HINT: this is valid in some recovery scenarios but may also indicate a problem.\n"
"P01 DETAIL: pushed WAL file '000000010000000100000001' to the archive\n"
"P01 WARN: could not push WAL file '000000010000000100000002' to the archive (will be retried): "
- "[55] raised from local-1 shim protocol: " STORAGE_ERROR_READ_MISSING,
+ "[55] raised from local-1 shim protocol: " STORAGE_ERROR_READ_MISSING "\n"
+ " [FileMissingError] on retry after 0ms",
TEST_PATH "/pg/pg_xlog/000000010000000100000002");
TEST_STORAGE_EXISTS(
diff --git a/test/src/module/command/restoreTest.c b/test/src/module/command/restoreTest.c
index 31966364c..0bda9e168 100644
--- a/test/src/module/command/restoreTest.c
+++ b/test/src/module/command/restoreTest.c
@@ -2743,7 +2743,8 @@ testRun(void)
TEST_ERROR(
cmdRestore(), FileMissingError,
"raised from local-1 shim protocol: unable to open missing file"
- " '" TEST_PATH "/repo/backup/test1/20161219-212741F_20161219-212918I/pg_data/global/pg_control' for read");
+ " '" TEST_PATH "/repo/backup/test1/20161219-212741F_20161219-212918I/pg_data/global/pg_control' for read\n"
+ "[FileMissingError] on retry after 0ms");
// Free local processes that were not freed because of the error
protocolFree();
diff --git a/test/src/module/command/verifyTest.c b/test/src/module/command/verifyTest.c
index ac0e89af3..39ac79ac7 100644
--- a/test/src/module/command/verifyTest.c
+++ b/test/src/module/command/verifyTest.c
@@ -1174,6 +1174,7 @@ testRun(void)
"local-1 shim protocol: unable to open file '" TEST_PATH "/repo/archive/db/"
"11-2/0000000200000008/000000020000000800000003-656817043007aa2100c44c712bcb456db705dab9' for read:"
" [13] Permission denied\n"
+ " [FileOpenError] on retry after 0ms\n"
"P00 WARN: unable to open missing file '" TEST_PATH "/repo/backup/db/20181119-152800F/backup.manifest' for read\n"
"P00 WARN: unable to open missing file '" TEST_PATH "/repo/backup/db/20181119-152800F/backup.manifest.copy'"
" for read\n"
@@ -1188,6 +1189,7 @@ testRun(void)
"P01 ERROR: [039]: invalid result UNPROCESSEDBACKUP/pg_data/testother: [41] raised from local-1 shim protocol:"
" unable to open file '" TEST_PATH "/repo/backup/db/UNPROCESSEDBACKUP/pg_data/testother' for read: [13]"
" Permission denied\n"
+ " [FileOpenError] on retry after 0ms\n"
"P00 DETAIL: archiveId: 11-2, wal start: 000000020000000700000FFD, wal stop: 000000020000000800000000\n"
"P00 DETAIL: archiveId: 11-2, wal start: 000000020000000800000002, wal stop: 000000020000000800000003\n"
"P00 DETAIL: archiveId: 11-2, wal start: 000000030000000000000000, wal stop: 000000030000000000000001\n"
diff --git a/test/src/module/protocol/protocolTest.c b/test/src/module/protocol/protocolTest.c
index c28bbc39a..f0c8ff98a 100644
--- a/test/src/module/protocol/protocolTest.c
+++ b/test/src/module/protocol/protocolTest.c
@@ -20,6 +20,8 @@ Test protocol server command handlers
***********************************************************************************************************************************/
#define TEST_PROTOCOL_COMMAND_ASSERT STRID5("assert", 0x2922ce610)
+static unsigned int testCommandAssertProtocolTotal = 0;
+
__attribute__((__noreturn__)) static void
testCommandAssertProtocol(PackRead *const param, ProtocolServer *const server)
{
@@ -31,7 +33,8 @@ testCommandAssertProtocol(PackRead *const param, ProtocolServer *const server)
ASSERT(param == NULL);
ASSERT(server != NULL);
- hrnErrorThrowP();
+ testCommandAssertProtocolTotal++;
+ hrnErrorThrowP(.message = testCommandAssertProtocolTotal <= 3 ? NULL : "ERR_MESSAGE_RETRY");
// No FUNCTION_HARNESS_RETURN_VOID() because the function does not return
}
@@ -525,6 +528,7 @@ testRun(void)
VariantList *retryList = varLstNew();
varLstAdd(retryList, varNewUInt64(0));
+ varLstAdd(retryList, varNewUInt64(500));
TEST_ASSIGN(
server, protocolServerNew(STRDEF("test server"), STRDEF("test"), HRN_FORK_CHILD_READ(), HRN_FORK_CHILD_WRITE()),
@@ -666,6 +670,15 @@ testRun(void)
pckReadBoolP(protocolClientExecute(client, protocolCommandNew(TEST_PROTOCOL_COMMAND_RETRY), true)), true,
"execute");
+ // -----------------------------------------------------------------------------------------------------------------
+ TEST_TITLE("command throws assert with retry messages");
+
+ TEST_ERROR(
+ protocolClientExecute(client, protocolCommandNew(TEST_PROTOCOL_COMMAND_ASSERT), false), AssertError,
+ "raised from test client: ERR_MESSAGE\n"
+ "[AssertError] on retry after 0ms\n"
+ "[AssertError] on retry after 500ms: ERR_MESSAGE_RETRY");
+
// -----------------------------------------------------------------------------------------------------------------
TEST_TITLE("free client");