1
0
mirror of https://github.com/pgbackrest/pgbackrest.git synced 2025-01-18 04:58:51 +02:00

Report original error and retries on local job failure.

The local process will retry jobs (e.g. backup file) but after a certain number of failures gives up. Previously, the last error was reported but generally the first error is far more valuable. The last error is likely to be a cascade failure such as the protocol being out of sync.

Report the first error (and stack trace) and append the retry errors to the first error without stack trace information.
This commit is contained in:
David Steele 2021-10-05 09:00:16 -04:00 committed by GitHub
parent 34f7873432
commit 6af827cbb1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 69 additions and 7 deletions

View File

@ -57,6 +57,21 @@
<p>Add recovery start time to online backup restore log.</p>
</release-item>
<release-item>
<commit subject="Allow stack trace to be specified for errorInternalThrow()."/>
<commit subject="Add jobRetry parameter to HRN_CFG_LOAD()."/>
<commit subject="Report original error and retries on local job failure.">
<github-pull-request id="1521"/>
</commit>
<release-item-contributor-list>
<release-item-contributor id="david.steele"/>
<release-item-reviewer id="stefan.fercot"/>
</release-item-contributor-list>
<p>Report original error and retries on local job failure.</p>
</release-item>
<release-item>
<github-pull-request id="1519"/>

View File

@ -182,9 +182,16 @@ protocolServerProcess(
// Send the command to the handler
MEM_CONTEXT_TEMP_BEGIN()
{
// Variables to store first error message and retry messages
const ErrorType *errType = NULL;
String *errMessage = NULL;
const String *errMessageFirst = NULL;
const String *errStackTrace = NULL;
// Initialize retries in case of command failure
bool retry = false;
unsigned int retryRemaining = retryInterval != NULL ? varLstSize(retryInterval) : 0;
TimeMSec retrySleepMs = 0;
// Handler retry loop
do
@ -197,12 +204,33 @@ protocolServerProcess(
}
CATCH_ANY()
{
// On first error record the error details. Only the first error will contain a stack trace since
// the first error is most likely to contain valuable information.
if (errType == NULL)
{
errType = errorType();
errMessage = strNewZ(errorMessage());
errMessageFirst = strNewZ(errorMessage());
errStackTrace = strNewZ(errorStackTrace());
}
// Else on a retry error only record the error type and message. Retry errors are less likely to
// contain valuable information but may be helpful for debugging.
else
{
strCatFmt(
errMessage, "\n[%s] on retry after %" PRIu64 "ms", errorTypeName(errorType()),
retrySleepMs);
// Only append the message if it differs from the first message
if (!strEqZ(errMessageFirst, errorMessage()))
strCatFmt(errMessage, ": %s", errorMessage());
}
// Are there retries remaining?
if (retryRemaining > 0)
{
// Get the sleep interval for this retry
TimeMSec retrySleepMs = varUInt64(
varLstGet(retryInterval, varLstSize(retryInterval) - retryRemaining));
retrySleepMs = varUInt64(varLstGet(retryInterval, varLstSize(retryInterval) - retryRemaining));
// Log the retry
LOG_DEBUG_FMT(
@ -222,7 +250,7 @@ protocolServerProcess(
}
// Else report error to the client
else
protocolServerError(this, errorCode(), STR(errorMessage()), STR(errorStackTrace()));
protocolServerError(this, errorTypeCode(errType), errMessage, errStackTrace);
}
TRY_END();
}

View File

@ -533,7 +533,8 @@ testRun(void)
" repo1: 10-1/0000000100000002/000000010000000200000000-aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa.gz"
" [FormatError] unexpected eof in compressed data\n"
" repo2: 10-1/0000000100000002/000000010000000200000000-aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa.gz"
" [FormatError] unexpected eof in compressed data");
" [FormatError] unexpected eof in compressed data\n"
" [FileReadError] on retry after 0ms");
TEST_STORAGE_GET(
storageSpoolWrite(), STORAGE_SPOOL_ARCHIVE_IN "/000000010000000200000000.error",
@ -543,6 +544,7 @@ testRun(void)
" [FormatError] unexpected eof in compressed data\n"
"repo2: 10-1/0000000100000002/000000010000000200000000-aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa.gz"
" [FormatError] unexpected eof in compressed data\n"
"[FileReadError] on retry after 0ms\n"
"repo3: [ArchiveMismatchError] unable to retrieve the archive id for database version '10' and system-id"
" '18072658121562454734'",
.remove = true);

View File

@ -835,7 +835,8 @@ testRun(void)
" HINT: this is valid in some recovery scenarios but may also indicate a problem.\n"
"P01 DETAIL: pushed WAL file '000000010000000100000001' to the archive\n"
"P01 WARN: could not push WAL file '000000010000000100000002' to the archive (will be retried): "
"[55] raised from local-1 shim protocol: " STORAGE_ERROR_READ_MISSING,
"[55] raised from local-1 shim protocol: " STORAGE_ERROR_READ_MISSING "\n"
" [FileMissingError] on retry after 0ms",
TEST_PATH "/pg/pg_xlog/000000010000000100000002");
TEST_STORAGE_EXISTS(

View File

@ -2743,7 +2743,8 @@ testRun(void)
TEST_ERROR(
cmdRestore(), FileMissingError,
"raised from local-1 shim protocol: unable to open missing file"
" '" TEST_PATH "/repo/backup/test1/20161219-212741F_20161219-212918I/pg_data/global/pg_control' for read");
" '" TEST_PATH "/repo/backup/test1/20161219-212741F_20161219-212918I/pg_data/global/pg_control' for read\n"
"[FileMissingError] on retry after 0ms");
// Free local processes that were not freed because of the error
protocolFree();

View File

@ -1174,6 +1174,7 @@ testRun(void)
"local-1 shim protocol: unable to open file '" TEST_PATH "/repo/archive/db/"
"11-2/0000000200000008/000000020000000800000003-656817043007aa2100c44c712bcb456db705dab9' for read:"
" [13] Permission denied\n"
" [FileOpenError] on retry after 0ms\n"
"P00 WARN: unable to open missing file '" TEST_PATH "/repo/backup/db/20181119-152800F/backup.manifest' for read\n"
"P00 WARN: unable to open missing file '" TEST_PATH "/repo/backup/db/20181119-152800F/backup.manifest.copy'"
" for read\n"
@ -1188,6 +1189,7 @@ testRun(void)
"P01 ERROR: [039]: invalid result UNPROCESSEDBACKUP/pg_data/testother: [41] raised from local-1 shim protocol:"
" unable to open file '" TEST_PATH "/repo/backup/db/UNPROCESSEDBACKUP/pg_data/testother' for read: [13]"
" Permission denied\n"
" [FileOpenError] on retry after 0ms\n"
"P00 DETAIL: archiveId: 11-2, wal start: 000000020000000700000FFD, wal stop: 000000020000000800000000\n"
"P00 DETAIL: archiveId: 11-2, wal start: 000000020000000800000002, wal stop: 000000020000000800000003\n"
"P00 DETAIL: archiveId: 11-2, wal start: 000000030000000000000000, wal stop: 000000030000000000000001\n"

View File

@ -20,6 +20,8 @@ Test protocol server command handlers
***********************************************************************************************************************************/
#define TEST_PROTOCOL_COMMAND_ASSERT STRID5("assert", 0x2922ce610)
static unsigned int testCommandAssertProtocolTotal = 0;
__attribute__((__noreturn__)) static void
testCommandAssertProtocol(PackRead *const param, ProtocolServer *const server)
{
@ -31,7 +33,8 @@ testCommandAssertProtocol(PackRead *const param, ProtocolServer *const server)
ASSERT(param == NULL);
ASSERT(server != NULL);
hrnErrorThrowP();
testCommandAssertProtocolTotal++;
hrnErrorThrowP(.message = testCommandAssertProtocolTotal <= 3 ? NULL : "ERR_MESSAGE_RETRY");
// No FUNCTION_HARNESS_RETURN_VOID() because the function does not return
}
@ -525,6 +528,7 @@ testRun(void)
VariantList *retryList = varLstNew();
varLstAdd(retryList, varNewUInt64(0));
varLstAdd(retryList, varNewUInt64(500));
TEST_ASSIGN(
server, protocolServerNew(STRDEF("test server"), STRDEF("test"), HRN_FORK_CHILD_READ(), HRN_FORK_CHILD_WRITE()),
@ -666,6 +670,15 @@ testRun(void)
pckReadBoolP(protocolClientExecute(client, protocolCommandNew(TEST_PROTOCOL_COMMAND_RETRY), true)), true,
"execute");
// -----------------------------------------------------------------------------------------------------------------
TEST_TITLE("command throws assert with retry messages");
TEST_ERROR(
protocolClientExecute(client, protocolCommandNew(TEST_PROTOCOL_COMMAND_ASSERT), false), AssertError,
"raised from test client: ERR_MESSAGE\n"
"[AssertError] on retry after 0ms\n"
"[AssertError] on retry after 500ms: ERR_MESSAGE_RETRY");
// -----------------------------------------------------------------------------------------------------------------
TEST_TITLE("free client");