mirror of
https://github.com/pgbackrest/pgbackrest.git
synced 2025-01-18 04:58:51 +02:00
Report original error and retries on local job failure.
The local process will retry jobs (e.g. backup file) but after a certain number of failures gives up. Previously, the last error was reported but generally the first error is far more valuable. The last error is likely to be a cascade failure such as the protocol being out of sync. Report the first error (and stack trace) and append the retry errors to the first error without stack trace information.
This commit is contained in:
parent
34f7873432
commit
6af827cbb1
@ -57,6 +57,21 @@
|
||||
<p>Add recovery start time to online backup restore log.</p>
|
||||
</release-item>
|
||||
|
||||
<release-item>
|
||||
<commit subject="Allow stack trace to be specified for errorInternalThrow()."/>
|
||||
<commit subject="Add jobRetry parameter to HRN_CFG_LOAD()."/>
|
||||
<commit subject="Report original error and retries on local job failure.">
|
||||
<github-pull-request id="1521"/>
|
||||
</commit>
|
||||
|
||||
<release-item-contributor-list>
|
||||
<release-item-contributor id="david.steele"/>
|
||||
<release-item-reviewer id="stefan.fercot"/>
|
||||
</release-item-contributor-list>
|
||||
|
||||
<p>Report original error and retries on local job failure.</p>
|
||||
</release-item>
|
||||
|
||||
<release-item>
|
||||
<github-pull-request id="1519"/>
|
||||
|
||||
|
@ -182,9 +182,16 @@ protocolServerProcess(
|
||||
// Send the command to the handler
|
||||
MEM_CONTEXT_TEMP_BEGIN()
|
||||
{
|
||||
// Variables to store first error message and retry messages
|
||||
const ErrorType *errType = NULL;
|
||||
String *errMessage = NULL;
|
||||
const String *errMessageFirst = NULL;
|
||||
const String *errStackTrace = NULL;
|
||||
|
||||
// Initialize retries in case of command failure
|
||||
bool retry = false;
|
||||
unsigned int retryRemaining = retryInterval != NULL ? varLstSize(retryInterval) : 0;
|
||||
TimeMSec retrySleepMs = 0;
|
||||
|
||||
// Handler retry loop
|
||||
do
|
||||
@ -197,12 +204,33 @@ protocolServerProcess(
|
||||
}
|
||||
CATCH_ANY()
|
||||
{
|
||||
// On first error record the error details. Only the first error will contain a stack trace since
|
||||
// the first error is most likely to contain valuable information.
|
||||
if (errType == NULL)
|
||||
{
|
||||
errType = errorType();
|
||||
errMessage = strNewZ(errorMessage());
|
||||
errMessageFirst = strNewZ(errorMessage());
|
||||
errStackTrace = strNewZ(errorStackTrace());
|
||||
}
|
||||
// Else on a retry error only record the error type and message. Retry errors are less likely to
|
||||
// contain valuable information but may be helpful for debugging.
|
||||
else
|
||||
{
|
||||
strCatFmt(
|
||||
errMessage, "\n[%s] on retry after %" PRIu64 "ms", errorTypeName(errorType()),
|
||||
retrySleepMs);
|
||||
|
||||
// Only append the message if it differs from the first message
|
||||
if (!strEqZ(errMessageFirst, errorMessage()))
|
||||
strCatFmt(errMessage, ": %s", errorMessage());
|
||||
}
|
||||
|
||||
// Are there retries remaining?
|
||||
if (retryRemaining > 0)
|
||||
{
|
||||
// Get the sleep interval for this retry
|
||||
TimeMSec retrySleepMs = varUInt64(
|
||||
varLstGet(retryInterval, varLstSize(retryInterval) - retryRemaining));
|
||||
retrySleepMs = varUInt64(varLstGet(retryInterval, varLstSize(retryInterval) - retryRemaining));
|
||||
|
||||
// Log the retry
|
||||
LOG_DEBUG_FMT(
|
||||
@ -222,7 +250,7 @@ protocolServerProcess(
|
||||
}
|
||||
// Else report error to the client
|
||||
else
|
||||
protocolServerError(this, errorCode(), STR(errorMessage()), STR(errorStackTrace()));
|
||||
protocolServerError(this, errorTypeCode(errType), errMessage, errStackTrace);
|
||||
}
|
||||
TRY_END();
|
||||
}
|
||||
|
@ -533,7 +533,8 @@ testRun(void)
|
||||
" repo1: 10-1/0000000100000002/000000010000000200000000-aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa.gz"
|
||||
" [FormatError] unexpected eof in compressed data\n"
|
||||
" repo2: 10-1/0000000100000002/000000010000000200000000-aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa.gz"
|
||||
" [FormatError] unexpected eof in compressed data");
|
||||
" [FormatError] unexpected eof in compressed data\n"
|
||||
" [FileReadError] on retry after 0ms");
|
||||
|
||||
TEST_STORAGE_GET(
|
||||
storageSpoolWrite(), STORAGE_SPOOL_ARCHIVE_IN "/000000010000000200000000.error",
|
||||
@ -543,6 +544,7 @@ testRun(void)
|
||||
" [FormatError] unexpected eof in compressed data\n"
|
||||
"repo2: 10-1/0000000100000002/000000010000000200000000-aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa.gz"
|
||||
" [FormatError] unexpected eof in compressed data\n"
|
||||
"[FileReadError] on retry after 0ms\n"
|
||||
"repo3: [ArchiveMismatchError] unable to retrieve the archive id for database version '10' and system-id"
|
||||
" '18072658121562454734'",
|
||||
.remove = true);
|
||||
|
@ -835,7 +835,8 @@ testRun(void)
|
||||
" HINT: this is valid in some recovery scenarios but may also indicate a problem.\n"
|
||||
"P01 DETAIL: pushed WAL file '000000010000000100000001' to the archive\n"
|
||||
"P01 WARN: could not push WAL file '000000010000000100000002' to the archive (will be retried): "
|
||||
"[55] raised from local-1 shim protocol: " STORAGE_ERROR_READ_MISSING,
|
||||
"[55] raised from local-1 shim protocol: " STORAGE_ERROR_READ_MISSING "\n"
|
||||
" [FileMissingError] on retry after 0ms",
|
||||
TEST_PATH "/pg/pg_xlog/000000010000000100000002");
|
||||
|
||||
TEST_STORAGE_EXISTS(
|
||||
|
@ -2743,7 +2743,8 @@ testRun(void)
|
||||
TEST_ERROR(
|
||||
cmdRestore(), FileMissingError,
|
||||
"raised from local-1 shim protocol: unable to open missing file"
|
||||
" '" TEST_PATH "/repo/backup/test1/20161219-212741F_20161219-212918I/pg_data/global/pg_control' for read");
|
||||
" '" TEST_PATH "/repo/backup/test1/20161219-212741F_20161219-212918I/pg_data/global/pg_control' for read\n"
|
||||
"[FileMissingError] on retry after 0ms");
|
||||
|
||||
// Free local processes that were not freed because of the error
|
||||
protocolFree();
|
||||
|
@ -1174,6 +1174,7 @@ testRun(void)
|
||||
"local-1 shim protocol: unable to open file '" TEST_PATH "/repo/archive/db/"
|
||||
"11-2/0000000200000008/000000020000000800000003-656817043007aa2100c44c712bcb456db705dab9' for read:"
|
||||
" [13] Permission denied\n"
|
||||
" [FileOpenError] on retry after 0ms\n"
|
||||
"P00 WARN: unable to open missing file '" TEST_PATH "/repo/backup/db/20181119-152800F/backup.manifest' for read\n"
|
||||
"P00 WARN: unable to open missing file '" TEST_PATH "/repo/backup/db/20181119-152800F/backup.manifest.copy'"
|
||||
" for read\n"
|
||||
@ -1188,6 +1189,7 @@ testRun(void)
|
||||
"P01 ERROR: [039]: invalid result UNPROCESSEDBACKUP/pg_data/testother: [41] raised from local-1 shim protocol:"
|
||||
" unable to open file '" TEST_PATH "/repo/backup/db/UNPROCESSEDBACKUP/pg_data/testother' for read: [13]"
|
||||
" Permission denied\n"
|
||||
" [FileOpenError] on retry after 0ms\n"
|
||||
"P00 DETAIL: archiveId: 11-2, wal start: 000000020000000700000FFD, wal stop: 000000020000000800000000\n"
|
||||
"P00 DETAIL: archiveId: 11-2, wal start: 000000020000000800000002, wal stop: 000000020000000800000003\n"
|
||||
"P00 DETAIL: archiveId: 11-2, wal start: 000000030000000000000000, wal stop: 000000030000000000000001\n"
|
||||
|
@ -20,6 +20,8 @@ Test protocol server command handlers
|
||||
***********************************************************************************************************************************/
|
||||
#define TEST_PROTOCOL_COMMAND_ASSERT STRID5("assert", 0x2922ce610)
|
||||
|
||||
static unsigned int testCommandAssertProtocolTotal = 0;
|
||||
|
||||
__attribute__((__noreturn__)) static void
|
||||
testCommandAssertProtocol(PackRead *const param, ProtocolServer *const server)
|
||||
{
|
||||
@ -31,7 +33,8 @@ testCommandAssertProtocol(PackRead *const param, ProtocolServer *const server)
|
||||
ASSERT(param == NULL);
|
||||
ASSERT(server != NULL);
|
||||
|
||||
hrnErrorThrowP();
|
||||
testCommandAssertProtocolTotal++;
|
||||
hrnErrorThrowP(.message = testCommandAssertProtocolTotal <= 3 ? NULL : "ERR_MESSAGE_RETRY");
|
||||
|
||||
// No FUNCTION_HARNESS_RETURN_VOID() because the function does not return
|
||||
}
|
||||
@ -525,6 +528,7 @@ testRun(void)
|
||||
|
||||
VariantList *retryList = varLstNew();
|
||||
varLstAdd(retryList, varNewUInt64(0));
|
||||
varLstAdd(retryList, varNewUInt64(500));
|
||||
|
||||
TEST_ASSIGN(
|
||||
server, protocolServerNew(STRDEF("test server"), STRDEF("test"), HRN_FORK_CHILD_READ(), HRN_FORK_CHILD_WRITE()),
|
||||
@ -666,6 +670,15 @@ testRun(void)
|
||||
pckReadBoolP(protocolClientExecute(client, protocolCommandNew(TEST_PROTOCOL_COMMAND_RETRY), true)), true,
|
||||
"execute");
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
TEST_TITLE("command throws assert with retry messages");
|
||||
|
||||
TEST_ERROR(
|
||||
protocolClientExecute(client, protocolCommandNew(TEST_PROTOCOL_COMMAND_ASSERT), false), AssertError,
|
||||
"raised from test client: ERR_MESSAGE\n"
|
||||
"[AssertError] on retry after 0ms\n"
|
||||
"[AssertError] on retry after 500ms: ERR_MESSAGE_RETRY");
|
||||
|
||||
// -----------------------------------------------------------------------------------------------------------------
|
||||
TEST_TITLE("free client");
|
||||
|
||||
|
Loading…
x
Reference in New Issue
Block a user