mirror of
https://github.com/pgbackrest/pgbackrest.git
synced 2024-12-12 10:04:14 +02:00
Fix issue with multiple async status files causing a hard error.
Multiple status files were being created by asynchronous archiving if a high-level error occurred after one or more WAL segments had already been transferred successfully. Error files were being written for every file in the queue regardless of whether it had already succeeded. To fix this, add an option to skip writing error files when an ok file already exists. There are other situations where both files might exist (various fsync and filesystem error scenarios) so it seems best to retry in the case that multiple status files are found rather than throwing a hard error (which then means that archiving is completely stuck). In the case of multiple status files, a warning will be logged to alert the user that something unusual is happening and the command will be retried. Reported by fpa-postgres, Joe Ayers, Douglas J Hunley.
This commit is contained in:
parent
f3ae3c4f9d
commit
8f6d324b2c
@ -14,6 +14,18 @@
|
||||
<release-list>
|
||||
<release date="XXXX-XX-XX" version="2.09dev" title="UNDER DEVELOPMENT">
|
||||
<release-core-list>
|
||||
<release-bug-list>
|
||||
<release-item>
|
||||
<release-item-contributor-list>
|
||||
<release-item-ideator id="fpa-postgres"/>
|
||||
<release-item-ideator id="joe.ayers"/>
|
||||
<release-item-ideator id="douglas.j.hunley"/>
|
||||
</release-item-contributor-list>
|
||||
|
||||
<p>Fix issue with multiple async status files causing a hard error.</p>
|
||||
</release-item>
|
||||
</release-bug-list>
|
||||
|
||||
<release-improvement-list>
|
||||
<release-item>
|
||||
<p>The <cmd>info</cmd> command is implemented entirely in C.</p>
|
||||
@ -6233,6 +6245,11 @@
|
||||
<contributor-id type="github">evanbenoit</contributor-id>
|
||||
</contributor>
|
||||
|
||||
<contributor id="fpa-postgres">
|
||||
<contributor-name-display>fpa-postgres</contributor-name-display>
|
||||
<contributor-id type="github">fpa-postgres</contributor-id>
|
||||
</contributor>
|
||||
|
||||
<contributor id="greg.smith">
|
||||
<contributor-name-display>Greg Smith</contributor-name-display>
|
||||
<contributor-id type="github">gregscds</contributor-id>
|
||||
@ -6277,6 +6294,10 @@
|
||||
<contributor-id type="github">Underhunden</contributor-id>
|
||||
</contributor>
|
||||
|
||||
<contributor id="joe.ayers">
|
||||
<contributor-name-display>Joe Ayers</contributor-name-display>
|
||||
</contributor>
|
||||
|
||||
<contributor id="john.harvey">
|
||||
<contributor-name-display>John Harvey</contributor-name-display>
|
||||
<contributor-id type="github">crunchyjohn</contributor-id>
|
||||
|
@ -452,6 +452,7 @@ sub archiveAsyncStatusWrite
|
||||
$strWalFile,
|
||||
$iCode,
|
||||
$strMessage,
|
||||
$bIgnoreErrorOnOk,
|
||||
) =
|
||||
logDebugParam
|
||||
(
|
||||
@ -461,6 +462,7 @@ sub archiveAsyncStatusWrite
|
||||
{name => 'strWalFile'},
|
||||
{name => 'iCode', required => false},
|
||||
{name => 'strMessage', required => false},
|
||||
{name => 'bIgnoreErrorOnOk', required => false, default => false},
|
||||
);
|
||||
|
||||
# Remove any error file exists unless a new one will be written
|
||||
@ -470,25 +472,33 @@ sub archiveAsyncStatusWrite
|
||||
storageLocal()->remove("${strSpoolPath}/${strWalFile}.error", {bIgnoreMissing => true});
|
||||
}
|
||||
|
||||
# Write the status file
|
||||
my $strStatus;
|
||||
|
||||
if (defined($iCode))
|
||||
# If an error will be written but an ok file already exists this may be expected and will be indicated by bIgnoreErrorOnOk set
|
||||
# to true. In this case just return without writing the error file.
|
||||
if (!($strType eq WAL_STATUS_ERROR && $bIgnoreErrorOnOk && storageLocal()->exists("${strSpoolPath}/${strWalFile}.ok")))
|
||||
{
|
||||
if (!defined($strMessage))
|
||||
# Write the status file
|
||||
my $strStatus;
|
||||
|
||||
if (defined($iCode))
|
||||
{
|
||||
confess &log(ASSERT, 'strMessage must be set when iCode is set');
|
||||
if (!defined($strMessage))
|
||||
{
|
||||
confess &log(ASSERT, 'strMessage must be set when iCode is set');
|
||||
}
|
||||
|
||||
$strStatus = "${iCode}\n${strMessage}";
|
||||
}
|
||||
elsif ($strType eq WAL_STATUS_ERROR)
|
||||
{
|
||||
confess &log(ASSERT, 'error status must have iCode and strMessage set');
|
||||
}
|
||||
|
||||
$strStatus = "${iCode}\n${strMessage}";
|
||||
}
|
||||
elsif ($strType eq WAL_STATUS_ERROR)
|
||||
{
|
||||
confess &log(ASSERT, 'error status must have iCode and strMessage set');
|
||||
storageLocal()->put(
|
||||
storageLocal()->openWrite("${strSpoolPath}/${strWalFile}.${strType}", {bAtomic => true}), $strStatus);
|
||||
}
|
||||
|
||||
storageLocal()->put(
|
||||
storageLocal()->openWrite("${strSpoolPath}/${strWalFile}.${strType}", {bAtomic => true}), $strStatus);
|
||||
# Return from function and log return values if any
|
||||
return logDebugReturn($strOperation);
|
||||
}
|
||||
|
||||
push @EXPORT, qw(archiveAsyncStatusWrite);
|
||||
|
@ -216,11 +216,11 @@ sub processQueue
|
||||
my $iCode = exceptionCode($EVAL_ERROR);
|
||||
my $strMessage = exceptionMessage($EVAL_ERROR);
|
||||
|
||||
# Error all queued jobs
|
||||
# Error all queued jobs except for the ones that have already succeeded
|
||||
foreach my $strWalFile (@{$stryWalFile})
|
||||
{
|
||||
archiveAsyncStatusWrite(
|
||||
WAL_STATUS_ERROR, $self->{strSpoolPath}, $strWalFile, $iCode, $strMessage);
|
||||
WAL_STATUS_ERROR, $self->{strSpoolPath}, $strWalFile, $iCode, $strMessage, true);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
@ -46,12 +46,20 @@ archiveAsyncStatus(ArchiveMode archiveMode, const String *walSegment, bool confe
|
||||
bool okFileExists = storageExistsNP(storageSpool(), strNewFmt("%s/%s", strPtr(spoolQueue), strPtr(okFile)));
|
||||
bool errorFileExists = storageExistsNP(storageSpool(), strNewFmt("%s/%s", strPtr(spoolQueue), strPtr(errorFile)));
|
||||
|
||||
// If both status files are found then assert - this could be a bug in the async process
|
||||
// If both status files are found then warn, remove the files, and return false so the segment will be retried. This may be
|
||||
// a bug in the async process but it may also be a failed fsync or other filesystem issue. In any case, a hard failure here
|
||||
// would mean that archiving is completely stuck so it is better to attempt a retry.
|
||||
if (okFileExists && errorFileExists)
|
||||
{
|
||||
THROW_FMT(
|
||||
AssertError, "multiple status files found in '%s' for WAL segment '%s'",
|
||||
LOG_WARN(
|
||||
"multiple status files found in '%s' for WAL segment '%s' will be removed and the command retried",
|
||||
strPtr(storagePath(storageSpool(), spoolQueue)), strPtr(walSegment));
|
||||
|
||||
storageRemoveNP(storageSpoolWrite(), strNewFmt("%s/%s", strPtr(spoolQueue), strPtr(okFile)));
|
||||
okFileExists = false;
|
||||
|
||||
storageRemoveNP(storageSpoolWrite(), strNewFmt("%s/%s", strPtr(spoolQueue), strPtr(errorFile)));
|
||||
errorFileExists = false;
|
||||
}
|
||||
|
||||
// If either of them exists then check what happened and report back
|
||||
|
@ -431,6 +431,7 @@ static const EmbeddedModule embeddedModule[] =
|
||||
"$strWalFile,\n"
|
||||
"$iCode,\n"
|
||||
"$strMessage,\n"
|
||||
"$bIgnoreErrorOnOk,\n"
|
||||
") =\n"
|
||||
"logDebugParam\n"
|
||||
"(\n"
|
||||
@ -440,6 +441,7 @@ static const EmbeddedModule embeddedModule[] =
|
||||
"{name => 'strWalFile'},\n"
|
||||
"{name => 'iCode', required => false},\n"
|
||||
"{name => 'strMessage', required => false},\n"
|
||||
"{name => 'bIgnoreErrorOnOk', required => false, default => false},\n"
|
||||
");\n"
|
||||
"\n\n"
|
||||
"if ($strType ne WAL_STATUS_ERROR)\n"
|
||||
@ -447,7 +449,10 @@ static const EmbeddedModule embeddedModule[] =
|
||||
"\n"
|
||||
"storageLocal()->remove(\"${strSpoolPath}/${strWalFile}.error\", {bIgnoreMissing => true});\n"
|
||||
"}\n"
|
||||
"\n\n"
|
||||
"\n\n\n"
|
||||
"if (!($strType eq WAL_STATUS_ERROR && $bIgnoreErrorOnOk && storageLocal()->exists(\"${strSpoolPath}/${strWalFile}.ok\")))\n"
|
||||
"{\n"
|
||||
"\n"
|
||||
"my $strStatus;\n"
|
||||
"\n"
|
||||
"if (defined($iCode))\n"
|
||||
@ -467,6 +472,9 @@ static const EmbeddedModule embeddedModule[] =
|
||||
"storageLocal()->put(\n"
|
||||
"storageLocal()->openWrite(\"${strSpoolPath}/${strWalFile}.${strType}\", {bAtomic => true}), $strStatus);\n"
|
||||
"}\n"
|
||||
"\n\n"
|
||||
"return logDebugReturn($strOperation);\n"
|
||||
"}\n"
|
||||
"\n"
|
||||
"push @EXPORT, qw(archiveAsyncStatusWrite);\n"
|
||||
"\n"
|
||||
@ -1603,7 +1611,7 @@ static const EmbeddedModule embeddedModule[] =
|
||||
"foreach my $strWalFile (@{$stryWalFile})\n"
|
||||
"{\n"
|
||||
"archiveAsyncStatusWrite(\n"
|
||||
"WAL_STATUS_ERROR, $self->{strSpoolPath}, $strWalFile, $iCode, $strMessage);\n"
|
||||
"WAL_STATUS_ERROR, $self->{strSpoolPath}, $strWalFile, $iCode, $strMessage, true);\n"
|
||||
"}\n"
|
||||
"}\n"
|
||||
"}\n"
|
||||
|
@ -199,6 +199,15 @@ sub run
|
||||
# Generate a valid warning ok
|
||||
archiveAsyncStatusWrite(WAL_STATUS_OK, $strSpoolPath, $strSegment, 0, 'Test Warning');
|
||||
|
||||
# Skip error when an ok file already exists
|
||||
#---------------------------------------------------------------------------------------------------------------------------
|
||||
archiveAsyncStatusWrite(
|
||||
WAL_STATUS_ERROR, $strSpoolPath, $strSegment, ERROR_ARCHIVE_DUPLICATE,
|
||||
"WAL segment ${strSegment} already exists in the archive", true);
|
||||
|
||||
$self->testResult(
|
||||
$self->storageTest()->exists("${strSpoolPath}/${strSegment}.error"), false, "error file should not exist");
|
||||
|
||||
#---------------------------------------------------------------------------------------------------------------------------
|
||||
# Generate an invalid error
|
||||
$self->testException(
|
||||
|
@ -84,13 +84,23 @@ testRun(void)
|
||||
// -------------------------------------------------------------------------------------------------------------------------
|
||||
storagePutNP(
|
||||
storageNewWriteNP(storageSpoolWrite(), strNewFmt(STORAGE_SPOOL_ARCHIVE_OUT "/%s.error", strPtr(segment))), bufNew(0));
|
||||
TEST_ERROR(
|
||||
archiveAsyncStatus(archiveModePush, segment, false), AssertError,
|
||||
TEST_RESULT_BOOL(archiveAsyncStatus(archiveModePush, segment, false), false, "multiple status files returns false");
|
||||
|
||||
TEST_RESULT_BOOL(
|
||||
storageExistsNP(storageSpool(), strNewFmt(STORAGE_SPOOL_ARCHIVE_OUT "/%s.error", strPtr(segment))), false,
|
||||
".error file was deleted");
|
||||
TEST_RESULT_BOOL(
|
||||
storageExistsNP(storageSpool(), strNewFmt(STORAGE_SPOOL_ARCHIVE_OUT "/%s.ok", strPtr(segment))), false,
|
||||
".ok file was deleted");
|
||||
|
||||
harnessLogResult(
|
||||
strPtr(
|
||||
strNewFmt(
|
||||
"multiple status files found in '%s/archive/db/out' for WAL segment '000000010000000100000001'", testPath())));
|
||||
"P00 WARN: multiple status files found in '%s/archive/db/out' for WAL segment '000000010000000100000001'"
|
||||
" will be removed and the command retried", testPath())));
|
||||
|
||||
unlink(strPtr(storagePathNP(storageSpool(), strNewFmt(STORAGE_SPOOL_ARCHIVE_OUT "/%s.ok", strPtr(segment)))));
|
||||
storagePutNP(
|
||||
storageNewWriteNP(storageSpoolWrite(), strNewFmt(STORAGE_SPOOL_ARCHIVE_OUT "/%s.error", strPtr(segment))), bufNew(0));
|
||||
TEST_ERROR(
|
||||
archiveAsyncStatus(archiveModePush, segment, true), AssertError,
|
||||
"status file '000000010000000100000001.error' has no content");
|
||||
|
Loading…
Reference in New Issue
Block a user