1
0
mirror of https://github.com/pgbackrest/pgbackrest.git synced 2024-12-12 10:04:14 +02:00

Fix issue with multiple async status files causing a hard error.

Multiple status files were being created by asynchronous archiving if a high-level error occurred after one or more WAL segments had already been transferred successfully.  Error files were being written for every file in the queue regardless of whether it had already succeeded.  To fix this, add an option to skip writing error files when an ok file already exists.

There are other situations where both files might exist (various fsync and filesystem error scenarios) so it seems best to retry in the case that multiple status files are found rather than throwing a hard error (which then means that archiving is completely stuck).  In the case of multiple status files, a warning will be logged to alert the user that something unusual is happening and the command will be retried.

Reported by fpa-postgres, Joe Ayers, Douglas J Hunley.
This commit is contained in:
David Steele 2019-01-26 16:59:54 +02:00
parent f3ae3c4f9d
commit 8f6d324b2c
7 changed files with 90 additions and 24 deletions

View File

@ -14,6 +14,18 @@
<release-list> <release-list>
<release date="XXXX-XX-XX" version="2.09dev" title="UNDER DEVELOPMENT"> <release date="XXXX-XX-XX" version="2.09dev" title="UNDER DEVELOPMENT">
<release-core-list> <release-core-list>
<release-bug-list>
<release-item>
<release-item-contributor-list>
<release-item-ideator id="fpa-postgres"/>
<release-item-ideator id="joe.ayers"/>
<release-item-ideator id="douglas.j.hunley"/>
</release-item-contributor-list>
<p>Fix issue with multiple async status files causing a hard error.</p>
</release-item>
</release-bug-list>
<release-improvement-list> <release-improvement-list>
<release-item> <release-item>
<p>The <cmd>info</cmd> command is implemented entirely in C.</p> <p>The <cmd>info</cmd> command is implemented entirely in C.</p>
@ -6233,6 +6245,11 @@
<contributor-id type="github">evanbenoit</contributor-id> <contributor-id type="github">evanbenoit</contributor-id>
</contributor> </contributor>
<contributor id="fpa-postgres">
<contributor-name-display>fpa-postgres</contributor-name-display>
<contributor-id type="github">fpa-postgres</contributor-id>
</contributor>
<contributor id="greg.smith"> <contributor id="greg.smith">
<contributor-name-display>Greg Smith</contributor-name-display> <contributor-name-display>Greg Smith</contributor-name-display>
<contributor-id type="github">gregscds</contributor-id> <contributor-id type="github">gregscds</contributor-id>
@ -6277,6 +6294,10 @@
<contributor-id type="github">Underhunden</contributor-id> <contributor-id type="github">Underhunden</contributor-id>
</contributor> </contributor>
<contributor id="joe.ayers">
<contributor-name-display>Joe Ayers</contributor-name-display>
</contributor>
<contributor id="john.harvey"> <contributor id="john.harvey">
<contributor-name-display>John Harvey</contributor-name-display> <contributor-name-display>John Harvey</contributor-name-display>
<contributor-id type="github">crunchyjohn</contributor-id> <contributor-id type="github">crunchyjohn</contributor-id>

View File

@ -452,6 +452,7 @@ sub archiveAsyncStatusWrite
$strWalFile, $strWalFile,
$iCode, $iCode,
$strMessage, $strMessage,
$bIgnoreErrorOnOk,
) = ) =
logDebugParam logDebugParam
( (
@ -461,6 +462,7 @@ sub archiveAsyncStatusWrite
{name => 'strWalFile'}, {name => 'strWalFile'},
{name => 'iCode', required => false}, {name => 'iCode', required => false},
{name => 'strMessage', required => false}, {name => 'strMessage', required => false},
{name => 'bIgnoreErrorOnOk', required => false, default => false},
); );
# Remove any error file exists unless a new one will be written # Remove any error file exists unless a new one will be written
@ -470,25 +472,33 @@ sub archiveAsyncStatusWrite
storageLocal()->remove("${strSpoolPath}/${strWalFile}.error", {bIgnoreMissing => true}); storageLocal()->remove("${strSpoolPath}/${strWalFile}.error", {bIgnoreMissing => true});
} }
# Write the status file # If an error will be written but an ok file already exists this may be expected and will be indicated by bIgnoreErrorOnOk set
my $strStatus; # to true. In this case just return without writing the error file.
if (!($strType eq WAL_STATUS_ERROR && $bIgnoreErrorOnOk && storageLocal()->exists("${strSpoolPath}/${strWalFile}.ok")))
if (defined($iCode))
{ {
if (!defined($strMessage)) # Write the status file
my $strStatus;
if (defined($iCode))
{ {
confess &log(ASSERT, 'strMessage must be set when iCode is set'); if (!defined($strMessage))
{
confess &log(ASSERT, 'strMessage must be set when iCode is set');
}
$strStatus = "${iCode}\n${strMessage}";
}
elsif ($strType eq WAL_STATUS_ERROR)
{
confess &log(ASSERT, 'error status must have iCode and strMessage set');
} }
$strStatus = "${iCode}\n${strMessage}"; storageLocal()->put(
} storageLocal()->openWrite("${strSpoolPath}/${strWalFile}.${strType}", {bAtomic => true}), $strStatus);
elsif ($strType eq WAL_STATUS_ERROR)
{
confess &log(ASSERT, 'error status must have iCode and strMessage set');
} }
storageLocal()->put( # Return from function and log return values if any
storageLocal()->openWrite("${strSpoolPath}/${strWalFile}.${strType}", {bAtomic => true}), $strStatus); return logDebugReturn($strOperation);
} }
push @EXPORT, qw(archiveAsyncStatusWrite); push @EXPORT, qw(archiveAsyncStatusWrite);

View File

@ -216,11 +216,11 @@ sub processQueue
my $iCode = exceptionCode($EVAL_ERROR); my $iCode = exceptionCode($EVAL_ERROR);
my $strMessage = exceptionMessage($EVAL_ERROR); my $strMessage = exceptionMessage($EVAL_ERROR);
# Error all queued jobs # Error all queued jobs except for the ones that have already succeeded
foreach my $strWalFile (@{$stryWalFile}) foreach my $strWalFile (@{$stryWalFile})
{ {
archiveAsyncStatusWrite( archiveAsyncStatusWrite(
WAL_STATUS_ERROR, $self->{strSpoolPath}, $strWalFile, $iCode, $strMessage); WAL_STATUS_ERROR, $self->{strSpoolPath}, $strWalFile, $iCode, $strMessage, true);
} }
} }
} }

View File

@ -46,12 +46,20 @@ archiveAsyncStatus(ArchiveMode archiveMode, const String *walSegment, bool confe
bool okFileExists = storageExistsNP(storageSpool(), strNewFmt("%s/%s", strPtr(spoolQueue), strPtr(okFile))); bool okFileExists = storageExistsNP(storageSpool(), strNewFmt("%s/%s", strPtr(spoolQueue), strPtr(okFile)));
bool errorFileExists = storageExistsNP(storageSpool(), strNewFmt("%s/%s", strPtr(spoolQueue), strPtr(errorFile))); bool errorFileExists = storageExistsNP(storageSpool(), strNewFmt("%s/%s", strPtr(spoolQueue), strPtr(errorFile)));
// If both status files are found then assert - this could be a bug in the async process // If both status files are found then warn, remove the files, and return false so the segment will be retried. This may be
// a bug in the async process but it may also be a failed fsync or other filesystem issue. In any case, a hard failure here
// would mean that archiving is completely stuck so it is better to attempt a retry.
if (okFileExists && errorFileExists) if (okFileExists && errorFileExists)
{ {
THROW_FMT( LOG_WARN(
AssertError, "multiple status files found in '%s' for WAL segment '%s'", "multiple status files found in '%s' for WAL segment '%s' will be removed and the command retried",
strPtr(storagePath(storageSpool(), spoolQueue)), strPtr(walSegment)); strPtr(storagePath(storageSpool(), spoolQueue)), strPtr(walSegment));
storageRemoveNP(storageSpoolWrite(), strNewFmt("%s/%s", strPtr(spoolQueue), strPtr(okFile)));
okFileExists = false;
storageRemoveNP(storageSpoolWrite(), strNewFmt("%s/%s", strPtr(spoolQueue), strPtr(errorFile)));
errorFileExists = false;
} }
// If either of them exists then check what happened and report back // If either of them exists then check what happened and report back

View File

@ -431,6 +431,7 @@ static const EmbeddedModule embeddedModule[] =
"$strWalFile,\n" "$strWalFile,\n"
"$iCode,\n" "$iCode,\n"
"$strMessage,\n" "$strMessage,\n"
"$bIgnoreErrorOnOk,\n"
") =\n" ") =\n"
"logDebugParam\n" "logDebugParam\n"
"(\n" "(\n"
@ -440,6 +441,7 @@ static const EmbeddedModule embeddedModule[] =
"{name => 'strWalFile'},\n" "{name => 'strWalFile'},\n"
"{name => 'iCode', required => false},\n" "{name => 'iCode', required => false},\n"
"{name => 'strMessage', required => false},\n" "{name => 'strMessage', required => false},\n"
"{name => 'bIgnoreErrorOnOk', required => false, default => false},\n"
");\n" ");\n"
"\n\n" "\n\n"
"if ($strType ne WAL_STATUS_ERROR)\n" "if ($strType ne WAL_STATUS_ERROR)\n"
@ -447,7 +449,10 @@ static const EmbeddedModule embeddedModule[] =
"\n" "\n"
"storageLocal()->remove(\"${strSpoolPath}/${strWalFile}.error\", {bIgnoreMissing => true});\n" "storageLocal()->remove(\"${strSpoolPath}/${strWalFile}.error\", {bIgnoreMissing => true});\n"
"}\n" "}\n"
"\n\n" "\n\n\n"
"if (!($strType eq WAL_STATUS_ERROR && $bIgnoreErrorOnOk && storageLocal()->exists(\"${strSpoolPath}/${strWalFile}.ok\")))\n"
"{\n"
"\n"
"my $strStatus;\n" "my $strStatus;\n"
"\n" "\n"
"if (defined($iCode))\n" "if (defined($iCode))\n"
@ -467,6 +472,9 @@ static const EmbeddedModule embeddedModule[] =
"storageLocal()->put(\n" "storageLocal()->put(\n"
"storageLocal()->openWrite(\"${strSpoolPath}/${strWalFile}.${strType}\", {bAtomic => true}), $strStatus);\n" "storageLocal()->openWrite(\"${strSpoolPath}/${strWalFile}.${strType}\", {bAtomic => true}), $strStatus);\n"
"}\n" "}\n"
"\n\n"
"return logDebugReturn($strOperation);\n"
"}\n"
"\n" "\n"
"push @EXPORT, qw(archiveAsyncStatusWrite);\n" "push @EXPORT, qw(archiveAsyncStatusWrite);\n"
"\n" "\n"
@ -1603,7 +1611,7 @@ static const EmbeddedModule embeddedModule[] =
"foreach my $strWalFile (@{$stryWalFile})\n" "foreach my $strWalFile (@{$stryWalFile})\n"
"{\n" "{\n"
"archiveAsyncStatusWrite(\n" "archiveAsyncStatusWrite(\n"
"WAL_STATUS_ERROR, $self->{strSpoolPath}, $strWalFile, $iCode, $strMessage);\n" "WAL_STATUS_ERROR, $self->{strSpoolPath}, $strWalFile, $iCode, $strMessage, true);\n"
"}\n" "}\n"
"}\n" "}\n"
"}\n" "}\n"

View File

@ -199,6 +199,15 @@ sub run
# Generate a valid warning ok # Generate a valid warning ok
archiveAsyncStatusWrite(WAL_STATUS_OK, $strSpoolPath, $strSegment, 0, 'Test Warning'); archiveAsyncStatusWrite(WAL_STATUS_OK, $strSpoolPath, $strSegment, 0, 'Test Warning');
# Skip error when an ok file already exists
#---------------------------------------------------------------------------------------------------------------------------
archiveAsyncStatusWrite(
WAL_STATUS_ERROR, $strSpoolPath, $strSegment, ERROR_ARCHIVE_DUPLICATE,
"WAL segment ${strSegment} already exists in the archive", true);
$self->testResult(
$self->storageTest()->exists("${strSpoolPath}/${strSegment}.error"), false, "error file should not exist");
#--------------------------------------------------------------------------------------------------------------------------- #---------------------------------------------------------------------------------------------------------------------------
# Generate an invalid error # Generate an invalid error
$self->testException( $self->testException(

View File

@ -84,13 +84,23 @@ testRun(void)
// ------------------------------------------------------------------------------------------------------------------------- // -------------------------------------------------------------------------------------------------------------------------
storagePutNP( storagePutNP(
storageNewWriteNP(storageSpoolWrite(), strNewFmt(STORAGE_SPOOL_ARCHIVE_OUT "/%s.error", strPtr(segment))), bufNew(0)); storageNewWriteNP(storageSpoolWrite(), strNewFmt(STORAGE_SPOOL_ARCHIVE_OUT "/%s.error", strPtr(segment))), bufNew(0));
TEST_ERROR( TEST_RESULT_BOOL(archiveAsyncStatus(archiveModePush, segment, false), false, "multiple status files returns false");
archiveAsyncStatus(archiveModePush, segment, false), AssertError,
TEST_RESULT_BOOL(
storageExistsNP(storageSpool(), strNewFmt(STORAGE_SPOOL_ARCHIVE_OUT "/%s.error", strPtr(segment))), false,
".error file was deleted");
TEST_RESULT_BOOL(
storageExistsNP(storageSpool(), strNewFmt(STORAGE_SPOOL_ARCHIVE_OUT "/%s.ok", strPtr(segment))), false,
".ok file was deleted");
harnessLogResult(
strPtr( strPtr(
strNewFmt( strNewFmt(
"multiple status files found in '%s/archive/db/out' for WAL segment '000000010000000100000001'", testPath()))); "P00 WARN: multiple status files found in '%s/archive/db/out' for WAL segment '000000010000000100000001'"
" will be removed and the command retried", testPath())));
unlink(strPtr(storagePathNP(storageSpool(), strNewFmt(STORAGE_SPOOL_ARCHIVE_OUT "/%s.ok", strPtr(segment))))); storagePutNP(
storageNewWriteNP(storageSpoolWrite(), strNewFmt(STORAGE_SPOOL_ARCHIVE_OUT "/%s.error", strPtr(segment))), bufNew(0));
TEST_ERROR( TEST_ERROR(
archiveAsyncStatus(archiveModePush, segment, true), AssertError, archiveAsyncStatus(archiveModePush, segment, true), AssertError,
"status file '000000010000000100000001.error' has no content"); "status file '000000010000000100000001.error' has no content");