From 8f6d324b2c2576456bd0ba7f691297df1f5aba1e Mon Sep 17 00:00:00 2001 From: David Steele Date: Sat, 26 Jan 2019 16:59:54 +0200 Subject: [PATCH] Fix issue with multiple async status files causing a hard error. Multiple status files were being created by asynchronous archiving if a high-level error occurred after one or more WAL segments had already been transferred successfully. Error files were being written for every file in the queue regardless of whether it had already succeeded. To fix this, add an option to skip writing error files when an ok file already exists. There are other situations where both files might exist (various fsync and filesystem error scenarios) so it seems best to retry in the case that multiple status files are found rather than throwing a hard error (which then means that archiving is completely stuck). In the case of multiple status files, a warning will be logged to alert the user that something unusual is happening and the command will be retried. Reported by fpa-postgres, Joe Ayers, Douglas J Hunley. --- doc/xml/release.xml | 21 +++++++++++ lib/pgBackRest/Archive/Common.pm | 36 ++++++++++++------- lib/pgBackRest/Archive/Push/Async.pm | 4 +-- src/command/archive/common.c | 14 ++++++-- src/perl/embed.auto.c | 12 +++++-- .../Module/Archive/ArchiveCommonPerlTest.pm | 9 +++++ test/src/module/archive/commonTest.c | 18 +++++++--- 7 files changed, 90 insertions(+), 24 deletions(-) diff --git a/doc/xml/release.xml b/doc/xml/release.xml index 093c0ebc1..cf57142b3 100644 --- a/doc/xml/release.xml +++ b/doc/xml/release.xml @@ -14,6 +14,18 @@ + + + + + + + + +

Fix issue with multiple async status files causing a hard error.

+
+
+

The info command is implemented entirely in C.

@@ -6233,6 +6245,11 @@ evanbenoit + + fpa-postgres + fpa-postgres + + Greg Smith gregscds @@ -6277,6 +6294,10 @@ Underhunden + + Joe Ayers + + John Harvey crunchyjohn diff --git a/lib/pgBackRest/Archive/Common.pm b/lib/pgBackRest/Archive/Common.pm index fe6de439b..b42053339 100644 --- a/lib/pgBackRest/Archive/Common.pm +++ b/lib/pgBackRest/Archive/Common.pm @@ -452,6 +452,7 @@ sub archiveAsyncStatusWrite $strWalFile, $iCode, $strMessage, + $bIgnoreErrorOnOk, ) = logDebugParam ( @@ -461,6 +462,7 @@ sub archiveAsyncStatusWrite {name => 'strWalFile'}, {name => 'iCode', required => false}, {name => 'strMessage', required => false}, + {name => 'bIgnoreErrorOnOk', required => false, default => false}, ); # Remove any error file exists unless a new one will be written @@ -470,25 +472,33 @@ sub archiveAsyncStatusWrite storageLocal()->remove("${strSpoolPath}/${strWalFile}.error", {bIgnoreMissing => true}); } - # Write the status file - my $strStatus; - - if (defined($iCode)) + # If an error will be written but an ok file already exists this may be expected and will be indicated by bIgnoreErrorOnOk set + # to true. In this case just return without writing the error file. + if (!($strType eq WAL_STATUS_ERROR && $bIgnoreErrorOnOk && storageLocal()->exists("${strSpoolPath}/${strWalFile}.ok"))) { - if (!defined($strMessage)) + # Write the status file + my $strStatus; + + if (defined($iCode)) { - confess &log(ASSERT, 'strMessage must be set when iCode is set'); + if (!defined($strMessage)) + { + confess &log(ASSERT, 'strMessage must be set when iCode is set'); + } + + $strStatus = "${iCode}\n${strMessage}"; + } + elsif ($strType eq WAL_STATUS_ERROR) + { + confess &log(ASSERT, 'error status must have iCode and strMessage set'); } - $strStatus = "${iCode}\n${strMessage}"; - } - elsif ($strType eq WAL_STATUS_ERROR) - { - confess &log(ASSERT, 'error status must have iCode and strMessage set'); + storageLocal()->put( + storageLocal()->openWrite("${strSpoolPath}/${strWalFile}.${strType}", {bAtomic => true}), $strStatus); } - storageLocal()->put( - storageLocal()->openWrite("${strSpoolPath}/${strWalFile}.${strType}", {bAtomic => true}), $strStatus); + # Return from function and log return values if any + return logDebugReturn($strOperation); } push @EXPORT, qw(archiveAsyncStatusWrite); diff --git a/lib/pgBackRest/Archive/Push/Async.pm b/lib/pgBackRest/Archive/Push/Async.pm index 2a4a51c53..20734b7f5 100644 --- a/lib/pgBackRest/Archive/Push/Async.pm +++ b/lib/pgBackRest/Archive/Push/Async.pm @@ -216,11 +216,11 @@ sub processQueue my $iCode = exceptionCode($EVAL_ERROR); my $strMessage = exceptionMessage($EVAL_ERROR); - # Error all queued jobs + # Error all queued jobs except for the ones that have already succeeded foreach my $strWalFile (@{$stryWalFile}) { archiveAsyncStatusWrite( - WAL_STATUS_ERROR, $self->{strSpoolPath}, $strWalFile, $iCode, $strMessage); + WAL_STATUS_ERROR, $self->{strSpoolPath}, $strWalFile, $iCode, $strMessage, true); } } } diff --git a/src/command/archive/common.c b/src/command/archive/common.c index b19a84044..6d1a6d31f 100644 --- a/src/command/archive/common.c +++ b/src/command/archive/common.c @@ -46,12 +46,20 @@ archiveAsyncStatus(ArchiveMode archiveMode, const String *walSegment, bool confe bool okFileExists = storageExistsNP(storageSpool(), strNewFmt("%s/%s", strPtr(spoolQueue), strPtr(okFile))); bool errorFileExists = storageExistsNP(storageSpool(), strNewFmt("%s/%s", strPtr(spoolQueue), strPtr(errorFile))); - // If both status files are found then assert - this could be a bug in the async process + // If both status files are found then warn, remove the files, and return false so the segment will be retried. This may be + // a bug in the async process but it may also be a failed fsync or other filesystem issue. In any case, a hard failure here + // would mean that archiving is completely stuck so it is better to attempt a retry. if (okFileExists && errorFileExists) { - THROW_FMT( - AssertError, "multiple status files found in '%s' for WAL segment '%s'", + LOG_WARN( + "multiple status files found in '%s' for WAL segment '%s' will be removed and the command retried", strPtr(storagePath(storageSpool(), spoolQueue)), strPtr(walSegment)); + + storageRemoveNP(storageSpoolWrite(), strNewFmt("%s/%s", strPtr(spoolQueue), strPtr(okFile))); + okFileExists = false; + + storageRemoveNP(storageSpoolWrite(), strNewFmt("%s/%s", strPtr(spoolQueue), strPtr(errorFile))); + errorFileExists = false; } // If either of them exists then check what happened and report back diff --git a/src/perl/embed.auto.c b/src/perl/embed.auto.c index 4992e9556..f88387ead 100644 --- a/src/perl/embed.auto.c +++ b/src/perl/embed.auto.c @@ -431,6 +431,7 @@ static const EmbeddedModule embeddedModule[] = "$strWalFile,\n" "$iCode,\n" "$strMessage,\n" + "$bIgnoreErrorOnOk,\n" ") =\n" "logDebugParam\n" "(\n" @@ -440,6 +441,7 @@ static const EmbeddedModule embeddedModule[] = "{name => 'strWalFile'},\n" "{name => 'iCode', required => false},\n" "{name => 'strMessage', required => false},\n" + "{name => 'bIgnoreErrorOnOk', required => false, default => false},\n" ");\n" "\n\n" "if ($strType ne WAL_STATUS_ERROR)\n" @@ -447,7 +449,10 @@ static const EmbeddedModule embeddedModule[] = "\n" "storageLocal()->remove(\"${strSpoolPath}/${strWalFile}.error\", {bIgnoreMissing => true});\n" "}\n" - "\n\n" + "\n\n\n" + "if (!($strType eq WAL_STATUS_ERROR && $bIgnoreErrorOnOk && storageLocal()->exists(\"${strSpoolPath}/${strWalFile}.ok\")))\n" + "{\n" + "\n" "my $strStatus;\n" "\n" "if (defined($iCode))\n" @@ -467,6 +472,9 @@ static const EmbeddedModule embeddedModule[] = "storageLocal()->put(\n" "storageLocal()->openWrite(\"${strSpoolPath}/${strWalFile}.${strType}\", {bAtomic => true}), $strStatus);\n" "}\n" + "\n\n" + "return logDebugReturn($strOperation);\n" + "}\n" "\n" "push @EXPORT, qw(archiveAsyncStatusWrite);\n" "\n" @@ -1603,7 +1611,7 @@ static const EmbeddedModule embeddedModule[] = "foreach my $strWalFile (@{$stryWalFile})\n" "{\n" "archiveAsyncStatusWrite(\n" - "WAL_STATUS_ERROR, $self->{strSpoolPath}, $strWalFile, $iCode, $strMessage);\n" + "WAL_STATUS_ERROR, $self->{strSpoolPath}, $strWalFile, $iCode, $strMessage, true);\n" "}\n" "}\n" "}\n" diff --git a/test/lib/pgBackRestTest/Module/Archive/ArchiveCommonPerlTest.pm b/test/lib/pgBackRestTest/Module/Archive/ArchiveCommonPerlTest.pm index c4b4ba0ec..84d77cef9 100644 --- a/test/lib/pgBackRestTest/Module/Archive/ArchiveCommonPerlTest.pm +++ b/test/lib/pgBackRestTest/Module/Archive/ArchiveCommonPerlTest.pm @@ -199,6 +199,15 @@ sub run # Generate a valid warning ok archiveAsyncStatusWrite(WAL_STATUS_OK, $strSpoolPath, $strSegment, 0, 'Test Warning'); + # Skip error when an ok file already exists + #--------------------------------------------------------------------------------------------------------------------------- + archiveAsyncStatusWrite( + WAL_STATUS_ERROR, $strSpoolPath, $strSegment, ERROR_ARCHIVE_DUPLICATE, + "WAL segment ${strSegment} already exists in the archive", true); + + $self->testResult( + $self->storageTest()->exists("${strSpoolPath}/${strSegment}.error"), false, "error file should not exist"); + #--------------------------------------------------------------------------------------------------------------------------- # Generate an invalid error $self->testException( diff --git a/test/src/module/archive/commonTest.c b/test/src/module/archive/commonTest.c index 17c1d4a71..292a6304f 100644 --- a/test/src/module/archive/commonTest.c +++ b/test/src/module/archive/commonTest.c @@ -84,13 +84,23 @@ testRun(void) // ------------------------------------------------------------------------------------------------------------------------- storagePutNP( storageNewWriteNP(storageSpoolWrite(), strNewFmt(STORAGE_SPOOL_ARCHIVE_OUT "/%s.error", strPtr(segment))), bufNew(0)); - TEST_ERROR( - archiveAsyncStatus(archiveModePush, segment, false), AssertError, + TEST_RESULT_BOOL(archiveAsyncStatus(archiveModePush, segment, false), false, "multiple status files returns false"); + + TEST_RESULT_BOOL( + storageExistsNP(storageSpool(), strNewFmt(STORAGE_SPOOL_ARCHIVE_OUT "/%s.error", strPtr(segment))), false, + ".error file was deleted"); + TEST_RESULT_BOOL( + storageExistsNP(storageSpool(), strNewFmt(STORAGE_SPOOL_ARCHIVE_OUT "/%s.ok", strPtr(segment))), false, + ".ok file was deleted"); + + harnessLogResult( strPtr( strNewFmt( - "multiple status files found in '%s/archive/db/out' for WAL segment '000000010000000100000001'", testPath()))); + "P00 WARN: multiple status files found in '%s/archive/db/out' for WAL segment '000000010000000100000001'" + " will be removed and the command retried", testPath()))); - unlink(strPtr(storagePathNP(storageSpool(), strNewFmt(STORAGE_SPOOL_ARCHIVE_OUT "/%s.ok", strPtr(segment))))); + storagePutNP( + storageNewWriteNP(storageSpoolWrite(), strNewFmt(STORAGE_SPOOL_ARCHIVE_OUT "/%s.error", strPtr(segment))), bufNew(0)); TEST_ERROR( archiveAsyncStatus(archiveModePush, segment, true), AssertError, "status file '000000010000000100000001.error' has no content");