diff --git a/doc/xml/release.xml b/doc/xml/release.xml index 093c0ebc1..cf57142b3 100644 --- a/doc/xml/release.xml +++ b/doc/xml/release.xml @@ -14,6 +14,18 @@ + + + + + + + + +

Fix issue with multiple async status files causing a hard error.

+
+
+

The info command is implemented entirely in C.

@@ -6233,6 +6245,11 @@ evanbenoit + + fpa-postgres + fpa-postgres + + Greg Smith gregscds @@ -6277,6 +6294,10 @@ Underhunden + + Joe Ayers + + John Harvey crunchyjohn diff --git a/lib/pgBackRest/Archive/Common.pm b/lib/pgBackRest/Archive/Common.pm index fe6de439b..b42053339 100644 --- a/lib/pgBackRest/Archive/Common.pm +++ b/lib/pgBackRest/Archive/Common.pm @@ -452,6 +452,7 @@ sub archiveAsyncStatusWrite $strWalFile, $iCode, $strMessage, + $bIgnoreErrorOnOk, ) = logDebugParam ( @@ -461,6 +462,7 @@ sub archiveAsyncStatusWrite {name => 'strWalFile'}, {name => 'iCode', required => false}, {name => 'strMessage', required => false}, + {name => 'bIgnoreErrorOnOk', required => false, default => false}, ); # Remove any error file exists unless a new one will be written @@ -470,25 +472,33 @@ sub archiveAsyncStatusWrite storageLocal()->remove("${strSpoolPath}/${strWalFile}.error", {bIgnoreMissing => true}); } - # Write the status file - my $strStatus; - - if (defined($iCode)) + # If an error will be written but an ok file already exists this may be expected and will be indicated by bIgnoreErrorOnOk set + # to true. In this case just return without writing the error file. + if (!($strType eq WAL_STATUS_ERROR && $bIgnoreErrorOnOk && storageLocal()->exists("${strSpoolPath}/${strWalFile}.ok"))) { - if (!defined($strMessage)) + # Write the status file + my $strStatus; + + if (defined($iCode)) { - confess &log(ASSERT, 'strMessage must be set when iCode is set'); + if (!defined($strMessage)) + { + confess &log(ASSERT, 'strMessage must be set when iCode is set'); + } + + $strStatus = "${iCode}\n${strMessage}"; + } + elsif ($strType eq WAL_STATUS_ERROR) + { + confess &log(ASSERT, 'error status must have iCode and strMessage set'); } - $strStatus = "${iCode}\n${strMessage}"; - } - elsif ($strType eq WAL_STATUS_ERROR) - { - confess &log(ASSERT, 'error status must have iCode and strMessage set'); + storageLocal()->put( + storageLocal()->openWrite("${strSpoolPath}/${strWalFile}.${strType}", {bAtomic => true}), $strStatus); } - storageLocal()->put( - storageLocal()->openWrite("${strSpoolPath}/${strWalFile}.${strType}", {bAtomic => true}), $strStatus); + # Return from function and log return values if any + return logDebugReturn($strOperation); } push @EXPORT, qw(archiveAsyncStatusWrite); diff --git a/lib/pgBackRest/Archive/Push/Async.pm b/lib/pgBackRest/Archive/Push/Async.pm index 2a4a51c53..20734b7f5 100644 --- a/lib/pgBackRest/Archive/Push/Async.pm +++ b/lib/pgBackRest/Archive/Push/Async.pm @@ -216,11 +216,11 @@ sub processQueue my $iCode = exceptionCode($EVAL_ERROR); my $strMessage = exceptionMessage($EVAL_ERROR); - # Error all queued jobs + # Error all queued jobs except for the ones that have already succeeded foreach my $strWalFile (@{$stryWalFile}) { archiveAsyncStatusWrite( - WAL_STATUS_ERROR, $self->{strSpoolPath}, $strWalFile, $iCode, $strMessage); + WAL_STATUS_ERROR, $self->{strSpoolPath}, $strWalFile, $iCode, $strMessage, true); } } } diff --git a/src/command/archive/common.c b/src/command/archive/common.c index b19a84044..6d1a6d31f 100644 --- a/src/command/archive/common.c +++ b/src/command/archive/common.c @@ -46,12 +46,20 @@ archiveAsyncStatus(ArchiveMode archiveMode, const String *walSegment, bool confe bool okFileExists = storageExistsNP(storageSpool(), strNewFmt("%s/%s", strPtr(spoolQueue), strPtr(okFile))); bool errorFileExists = storageExistsNP(storageSpool(), strNewFmt("%s/%s", strPtr(spoolQueue), strPtr(errorFile))); - // If both status files are found then assert - this could be a bug in the async process + // If both status files are found then warn, remove the files, and return false so the segment will be retried. This may be + // a bug in the async process but it may also be a failed fsync or other filesystem issue. In any case, a hard failure here + // would mean that archiving is completely stuck so it is better to attempt a retry. if (okFileExists && errorFileExists) { - THROW_FMT( - AssertError, "multiple status files found in '%s' for WAL segment '%s'", + LOG_WARN( + "multiple status files found in '%s' for WAL segment '%s' will be removed and the command retried", strPtr(storagePath(storageSpool(), spoolQueue)), strPtr(walSegment)); + + storageRemoveNP(storageSpoolWrite(), strNewFmt("%s/%s", strPtr(spoolQueue), strPtr(okFile))); + okFileExists = false; + + storageRemoveNP(storageSpoolWrite(), strNewFmt("%s/%s", strPtr(spoolQueue), strPtr(errorFile))); + errorFileExists = false; } // If either of them exists then check what happened and report back diff --git a/src/perl/embed.auto.c b/src/perl/embed.auto.c index 4992e9556..f88387ead 100644 --- a/src/perl/embed.auto.c +++ b/src/perl/embed.auto.c @@ -431,6 +431,7 @@ static const EmbeddedModule embeddedModule[] = "$strWalFile,\n" "$iCode,\n" "$strMessage,\n" + "$bIgnoreErrorOnOk,\n" ") =\n" "logDebugParam\n" "(\n" @@ -440,6 +441,7 @@ static const EmbeddedModule embeddedModule[] = "{name => 'strWalFile'},\n" "{name => 'iCode', required => false},\n" "{name => 'strMessage', required => false},\n" + "{name => 'bIgnoreErrorOnOk', required => false, default => false},\n" ");\n" "\n\n" "if ($strType ne WAL_STATUS_ERROR)\n" @@ -447,7 +449,10 @@ static const EmbeddedModule embeddedModule[] = "\n" "storageLocal()->remove(\"${strSpoolPath}/${strWalFile}.error\", {bIgnoreMissing => true});\n" "}\n" - "\n\n" + "\n\n\n" + "if (!($strType eq WAL_STATUS_ERROR && $bIgnoreErrorOnOk && storageLocal()->exists(\"${strSpoolPath}/${strWalFile}.ok\")))\n" + "{\n" + "\n" "my $strStatus;\n" "\n" "if (defined($iCode))\n" @@ -467,6 +472,9 @@ static const EmbeddedModule embeddedModule[] = "storageLocal()->put(\n" "storageLocal()->openWrite(\"${strSpoolPath}/${strWalFile}.${strType}\", {bAtomic => true}), $strStatus);\n" "}\n" + "\n\n" + "return logDebugReturn($strOperation);\n" + "}\n" "\n" "push @EXPORT, qw(archiveAsyncStatusWrite);\n" "\n" @@ -1603,7 +1611,7 @@ static const EmbeddedModule embeddedModule[] = "foreach my $strWalFile (@{$stryWalFile})\n" "{\n" "archiveAsyncStatusWrite(\n" - "WAL_STATUS_ERROR, $self->{strSpoolPath}, $strWalFile, $iCode, $strMessage);\n" + "WAL_STATUS_ERROR, $self->{strSpoolPath}, $strWalFile, $iCode, $strMessage, true);\n" "}\n" "}\n" "}\n" diff --git a/test/lib/pgBackRestTest/Module/Archive/ArchiveCommonPerlTest.pm b/test/lib/pgBackRestTest/Module/Archive/ArchiveCommonPerlTest.pm index c4b4ba0ec..84d77cef9 100644 --- a/test/lib/pgBackRestTest/Module/Archive/ArchiveCommonPerlTest.pm +++ b/test/lib/pgBackRestTest/Module/Archive/ArchiveCommonPerlTest.pm @@ -199,6 +199,15 @@ sub run # Generate a valid warning ok archiveAsyncStatusWrite(WAL_STATUS_OK, $strSpoolPath, $strSegment, 0, 'Test Warning'); + # Skip error when an ok file already exists + #--------------------------------------------------------------------------------------------------------------------------- + archiveAsyncStatusWrite( + WAL_STATUS_ERROR, $strSpoolPath, $strSegment, ERROR_ARCHIVE_DUPLICATE, + "WAL segment ${strSegment} already exists in the archive", true); + + $self->testResult( + $self->storageTest()->exists("${strSpoolPath}/${strSegment}.error"), false, "error file should not exist"); + #--------------------------------------------------------------------------------------------------------------------------- # Generate an invalid error $self->testException( diff --git a/test/src/module/archive/commonTest.c b/test/src/module/archive/commonTest.c index 17c1d4a71..292a6304f 100644 --- a/test/src/module/archive/commonTest.c +++ b/test/src/module/archive/commonTest.c @@ -84,13 +84,23 @@ testRun(void) // ------------------------------------------------------------------------------------------------------------------------- storagePutNP( storageNewWriteNP(storageSpoolWrite(), strNewFmt(STORAGE_SPOOL_ARCHIVE_OUT "/%s.error", strPtr(segment))), bufNew(0)); - TEST_ERROR( - archiveAsyncStatus(archiveModePush, segment, false), AssertError, + TEST_RESULT_BOOL(archiveAsyncStatus(archiveModePush, segment, false), false, "multiple status files returns false"); + + TEST_RESULT_BOOL( + storageExistsNP(storageSpool(), strNewFmt(STORAGE_SPOOL_ARCHIVE_OUT "/%s.error", strPtr(segment))), false, + ".error file was deleted"); + TEST_RESULT_BOOL( + storageExistsNP(storageSpool(), strNewFmt(STORAGE_SPOOL_ARCHIVE_OUT "/%s.ok", strPtr(segment))), false, + ".ok file was deleted"); + + harnessLogResult( strPtr( strNewFmt( - "multiple status files found in '%s/archive/db/out' for WAL segment '000000010000000100000001'", testPath()))); + "P00 WARN: multiple status files found in '%s/archive/db/out' for WAL segment '000000010000000100000001'" + " will be removed and the command retried", testPath()))); - unlink(strPtr(storagePathNP(storageSpool(), strNewFmt(STORAGE_SPOOL_ARCHIVE_OUT "/%s.ok", strPtr(segment))))); + storagePutNP( + storageNewWriteNP(storageSpoolWrite(), strNewFmt(STORAGE_SPOOL_ARCHIVE_OUT "/%s.error", strPtr(segment))), bufNew(0)); TEST_ERROR( archiveAsyncStatus(archiveModePush, segment, true), AssertError, "status file '000000010000000100000001.error' has no content");