1
0
mirror of https://github.com/pgbackrest/pgbackrest.git synced 2024-12-12 10:04:14 +02:00

Fix regression in retries.

5314dbf aimed to make nested Wait objects more accurate with regard to wait time but it also got rid of the "bonus" retry that was implicit in the prior implementation. This meant that if an operation used up the entire allotted timeout, it would not be retried. Object stores especially are noisy places and some amount of retry should always be attempted. So even though removing the "bonus" retry was intended, it turned out not to be a good idea.

Instead of an implicit retry, formalize two retries in the Wait object even if the wait time has expired. Any number of retries are allowed during the wait period. Also remove waitRemaining() since it is no longer needed.

Adjust tests as needed to account for the extra timeouts.

Note that there may still be an underlying issue here that is simply being masked by retries. That is, the issue expressing was that waiting for a socket to be writable was timing out and without a retry that caused a hard error. This patch does nothing to address the source of the write timeout and perhaps there is nothing we can do about it. It does seem similar to the write issue we had with our blocking TLS implementation, but it was never clear if that was a problem with TLS, the kernel, or a bug in pgBackRest itself. It cropped up after a kernel update and we switched to non-blocking TLS to address the issue (c88684e).
This commit is contained in:
David Steele 2023-11-09 12:04:25 -03:00 committed by GitHub
parent 3c116e1829
commit fa5b2d44ad
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 103 additions and 46 deletions

View File

@ -1,4 +1,28 @@
<release date="XXXX-XX-XX" version="2.49dev" title="UNDER DEVELOPMENT">
<release-core-list>
<release-bug-list>
<release-item>
<commit subject="Remove unused header."/>
<commit subject="Fix regression in retries.">
<github-issue id="2200"/>
<github-issue id="2214"/>
<github-pull-request id="2221"/>
</commit>
<release-item-contributor-list>
<release-item-ideator id="norman.adkins"/>
<release-item-ideator id="tanel.suurhans"/>
<release-item-ideator id="jordan.english"/>
<release-item-ideator id="timothee.peignier"/>
<release-item-contributor id="david.steele"/>
<release-item-reviewer id="stephen.frost"/>
</release-item-contributor-list>
<p>Fix regression in retries.</p>
</release-item>
</release-bug-list>
</release-core-list>
<release-doc-list>
<release-feature-list>
<release-item>

View File

@ -525,6 +525,11 @@
<contributor-id type="github">precision-software</contributor-id>
</contributor>
<contributor id="jordan.english">
<contributor-name-display>Jordan English</contributor-name-display>
<contributor-id type="github">jordanenglish</contributor-id>
</contributor>
<contributor id="julian.zhang">
<contributor-name-display>Julian Zhang</contributor-name-display>
<contributor-id type="github">julianzhang98</contributor-id>
@ -760,6 +765,11 @@
<contributor-id type="github">nj3110</contributor-id>
</contributor>
<contributor id="norman.adkins">
<contributor-name-display>Norman Adkins</contributor-name-display>
<contributor-id type="github">adkinsn</contributor-id>
</contributor>
<contributor id="nuno.pires">
<contributor-name-display>Nuno Pires</contributor-name-display>
<contributor-id type="github">nunopi</contributor-id>
@ -905,6 +915,11 @@
<contributor-id type="github">Tiago-Anastacio</contributor-id>
</contributor>
<contributor id="tanel.suurhans">
<contributor-name-display>Tanel Suurhans</contributor-name-display>
<contributor-id type="github">tanelsuurhans</contributor-id>
</contributor>
<contributor id="tim.garton">
<contributor-name-display>Tim Garton</contributor-name-display>
<contributor-id type="github">ralfthewise</contributor-id>

View File

@ -97,7 +97,7 @@ sckClientOpen(THIS_VOID)
THROW_ON_SYS_ERROR(fd == -1, HostConnectError, "unable to create socket");
sckOptionSet(fd);
sckConnect(fd, this->host, this->port, addressFound, waitRemaining(wait));
sckConnect(fd, this->host, this->port, addressFound, this->timeoutConnect);
// Create the session
MEM_CONTEXT_PRIOR_BEGIN()
@ -119,8 +119,7 @@ sckClientOpen(THIS_VOID)
errRetryAdd(errRetry);
// Increment address info index and reset if the end has been reached. When the end has been reached sleep for a bit
// to hopefully have better chance at succeeding, otherwise continue right to the next address as long as there is
// some time left.
// to hopefully have better chance at succeeding, otherwise continue right to the next address.
addrInfoIdx++;
if (addrInfoIdx >= addrInfoSize(addrInfo))
@ -129,7 +128,7 @@ sckClientOpen(THIS_VOID)
retry = waitMore(wait);
}
else
retry = waitRemaining(wait) > 0;
retry = true;
// Error when no retries remain
if (!retry)

View File

@ -16,6 +16,7 @@ struct Wait
TimeMSec sleepTime; // Next sleep time (in usec)
TimeMSec sleepPrevTime; // Previous time slept (in usec)
TimeMSec beginTime; // Time the wait began (in epoch usec)
unsigned int retry; // Retries remaining
};
/**********************************************************************************************************************************/
@ -33,6 +34,7 @@ waitNew(const TimeMSec waitTime)
*this = (Wait)
{
.waitTime = waitTime,
.retry = 2,
};
// Calculate first sleep time -- start with 1/10th of a second for anything >= 1 second
@ -52,31 +54,6 @@ waitNew(const TimeMSec waitTime)
FUNCTION_LOG_RETURN(WAIT, this);
}
/**********************************************************************************************************************************/
FN_EXTERN TimeMSec
waitRemaining(Wait *const this)
{
FUNCTION_TEST_BEGIN();
FUNCTION_TEST_PARAM(WAIT, this);
FUNCTION_TEST_END();
TimeMSec result = 0;
// If any wait time remains
if (this->sleepTime > 0)
{
// Returning remaining time, if any, else set sleepTime to 0 so next call to waitMore will return false
const TimeMSec elapsedTime = timeMSec() - this->beginTime;
if (elapsedTime < this->waitTime)
result = this->waitTime - elapsedTime;
else
this->sleepTime = 0;
}
FUNCTION_TEST_RETURN(TIME_MSEC, result);
}
/**********************************************************************************************************************************/
FN_EXTERN bool
waitMore(Wait *const this)
@ -114,13 +91,26 @@ waitMore(Wait *const this)
// Store new sleep times
this->sleepPrevTime = this->sleepTime;
this->sleepTime = sleepTime;
// Need to wait more
result = true;
}
// Else set sleep to zero so next call will return false
// Else are there retries left?
else if (this->retry != 0)
{
// Sleep using the last calculated time
sleepMSec(this->sleepTime);
}
// Else set sleep to zero so call will return false
else
this->sleepTime = 0;
// Caller can continue processing
if (this->sleepTime > 0)
{
// Decrement retries
if (this->retry != 0)
this->retry--;
result = true;
}
}
FUNCTION_LOG_RETURN(BOOL, result);

View File

@ -1,5 +1,10 @@
/***********************************************************************************************************************************
Wait Handler
Used for operations that may fail due to an error or some unexpected condition such as file missing. When waitMore() is called it
will wait (based on a Fibonacci backoff) before returning to give the error or condition time to clear. Even when the wait time has
expired before waitMore() is called, there will still be two retries to compensate for operations that use up the entire time limit.
Any number of retries are allowed within the time limit.
***********************************************************************************************************************************/
#ifndef COMMON_WAIT_H
#define COMMON_WAIT_H
@ -17,16 +22,10 @@ Constructors
***********************************************************************************************************************************/
FN_EXTERN Wait *waitNew(TimeMSec waitTime);
/***********************************************************************************************************************************
Getters/Setters
***********************************************************************************************************************************/
// How much time is remaining? Recalculated each time waitMore() is called.
FN_EXTERN TimeMSec waitRemaining(Wait *this);
/***********************************************************************************************************************************
Functions
***********************************************************************************************************************************/
// Wait and return whether the caller has more time left
// Wait and return true if the caller has more time/retries left
FN_EXTERN bool waitMore(Wait *this);
/***********************************************************************************************************************************

View File

@ -312,6 +312,7 @@ testRun(void)
TEST_ERROR_FMT(
httpRequestResponse(httpRequestNewP(client, STRDEF("GET"), STRDEF("/")), false), HostConnectError,
"unable to connect to 'localhost:%u (127.0.0.1)': [111] Connection refused\n"
"[RETRY DETAIL OMITTED]\n"
"[RETRY DETAIL OMITTED]",
hrnServerPort(0));

View File

@ -375,7 +375,11 @@ testRun(void)
// This address should not be in use in a test environment -- if it is the test will fail
TEST_ASSIGN(client, sckClientNew(STRDEF("172.31.255.255"), hrnServerPort(0), 100, 100), "new client");
TEST_ERROR_FMT(ioClientOpen(client), HostConnectError, "timeout connecting to '172.31.255.255:%u'", hrnServerPort(0));
TEST_ERROR_FMT(
ioClientOpen(client), HostConnectError,
"timeout connecting to '172.31.255.255:%u'\n"
"[RETRY DETAIL OMITTED]",
hrnServerPort(0));
// -------------------------------------------------------------------------------------------------------------------------
TEST_TITLE("sckServerAccept() returns NULL on interrupt");

View File

@ -23,18 +23,20 @@ testRun(void)
TimeMSec begin = timeMSec();
TEST_ASSIGN(wait, waitNew(0), "new wait");
TEST_RESULT_UINT(waitRemaining(wait), 0, " check remaining time");
TEST_RESULT_UINT(wait->waitTime, 0, " check wait time");
TEST_RESULT_UINT(wait->sleepTime, 0, " check sleep time");
TEST_RESULT_UINT(wait->sleepPrevTime, 0, " check sleep prev time");
TEST_RESULT_BOOL(waitMore(wait), false, " no wait more");
// -------------------------------------------------------------------------------------------------------------------------
TEST_TITLE("100ms using only waitRemaining()");
TEST_TITLE("100ms with retries after time expired");
TEST_ASSIGN(wait, waitNew(100), "new wait");
sleepMSec(100);
TEST_RESULT_UINT(waitRemaining(wait), 0, " check remaining time");
TEST_RESULT_BOOL(waitMore(wait), true, " time expired, first retry");
TEST_RESULT_BOOL(waitMore(wait), true, " time expired, second retry");
TEST_RESULT_BOOL(waitMore(wait), false, " time expired, retries expired");
// -------------------------------------------------------------------------------------------------------------------------
TEST_TITLE("200ms wait");
@ -42,13 +44,19 @@ testRun(void)
begin = timeMSec();
TEST_ASSIGN(wait, waitNew(200), "new wait = 0.2 sec");
TEST_RESULT_BOOL(waitRemaining(wait) <= 200, true, " check remaining time upper range");
TEST_RESULT_BOOL(waitRemaining(wait) >= 150, true, " check remaining time lowe range");
TEST_RESULT_UINT(wait->waitTime, 200, " check wait time");
TEST_RESULT_UINT(wait->sleepTime, 20, " check sleep time");
TEST_RESULT_UINT(wait->sleepPrevTime, 0, " check sleep prev time");
TEST_RESULT_BOOL(wait->beginTime > (TimeMSec)1483228800000, true, " check begin time");
TEST_RESULT_BOOL(waitMore(wait), true, " first retry");
TEST_RESULT_UINT(wait->retry, 1, " check retry");
TEST_RESULT_BOOL(waitMore(wait), true, " second retry");
TEST_RESULT_UINT(wait->retry, 0, " check retry");
TEST_RESULT_BOOL(waitMore(wait), true, " still going because of time");
while (waitMore(wait));
TimeMSec end = timeMSec();

View File

@ -525,6 +525,7 @@ testRun(void)
HRN_PQ_SCRIPT_CHECKPOINT(2),
HRN_PQ_SCRIPT_CHECKPOINT_TARGET_REACHED_GE_10(2, "5/5", false, "5/4", 100),
HRN_PQ_SCRIPT_CHECKPOINT_TARGET_REACHED_GE_10(2, "5/5", false, "5/4", 100),
HRN_PQ_SCRIPT_CHECKPOINT_TARGET_REACHED_GE_10(2, "5/5", false, "5/4", 100),
// Wait for standby to sync
HRN_PQ_SCRIPT_REPLAY_TARGET_REACHED_GE_10(2, "5/5", false, "5/3"),

View File

@ -153,6 +153,10 @@ testRun(void)
{.function = HRN_PQ_SENDQUERY, .param = "[\"" TEST_QUERY "\"]", .resultInt = 1},
{.function = HRN_PQ_CONSUMEINPUT, .sleep = 600},
{.function = HRN_PQ_ISBUSY, .resultInt = 1},
{.function = HRN_PQ_CONSUMEINPUT},
{.function = HRN_PQ_ISBUSY, .resultInt = 1},
{.function = HRN_PQ_CONSUMEINPUT},
{.function = HRN_PQ_ISBUSY, .resultInt = 1},
{.function = HRN_PQ_GETCANCEL},
{.function = HRN_PQ_CANCEL, .resultInt = 1},
{.function = HRN_PQ_FREECANCEL},
@ -180,6 +184,8 @@ testRun(void)
{.function = HRN_PQ_ISBUSY, .resultInt = 1},
{.function = HRN_PQ_CONSUMEINPUT, .sleep = 300},
{.function = HRN_PQ_ISBUSY, .resultInt = 1},
{.function = HRN_PQ_CONSUMEINPUT},
{.function = HRN_PQ_ISBUSY, .resultInt = 1},
{.function = HRN_PQ_GETCANCEL},
{.function = HRN_PQ_CANCEL, .resultInt = 0, .resultZ = TEST_PQ_ERROR},
{.function = HRN_PQ_FREECANCEL});
@ -202,6 +208,10 @@ testRun(void)
{.function = HRN_PQ_SENDQUERY, .param = "[\"" TEST_QUERY "\"]", .resultInt = 1},
{.function = HRN_PQ_CONSUMEINPUT, .sleep = 600},
{.function = HRN_PQ_ISBUSY, .resultInt = 1},
{.function = HRN_PQ_CONSUMEINPUT},
{.function = HRN_PQ_ISBUSY, .resultInt = 1},
{.function = HRN_PQ_CONSUMEINPUT},
{.function = HRN_PQ_ISBUSY, .resultInt = 1},
{.function = HRN_PQ_GETCANCEL, .resultNull = true});
TEST_ERROR(

View File

@ -2051,6 +2051,12 @@ testRun(void)
{.function = HRNLIBSSH2_SFTP_STAT_EX, .param = "[\"" TEST_PATH "/missing\",0]",
.resultInt = LIBSSH2_ERROR_SFTP_PROTOCOL},
{.function = HRNLIBSSH2_SFTP_LAST_ERROR, .resultUInt = LIBSSH2_FX_NO_SUCH_FILE},
{.function = HRNLIBSSH2_SFTP_STAT_EX, .param = "[\"" TEST_PATH "/missing\",0]",
.resultInt = LIBSSH2_ERROR_SFTP_PROTOCOL},
{.function = HRNLIBSSH2_SFTP_LAST_ERROR, .resultUInt = LIBSSH2_FX_NO_SUCH_FILE},
{.function = HRNLIBSSH2_SFTP_STAT_EX, .param = "[\"" TEST_PATH "/missing\",0]",
.resultInt = LIBSSH2_ERROR_SFTP_PROTOCOL},
{.function = HRNLIBSSH2_SFTP_LAST_ERROR, .resultUInt = LIBSSH2_FX_NO_SUCH_FILE},
// Path missing
{.function = HRNLIBSSH2_SFTP_STAT_EX, .param = "[\"" TEST_PATH "\",0]",
.attrPerms = LIBSSH2_SFTP_S_IFDIR, .resultInt = LIBSSH2_ERROR_NONE},