mirror of
https://github.com/pgbackrest/pgbackrest.git
synced 2024-12-12 10:04:14 +02:00
Fix regression in retries.
5314dbf
aimed to make nested Wait objects more accurate with regard to wait time but it also got rid of the "bonus" retry that was implicit in the prior implementation. This meant that if an operation used up the entire allotted timeout, it would not be retried. Object stores especially are noisy places and some amount of retry should always be attempted. So even though removing the "bonus" retry was intended, it turned out not to be a good idea. Instead of an implicit retry, formalize two retries in the Wait object even if the wait time has expired. Any number of retries are allowed during the wait period. Also remove waitRemaining() since it is no longer needed. Adjust tests as needed to account for the extra timeouts. Note that there may still be an underlying issue here that is simply being masked by retries. That is, the issue expressing was that waiting for a socket to be writable was timing out and without a retry that caused a hard error. This patch does nothing to address the source of the write timeout and perhaps there is nothing we can do about it. It does seem similar to the write issue we had with our blocking TLS implementation, but it was never clear if that was a problem with TLS, the kernel, or a bug in pgBackRest itself. It cropped up after a kernel update and we switched to non-blocking TLS to address the issue (c88684e
).
This commit is contained in:
parent
3c116e1829
commit
fa5b2d44ad
@ -1,4 +1,28 @@
|
||||
<release date="XXXX-XX-XX" version="2.49dev" title="UNDER DEVELOPMENT">
|
||||
<release-core-list>
|
||||
<release-bug-list>
|
||||
<release-item>
|
||||
<commit subject="Remove unused header."/>
|
||||
<commit subject="Fix regression in retries.">
|
||||
<github-issue id="2200"/>
|
||||
<github-issue id="2214"/>
|
||||
<github-pull-request id="2221"/>
|
||||
</commit>
|
||||
|
||||
<release-item-contributor-list>
|
||||
<release-item-ideator id="norman.adkins"/>
|
||||
<release-item-ideator id="tanel.suurhans"/>
|
||||
<release-item-ideator id="jordan.english"/>
|
||||
<release-item-ideator id="timothee.peignier"/>
|
||||
<release-item-contributor id="david.steele"/>
|
||||
<release-item-reviewer id="stephen.frost"/>
|
||||
</release-item-contributor-list>
|
||||
|
||||
<p>Fix regression in retries.</p>
|
||||
</release-item>
|
||||
</release-bug-list>
|
||||
</release-core-list>
|
||||
|
||||
<release-doc-list>
|
||||
<release-feature-list>
|
||||
<release-item>
|
||||
|
@ -525,6 +525,11 @@
|
||||
<contributor-id type="github">precision-software</contributor-id>
|
||||
</contributor>
|
||||
|
||||
<contributor id="jordan.english">
|
||||
<contributor-name-display>Jordan English</contributor-name-display>
|
||||
<contributor-id type="github">jordanenglish</contributor-id>
|
||||
</contributor>
|
||||
|
||||
<contributor id="julian.zhang">
|
||||
<contributor-name-display>Julian Zhang</contributor-name-display>
|
||||
<contributor-id type="github">julianzhang98</contributor-id>
|
||||
@ -760,6 +765,11 @@
|
||||
<contributor-id type="github">nj3110</contributor-id>
|
||||
</contributor>
|
||||
|
||||
<contributor id="norman.adkins">
|
||||
<contributor-name-display>Norman Adkins</contributor-name-display>
|
||||
<contributor-id type="github">adkinsn</contributor-id>
|
||||
</contributor>
|
||||
|
||||
<contributor id="nuno.pires">
|
||||
<contributor-name-display>Nuno Pires</contributor-name-display>
|
||||
<contributor-id type="github">nunopi</contributor-id>
|
||||
@ -905,6 +915,11 @@
|
||||
<contributor-id type="github">Tiago-Anastacio</contributor-id>
|
||||
</contributor>
|
||||
|
||||
<contributor id="tanel.suurhans">
|
||||
<contributor-name-display>Tanel Suurhans</contributor-name-display>
|
||||
<contributor-id type="github">tanelsuurhans</contributor-id>
|
||||
</contributor>
|
||||
|
||||
<contributor id="tim.garton">
|
||||
<contributor-name-display>Tim Garton</contributor-name-display>
|
||||
<contributor-id type="github">ralfthewise</contributor-id>
|
||||
|
@ -97,7 +97,7 @@ sckClientOpen(THIS_VOID)
|
||||
THROW_ON_SYS_ERROR(fd == -1, HostConnectError, "unable to create socket");
|
||||
|
||||
sckOptionSet(fd);
|
||||
sckConnect(fd, this->host, this->port, addressFound, waitRemaining(wait));
|
||||
sckConnect(fd, this->host, this->port, addressFound, this->timeoutConnect);
|
||||
|
||||
// Create the session
|
||||
MEM_CONTEXT_PRIOR_BEGIN()
|
||||
@ -119,8 +119,7 @@ sckClientOpen(THIS_VOID)
|
||||
errRetryAdd(errRetry);
|
||||
|
||||
// Increment address info index and reset if the end has been reached. When the end has been reached sleep for a bit
|
||||
// to hopefully have better chance at succeeding, otherwise continue right to the next address as long as there is
|
||||
// some time left.
|
||||
// to hopefully have better chance at succeeding, otherwise continue right to the next address.
|
||||
addrInfoIdx++;
|
||||
|
||||
if (addrInfoIdx >= addrInfoSize(addrInfo))
|
||||
@ -129,7 +128,7 @@ sckClientOpen(THIS_VOID)
|
||||
retry = waitMore(wait);
|
||||
}
|
||||
else
|
||||
retry = waitRemaining(wait) > 0;
|
||||
retry = true;
|
||||
|
||||
// Error when no retries remain
|
||||
if (!retry)
|
||||
|
@ -16,6 +16,7 @@ struct Wait
|
||||
TimeMSec sleepTime; // Next sleep time (in usec)
|
||||
TimeMSec sleepPrevTime; // Previous time slept (in usec)
|
||||
TimeMSec beginTime; // Time the wait began (in epoch usec)
|
||||
unsigned int retry; // Retries remaining
|
||||
};
|
||||
|
||||
/**********************************************************************************************************************************/
|
||||
@ -33,6 +34,7 @@ waitNew(const TimeMSec waitTime)
|
||||
*this = (Wait)
|
||||
{
|
||||
.waitTime = waitTime,
|
||||
.retry = 2,
|
||||
};
|
||||
|
||||
// Calculate first sleep time -- start with 1/10th of a second for anything >= 1 second
|
||||
@ -52,31 +54,6 @@ waitNew(const TimeMSec waitTime)
|
||||
FUNCTION_LOG_RETURN(WAIT, this);
|
||||
}
|
||||
|
||||
/**********************************************************************************************************************************/
|
||||
FN_EXTERN TimeMSec
|
||||
waitRemaining(Wait *const this)
|
||||
{
|
||||
FUNCTION_TEST_BEGIN();
|
||||
FUNCTION_TEST_PARAM(WAIT, this);
|
||||
FUNCTION_TEST_END();
|
||||
|
||||
TimeMSec result = 0;
|
||||
|
||||
// If any wait time remains
|
||||
if (this->sleepTime > 0)
|
||||
{
|
||||
// Returning remaining time, if any, else set sleepTime to 0 so next call to waitMore will return false
|
||||
const TimeMSec elapsedTime = timeMSec() - this->beginTime;
|
||||
|
||||
if (elapsedTime < this->waitTime)
|
||||
result = this->waitTime - elapsedTime;
|
||||
else
|
||||
this->sleepTime = 0;
|
||||
}
|
||||
|
||||
FUNCTION_TEST_RETURN(TIME_MSEC, result);
|
||||
}
|
||||
|
||||
/**********************************************************************************************************************************/
|
||||
FN_EXTERN bool
|
||||
waitMore(Wait *const this)
|
||||
@ -114,13 +91,26 @@ waitMore(Wait *const this)
|
||||
// Store new sleep times
|
||||
this->sleepPrevTime = this->sleepTime;
|
||||
this->sleepTime = sleepTime;
|
||||
|
||||
// Need to wait more
|
||||
result = true;
|
||||
}
|
||||
// Else set sleep to zero so next call will return false
|
||||
// Else are there retries left?
|
||||
else if (this->retry != 0)
|
||||
{
|
||||
// Sleep using the last calculated time
|
||||
sleepMSec(this->sleepTime);
|
||||
}
|
||||
// Else set sleep to zero so call will return false
|
||||
else
|
||||
this->sleepTime = 0;
|
||||
|
||||
// Caller can continue processing
|
||||
if (this->sleepTime > 0)
|
||||
{
|
||||
// Decrement retries
|
||||
if (this->retry != 0)
|
||||
this->retry--;
|
||||
|
||||
result = true;
|
||||
}
|
||||
}
|
||||
|
||||
FUNCTION_LOG_RETURN(BOOL, result);
|
||||
|
@ -1,5 +1,10 @@
|
||||
/***********************************************************************************************************************************
|
||||
Wait Handler
|
||||
|
||||
Used for operations that may fail due to an error or some unexpected condition such as file missing. When waitMore() is called it
|
||||
will wait (based on a Fibonacci backoff) before returning to give the error or condition time to clear. Even when the wait time has
|
||||
expired before waitMore() is called, there will still be two retries to compensate for operations that use up the entire time limit.
|
||||
Any number of retries are allowed within the time limit.
|
||||
***********************************************************************************************************************************/
|
||||
#ifndef COMMON_WAIT_H
|
||||
#define COMMON_WAIT_H
|
||||
@ -17,16 +22,10 @@ Constructors
|
||||
***********************************************************************************************************************************/
|
||||
FN_EXTERN Wait *waitNew(TimeMSec waitTime);
|
||||
|
||||
/***********************************************************************************************************************************
|
||||
Getters/Setters
|
||||
***********************************************************************************************************************************/
|
||||
// How much time is remaining? Recalculated each time waitMore() is called.
|
||||
FN_EXTERN TimeMSec waitRemaining(Wait *this);
|
||||
|
||||
/***********************************************************************************************************************************
|
||||
Functions
|
||||
***********************************************************************************************************************************/
|
||||
// Wait and return whether the caller has more time left
|
||||
// Wait and return true if the caller has more time/retries left
|
||||
FN_EXTERN bool waitMore(Wait *this);
|
||||
|
||||
/***********************************************************************************************************************************
|
||||
|
@ -312,6 +312,7 @@ testRun(void)
|
||||
TEST_ERROR_FMT(
|
||||
httpRequestResponse(httpRequestNewP(client, STRDEF("GET"), STRDEF("/")), false), HostConnectError,
|
||||
"unable to connect to 'localhost:%u (127.0.0.1)': [111] Connection refused\n"
|
||||
"[RETRY DETAIL OMITTED]\n"
|
||||
"[RETRY DETAIL OMITTED]",
|
||||
hrnServerPort(0));
|
||||
|
||||
|
@ -375,7 +375,11 @@ testRun(void)
|
||||
|
||||
// This address should not be in use in a test environment -- if it is the test will fail
|
||||
TEST_ASSIGN(client, sckClientNew(STRDEF("172.31.255.255"), hrnServerPort(0), 100, 100), "new client");
|
||||
TEST_ERROR_FMT(ioClientOpen(client), HostConnectError, "timeout connecting to '172.31.255.255:%u'", hrnServerPort(0));
|
||||
TEST_ERROR_FMT(
|
||||
ioClientOpen(client), HostConnectError,
|
||||
"timeout connecting to '172.31.255.255:%u'\n"
|
||||
"[RETRY DETAIL OMITTED]",
|
||||
hrnServerPort(0));
|
||||
|
||||
// -------------------------------------------------------------------------------------------------------------------------
|
||||
TEST_TITLE("sckServerAccept() returns NULL on interrupt");
|
||||
|
@ -23,18 +23,20 @@ testRun(void)
|
||||
TimeMSec begin = timeMSec();
|
||||
|
||||
TEST_ASSIGN(wait, waitNew(0), "new wait");
|
||||
TEST_RESULT_UINT(waitRemaining(wait), 0, " check remaining time");
|
||||
TEST_RESULT_UINT(wait->waitTime, 0, " check wait time");
|
||||
TEST_RESULT_UINT(wait->sleepTime, 0, " check sleep time");
|
||||
TEST_RESULT_UINT(wait->sleepPrevTime, 0, " check sleep prev time");
|
||||
TEST_RESULT_BOOL(waitMore(wait), false, " no wait more");
|
||||
|
||||
// -------------------------------------------------------------------------------------------------------------------------
|
||||
TEST_TITLE("100ms using only waitRemaining()");
|
||||
TEST_TITLE("100ms with retries after time expired");
|
||||
|
||||
TEST_ASSIGN(wait, waitNew(100), "new wait");
|
||||
sleepMSec(100);
|
||||
TEST_RESULT_UINT(waitRemaining(wait), 0, " check remaining time");
|
||||
|
||||
TEST_RESULT_BOOL(waitMore(wait), true, " time expired, first retry");
|
||||
TEST_RESULT_BOOL(waitMore(wait), true, " time expired, second retry");
|
||||
TEST_RESULT_BOOL(waitMore(wait), false, " time expired, retries expired");
|
||||
|
||||
// -------------------------------------------------------------------------------------------------------------------------
|
||||
TEST_TITLE("200ms wait");
|
||||
@ -42,13 +44,19 @@ testRun(void)
|
||||
begin = timeMSec();
|
||||
|
||||
TEST_ASSIGN(wait, waitNew(200), "new wait = 0.2 sec");
|
||||
TEST_RESULT_BOOL(waitRemaining(wait) <= 200, true, " check remaining time upper range");
|
||||
TEST_RESULT_BOOL(waitRemaining(wait) >= 150, true, " check remaining time lowe range");
|
||||
TEST_RESULT_UINT(wait->waitTime, 200, " check wait time");
|
||||
TEST_RESULT_UINT(wait->sleepTime, 20, " check sleep time");
|
||||
TEST_RESULT_UINT(wait->sleepPrevTime, 0, " check sleep prev time");
|
||||
TEST_RESULT_BOOL(wait->beginTime > (TimeMSec)1483228800000, true, " check begin time");
|
||||
|
||||
TEST_RESULT_BOOL(waitMore(wait), true, " first retry");
|
||||
TEST_RESULT_UINT(wait->retry, 1, " check retry");
|
||||
|
||||
TEST_RESULT_BOOL(waitMore(wait), true, " second retry");
|
||||
TEST_RESULT_UINT(wait->retry, 0, " check retry");
|
||||
|
||||
TEST_RESULT_BOOL(waitMore(wait), true, " still going because of time");
|
||||
|
||||
while (waitMore(wait));
|
||||
TimeMSec end = timeMSec();
|
||||
|
||||
|
@ -525,6 +525,7 @@ testRun(void)
|
||||
HRN_PQ_SCRIPT_CHECKPOINT(2),
|
||||
HRN_PQ_SCRIPT_CHECKPOINT_TARGET_REACHED_GE_10(2, "5/5", false, "5/4", 100),
|
||||
HRN_PQ_SCRIPT_CHECKPOINT_TARGET_REACHED_GE_10(2, "5/5", false, "5/4", 100),
|
||||
HRN_PQ_SCRIPT_CHECKPOINT_TARGET_REACHED_GE_10(2, "5/5", false, "5/4", 100),
|
||||
|
||||
// Wait for standby to sync
|
||||
HRN_PQ_SCRIPT_REPLAY_TARGET_REACHED_GE_10(2, "5/5", false, "5/3"),
|
||||
|
@ -153,6 +153,10 @@ testRun(void)
|
||||
{.function = HRN_PQ_SENDQUERY, .param = "[\"" TEST_QUERY "\"]", .resultInt = 1},
|
||||
{.function = HRN_PQ_CONSUMEINPUT, .sleep = 600},
|
||||
{.function = HRN_PQ_ISBUSY, .resultInt = 1},
|
||||
{.function = HRN_PQ_CONSUMEINPUT},
|
||||
{.function = HRN_PQ_ISBUSY, .resultInt = 1},
|
||||
{.function = HRN_PQ_CONSUMEINPUT},
|
||||
{.function = HRN_PQ_ISBUSY, .resultInt = 1},
|
||||
{.function = HRN_PQ_GETCANCEL},
|
||||
{.function = HRN_PQ_CANCEL, .resultInt = 1},
|
||||
{.function = HRN_PQ_FREECANCEL},
|
||||
@ -180,6 +184,8 @@ testRun(void)
|
||||
{.function = HRN_PQ_ISBUSY, .resultInt = 1},
|
||||
{.function = HRN_PQ_CONSUMEINPUT, .sleep = 300},
|
||||
{.function = HRN_PQ_ISBUSY, .resultInt = 1},
|
||||
{.function = HRN_PQ_CONSUMEINPUT},
|
||||
{.function = HRN_PQ_ISBUSY, .resultInt = 1},
|
||||
{.function = HRN_PQ_GETCANCEL},
|
||||
{.function = HRN_PQ_CANCEL, .resultInt = 0, .resultZ = TEST_PQ_ERROR},
|
||||
{.function = HRN_PQ_FREECANCEL});
|
||||
@ -202,6 +208,10 @@ testRun(void)
|
||||
{.function = HRN_PQ_SENDQUERY, .param = "[\"" TEST_QUERY "\"]", .resultInt = 1},
|
||||
{.function = HRN_PQ_CONSUMEINPUT, .sleep = 600},
|
||||
{.function = HRN_PQ_ISBUSY, .resultInt = 1},
|
||||
{.function = HRN_PQ_CONSUMEINPUT},
|
||||
{.function = HRN_PQ_ISBUSY, .resultInt = 1},
|
||||
{.function = HRN_PQ_CONSUMEINPUT},
|
||||
{.function = HRN_PQ_ISBUSY, .resultInt = 1},
|
||||
{.function = HRN_PQ_GETCANCEL, .resultNull = true});
|
||||
|
||||
TEST_ERROR(
|
||||
|
@ -2051,6 +2051,12 @@ testRun(void)
|
||||
{.function = HRNLIBSSH2_SFTP_STAT_EX, .param = "[\"" TEST_PATH "/missing\",0]",
|
||||
.resultInt = LIBSSH2_ERROR_SFTP_PROTOCOL},
|
||||
{.function = HRNLIBSSH2_SFTP_LAST_ERROR, .resultUInt = LIBSSH2_FX_NO_SUCH_FILE},
|
||||
{.function = HRNLIBSSH2_SFTP_STAT_EX, .param = "[\"" TEST_PATH "/missing\",0]",
|
||||
.resultInt = LIBSSH2_ERROR_SFTP_PROTOCOL},
|
||||
{.function = HRNLIBSSH2_SFTP_LAST_ERROR, .resultUInt = LIBSSH2_FX_NO_SUCH_FILE},
|
||||
{.function = HRNLIBSSH2_SFTP_STAT_EX, .param = "[\"" TEST_PATH "/missing\",0]",
|
||||
.resultInt = LIBSSH2_ERROR_SFTP_PROTOCOL},
|
||||
{.function = HRNLIBSSH2_SFTP_LAST_ERROR, .resultUInt = LIBSSH2_FX_NO_SUCH_FILE},
|
||||
// Path missing
|
||||
{.function = HRNLIBSSH2_SFTP_STAT_EX, .param = "[\"" TEST_PATH "\",0]",
|
||||
.attrPerms = LIBSSH2_SFTP_S_IFDIR, .resultInt = LIBSSH2_ERROR_NONE},
|
||||
|
Loading…
Reference in New Issue
Block a user