1
0
mirror of https://github.com/pgbackrest/pgbackrest.git synced 2025-11-06 08:49:29 +02:00

Fix regression in retries.

5314dbf aimed to make nested Wait objects more accurate with regard to wait time but it also got rid of the "bonus" retry that was implicit in the prior implementation. This meant that if an operation used up the entire allotted timeout, it would not be retried. Object stores especially are noisy places and some amount of retry should always be attempted. So even though removing the "bonus" retry was intended, it turned out not to be a good idea.

Instead of an implicit retry, formalize two retries in the Wait object even if the wait time has expired. Any number of retries are allowed during the wait period. Also remove waitRemaining() since it is no longer needed.

Adjust tests as needed to account for the extra timeouts.

Note that there may still be an underlying issue here that is simply being masked by retries. That is, the issue expressing was that waiting for a socket to be writable was timing out and without a retry that caused a hard error. This patch does nothing to address the source of the write timeout and perhaps there is nothing we can do about it. It does seem similar to the write issue we had with our blocking TLS implementation, but it was never clear if that was a problem with TLS, the kernel, or a bug in pgBackRest itself. It cropped up after a kernel update and we switched to non-blocking TLS to address the issue (c88684e).
This commit is contained in:
David Steele
2023-11-09 12:04:25 -03:00
committed by GitHub
parent 3c116e1829
commit fa5b2d44ad
11 changed files with 103 additions and 46 deletions

View File

@@ -97,7 +97,7 @@ sckClientOpen(THIS_VOID)
THROW_ON_SYS_ERROR(fd == -1, HostConnectError, "unable to create socket");
sckOptionSet(fd);
sckConnect(fd, this->host, this->port, addressFound, waitRemaining(wait));
sckConnect(fd, this->host, this->port, addressFound, this->timeoutConnect);
// Create the session
MEM_CONTEXT_PRIOR_BEGIN()
@@ -119,8 +119,7 @@ sckClientOpen(THIS_VOID)
errRetryAdd(errRetry);
// Increment address info index and reset if the end has been reached. When the end has been reached sleep for a bit
// to hopefully have better chance at succeeding, otherwise continue right to the next address as long as there is
// some time left.
// to hopefully have better chance at succeeding, otherwise continue right to the next address.
addrInfoIdx++;
if (addrInfoIdx >= addrInfoSize(addrInfo))
@@ -129,7 +128,7 @@ sckClientOpen(THIS_VOID)
retry = waitMore(wait);
}
else
retry = waitRemaining(wait) > 0;
retry = true;
// Error when no retries remain
if (!retry)

View File

@@ -16,6 +16,7 @@ struct Wait
TimeMSec sleepTime; // Next sleep time (in usec)
TimeMSec sleepPrevTime; // Previous time slept (in usec)
TimeMSec beginTime; // Time the wait began (in epoch usec)
unsigned int retry; // Retries remaining
};
/**********************************************************************************************************************************/
@@ -33,6 +34,7 @@ waitNew(const TimeMSec waitTime)
*this = (Wait)
{
.waitTime = waitTime,
.retry = 2,
};
// Calculate first sleep time -- start with 1/10th of a second for anything >= 1 second
@@ -52,31 +54,6 @@ waitNew(const TimeMSec waitTime)
FUNCTION_LOG_RETURN(WAIT, this);
}
/**********************************************************************************************************************************/
FN_EXTERN TimeMSec
waitRemaining(Wait *const this)
{
FUNCTION_TEST_BEGIN();
FUNCTION_TEST_PARAM(WAIT, this);
FUNCTION_TEST_END();
TimeMSec result = 0;
// If any wait time remains
if (this->sleepTime > 0)
{
// Returning remaining time, if any, else set sleepTime to 0 so next call to waitMore will return false
const TimeMSec elapsedTime = timeMSec() - this->beginTime;
if (elapsedTime < this->waitTime)
result = this->waitTime - elapsedTime;
else
this->sleepTime = 0;
}
FUNCTION_TEST_RETURN(TIME_MSEC, result);
}
/**********************************************************************************************************************************/
FN_EXTERN bool
waitMore(Wait *const this)
@@ -114,13 +91,26 @@ waitMore(Wait *const this)
// Store new sleep times
this->sleepPrevTime = this->sleepTime;
this->sleepTime = sleepTime;
// Need to wait more
result = true;
}
// Else set sleep to zero so next call will return false
// Else are there retries left?
else if (this->retry != 0)
{
// Sleep using the last calculated time
sleepMSec(this->sleepTime);
}
// Else set sleep to zero so call will return false
else
this->sleepTime = 0;
// Caller can continue processing
if (this->sleepTime > 0)
{
// Decrement retries
if (this->retry != 0)
this->retry--;
result = true;
}
}
FUNCTION_LOG_RETURN(BOOL, result);

View File

@@ -1,5 +1,10 @@
/***********************************************************************************************************************************
Wait Handler
Used for operations that may fail due to an error or some unexpected condition such as file missing. When waitMore() is called it
will wait (based on a Fibonacci backoff) before returning to give the error or condition time to clear. Even when the wait time has
expired before waitMore() is called, there will still be two retries to compensate for operations that use up the entire time limit.
Any number of retries are allowed within the time limit.
***********************************************************************************************************************************/
#ifndef COMMON_WAIT_H
#define COMMON_WAIT_H
@@ -17,16 +22,10 @@ Constructors
***********************************************************************************************************************************/
FN_EXTERN Wait *waitNew(TimeMSec waitTime);
/***********************************************************************************************************************************
Getters/Setters
***********************************************************************************************************************************/
// How much time is remaining? Recalculated each time waitMore() is called.
FN_EXTERN TimeMSec waitRemaining(Wait *this);
/***********************************************************************************************************************************
Functions
***********************************************************************************************************************************/
// Wait and return whether the caller has more time left
// Wait and return true if the caller has more time/retries left
FN_EXTERN bool waitMore(Wait *this);
/***********************************************************************************************************************************