You've already forked pgbackrest
mirror of
https://github.com/pgbackrest/pgbackrest.git
synced 2025-11-06 08:49:29 +02:00
Fix regression in retries.
5314dbfaimed to make nested Wait objects more accurate with regard to wait time but it also got rid of the "bonus" retry that was implicit in the prior implementation. This meant that if an operation used up the entire allotted timeout, it would not be retried. Object stores especially are noisy places and some amount of retry should always be attempted. So even though removing the "bonus" retry was intended, it turned out not to be a good idea. Instead of an implicit retry, formalize two retries in the Wait object even if the wait time has expired. Any number of retries are allowed during the wait period. Also remove waitRemaining() since it is no longer needed. Adjust tests as needed to account for the extra timeouts. Note that there may still be an underlying issue here that is simply being masked by retries. That is, the issue expressing was that waiting for a socket to be writable was timing out and without a retry that caused a hard error. This patch does nothing to address the source of the write timeout and perhaps there is nothing we can do about it. It does seem similar to the write issue we had with our blocking TLS implementation, but it was never clear if that was a problem with TLS, the kernel, or a bug in pgBackRest itself. It cropped up after a kernel update and we switched to non-blocking TLS to address the issue (c88684e).
This commit is contained in:
@@ -97,7 +97,7 @@ sckClientOpen(THIS_VOID)
|
||||
THROW_ON_SYS_ERROR(fd == -1, HostConnectError, "unable to create socket");
|
||||
|
||||
sckOptionSet(fd);
|
||||
sckConnect(fd, this->host, this->port, addressFound, waitRemaining(wait));
|
||||
sckConnect(fd, this->host, this->port, addressFound, this->timeoutConnect);
|
||||
|
||||
// Create the session
|
||||
MEM_CONTEXT_PRIOR_BEGIN()
|
||||
@@ -119,8 +119,7 @@ sckClientOpen(THIS_VOID)
|
||||
errRetryAdd(errRetry);
|
||||
|
||||
// Increment address info index and reset if the end has been reached. When the end has been reached sleep for a bit
|
||||
// to hopefully have better chance at succeeding, otherwise continue right to the next address as long as there is
|
||||
// some time left.
|
||||
// to hopefully have better chance at succeeding, otherwise continue right to the next address.
|
||||
addrInfoIdx++;
|
||||
|
||||
if (addrInfoIdx >= addrInfoSize(addrInfo))
|
||||
@@ -129,7 +128,7 @@ sckClientOpen(THIS_VOID)
|
||||
retry = waitMore(wait);
|
||||
}
|
||||
else
|
||||
retry = waitRemaining(wait) > 0;
|
||||
retry = true;
|
||||
|
||||
// Error when no retries remain
|
||||
if (!retry)
|
||||
|
||||
@@ -16,6 +16,7 @@ struct Wait
|
||||
TimeMSec sleepTime; // Next sleep time (in usec)
|
||||
TimeMSec sleepPrevTime; // Previous time slept (in usec)
|
||||
TimeMSec beginTime; // Time the wait began (in epoch usec)
|
||||
unsigned int retry; // Retries remaining
|
||||
};
|
||||
|
||||
/**********************************************************************************************************************************/
|
||||
@@ -33,6 +34,7 @@ waitNew(const TimeMSec waitTime)
|
||||
*this = (Wait)
|
||||
{
|
||||
.waitTime = waitTime,
|
||||
.retry = 2,
|
||||
};
|
||||
|
||||
// Calculate first sleep time -- start with 1/10th of a second for anything >= 1 second
|
||||
@@ -52,31 +54,6 @@ waitNew(const TimeMSec waitTime)
|
||||
FUNCTION_LOG_RETURN(WAIT, this);
|
||||
}
|
||||
|
||||
/**********************************************************************************************************************************/
|
||||
FN_EXTERN TimeMSec
|
||||
waitRemaining(Wait *const this)
|
||||
{
|
||||
FUNCTION_TEST_BEGIN();
|
||||
FUNCTION_TEST_PARAM(WAIT, this);
|
||||
FUNCTION_TEST_END();
|
||||
|
||||
TimeMSec result = 0;
|
||||
|
||||
// If any wait time remains
|
||||
if (this->sleepTime > 0)
|
||||
{
|
||||
// Returning remaining time, if any, else set sleepTime to 0 so next call to waitMore will return false
|
||||
const TimeMSec elapsedTime = timeMSec() - this->beginTime;
|
||||
|
||||
if (elapsedTime < this->waitTime)
|
||||
result = this->waitTime - elapsedTime;
|
||||
else
|
||||
this->sleepTime = 0;
|
||||
}
|
||||
|
||||
FUNCTION_TEST_RETURN(TIME_MSEC, result);
|
||||
}
|
||||
|
||||
/**********************************************************************************************************************************/
|
||||
FN_EXTERN bool
|
||||
waitMore(Wait *const this)
|
||||
@@ -114,13 +91,26 @@ waitMore(Wait *const this)
|
||||
// Store new sleep times
|
||||
this->sleepPrevTime = this->sleepTime;
|
||||
this->sleepTime = sleepTime;
|
||||
|
||||
// Need to wait more
|
||||
result = true;
|
||||
}
|
||||
// Else set sleep to zero so next call will return false
|
||||
// Else are there retries left?
|
||||
else if (this->retry != 0)
|
||||
{
|
||||
// Sleep using the last calculated time
|
||||
sleepMSec(this->sleepTime);
|
||||
}
|
||||
// Else set sleep to zero so call will return false
|
||||
else
|
||||
this->sleepTime = 0;
|
||||
|
||||
// Caller can continue processing
|
||||
if (this->sleepTime > 0)
|
||||
{
|
||||
// Decrement retries
|
||||
if (this->retry != 0)
|
||||
this->retry--;
|
||||
|
||||
result = true;
|
||||
}
|
||||
}
|
||||
|
||||
FUNCTION_LOG_RETURN(BOOL, result);
|
||||
|
||||
@@ -1,5 +1,10 @@
|
||||
/***********************************************************************************************************************************
|
||||
Wait Handler
|
||||
|
||||
Used for operations that may fail due to an error or some unexpected condition such as file missing. When waitMore() is called it
|
||||
will wait (based on a Fibonacci backoff) before returning to give the error or condition time to clear. Even when the wait time has
|
||||
expired before waitMore() is called, there will still be two retries to compensate for operations that use up the entire time limit.
|
||||
Any number of retries are allowed within the time limit.
|
||||
***********************************************************************************************************************************/
|
||||
#ifndef COMMON_WAIT_H
|
||||
#define COMMON_WAIT_H
|
||||
@@ -17,16 +22,10 @@ Constructors
|
||||
***********************************************************************************************************************************/
|
||||
FN_EXTERN Wait *waitNew(TimeMSec waitTime);
|
||||
|
||||
/***********************************************************************************************************************************
|
||||
Getters/Setters
|
||||
***********************************************************************************************************************************/
|
||||
// How much time is remaining? Recalculated each time waitMore() is called.
|
||||
FN_EXTERN TimeMSec waitRemaining(Wait *this);
|
||||
|
||||
/***********************************************************************************************************************************
|
||||
Functions
|
||||
***********************************************************************************************************************************/
|
||||
// Wait and return whether the caller has more time left
|
||||
// Wait and return true if the caller has more time/retries left
|
||||
FN_EXTERN bool waitMore(Wait *this);
|
||||
|
||||
/***********************************************************************************************************************************
|
||||
|
||||
Reference in New Issue
Block a user