1
0
mirror of https://github.com/postgrespro/pg_probackup.git synced 2025-09-16 09:26:30 +02:00

Merge pull request #309 from postgrespro/issue_308

[Issue #308] Wait on empty exclusive lock file
This commit is contained in:
gsmol
2021-02-02 14:43:28 -08:00
committed by GitHub
4 changed files with 105 additions and 59 deletions

View File

@@ -166,6 +166,9 @@ write_backup_status(pgBackup *backup, BackupStatus status,
*
* TODO: lock-timeout as parameter
* TODO: we must think about more fine grain unlock mechanism - separate unlock_backup() function.
* TODO: more accurate naming
* -> exclusive lock -> acquire HW_LATCH and wait until all LW_LATCH`es are clear
* -> shared lock -> acquire HW_LATCH, acquire LW_LATCH, release HW_LATCH
*/
bool
lock_backup(pgBackup *backup, bool strict, bool exclusive)
@@ -205,7 +208,7 @@ lock_backup(pgBackup *backup, bool strict, bool exclusive)
{
/* release exclusive lock */
if (fio_unlink(lock_file, FIO_BACKUP_HOST) < 0)
elog(ERROR, "Could not remove old lock file \"%s\": %s",
elog(ERROR, "Could not remove exclusive lock file \"%s\": %s",
lock_file, strerror(errno));
/* we are done */
@@ -261,48 +264,16 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
int fd = 0;
char buffer[MAXPGPATH * 2 + 256];
int ntries = LOCK_TIMEOUT;
int log_freq = ntries / 5;
int empty_tries = LOCK_STALE_TIMEOUT;
int len;
int encoded_pid;
pid_t my_p_pid;
join_path_components(lock_file, backup->root_dir, BACKUP_LOCK_FILE);
/*
* TODO: is this stuff with ppid below is relevant for us ?
*
* If the PID in the lockfile is our own PID or our parent's or
* grandparent's PID, then the file must be stale (probably left over from
* a previous system boot cycle). We need to check this because of the
* likelihood that a reboot will assign exactly the same PID as we had in
* the previous reboot, or one that's only one or two counts larger and
* hence the lockfile's PID now refers to an ancestor shell process. We
* allow pg_ctl to pass down its parent shell PID (our grandparent PID)
* via the environment variable PG_GRANDPARENT_PID; this is so that
* launching the postmaster via pg_ctl can be just as reliable as
* launching it directly. There is no provision for detecting
* further-removed ancestor processes, but if the init script is written
* carefully then all but the immediate parent shell will be root-owned
* processes and so the kill test will fail with EPERM. Note that we
* cannot get a false negative this way, because an existing postmaster
* would surely never launch a competing postmaster or pg_ctl process
* directly.
*/
#ifndef WIN32
my_p_pid = getppid();
#else
/*
* Windows hasn't got getppid(), but doesn't need it since it's not using
* real kill() either...
*/
my_p_pid = 0;
#endif
/*
* We need a loop here because of race conditions. But don't loop forever
* (for example, a non-writable $backup_instance_path directory might cause a failure
* that won't go away). 100 tries seems like plenty.
* that won't go away).
*/
do
{
@@ -351,13 +322,38 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
fclose(fp_out);
/*
* It should be possible only as a result of system crash,
* so its hypothetical owner should be dead by now
* There are several possible reasons for lock file
* to be empty:
* - system crash
* - process crash
* - race between writer and reader
*
* Consider empty file to be stale after LOCK_STALE_TIMEOUT attempts.
*
* TODO: alternatively we can write into temp file (lock_file_%pid),
* rename it and then re-read lock file to make sure,
* that we are successfully acquired the lock.
*/
if (len == 0)
{
elog(WARNING, "Lock file \"%s\" is empty", lock_file);
goto grab_lock;
if (empty_tries == 0)
{
elog(WARNING, "Lock file \"%s\" is empty", lock_file);
goto grab_lock;
}
if ((empty_tries % LOG_FREQ) == 0)
elog(WARNING, "Waiting %u seconds on empty exclusive lock for backup %s",
empty_tries, base36enc(backup->start_time));
sleep(1);
/*
* waiting on empty lock file should not affect
* the timer for concurrent lockers (ntries).
*/
empty_tries--;
ntries++;
continue;
}
encoded_pid = atoi(buffer);
@@ -371,24 +367,23 @@ lock_backup_exclusive(pgBackup *backup, bool strict)
/*
* Check to see if the other process still exists
*
* Per discussion above, my_pid, my_p_pid can be
* ignored as false matches.
*
* Normally kill() will fail with ESRCH if the given PID doesn't
* exist.
*/
if (encoded_pid != my_pid && encoded_pid != my_p_pid)
if (encoded_pid == my_pid)
return 0;
else
{
if (kill(encoded_pid, 0) == 0)
{
/* complain every fifth interval */
if ((ntries % log_freq) == 0)
if ((ntries % LOG_FREQ) == 0)
{
elog(WARNING, "Process %d is using backup %s, and is still running",
encoded_pid, base36enc(backup->start_time));
elog(WARNING, "Waiting %u seconds on lock for backup %s", ntries, base36enc(backup->start_time));
elog(WARNING, "Waiting %u seconds on exclusive lock for backup %s",
ntries, base36enc(backup->start_time));
}
sleep(1);
@@ -435,7 +430,7 @@ grab_lock:
errno = 0;
if (fio_write(fd, buffer, strlen(buffer)) != strlen(buffer))
{
int save_errno = errno;
int save_errno = errno;
fio_close(fd);
fio_unlink(lock_file, FIO_BACKUP_HOST);
@@ -453,7 +448,7 @@ grab_lock:
if (fio_flush(fd) != 0)
{
int save_errno = errno;
int save_errno = errno;
fio_close(fd);
fio_unlink(lock_file, FIO_BACKUP_HOST);
@@ -471,7 +466,7 @@ grab_lock:
if (fio_close(fd) != 0)
{
int save_errno = errno;
int save_errno = errno;
fio_unlink(lock_file, FIO_BACKUP_HOST);
@@ -482,6 +477,10 @@ grab_lock:
lock_file, strerror(save_errno));
}
// elog(LOG, "Acquired exclusive lock for backup %s after %ds",
// base36enc(backup->start_time),
// LOCK_TIMEOUT - ntries + LOCK_STALE_TIMEOUT - empty_tries);
return 0;
}
@@ -493,7 +492,6 @@ wait_read_only_owners(pgBackup *backup)
char buffer[256];
pid_t encoded_pid;
int ntries = LOCK_TIMEOUT;
int log_freq = ntries / 5;
char lock_file[MAXPGPATH];
join_path_components(lock_file, backup->root_dir, BACKUP_RO_LOCK_FILE);
@@ -523,7 +521,7 @@ wait_read_only_owners(pgBackup *backup)
{
if (kill(encoded_pid, 0) == 0)
{
if ((ntries % log_freq) == 0)
if ((ntries % LOG_FREQ) == 0)
{
elog(WARNING, "Process %d is using backup %s in read only mode, and is still running",
encoded_pid, base36enc(backup->start_time));

View File

@@ -79,7 +79,9 @@ extern const char *PROGRAM_EMAIL;
/* Timeout defaults */
#define ARCHIVE_TIMEOUT_DEFAULT 300
#define REPLICA_TIMEOUT_DEFAULT 300
#define LOCK_TIMEOUT 30
#define LOCK_TIMEOUT 60
#define LOCK_STALE_TIMEOUT 30
#define LOG_FREQ 10
/* Directory/File permission */
#define DIR_PERMISSION (0700)

View File

@@ -757,7 +757,7 @@ class ProbackupTest(object):
return GDBobj([binary_path] + command, self.verbose)
if asynchronous:
return subprocess.Popen(
self.cmd,
[binary_path] + command,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
env=env
@@ -1133,8 +1133,8 @@ class ProbackupTest(object):
exit(1)
def validate_pb(
self, backup_dir, instance=None,
backup_id=None, options=[], old_binary=False, gdb=False
self, backup_dir, instance=None, backup_id=None,
options=[], old_binary=False, gdb=False, asynchronous=False
):
cmd_list = [
@@ -1146,11 +1146,11 @@ class ProbackupTest(object):
if backup_id:
cmd_list += ['-i', backup_id]
return self.run_pb(cmd_list + options, old_binary=old_binary, gdb=gdb)
return self.run_pb(cmd_list + options, old_binary=old_binary, gdb=gdb, asynchronous=asynchronous)
def delete_pb(
self, backup_dir, instance,
backup_id=None, options=[], old_binary=False, gdb=False):
self, backup_dir, instance, backup_id=None,
options=[], old_binary=False, gdb=False, asynchronous=False):
cmd_list = [
'delete',
'-B', backup_dir
@@ -1160,7 +1160,7 @@ class ProbackupTest(object):
if backup_id:
cmd_list += ['-i', backup_id]
return self.run_pb(cmd_list + options, old_binary=old_binary, gdb=gdb)
return self.run_pb(cmd_list + options, old_binary=old_binary, gdb=gdb, asynchronous=asynchronous)
def delete_expired(
self, backup_dir, instance, options=[], old_binary=False):

View File

@@ -535,6 +535,52 @@ class LockingTest(ProbackupTest, unittest.TestCase):
# Clean after yourself
self.del_test_dir(module_name, fname)
def test_empty_lock_file(self):
"""
https://github.com/postgrespro/pg_probackup/issues/308
"""
fname = self.id().split('.')[3]
node = self.make_simple_node(
base_dir=os.path.join(module_name, fname, 'node'),
initdb_params=['--data-checksums'])
backup_dir = os.path.join(self.tmp_path, module_name, fname, 'backup')
self.init_pb(backup_dir)
self.add_instance(backup_dir, 'node', node)
self.set_archiving(backup_dir, 'node', node)
node.slow_start()
# Fill with data
node.pgbench_init(scale=100)
# FULL
backup_id = self.backup_node(backup_dir, 'node', node)
lockfile = os.path.join(backup_dir, 'backups', 'node', backup_id, 'backup.pid')
with open(lockfile, "w+") as f:
f.truncate()
out = self.validate_pb(backup_dir, 'node', backup_id)
self.assertIn(
"Waiting 30 seconds on empty exclusive lock for backup", out)
# lockfile = os.path.join(backup_dir, 'backups', 'node', backup_id, 'backup.pid')
# with open(lockfile, "w+") as f:
# f.truncate()
#
# p1 = self.validate_pb(backup_dir, 'node', backup_id, asynchronous=True,
# options=['--log-level-file=LOG', '--log-filename=validate.log'])
# sleep(3)
# p2 = self.delete_pb(backup_dir, 'node', backup_id, asynchronous=True,
# options=['--log-level-file=LOG', '--log-filename=delete.log'])
#
# p1.wait()
# p2.wait()
# Clean after yourself
self.del_test_dir(module_name, fname)
# TODO:
# test that concurrent validation and restore are not locking each other
# check that quick exclusive lock, when taking RO-lock, is really quick