1
0
mirror of https://github.com/postgrespro/pg_probackup.git synced 2025-01-07 13:40:17 +02:00

PGPRO-1918: Consider lock_backup()'s result

This commit is contained in:
Arthur Zakirov 2019-02-18 16:55:46 +03:00
parent 3b91712e1a
commit 8960f2aae2
9 changed files with 81 additions and 54 deletions

View File

@ -976,8 +976,9 @@ do_backup(time_t start_time)
/* Create backup directory and BACKUP_CONTROL_FILE */
if (pgBackupCreateDir(&current))
elog(ERROR, "cannot create backup directory");
lock_backup(&current);
elog(ERROR, "Cannot create backup directory");
if (!lock_backup(&current))
elog(ERROR, "Cannot lock backup directory");
write_backup(&current);
elog(LOG, "Backup destination is initialized");

View File

@ -69,12 +69,13 @@ read_backup(time_t timestamp)
* status.
*/
void
write_backup_status(pgBackup *backup)
write_backup_status(pgBackup *backup, BackupStatus status)
{
pgBackup *tmp;
tmp = read_backup(backup->start_time);
backup->status = status;
tmp->status = backup->status;
write_backup(tmp);
@ -84,7 +85,7 @@ write_backup_status(pgBackup *backup)
/*
* Create exclusive lockfile in the backup's directory.
*/
void
bool
lock_backup(pgBackup *backup)
{
char lock_file[MAXPGPATH];
@ -149,7 +150,7 @@ lock_backup(pgBackup *backup)
* Couldn't create the pid file. Probably it already exists.
*/
if ((errno != EEXIST && errno != EACCES) || ntries > 100)
elog(ERROR, "could not create lock file \"%s\": %s",
elog(ERROR, "Could not create lock file \"%s\": %s",
lock_file, strerror(errno));
/*
@ -161,22 +162,22 @@ lock_backup(pgBackup *backup)
{
if (errno == ENOENT)
continue; /* race condition; try again */
elog(ERROR, "could not open lock file \"%s\": %s",
elog(ERROR, "Could not open lock file \"%s\": %s",
lock_file, strerror(errno));
}
if ((len = read(fd, buffer, sizeof(buffer) - 1)) < 0)
elog(ERROR, "could not read lock file \"%s\": %s",
elog(ERROR, "Could not read lock file \"%s\": %s",
lock_file, strerror(errno));
close(fd);
if (len == 0)
elog(ERROR, "lock file \"%s\" is empty", lock_file);
elog(ERROR, "Lock file \"%s\" is empty", lock_file);
buffer[len] = '\0';
encoded_pid = atoi(buffer);
if (encoded_pid <= 0)
elog(ERROR, "bogus data in lock file \"%s\": \"%s\"",
elog(ERROR, "Bogus data in lock file \"%s\": \"%s\"",
lock_file, buffer);
/*
@ -190,9 +191,21 @@ lock_backup(pgBackup *backup)
*/
if (encoded_pid != my_pid && encoded_pid != my_p_pid)
{
if (kill(encoded_pid, 0) == 0 ||
(errno != ESRCH && errno != EPERM))
elog(ERROR, "lock file \"%s\" already exists", lock_file);
if (kill(encoded_pid, 0) == 0)
{
elog(WARNING, "Process %d is using backup %s and still is running",
encoded_pid, base36enc(backup->start_time));
return false;
}
else
{
if (errno == ESRCH)
elog(WARNING, "Process %d which used backup %s no longer exists",
encoded_pid, base36enc(backup->start_time));
else
elog(ERROR, "Failed to send signal 0 to a process %d: %s",
encoded_pid, strerror(errno));
}
}
/*
@ -201,7 +214,7 @@ lock_backup(pgBackup *backup)
* would-be creators.
*/
if (unlink(lock_file) < 0)
elog(ERROR, "could not remove old lock file \"%s\": %s",
elog(ERROR, "Could not remove old lock file \"%s\": %s",
lock_file, strerror(errno));
}
@ -219,7 +232,7 @@ lock_backup(pgBackup *backup)
unlink(lock_file);
/* if write didn't set errno, assume problem is no disk space */
errno = save_errno ? save_errno : ENOSPC;
elog(ERROR, "could not write lock file \"%s\": %s",
elog(ERROR, "Could not write lock file \"%s\": %s",
lock_file, strerror(errno));
}
if (fsync(fd) != 0)
@ -229,7 +242,7 @@ lock_backup(pgBackup *backup)
close(fd);
unlink(lock_file);
errno = save_errno;
elog(ERROR, "could not write lock file \"%s\": %s",
elog(ERROR, "Could not write lock file \"%s\": %s",
lock_file, strerror(errno));
}
if (close(fd) != 0)
@ -238,7 +251,7 @@ lock_backup(pgBackup *backup)
unlink(lock_file);
errno = save_errno;
elog(ERROR, "could not write lock file \"%s\": %s",
elog(ERROR, "Culd not write lock file \"%s\": %s",
lock_file, strerror(errno));
}
@ -255,6 +268,8 @@ lock_backup(pgBackup *backup)
if (lock_files == NULL)
lock_files = parray_new();
parray_append(lock_files, pgut_strdup(lock_file));
return true;
}
/*
@ -418,7 +433,8 @@ catalog_lock_backup_list(parray *backup_list, int from_idx, int to_idx)
end_idx = Min(from_idx, to_idx);
for (i = start_idx; i >= end_idx; i--)
lock_backup((pgBackup *) parray_get(backup_list, i));
if (!lock_backup((pgBackup *) parray_get(backup_list, i)))
elog(ERROR, "Cannot lock backup directory");
}
/*

View File

@ -202,7 +202,12 @@ do_retention_purge(void)
continue;
}
lock_backup(backup);
/*
* If the backup still is used do not interrupt go to the next
* backup.
*/
if (!lock_backup(backup))
continue;
/* Delete backup and update status to DELETED */
delete_backup_files(backup);
@ -238,7 +243,7 @@ do_retention_purge(void)
if (backup_deleted)
elog(INFO, "Purging finished");
else
elog(INFO, "Nothing to delete by retention policy");
elog(INFO, "There are no backups to delete by retention policy");
return 0;
}
@ -275,8 +280,7 @@ delete_backup_files(pgBackup *backup)
* Update STATUS to BACKUP_STATUS_DELETING in preparation for the case which
* the error occurs before deleting all backup files.
*/
backup->status = BACKUP_STATUS_DELETING;
write_backup_status(backup);
write_backup_status(backup, BACKUP_STATUS_DELETING);
/* list files to be deleted */
files = parray_new();

View File

@ -227,11 +227,8 @@ merge_backups(pgBackup *to_backup, pgBackup *from_backup)
if (from_backup->status == BACKUP_STATUS_DELETING)
goto delete_source_backup;
to_backup->status = BACKUP_STATUS_MERGING;
write_backup_status(to_backup);
from_backup->status = BACKUP_STATUS_MERGING;
write_backup_status(from_backup);
write_backup_status(to_backup, BACKUP_STATUS_MERGING);
write_backup_status(from_backup, BACKUP_STATUS_MERGING);
create_data_directories(to_database_path, from_backup_path, false);

View File

@ -484,8 +484,7 @@ validate_backup_wal_from_start_to_stop(pgBackup *backup,
* If we don't have WAL between start_lsn and stop_lsn,
* the backup is definitely corrupted. Update its status.
*/
backup->status = BACKUP_STATUS_CORRUPT;
write_backup_status(backup);
write_backup_status(backup, BACKUP_STATUS_CORRUPT);
elog(WARNING, "There are not enough WAL records to consistenly restore "
"backup %s from START LSN: %X/%X to STOP LSN: %X/%X",

View File

@ -459,8 +459,8 @@ extern int do_validate_all(void);
/* in catalog.c */
extern pgBackup *read_backup(time_t timestamp);
extern void write_backup(pgBackup *backup);
extern void write_backup_status(pgBackup *backup);
extern void lock_backup(pgBackup *backup);
extern void write_backup_status(pgBackup *backup, BackupStatus status);
extern bool lock_backup(pgBackup *backup);
extern const char *pgBackupGetBackupMode(pgBackup *backup);

View File

@ -209,8 +209,7 @@ do_restore_or_validate(time_t target_backup_id, pgRecoveryTarget *rt,
{
if (backup->status == BACKUP_STATUS_OK)
{
backup->status = BACKUP_STATUS_ORPHAN;
write_backup_status(backup);
write_backup_status(backup, BACKUP_STATUS_ORPHAN);
elog(WARNING, "Backup %s is orphaned because his parent %s is missing",
base36enc(backup->start_time), missing_backup_id);
@ -242,8 +241,7 @@ do_restore_or_validate(time_t target_backup_id, pgRecoveryTarget *rt,
{
if (backup->status == BACKUP_STATUS_OK)
{
backup->status = BACKUP_STATUS_ORPHAN;
write_backup_status(backup);
write_backup_status(backup, BACKUP_STATUS_ORPHAN);
elog(WARNING,
"Backup %s is orphaned because his parent %s has status: %s",
@ -317,7 +315,10 @@ do_restore_or_validate(time_t target_backup_id, pgRecoveryTarget *rt,
{
tmp_backup = (pgBackup *) parray_get(parent_chain, i);
lock_backup(tmp_backup);
/* Do not interrupt, validate the next backup */
if (!lock_backup(tmp_backup))
continue;
pgBackupValidate(tmp_backup);
/* Maybe we should be more paranoid and check for !BACKUP_STATUS_OK? */
if (tmp_backup->status == BACKUP_STATUS_CORRUPT)
@ -360,8 +361,7 @@ do_restore_or_validate(time_t target_backup_id, pgRecoveryTarget *rt,
{
if (backup->status == BACKUP_STATUS_OK)
{
backup->status = BACKUP_STATUS_ORPHAN;
write_backup_status(backup);
write_backup_status(backup, BACKUP_STATUS_ORPHAN);
elog(WARNING, "Backup %s is orphaned because his parent %s has status: %s",
base36enc(backup->start_time),
@ -409,8 +409,8 @@ do_restore_or_validate(time_t target_backup_id, pgRecoveryTarget *rt,
* Backup was locked during validation if no-validate wasn't
* specified.
*/
if (rt->restore_no_validate)
lock_backup(backup);
if (rt->restore_no_validate && !lock_backup(backup))
elog(ERROR, "Cannot lock backup directory");
restore_backup(backup);
}

View File

@ -59,6 +59,15 @@ pgBackupValidate(pgBackup *backup)
"Please upgrade pg_probackup binary.",
PROGRAM_VERSION, base36enc(backup->start_time), backup->program_version);
if (backup->status == BACKUP_STATUS_RUNNING)
{
elog(WARNING, "Backup %s has status %s, change it to ERROR and skip validation",
base36enc(backup->start_time), status2str(backup->status));
write_backup_status(backup, BACKUP_STATUS_ERROR);
corrupted_backup_found = true;
return;
}
/* Revalidation is attempted for DONE, ORPHAN and CORRUPT backups */
if (backup->status != BACKUP_STATUS_OK &&
backup->status != BACKUP_STATUS_DONE &&
@ -143,8 +152,8 @@ pgBackupValidate(pgBackup *backup)
parray_free(files);
/* Update backup status */
backup->status = corrupted ? BACKUP_STATUS_CORRUPT : BACKUP_STATUS_OK;
write_backup_status(backup);
write_backup_status(backup, corrupted ? BACKUP_STATUS_CORRUPT :
BACKUP_STATUS_OK);
if (corrupted)
elog(WARNING, "Backup %s data files are corrupted", base36enc(backup->start_time));
@ -385,8 +394,7 @@ do_validate_instance(void)
/* orphanize current_backup */
if (current_backup->status == BACKUP_STATUS_OK)
{
current_backup->status = BACKUP_STATUS_ORPHAN;
write_backup_status(current_backup);
write_backup_status(current_backup, BACKUP_STATUS_ORPHAN);
elog(WARNING, "Backup %s is orphaned because his parent %s is missing",
base36enc(current_backup->start_time),
parent_backup_id);
@ -410,8 +418,7 @@ do_validate_instance(void)
/* orphanize current_backup */
if (current_backup->status == BACKUP_STATUS_OK)
{
current_backup->status = BACKUP_STATUS_ORPHAN;
write_backup_status(current_backup);
write_backup_status(current_backup, BACKUP_STATUS_ORPHAN);
elog(WARNING, "Backup %s is orphaned because his parent %s has status: %s",
base36enc(current_backup->start_time), parent_backup_id,
status2str(tmp_backup->status));
@ -435,7 +442,9 @@ do_validate_instance(void)
else
base_full_backup = current_backup;
lock_backup(current_backup);
/* Do not interrupt, validate the next backup */
if (!lock_backup(current_backup))
continue;
/* Valiate backup files*/
pgBackupValidate(current_backup);
@ -469,8 +478,7 @@ do_validate_instance(void)
{
if (backup->status == BACKUP_STATUS_OK)
{
backup->status = BACKUP_STATUS_ORPHAN;
write_backup_status(backup);
write_backup_status(backup, BACKUP_STATUS_ORPHAN);
elog(WARNING, "Backup %s is orphaned because his parent %s has status: %s",
base36enc(backup->start_time),
@ -522,7 +530,9 @@ do_validate_instance(void)
if (backup->status == BACKUP_STATUS_ORPHAN)
{
lock_backup(backup);
/* Do not interrupt, validate the next backup */
if (!lock_backup(backup))
continue;
/* Revaliate backup files*/
pgBackupValidate(backup);

View File

@ -12,7 +12,7 @@ class LockingTest(ProbackupTest, unittest.TestCase):
# @unittest.skip("skip")
# @unittest.expectedFailure
def test_locking_running_1(self):
"""
"""
make node, take full backup, stop it in the middle
run validate, expect it to successfully executed,
concurrect RUNNING backup with pid file and active process is legal
@ -46,7 +46,7 @@ class LockingTest(ProbackupTest, unittest.TestCase):
self.assertEqual(
'RUNNING', self.show_pb(backup_dir, 'node')[1]['status'])
self.validate_pb(backup_dir)
self.validate_pb(backup_dir, options=['--log-level-file=VERBOSE'])
self.assertEqual(
'OK', self.show_pb(backup_dir, 'node')[0]['status'])
@ -58,7 +58,7 @@ class LockingTest(ProbackupTest, unittest.TestCase):
self.del_test_dir(module_name, fname)
def test_locking_running_2(self):
"""
"""
make node, take full backup, stop it in the middle,
kill process so no cleanup is done - pid file is in place,
run validate, expect it to not successfully executed,
@ -112,7 +112,7 @@ class LockingTest(ProbackupTest, unittest.TestCase):
self.del_test_dir(module_name, fname)
def test_locking_running_3(self):
"""
"""
make node, take full backup, stop it in the middle,
terminate process, delete pid file,
run validate, expect it to not successfully executed,
@ -168,4 +168,4 @@ class LockingTest(ProbackupTest, unittest.TestCase):
'ERROR', self.show_pb(backup_dir, 'node')[1]['status'])
# Clean after yourself
self.del_test_dir(module_name, fname)
self.del_test_dir(module_name, fname)