mirror of
https://github.com/postgrespro/pg_probackup.git
synced 2025-01-07 13:40:17 +02:00
PGPRO-1918: Consider lock_backup()'s result
This commit is contained in:
parent
3b91712e1a
commit
8960f2aae2
@ -976,8 +976,9 @@ do_backup(time_t start_time)
|
||||
|
||||
/* Create backup directory and BACKUP_CONTROL_FILE */
|
||||
if (pgBackupCreateDir(¤t))
|
||||
elog(ERROR, "cannot create backup directory");
|
||||
lock_backup(¤t);
|
||||
elog(ERROR, "Cannot create backup directory");
|
||||
if (!lock_backup(¤t))
|
||||
elog(ERROR, "Cannot lock backup directory");
|
||||
write_backup(¤t);
|
||||
|
||||
elog(LOG, "Backup destination is initialized");
|
||||
|
@ -69,12 +69,13 @@ read_backup(time_t timestamp)
|
||||
* status.
|
||||
*/
|
||||
void
|
||||
write_backup_status(pgBackup *backup)
|
||||
write_backup_status(pgBackup *backup, BackupStatus status)
|
||||
{
|
||||
pgBackup *tmp;
|
||||
|
||||
tmp = read_backup(backup->start_time);
|
||||
|
||||
backup->status = status;
|
||||
tmp->status = backup->status;
|
||||
write_backup(tmp);
|
||||
|
||||
@ -84,7 +85,7 @@ write_backup_status(pgBackup *backup)
|
||||
/*
|
||||
* Create exclusive lockfile in the backup's directory.
|
||||
*/
|
||||
void
|
||||
bool
|
||||
lock_backup(pgBackup *backup)
|
||||
{
|
||||
char lock_file[MAXPGPATH];
|
||||
@ -149,7 +150,7 @@ lock_backup(pgBackup *backup)
|
||||
* Couldn't create the pid file. Probably it already exists.
|
||||
*/
|
||||
if ((errno != EEXIST && errno != EACCES) || ntries > 100)
|
||||
elog(ERROR, "could not create lock file \"%s\": %s",
|
||||
elog(ERROR, "Could not create lock file \"%s\": %s",
|
||||
lock_file, strerror(errno));
|
||||
|
||||
/*
|
||||
@ -161,22 +162,22 @@ lock_backup(pgBackup *backup)
|
||||
{
|
||||
if (errno == ENOENT)
|
||||
continue; /* race condition; try again */
|
||||
elog(ERROR, "could not open lock file \"%s\": %s",
|
||||
elog(ERROR, "Could not open lock file \"%s\": %s",
|
||||
lock_file, strerror(errno));
|
||||
}
|
||||
if ((len = read(fd, buffer, sizeof(buffer) - 1)) < 0)
|
||||
elog(ERROR, "could not read lock file \"%s\": %s",
|
||||
elog(ERROR, "Could not read lock file \"%s\": %s",
|
||||
lock_file, strerror(errno));
|
||||
close(fd);
|
||||
|
||||
if (len == 0)
|
||||
elog(ERROR, "lock file \"%s\" is empty", lock_file);
|
||||
elog(ERROR, "Lock file \"%s\" is empty", lock_file);
|
||||
|
||||
buffer[len] = '\0';
|
||||
encoded_pid = atoi(buffer);
|
||||
|
||||
if (encoded_pid <= 0)
|
||||
elog(ERROR, "bogus data in lock file \"%s\": \"%s\"",
|
||||
elog(ERROR, "Bogus data in lock file \"%s\": \"%s\"",
|
||||
lock_file, buffer);
|
||||
|
||||
/*
|
||||
@ -190,9 +191,21 @@ lock_backup(pgBackup *backup)
|
||||
*/
|
||||
if (encoded_pid != my_pid && encoded_pid != my_p_pid)
|
||||
{
|
||||
if (kill(encoded_pid, 0) == 0 ||
|
||||
(errno != ESRCH && errno != EPERM))
|
||||
elog(ERROR, "lock file \"%s\" already exists", lock_file);
|
||||
if (kill(encoded_pid, 0) == 0)
|
||||
{
|
||||
elog(WARNING, "Process %d is using backup %s and still is running",
|
||||
encoded_pid, base36enc(backup->start_time));
|
||||
return false;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (errno == ESRCH)
|
||||
elog(WARNING, "Process %d which used backup %s no longer exists",
|
||||
encoded_pid, base36enc(backup->start_time));
|
||||
else
|
||||
elog(ERROR, "Failed to send signal 0 to a process %d: %s",
|
||||
encoded_pid, strerror(errno));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
@ -201,7 +214,7 @@ lock_backup(pgBackup *backup)
|
||||
* would-be creators.
|
||||
*/
|
||||
if (unlink(lock_file) < 0)
|
||||
elog(ERROR, "could not remove old lock file \"%s\": %s",
|
||||
elog(ERROR, "Could not remove old lock file \"%s\": %s",
|
||||
lock_file, strerror(errno));
|
||||
}
|
||||
|
||||
@ -219,7 +232,7 @@ lock_backup(pgBackup *backup)
|
||||
unlink(lock_file);
|
||||
/* if write didn't set errno, assume problem is no disk space */
|
||||
errno = save_errno ? save_errno : ENOSPC;
|
||||
elog(ERROR, "could not write lock file \"%s\": %s",
|
||||
elog(ERROR, "Could not write lock file \"%s\": %s",
|
||||
lock_file, strerror(errno));
|
||||
}
|
||||
if (fsync(fd) != 0)
|
||||
@ -229,7 +242,7 @@ lock_backup(pgBackup *backup)
|
||||
close(fd);
|
||||
unlink(lock_file);
|
||||
errno = save_errno;
|
||||
elog(ERROR, "could not write lock file \"%s\": %s",
|
||||
elog(ERROR, "Could not write lock file \"%s\": %s",
|
||||
lock_file, strerror(errno));
|
||||
}
|
||||
if (close(fd) != 0)
|
||||
@ -238,7 +251,7 @@ lock_backup(pgBackup *backup)
|
||||
|
||||
unlink(lock_file);
|
||||
errno = save_errno;
|
||||
elog(ERROR, "could not write lock file \"%s\": %s",
|
||||
elog(ERROR, "Culd not write lock file \"%s\": %s",
|
||||
lock_file, strerror(errno));
|
||||
}
|
||||
|
||||
@ -255,6 +268,8 @@ lock_backup(pgBackup *backup)
|
||||
if (lock_files == NULL)
|
||||
lock_files = parray_new();
|
||||
parray_append(lock_files, pgut_strdup(lock_file));
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -418,7 +433,8 @@ catalog_lock_backup_list(parray *backup_list, int from_idx, int to_idx)
|
||||
end_idx = Min(from_idx, to_idx);
|
||||
|
||||
for (i = start_idx; i >= end_idx; i--)
|
||||
lock_backup((pgBackup *) parray_get(backup_list, i));
|
||||
if (!lock_backup((pgBackup *) parray_get(backup_list, i)))
|
||||
elog(ERROR, "Cannot lock backup directory");
|
||||
}
|
||||
|
||||
/*
|
||||
|
12
src/delete.c
12
src/delete.c
@ -202,7 +202,12 @@ do_retention_purge(void)
|
||||
continue;
|
||||
}
|
||||
|
||||
lock_backup(backup);
|
||||
/*
|
||||
* If the backup still is used do not interrupt go to the next
|
||||
* backup.
|
||||
*/
|
||||
if (!lock_backup(backup))
|
||||
continue;
|
||||
|
||||
/* Delete backup and update status to DELETED */
|
||||
delete_backup_files(backup);
|
||||
@ -238,7 +243,7 @@ do_retention_purge(void)
|
||||
if (backup_deleted)
|
||||
elog(INFO, "Purging finished");
|
||||
else
|
||||
elog(INFO, "Nothing to delete by retention policy");
|
||||
elog(INFO, "There are no backups to delete by retention policy");
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -275,8 +280,7 @@ delete_backup_files(pgBackup *backup)
|
||||
* Update STATUS to BACKUP_STATUS_DELETING in preparation for the case which
|
||||
* the error occurs before deleting all backup files.
|
||||
*/
|
||||
backup->status = BACKUP_STATUS_DELETING;
|
||||
write_backup_status(backup);
|
||||
write_backup_status(backup, BACKUP_STATUS_DELETING);
|
||||
|
||||
/* list files to be deleted */
|
||||
files = parray_new();
|
||||
|
@ -227,11 +227,8 @@ merge_backups(pgBackup *to_backup, pgBackup *from_backup)
|
||||
if (from_backup->status == BACKUP_STATUS_DELETING)
|
||||
goto delete_source_backup;
|
||||
|
||||
to_backup->status = BACKUP_STATUS_MERGING;
|
||||
write_backup_status(to_backup);
|
||||
|
||||
from_backup->status = BACKUP_STATUS_MERGING;
|
||||
write_backup_status(from_backup);
|
||||
write_backup_status(to_backup, BACKUP_STATUS_MERGING);
|
||||
write_backup_status(from_backup, BACKUP_STATUS_MERGING);
|
||||
|
||||
create_data_directories(to_database_path, from_backup_path, false);
|
||||
|
||||
|
@ -484,8 +484,7 @@ validate_backup_wal_from_start_to_stop(pgBackup *backup,
|
||||
* If we don't have WAL between start_lsn and stop_lsn,
|
||||
* the backup is definitely corrupted. Update its status.
|
||||
*/
|
||||
backup->status = BACKUP_STATUS_CORRUPT;
|
||||
write_backup_status(backup);
|
||||
write_backup_status(backup, BACKUP_STATUS_CORRUPT);
|
||||
|
||||
elog(WARNING, "There are not enough WAL records to consistenly restore "
|
||||
"backup %s from START LSN: %X/%X to STOP LSN: %X/%X",
|
||||
|
@ -459,8 +459,8 @@ extern int do_validate_all(void);
|
||||
/* in catalog.c */
|
||||
extern pgBackup *read_backup(time_t timestamp);
|
||||
extern void write_backup(pgBackup *backup);
|
||||
extern void write_backup_status(pgBackup *backup);
|
||||
extern void lock_backup(pgBackup *backup);
|
||||
extern void write_backup_status(pgBackup *backup, BackupStatus status);
|
||||
extern bool lock_backup(pgBackup *backup);
|
||||
|
||||
extern const char *pgBackupGetBackupMode(pgBackup *backup);
|
||||
|
||||
|
@ -209,8 +209,7 @@ do_restore_or_validate(time_t target_backup_id, pgRecoveryTarget *rt,
|
||||
{
|
||||
if (backup->status == BACKUP_STATUS_OK)
|
||||
{
|
||||
backup->status = BACKUP_STATUS_ORPHAN;
|
||||
write_backup_status(backup);
|
||||
write_backup_status(backup, BACKUP_STATUS_ORPHAN);
|
||||
|
||||
elog(WARNING, "Backup %s is orphaned because his parent %s is missing",
|
||||
base36enc(backup->start_time), missing_backup_id);
|
||||
@ -242,8 +241,7 @@ do_restore_or_validate(time_t target_backup_id, pgRecoveryTarget *rt,
|
||||
{
|
||||
if (backup->status == BACKUP_STATUS_OK)
|
||||
{
|
||||
backup->status = BACKUP_STATUS_ORPHAN;
|
||||
write_backup_status(backup);
|
||||
write_backup_status(backup, BACKUP_STATUS_ORPHAN);
|
||||
|
||||
elog(WARNING,
|
||||
"Backup %s is orphaned because his parent %s has status: %s",
|
||||
@ -317,7 +315,10 @@ do_restore_or_validate(time_t target_backup_id, pgRecoveryTarget *rt,
|
||||
{
|
||||
tmp_backup = (pgBackup *) parray_get(parent_chain, i);
|
||||
|
||||
lock_backup(tmp_backup);
|
||||
/* Do not interrupt, validate the next backup */
|
||||
if (!lock_backup(tmp_backup))
|
||||
continue;
|
||||
|
||||
pgBackupValidate(tmp_backup);
|
||||
/* Maybe we should be more paranoid and check for !BACKUP_STATUS_OK? */
|
||||
if (tmp_backup->status == BACKUP_STATUS_CORRUPT)
|
||||
@ -360,8 +361,7 @@ do_restore_or_validate(time_t target_backup_id, pgRecoveryTarget *rt,
|
||||
{
|
||||
if (backup->status == BACKUP_STATUS_OK)
|
||||
{
|
||||
backup->status = BACKUP_STATUS_ORPHAN;
|
||||
write_backup_status(backup);
|
||||
write_backup_status(backup, BACKUP_STATUS_ORPHAN);
|
||||
|
||||
elog(WARNING, "Backup %s is orphaned because his parent %s has status: %s",
|
||||
base36enc(backup->start_time),
|
||||
@ -409,8 +409,8 @@ do_restore_or_validate(time_t target_backup_id, pgRecoveryTarget *rt,
|
||||
* Backup was locked during validation if no-validate wasn't
|
||||
* specified.
|
||||
*/
|
||||
if (rt->restore_no_validate)
|
||||
lock_backup(backup);
|
||||
if (rt->restore_no_validate && !lock_backup(backup))
|
||||
elog(ERROR, "Cannot lock backup directory");
|
||||
|
||||
restore_backup(backup);
|
||||
}
|
||||
|
@ -59,6 +59,15 @@ pgBackupValidate(pgBackup *backup)
|
||||
"Please upgrade pg_probackup binary.",
|
||||
PROGRAM_VERSION, base36enc(backup->start_time), backup->program_version);
|
||||
|
||||
if (backup->status == BACKUP_STATUS_RUNNING)
|
||||
{
|
||||
elog(WARNING, "Backup %s has status %s, change it to ERROR and skip validation",
|
||||
base36enc(backup->start_time), status2str(backup->status));
|
||||
write_backup_status(backup, BACKUP_STATUS_ERROR);
|
||||
corrupted_backup_found = true;
|
||||
return;
|
||||
}
|
||||
|
||||
/* Revalidation is attempted for DONE, ORPHAN and CORRUPT backups */
|
||||
if (backup->status != BACKUP_STATUS_OK &&
|
||||
backup->status != BACKUP_STATUS_DONE &&
|
||||
@ -143,8 +152,8 @@ pgBackupValidate(pgBackup *backup)
|
||||
parray_free(files);
|
||||
|
||||
/* Update backup status */
|
||||
backup->status = corrupted ? BACKUP_STATUS_CORRUPT : BACKUP_STATUS_OK;
|
||||
write_backup_status(backup);
|
||||
write_backup_status(backup, corrupted ? BACKUP_STATUS_CORRUPT :
|
||||
BACKUP_STATUS_OK);
|
||||
|
||||
if (corrupted)
|
||||
elog(WARNING, "Backup %s data files are corrupted", base36enc(backup->start_time));
|
||||
@ -385,8 +394,7 @@ do_validate_instance(void)
|
||||
/* orphanize current_backup */
|
||||
if (current_backup->status == BACKUP_STATUS_OK)
|
||||
{
|
||||
current_backup->status = BACKUP_STATUS_ORPHAN;
|
||||
write_backup_status(current_backup);
|
||||
write_backup_status(current_backup, BACKUP_STATUS_ORPHAN);
|
||||
elog(WARNING, "Backup %s is orphaned because his parent %s is missing",
|
||||
base36enc(current_backup->start_time),
|
||||
parent_backup_id);
|
||||
@ -410,8 +418,7 @@ do_validate_instance(void)
|
||||
/* orphanize current_backup */
|
||||
if (current_backup->status == BACKUP_STATUS_OK)
|
||||
{
|
||||
current_backup->status = BACKUP_STATUS_ORPHAN;
|
||||
write_backup_status(current_backup);
|
||||
write_backup_status(current_backup, BACKUP_STATUS_ORPHAN);
|
||||
elog(WARNING, "Backup %s is orphaned because his parent %s has status: %s",
|
||||
base36enc(current_backup->start_time), parent_backup_id,
|
||||
status2str(tmp_backup->status));
|
||||
@ -435,7 +442,9 @@ do_validate_instance(void)
|
||||
else
|
||||
base_full_backup = current_backup;
|
||||
|
||||
lock_backup(current_backup);
|
||||
/* Do not interrupt, validate the next backup */
|
||||
if (!lock_backup(current_backup))
|
||||
continue;
|
||||
/* Valiate backup files*/
|
||||
pgBackupValidate(current_backup);
|
||||
|
||||
@ -469,8 +478,7 @@ do_validate_instance(void)
|
||||
{
|
||||
if (backup->status == BACKUP_STATUS_OK)
|
||||
{
|
||||
backup->status = BACKUP_STATUS_ORPHAN;
|
||||
write_backup_status(backup);
|
||||
write_backup_status(backup, BACKUP_STATUS_ORPHAN);
|
||||
|
||||
elog(WARNING, "Backup %s is orphaned because his parent %s has status: %s",
|
||||
base36enc(backup->start_time),
|
||||
@ -522,7 +530,9 @@ do_validate_instance(void)
|
||||
|
||||
if (backup->status == BACKUP_STATUS_ORPHAN)
|
||||
{
|
||||
lock_backup(backup);
|
||||
/* Do not interrupt, validate the next backup */
|
||||
if (!lock_backup(backup))
|
||||
continue;
|
||||
/* Revaliate backup files*/
|
||||
pgBackupValidate(backup);
|
||||
|
||||
|
@ -12,7 +12,7 @@ class LockingTest(ProbackupTest, unittest.TestCase):
|
||||
# @unittest.skip("skip")
|
||||
# @unittest.expectedFailure
|
||||
def test_locking_running_1(self):
|
||||
"""
|
||||
"""
|
||||
make node, take full backup, stop it in the middle
|
||||
run validate, expect it to successfully executed,
|
||||
concurrect RUNNING backup with pid file and active process is legal
|
||||
@ -46,7 +46,7 @@ class LockingTest(ProbackupTest, unittest.TestCase):
|
||||
self.assertEqual(
|
||||
'RUNNING', self.show_pb(backup_dir, 'node')[1]['status'])
|
||||
|
||||
self.validate_pb(backup_dir)
|
||||
self.validate_pb(backup_dir, options=['--log-level-file=VERBOSE'])
|
||||
|
||||
self.assertEqual(
|
||||
'OK', self.show_pb(backup_dir, 'node')[0]['status'])
|
||||
@ -58,7 +58,7 @@ class LockingTest(ProbackupTest, unittest.TestCase):
|
||||
self.del_test_dir(module_name, fname)
|
||||
|
||||
def test_locking_running_2(self):
|
||||
"""
|
||||
"""
|
||||
make node, take full backup, stop it in the middle,
|
||||
kill process so no cleanup is done - pid file is in place,
|
||||
run validate, expect it to not successfully executed,
|
||||
@ -112,7 +112,7 @@ class LockingTest(ProbackupTest, unittest.TestCase):
|
||||
self.del_test_dir(module_name, fname)
|
||||
|
||||
def test_locking_running_3(self):
|
||||
"""
|
||||
"""
|
||||
make node, take full backup, stop it in the middle,
|
||||
terminate process, delete pid file,
|
||||
run validate, expect it to not successfully executed,
|
||||
@ -168,4 +168,4 @@ class LockingTest(ProbackupTest, unittest.TestCase):
|
||||
'ERROR', self.show_pb(backup_dir, 'node')[1]['status'])
|
||||
|
||||
# Clean after yourself
|
||||
self.del_test_dir(module_name, fname)
|
||||
self.del_test_dir(module_name, fname)
|
||||
|
Loading…
Reference in New Issue
Block a user