1
0
mirror of https://github.com/postgrespro/pg_probackup.git synced 2025-01-07 13:40:17 +02:00

Merge branch 'pgpro-1918'

This commit is contained in:
Grigory Smolkin 2019-02-22 19:33:41 +03:00
commit 9243a8a399
11 changed files with 830 additions and 123 deletions

View File

@ -959,9 +959,6 @@ do_backup(time_t start_time)
instance_config.master_user);
}
/* Get exclusive lock of backup catalog */
catalog_lock();
/*
* Ensure that backup directory was initialized for the same PostgreSQL
* instance we opened connection to. And that target backup database PGDATA
@ -971,7 +968,6 @@ do_backup(time_t start_time)
if (!is_remote_backup)
check_system_identifiers();
/* Start backup. Update backup status. */
current.status = BACKUP_STATUS_RUNNING;
current.start_time = start_time;
@ -980,7 +976,10 @@ do_backup(time_t start_time)
/* Create backup directory and BACKUP_CONTROL_FILE */
if (pgBackupCreateDir(&current))
elog(ERROR, "cannot create backup directory");
elog(ERROR, "Cannot create backup directory");
if (!lock_backup(&current))
elog(ERROR, "Cannot lock backup %s directory",
base36enc(current.start_time));
write_backup(&current);
elog(LOG, "Backup destination is initialized");

View File

@ -21,23 +21,74 @@ static const char *backupModes[] = {"", "PAGE", "PTRACK", "DELTA", "FULL"};
static pgBackup *readBackupControlFile(const char *path);
static bool exit_hook_registered = false;
static char lock_file[MAXPGPATH];
static parray *lock_files = NULL;
static void
unlink_lock_atexit(void)
{
int res;
res = unlink(lock_file);
if (res != 0 && res != ENOENT)
elog(WARNING, "%s: %s", lock_file, strerror(errno));
int i;
if (lock_files == NULL)
return;
for (i = 0; i < parray_num(lock_files); i++)
{
char *lock_file = (char *) parray_get(lock_files, i);
int res;
res = unlink(lock_file);
if (res != 0 && res != ENOENT)
elog(WARNING, "%s: %s", lock_file, strerror(errno));
}
parray_walk(lock_files, pfree);
parray_free(lock_files);
lock_files = NULL;
}
/*
* Create a lockfile.
* Read backup meta information from BACKUP_CONTROL_FILE.
* If no backup matches, return NULL.
*/
pgBackup *
read_backup(time_t timestamp)
{
pgBackup tmp;
char conf_path[MAXPGPATH];
tmp.start_time = timestamp;
pgBackupGetPath(&tmp, conf_path, lengthof(conf_path), BACKUP_CONTROL_FILE);
return readBackupControlFile(conf_path);
}
/*
* Save the backup status into BACKUP_CONTROL_FILE.
*
* We need to reread the backup using its ID and save it changing only its
* status.
*/
void
catalog_lock(void)
write_backup_status(pgBackup *backup, BackupStatus status)
{
pgBackup *tmp;
tmp = read_backup(backup->start_time);
backup->status = status;
tmp->status = backup->status;
write_backup(tmp);
pgBackupFree(tmp);
}
/*
* Create exclusive lockfile in the backup's directory.
*/
bool
lock_backup(pgBackup *backup)
{
char lock_file[MAXPGPATH];
int fd;
char buffer[MAXPGPATH * 2 + 256];
int ntries;
@ -46,7 +97,7 @@ catalog_lock(void)
pid_t my_pid,
my_p_pid;
join_path_components(lock_file, backup_instance_path, BACKUP_CATALOG_PID);
pgBackupGetPath(backup, lock_file, lengthof(lock_file), BACKUP_CATALOG_PID);
/*
* If the PID in the lockfile is our own PID or our parent's or
@ -99,7 +150,7 @@ catalog_lock(void)
* Couldn't create the pid file. Probably it already exists.
*/
if ((errno != EEXIST && errno != EACCES) || ntries > 100)
elog(ERROR, "could not create lock file \"%s\": %s",
elog(ERROR, "Could not create lock file \"%s\": %s",
lock_file, strerror(errno));
/*
@ -111,22 +162,22 @@ catalog_lock(void)
{
if (errno == ENOENT)
continue; /* race condition; try again */
elog(ERROR, "could not open lock file \"%s\": %s",
elog(ERROR, "Could not open lock file \"%s\": %s",
lock_file, strerror(errno));
}
if ((len = read(fd, buffer, sizeof(buffer) - 1)) < 0)
elog(ERROR, "could not read lock file \"%s\": %s",
elog(ERROR, "Could not read lock file \"%s\": %s",
lock_file, strerror(errno));
close(fd);
if (len == 0)
elog(ERROR, "lock file \"%s\" is empty", lock_file);
elog(ERROR, "Lock file \"%s\" is empty", lock_file);
buffer[len] = '\0';
encoded_pid = atoi(buffer);
if (encoded_pid <= 0)
elog(ERROR, "bogus data in lock file \"%s\": \"%s\"",
elog(ERROR, "Bogus data in lock file \"%s\": \"%s\"",
lock_file, buffer);
/*
@ -140,9 +191,21 @@ catalog_lock(void)
*/
if (encoded_pid != my_pid && encoded_pid != my_p_pid)
{
if (kill(encoded_pid, 0) == 0 ||
(errno != ESRCH && errno != EPERM))
elog(ERROR, "lock file \"%s\" already exists", lock_file);
if (kill(encoded_pid, 0) == 0)
{
elog(WARNING, "Process %d is using backup %s and still is running",
encoded_pid, base36enc(backup->start_time));
return false;
}
else
{
if (errno == ESRCH)
elog(WARNING, "Process %d which used backup %s no longer exists",
encoded_pid, base36enc(backup->start_time));
else
elog(ERROR, "Failed to send signal 0 to a process %d: %s",
encoded_pid, strerror(errno));
}
}
/*
@ -151,7 +214,7 @@ catalog_lock(void)
* would-be creators.
*/
if (unlink(lock_file) < 0)
elog(ERROR, "could not remove old lock file \"%s\": %s",
elog(ERROR, "Could not remove old lock file \"%s\": %s",
lock_file, strerror(errno));
}
@ -169,7 +232,7 @@ catalog_lock(void)
unlink(lock_file);
/* if write didn't set errno, assume problem is no disk space */
errno = save_errno ? save_errno : ENOSPC;
elog(ERROR, "could not write lock file \"%s\": %s",
elog(ERROR, "Could not write lock file \"%s\": %s",
lock_file, strerror(errno));
}
if (fsync(fd) != 0)
@ -179,7 +242,7 @@ catalog_lock(void)
close(fd);
unlink(lock_file);
errno = save_errno;
elog(ERROR, "could not write lock file \"%s\": %s",
elog(ERROR, "Could not write lock file \"%s\": %s",
lock_file, strerror(errno));
}
if (close(fd) != 0)
@ -188,7 +251,7 @@ catalog_lock(void)
unlink(lock_file);
errno = save_errno;
elog(ERROR, "could not write lock file \"%s\": %s",
elog(ERROR, "Culd not write lock file \"%s\": %s",
lock_file, strerror(errno));
}
@ -200,41 +263,13 @@ catalog_lock(void)
atexit(unlink_lock_atexit);
exit_hook_registered = true;
}
}
/*
* Read backup meta information from BACKUP_CONTROL_FILE.
* If no backup matches, return NULL.
*/
pgBackup *
read_backup(time_t timestamp)
{
pgBackup tmp;
char conf_path[MAXPGPATH];
/* Use parray so that the lock files are unlinked in a loop */
if (lock_files == NULL)
lock_files = parray_new();
parray_append(lock_files, pgut_strdup(lock_file));
tmp.start_time = timestamp;
pgBackupGetPath(&tmp, conf_path, lengthof(conf_path), BACKUP_CONTROL_FILE);
return readBackupControlFile(conf_path);
}
/*
* Save the backup status into BACKUP_CONTROL_FILE.
*
* We need to reread the backup using its ID and save it changing only its
* status.
*/
void
write_backup_status(pgBackup *backup)
{
pgBackup *tmp;
tmp = read_backup(backup->start_time);
tmp->status = backup->status;
write_backup(tmp);
pgBackupFree(tmp);
return true;
}
/*
@ -381,6 +416,31 @@ err_proc:
return NULL;
}
/*
* Lock list of backups. Function goes in backward direction.
*/
void
catalog_lock_backup_list(parray *backup_list, int from_idx, int to_idx)
{
int start_idx,
end_idx;
int i;
if (parray_num(backup_list) == 0)
return;
start_idx = Max(from_idx, to_idx);
end_idx = Min(from_idx, to_idx);
for (i = start_idx; i >= end_idx; i--)
{
pgBackup *backup = (pgBackup *) parray_get(backup_list, i);
if (!lock_backup(backup))
elog(ERROR, "Cannot lock backup %s directory",
base36enc(backup->start_time));
}
}
/*
* Find the last completed backup on given timeline
*/

View File

@ -28,9 +28,6 @@ do_delete(time_t backup_id)
XLogRecPtr oldest_lsn = InvalidXLogRecPtr;
TimeLineID oldest_tli = 0;
/* Get exclusive lock of backup catalog */
catalog_lock();
/* Get complete list of backups */
backup_list = catalog_get_backup_list(INVALID_BACKUP_ID);
@ -76,6 +73,8 @@ do_delete(time_t backup_id)
if (parray_num(delete_list) == 0)
elog(ERROR, "no backup found, cannot delete");
catalog_lock_backup_list(delete_list, parray_num(delete_list) - 1, 0);
/* Delete backups from the end of list */
for (i = (int) parray_num(delete_list) - 1; i >= 0; i--)
{
@ -146,9 +145,6 @@ do_retention_purge(void)
}
}
/* Get exclusive lock of backup catalog */
catalog_lock();
/* Get a complete list of backups. */
backup_list = catalog_get_backup_list(INVALID_BACKUP_ID);
if (parray_num(backup_list) == 0)
@ -206,6 +202,17 @@ do_retention_purge(void)
continue;
}
/*
* If the backup still is used do not interrupt go to the next
* backup.
*/
if (!lock_backup(backup))
{
elog(WARNING, "Cannot lock backup %s directory, skip purging",
base36enc(backup->start_time));
continue;
}
/* Delete backup and update status to DELETED */
delete_backup_files(backup);
backup_deleted = true;
@ -240,7 +247,7 @@ do_retention_purge(void)
if (backup_deleted)
elog(INFO, "Purging finished");
else
elog(INFO, "Nothing to delete by retention policy");
elog(INFO, "There are no backups to delete by retention policy");
return 0;
}
@ -277,8 +284,7 @@ delete_backup_files(pgBackup *backup)
* Update STATUS to BACKUP_STATUS_DELETING in preparation for the case which
* the error occurs before deleting all backup files.
*/
backup->status = BACKUP_STATUS_DELETING;
write_backup_status(backup);
write_backup_status(backup, BACKUP_STATUS_DELETING);
/* list files to be deleted */
files = parray_new();
@ -430,6 +436,8 @@ do_delete_instance(void)
/* Delete all backups. */
backup_list = catalog_get_backup_list(INVALID_BACKUP_ID);
catalog_lock_backup_list(backup_list, 0, parray_num(backup_list) - 1);
for (i = 0; i < parray_num(backup_list); i++)
{
pgBackup *backup = (pgBackup *) parray_get(backup_list, i);

View File

@ -61,8 +61,6 @@ do_merge(time_t backup_id)
elog(INFO, "Merge started");
catalog_lock();
/* Get list of all backups sorted in order of descending start time */
backups = catalog_get_backup_list(INVALID_BACKUP_ID);
@ -125,6 +123,8 @@ do_merge(time_t backup_id)
Assert(full_backup_idx != dest_backup_idx);
catalog_lock_backup_list(backups, full_backup_idx, dest_backup_idx);
/*
* Found target and full backups, merge them and intermediate backups
*/
@ -227,11 +227,8 @@ merge_backups(pgBackup *to_backup, pgBackup *from_backup)
if (from_backup->status == BACKUP_STATUS_DELETING)
goto delete_source_backup;
to_backup->status = BACKUP_STATUS_MERGING;
write_backup_status(to_backup);
from_backup->status = BACKUP_STATUS_MERGING;
write_backup_status(from_backup);
write_backup_status(to_backup, BACKUP_STATUS_MERGING);
write_backup_status(from_backup, BACKUP_STATUS_MERGING);
create_data_directories(to_database_path, from_backup_path, false);

View File

@ -484,8 +484,7 @@ validate_backup_wal_from_start_to_stop(pgBackup *backup,
* If we don't have WAL between start_lsn and stop_lsn,
* the backup is definitely corrupted. Update its status.
*/
backup->status = BACKUP_STATUS_CORRUPT;
write_backup_status(backup);
write_backup_status(backup, BACKUP_STATUS_CORRUPT);
elog(WARNING, "There are not enough WAL records to consistenly restore "
"backup %s from START LSN: %X/%X to STOP LSN: %X/%X",

View File

@ -43,7 +43,7 @@
#define PG_GLOBAL_DIR "global"
#define BACKUP_CONTROL_FILE "backup.control"
#define BACKUP_CATALOG_CONF_FILE "pg_probackup.conf"
#define BACKUP_CATALOG_PID "pg_probackup.pid"
#define BACKUP_CATALOG_PID "backup.pid"
#define DATABASE_FILE_LIST "backup_content.control"
#define PG_BACKUP_LABEL_FILE "backup_label"
#define PG_BLACK_LIST "black_list"
@ -459,14 +459,16 @@ extern int do_validate_all(void);
/* in catalog.c */
extern pgBackup *read_backup(time_t timestamp);
extern void write_backup(pgBackup *backup);
extern void write_backup_status(pgBackup *backup);
extern void write_backup_status(pgBackup *backup, BackupStatus status);
extern bool lock_backup(pgBackup *backup);
extern const char *pgBackupGetBackupMode(pgBackup *backup);
extern parray *catalog_get_backup_list(time_t requested_backup_id);
extern void catalog_lock_backup_list(parray *backup_list, int from_idx,
int to_idx);
extern pgBackup *catalog_get_last_data_backup(parray *backup_list,
TimeLineID tli);
extern void catalog_lock(void);
extern void pgBackupWriteControl(FILE *out, pgBackup *backup);
extern void write_backup_filelist(pgBackup *backup, parray *files,
const char *root);

View File

@ -75,8 +75,6 @@ do_restore_or_validate(time_t target_backup_id, pgRecoveryTarget *rt,
elog(LOG, "%s begin.", action);
/* Get exclusive lock of backup catalog */
catalog_lock();
/* Get list of all backups sorted in order of descending start time */
backups = catalog_get_backup_list(INVALID_BACKUP_ID);
@ -126,7 +124,8 @@ do_restore_or_validate(time_t target_backup_id, pgRecoveryTarget *rt,
{
if ((current_backup->status == BACKUP_STATUS_DONE ||
current_backup->status == BACKUP_STATUS_ORPHAN ||
current_backup->status == BACKUP_STATUS_CORRUPT)
current_backup->status == BACKUP_STATUS_CORRUPT ||
current_backup->status == BACKUP_STATUS_RUNNING)
&& !rt->restore_no_validate)
elog(WARNING, "Backup %s has status: %s",
base36enc(current_backup->start_time), status2str(current_backup->status));
@ -211,8 +210,7 @@ do_restore_or_validate(time_t target_backup_id, pgRecoveryTarget *rt,
{
if (backup->status == BACKUP_STATUS_OK)
{
backup->status = BACKUP_STATUS_ORPHAN;
write_backup_status(backup);
write_backup_status(backup, BACKUP_STATUS_ORPHAN);
elog(WARNING, "Backup %s is orphaned because his parent %s is missing",
base36enc(backup->start_time), missing_backup_id);
@ -244,8 +242,7 @@ do_restore_or_validate(time_t target_backup_id, pgRecoveryTarget *rt,
{
if (backup->status == BACKUP_STATUS_OK)
{
backup->status = BACKUP_STATUS_ORPHAN;
write_backup_status(backup);
write_backup_status(backup, BACKUP_STATUS_ORPHAN);
elog(WARNING,
"Backup %s is orphaned because his parent %s has status: %s",
@ -319,9 +316,27 @@ do_restore_or_validate(time_t target_backup_id, pgRecoveryTarget *rt,
{
tmp_backup = (pgBackup *) parray_get(parent_chain, i);
/* Do not interrupt, validate the next backup */
if (!lock_backup(tmp_backup))
{
if (is_restore)
elog(ERROR, "Cannot lock backup %s directory",
base36enc(tmp_backup->start_time));
else
{
elog(WARNING, "Cannot lock backup %s directory, skip validation",
base36enc(tmp_backup->start_time));
continue;
}
}
pgBackupValidate(tmp_backup);
/* Maybe we should be more paranoid and check for !BACKUP_STATUS_OK? */
if (tmp_backup->status == BACKUP_STATUS_CORRUPT)
/* After pgBackupValidate() only following backup
* states are possible: ERROR, RUNNING, CORRUPT and OK.
* Validate WAL only for OK, because there is no point
* in WAL validation for corrupted, errored or running backups.
*/
if (tmp_backup->status != BACKUP_STATUS_OK)
{
corrupted_backup = tmp_backup;
/* we need corrupted backup index from 'backups' not parent_chain
@ -361,8 +376,7 @@ do_restore_or_validate(time_t target_backup_id, pgRecoveryTarget *rt,
{
if (backup->status == BACKUP_STATUS_OK)
{
backup->status = BACKUP_STATUS_ORPHAN;
write_backup_status(backup);
write_backup_status(backup, BACKUP_STATUS_ORPHAN);
elog(WARNING, "Backup %s is orphaned because his parent %s has status: %s",
base36enc(backup->start_time),
@ -406,6 +420,13 @@ do_restore_or_validate(time_t target_backup_id, pgRecoveryTarget *rt,
elog(ERROR, "Backup %s was created for version %s which doesn't support recovery_target_lsn",
base36enc(dest_backup->start_time), dest_backup->server_version);
/*
* Backup was locked during validation if no-validate wasn't
* specified.
*/
if (rt->restore_no_validate && !lock_backup(backup))
elog(ERROR, "Cannot lock backup directory");
restore_backup(backup);
}

View File

@ -19,6 +19,7 @@ static void *pgBackupValidateFiles(void *arg);
static void do_validate_instance(void);
static bool corrupted_backup_found = false;
static bool skipped_due_to_lock = false;
typedef struct
{
@ -59,6 +60,15 @@ pgBackupValidate(pgBackup *backup)
"Please upgrade pg_probackup binary.",
PROGRAM_VERSION, base36enc(backup->start_time), backup->program_version);
if (backup->status == BACKUP_STATUS_RUNNING)
{
elog(WARNING, "Backup %s has status %s, change it to ERROR and skip validation",
base36enc(backup->start_time), status2str(backup->status));
write_backup_status(backup, BACKUP_STATUS_ERROR);
corrupted_backup_found = true;
return;
}
/* Revalidation is attempted for DONE, ORPHAN and CORRUPT backups */
if (backup->status != BACKUP_STATUS_OK &&
backup->status != BACKUP_STATUS_DONE &&
@ -143,8 +153,8 @@ pgBackupValidate(pgBackup *backup)
parray_free(files);
/* Update backup status */
backup->status = corrupted ? BACKUP_STATUS_CORRUPT : BACKUP_STATUS_OK;
write_backup_status(backup);
write_backup_status(backup, corrupted ? BACKUP_STATUS_CORRUPT :
BACKUP_STATUS_OK);
if (corrupted)
elog(WARNING, "Backup %s data files are corrupted", base36enc(backup->start_time));
@ -278,6 +288,9 @@ pgBackupValidateFiles(void *arg)
int
do_validate_all(void)
{
corrupted_backup_found = false;
skipped_due_to_lock = false;
if (instance_name == NULL)
{
/* Show list of instances */
@ -330,12 +343,16 @@ do_validate_all(void)
do_validate_instance();
}
if (skipped_due_to_lock)
elog(WARNING, "Some backups weren't locked and they were skipped");
if (corrupted_backup_found)
{
elog(WARNING, "Some backups are not valid");
return 1;
}
else
if (!skipped_due_to_lock && !corrupted_backup_found)
elog(INFO, "All backups are valid");
return 0;
@ -355,9 +372,6 @@ do_validate_instance(void)
elog(INFO, "Validate backups of the instance '%s'", instance_name);
/* Get exclusive lock of backup catalog */
catalog_lock();
/* Get list of all backups sorted in order of descending start time */
backups = catalog_get_backup_list(INVALID_BACKUP_ID);
@ -388,8 +402,7 @@ do_validate_instance(void)
/* orphanize current_backup */
if (current_backup->status == BACKUP_STATUS_OK)
{
current_backup->status = BACKUP_STATUS_ORPHAN;
write_backup_status(current_backup);
write_backup_status(current_backup, BACKUP_STATUS_ORPHAN);
elog(WARNING, "Backup %s is orphaned because his parent %s is missing",
base36enc(current_backup->start_time),
parent_backup_id);
@ -413,8 +426,7 @@ do_validate_instance(void)
/* orphanize current_backup */
if (current_backup->status == BACKUP_STATUS_OK)
{
current_backup->status = BACKUP_STATUS_ORPHAN;
write_backup_status(current_backup);
write_backup_status(current_backup, BACKUP_STATUS_ORPHAN);
elog(WARNING, "Backup %s is orphaned because his parent %s has status: %s",
base36enc(current_backup->start_time), parent_backup_id,
status2str(tmp_backup->status));
@ -438,6 +450,14 @@ do_validate_instance(void)
else
base_full_backup = current_backup;
/* Do not interrupt, validate the next backup */
if (!lock_backup(current_backup))
{
elog(WARNING, "Cannot lock backup %s directory, skip validation",
base36enc(current_backup->start_time));
skipped_due_to_lock = true;
continue;
}
/* Valiate backup files*/
pgBackupValidate(current_backup);
@ -450,14 +470,14 @@ do_validate_instance(void)
/*
* Mark every descendant of corrupted backup as orphan
*/
if (current_backup->status == BACKUP_STATUS_CORRUPT)
if (current_backup->status != BACKUP_STATUS_OK)
{
/* This is ridiculous but legal.
* PAGE1_2b <- OK
* PAGE1_2a <- OK
* PAGE1_1b <- ORPHAN
* PAGE1_1a <- CORRUPT
* FULL1 <- OK
* PAGE_b2 <- OK
* PAGE_a2 <- OK
* PAGE_b1 <- ORPHAN
* PAGE_a1 <- CORRUPT
* FULL <- OK
*/
corrupted_backup_found = true;
@ -471,8 +491,7 @@ do_validate_instance(void)
{
if (backup->status == BACKUP_STATUS_OK)
{
backup->status = BACKUP_STATUS_ORPHAN;
write_backup_status(backup);
write_backup_status(backup, BACKUP_STATUS_ORPHAN);
elog(WARNING, "Backup %s is orphaned because his parent %s has status: %s",
base36enc(backup->start_time),
@ -497,14 +516,14 @@ do_validate_instance(void)
pgBackup *tmp_backup = NULL;
int result;
//PAGE3b ORPHAN
//PAGE2b ORPHAN -----
//PAGE6a ORPHAN |
//PAGE5a CORRUPT |
//PAGE4a missing |
//PAGE3a missing |
//PAGE2a ORPHAN |
//PAGE1a OK <- we are here <-|
//PAGE_b2 ORPHAN
//PAGE_b1 ORPHAN -----
//PAGE_a5 ORPHAN |
//PAGE_a4 CORRUPT |
//PAGE_a3 missing |
//PAGE_a2 missing |
//PAGE_a1 ORPHAN |
//PAGE OK <- we are here<-|
//FULL OK
if (is_parent(current_backup->start_time, backup, false))
@ -524,6 +543,14 @@ do_validate_instance(void)
if (backup->status == BACKUP_STATUS_ORPHAN)
{
/* Do not interrupt, validate the next backup */
if (!lock_backup(backup))
{
elog(WARNING, "Cannot lock backup %s directory, skip validation",
base36enc(backup->start_time));
skipped_due_to_lock = true;
continue;
}
/* Revaliate backup files*/
pgBackupValidate(backup);

View File

@ -806,7 +806,7 @@ class ProbackupTest(object):
def validate_pb(
self, backup_dir, instance=None,
backup_id=None, options=[], old_binary=False
backup_id=None, options=[], old_binary=False, gdb=False
):
cmd_list = [
@ -818,7 +818,7 @@ class ProbackupTest(object):
if backup_id:
cmd_list += ['-i', backup_id]
return self.run_pb(cmd_list + options, old_binary=old_binary)
return self.run_pb(cmd_list + options, old_binary=old_binary, gdb=gdb)
def delete_pb(
self, backup_dir, instance,
@ -1383,7 +1383,7 @@ class GDBobj(ProbackupTest):
if line.startswith('*stopped,reason="breakpoint-hit"'):
continue
if (
line.startswith('*stopped,reason="exited-normally"') or
line.startswith('*stopped,reason="exited"') or
line == '*stopped\n'
):
return
@ -1391,6 +1391,18 @@ class GDBobj(ProbackupTest):
'Failed to continue execution until exit.\n'
)
def continue_execution_until_error(self):
result = self._execute('continue', False)
for line in result:
if line.startswith('^error'):
return
if line.startswith('*stopped,reason="exited'):
return
raise GdbException(
'Failed to continue execution until error.\n')
def continue_execution_until_break(self, ignore_count=0):
if ignore_count > 0:
result = self._execute(
@ -1436,6 +1448,9 @@ class GDBobj(ProbackupTest):
print(repr(line))
if line.startswith('^done') or line.startswith('*stopped'):
break
if running and line.startswith('*running'):
if line.startswith('^error'):
break
if running and (line.startswith('*running') or line.startswith('^running')):
# if running and line.startswith('*running'):
break
return output

View File

@ -11,8 +11,68 @@ class LockingTest(ProbackupTest, unittest.TestCase):
# @unittest.skip("skip")
# @unittest.expectedFailure
def test_locking_simple(self):
""""""
def test_locking_running_validate_1(self):
"""
make node, take full backup, stop it in the middle
run validate, expect it to successfully executed,
concurrect RUNNING backup with pid file and active process is legal
"""
fname = self.id().split('.')[3]
node = self.make_simple_node(
base_dir=os.path.join(module_name, fname, 'node'),
initdb_params=['--data-checksums'],
pg_options={'wal_level': 'replica'})
backup_dir = os.path.join(self.tmp_path, module_name, fname, 'backup')
self.init_pb(backup_dir)
self.add_instance(backup_dir, 'node', node)
self.set_archiving(backup_dir, 'node', node)
node.slow_start()
self.backup_node(backup_dir, 'node', node)
gdb = self.backup_node(
backup_dir, 'node', node, gdb=True)
gdb.set_breakpoint('copy_file')
gdb.run_until_break()
if gdb.continue_execution_until_break(20) != 'breakpoint-hit':
self.AssertTrue(False, 'Failed to hit breakpoint')
self.assertEqual(
'OK', self.show_pb(backup_dir, 'node')[0]['status'])
self.assertEqual(
'RUNNING', self.show_pb(backup_dir, 'node')[1]['status'])
validate_output = self.validate_pb(
backup_dir, options=['--log-level-console=LOG'])
backup_id = self.show_pb(backup_dir, 'node')[1]['id']
self.assertIn(
"is using backup {0} and still is running".format(backup_id),
validate_output,
'\n Unexpected Validate Output: {0}\n'.format(repr(validate_output)))
self.assertEqual(
'OK', self.show_pb(backup_dir, 'node')[0]['status'])
self.assertEqual(
'RUNNING', self.show_pb(backup_dir, 'node')[1]['status'])
# Clean after yourself
# self.del_test_dir(module_name, fname)
def test_locking_running_validate_2(self):
"""
make node, take full backup, stop it in the middle,
kill process so no cleanup is done - pid file is in place,
run validate, expect it to not successfully executed,
RUNNING backup with pid file AND without active pid is legal,
but his status must be changed to ERROR and pid file is deleted
"""
fname = self.id().split('.')[3]
node = self.make_simple_node(
base_dir=os.path.join(module_name, fname, 'node'),
@ -37,6 +97,34 @@ class LockingTest(ProbackupTest, unittest.TestCase):
self.AssertTrue(False, 'Failed to hit breakpoint')
gdb._execute('signal SIGKILL')
gdb.continue_execution_until_error()
self.assertEqual(
'OK', self.show_pb(backup_dir, 'node')[0]['status'])
self.assertEqual(
'RUNNING', self.show_pb(backup_dir, 'node')[1]['status'])
backup_id = self.show_pb(backup_dir, 'node')[1]['id']
try:
self.validate_pb(backup_dir)
self.assertEqual(
1, 0,
"Expecting Error because RUNNING backup is no longer active.\n "
"Output: {0} \n CMD: {1}".format(
repr(self.output), self.cmd))
except ProbackupException as e:
self.assertTrue(
"which used backup {0} no longer exists".format(
backup_id) in e.message and
"Backup {0} has status RUNNING, change it "
"to ERROR and skip validation".format(
backup_id) in e.message and
"WARNING: Some backups are not valid" in
e.message,
'\n Unexpected Error Message: {0}\n CMD: {1}'.format(
repr(e.message), self.cmd))
self.assertEqual(
'OK', self.show_pb(backup_dir, 'node')[0]['status'])
@ -45,4 +133,321 @@ class LockingTest(ProbackupTest, unittest.TestCase):
'ERROR', self.show_pb(backup_dir, 'node')[1]['status'])
# Clean after yourself
self.del_test_dir(module_name, fname)
self.del_test_dir(module_name, fname)
def test_locking_running_validate_2_specific_id(self):
"""
make node, take full backup, stop it in the middle,
kill process so no cleanup is done - pid file is in place,
run validate on this specific backup,
expect it to not successfully executed,
RUNNING backup with pid file AND without active pid is legal,
but his status must be changed to ERROR and pid file is deleted
"""
fname = self.id().split('.')[3]
node = self.make_simple_node(
base_dir=os.path.join(module_name, fname, 'node'),
initdb_params=['--data-checksums'],
pg_options={'wal_level': 'replica'})
backup_dir = os.path.join(self.tmp_path, module_name, fname, 'backup')
self.init_pb(backup_dir)
self.add_instance(backup_dir, 'node', node)
self.set_archiving(backup_dir, 'node', node)
node.slow_start()
self.backup_node(backup_dir, 'node', node)
gdb = self.backup_node(
backup_dir, 'node', node, gdb=True)
gdb.set_breakpoint('copy_file')
gdb.run_until_break()
if gdb.continue_execution_until_break(20) != 'breakpoint-hit':
self.AssertTrue(False, 'Failed to hit breakpoint')
gdb._execute('signal SIGKILL')
gdb.continue_execution_until_error()
self.assertEqual(
'OK', self.show_pb(backup_dir, 'node')[0]['status'])
self.assertEqual(
'RUNNING', self.show_pb(backup_dir, 'node')[1]['status'])
backup_id = self.show_pb(backup_dir, 'node')[1]['id']
try:
self.validate_pb(backup_dir, 'node', backup_id)
self.assertEqual(
1, 0,
"Expecting Error because RUNNING backup is no longer active.\n "
"Output: {0} \n CMD: {1}".format(
repr(self.output), self.cmd))
except ProbackupException as e:
self.assertTrue(
"which used backup {0} no longer exists".format(
backup_id) in e.message and
"Backup {0} has status RUNNING, change it "
"to ERROR and skip validation".format(
backup_id) in e.message and
"ERROR: Backup {0} has status: ERROR".format(backup_id) in
e.message,
'\n Unexpected Error Message: {0}\n CMD: {1}'.format(
repr(e.message), self.cmd))
self.assertEqual(
'OK', self.show_pb(backup_dir, 'node')[0]['status'])
self.assertEqual(
'ERROR', self.show_pb(backup_dir, 'node')[1]['status'])
try:
self.validate_pb(backup_dir, 'node', backup_id)
self.assertEqual(
1, 0,
"Expecting Error because backup has status ERROR.\n "
"Output: {0} \n CMD: {1}".format(
repr(self.output), self.cmd))
except ProbackupException as e:
self.assertIn(
"ERROR: Backup {0} has status: ERROR".format(backup_id),
e.message,
'\n Unexpected Error Message: {0}\n CMD: {1}'.format(
repr(e.message), self.cmd))
try:
self.validate_pb(backup_dir)
self.assertEqual(
1, 0,
"Expecting Error because backup has status ERROR.\n "
"Output: {0} \n CMD: {1}".format(
repr(self.output), self.cmd))
except ProbackupException as e:
self.assertTrue(
"WARNING: Backup {0} has status ERROR. Skip validation".format(
backup_id) in e.message and
"WARNING: Some backups are not valid" in e.message,
'\n Unexpected Error Message: {0}\n CMD: {1}'.format(
repr(e.message), self.cmd))
# Clean after yourself
self.del_test_dir(module_name, fname)
def test_locking_running_3(self):
"""
make node, take full backup, stop it in the middle,
terminate process, delete pid file,
run validate, expect it to not successfully executed,
RUNNING backup without pid file AND without active pid is legal,
his status must be changed to ERROR
"""
fname = self.id().split('.')[3]
node = self.make_simple_node(
base_dir=os.path.join(module_name, fname, 'node'),
initdb_params=['--data-checksums'],
pg_options={'wal_level': 'replica'})
backup_dir = os.path.join(self.tmp_path, module_name, fname, 'backup')
self.init_pb(backup_dir)
self.add_instance(backup_dir, 'node', node)
self.set_archiving(backup_dir, 'node', node)
node.slow_start()
self.backup_node(backup_dir, 'node', node)
gdb = self.backup_node(
backup_dir, 'node', node, gdb=True)
gdb.set_breakpoint('copy_file')
gdb.run_until_break()
if gdb.continue_execution_until_break(20) != 'breakpoint-hit':
self.AssertTrue(False, 'Failed to hit breakpoint')
gdb._execute('signal SIGKILL')
gdb.continue_execution_until_error()
self.assertEqual(
'OK', self.show_pb(backup_dir, 'node')[0]['status'])
self.assertEqual(
'RUNNING', self.show_pb(backup_dir, 'node')[1]['status'])
backup_id = self.show_pb(backup_dir, 'node')[1]['id']
os.remove(
os.path.join(backup_dir, 'backups', 'node', backup_id, 'backup.pid'))
try:
self.validate_pb(backup_dir)
self.assertEqual(
1, 0,
"Expecting Error because RUNNING backup is no longer active.\n "
"Output: {0} \n CMD: {1}".format(
repr(self.output), self.cmd))
except ProbackupException as e:
self.assertTrue(
"Backup {0} has status RUNNING, change it "
"to ERROR and skip validation".format(
backup_id) in e.message and
"WARNING: Some backups are not valid" in
e.message,
'\n Unexpected Error Message: {0}\n CMD: {1}'.format(
repr(e.message), self.cmd))
self.assertEqual(
'OK', self.show_pb(backup_dir, 'node')[0]['status'])
self.assertEqual(
'ERROR', self.show_pb(backup_dir, 'node')[1]['status'])
# Clean after yourself
self.del_test_dir(module_name, fname)
def test_locking_restore_locked(self):
"""
make node, take full backup, take two page backups,
launch validate on PAGE1 and stop it in the middle,
launch restore of PAGE2.
Expect restore to fail because validation of
intermediate backup is impossible
"""
fname = self.id().split('.')[3]
node = self.make_simple_node(
base_dir=os.path.join(module_name, fname, 'node'),
initdb_params=['--data-checksums'],
pg_options={'wal_level': 'replica'})
backup_dir = os.path.join(self.tmp_path, module_name, fname, 'backup')
self.init_pb(backup_dir)
self.add_instance(backup_dir, 'node', node)
self.set_archiving(backup_dir, 'node', node)
node.slow_start()
# FULL
full_id = self.backup_node(backup_dir, 'node', node)
# PAGE1
backup_id = self.backup_node(backup_dir, 'node', node, backup_type='page')
# PAGE2
self.backup_node(backup_dir, 'node', node, backup_type='page')
gdb = self.validate_pb(
backup_dir, 'node', backup_id=backup_id, gdb=True)
gdb.set_breakpoint('pgBackupValidate')
gdb.run_until_break()
node.cleanup()
try:
self.restore_node(backup_dir, 'node', node)
self.assertEqual(
1, 0,
"Expecting Error because restore without whole chain validation "
"is prohibited unless --no-validate provided.\n "
"Output: {0} \n CMD: {1}".format(
repr(self.output), self.cmd))
except ProbackupException as e:
self.assertTrue(
"ERROR: Cannot lock backup {0} directory\n".format(full_id) in e.message,
'\n Unexpected Error Message: {0}\n CMD: {1}'.format(
repr(e.message), self.cmd))
# Clean after yourself
self.del_test_dir(module_name, fname)
def test_locking_restore_locked_without_validation(self):
"""
make node, take full backup, take page backup,
launch validate on FULL and stop it in the middle,
launch restore of PAGE.
Expect restore to fail because validation of
intermediate backup is impossible
"""
fname = self.id().split('.')[3]
node = self.make_simple_node(
base_dir=os.path.join(module_name, fname, 'node'),
initdb_params=['--data-checksums'],
pg_options={'wal_level': 'replica'})
backup_dir = os.path.join(self.tmp_path, module_name, fname, 'backup')
self.init_pb(backup_dir)
self.add_instance(backup_dir, 'node', node)
self.set_archiving(backup_dir, 'node', node)
node.slow_start()
# FULL
backup_id = self.backup_node(backup_dir, 'node', node)
# PAGE1
restore_id = self.backup_node(backup_dir, 'node', node, backup_type='page')
gdb = self.validate_pb(
backup_dir, 'node', backup_id=backup_id, gdb=True)
gdb.set_breakpoint('pgBackupValidate')
gdb.run_until_break()
node.cleanup()
try:
self.restore_node(
backup_dir, 'node', node, options=['--no-validate'])
self.assertEqual(
1, 0,
"Expecting Error because restore without whole chain validation "
"is prohibited unless --no-validate provided.\n "
"Output: {0} \n CMD: {1}".format(
repr(self.output), self.cmd))
except ProbackupException as e:
self.assertTrue(
"Backup {0} is used without validation".format(
restore_id) in e.message and
'is using backup {0} and still is running'.format(
backup_id) in e.message and
'ERROR: Cannot lock backup directory' in e.message,
'\n Unexpected Error Message: {0}\n CMD: {1}'.format(
repr(e.message), self.cmd))
# Clean after yourself
self.del_test_dir(module_name, fname)
def test_locking_concurrent_vaidate_and_backup(self):
"""
make node, take full backup, launch validate
and stop it in the middle, take page backup.
Expect PAGE backup to be successfully executed
"""
fname = self.id().split('.')[3]
node = self.make_simple_node(
base_dir=os.path.join(module_name, fname, 'node'),
initdb_params=['--data-checksums'],
pg_options={'wal_level': 'replica'})
backup_dir = os.path.join(self.tmp_path, module_name, fname, 'backup')
self.init_pb(backup_dir)
self.add_instance(backup_dir, 'node', node)
self.set_archiving(backup_dir, 'node', node)
node.slow_start()
# FULL
self.backup_node(backup_dir, 'node', node)
# PAGE2
backup_id = self.backup_node(backup_dir, 'node', node, backup_type='page')
gdb = self.validate_pb(
backup_dir, 'node', backup_id=backup_id, gdb=True)
gdb.set_breakpoint('pgBackupValidate')
gdb.run_until_break()
# This PAGE backup is expected to be successfull
self.backup_node(backup_dir, 'node', node, backup_type='page')
# Clean after yourself
self.del_test_dir(module_name, fname)

View File

@ -411,6 +411,180 @@ class ValidateTest(ProbackupTest, unittest.TestCase):
# Clean after yourself
self.del_test_dir(module_name, fname)
# @unittest.skip("skip")
def test_validate_specific_error_intermediate_backups(self):
"""
make archive node, take FULL, PAGE1, PAGE2 backups,
change backup status of FULL and PAGE1 to ERROR,
run validate on PAGE1
purpose of this test is to be sure that not only
CORRUPT backup descendants can be orphanized
"""
fname = self.id().split('.')[3]
node = self.make_simple_node(
base_dir=os.path.join(module_name, fname, 'node'),
initdb_params=['--data-checksums'],
pg_options={'wal_level': 'replica'}
)
backup_dir = os.path.join(self.tmp_path, module_name, fname, 'backup')
self.init_pb(backup_dir)
self.add_instance(backup_dir, 'node', node)
self.set_archiving(backup_dir, 'node', node)
node.slow_start()
# FULL
backup_id_1 = self.backup_node(backup_dir, 'node', node)
# PAGE1
backup_id_2 = self.backup_node(
backup_dir, 'node', node, backup_type='page')
# PAGE2
backup_id_3 = self.backup_node(
backup_dir, 'node', node, backup_type='page')
# Change FULL backup status to ERROR
control_path = os.path.join(
backup_dir, 'backups', 'node', backup_id_1, 'backup.control')
with open(control_path, 'r') as f:
actual_control = f.read()
new_control_file = ''
for line in actual_control.splitlines():
new_control_file += line.replace(
'status = OK', 'status = ERROR')
new_control_file += '\n'
with open(control_path, 'wt') as f:
f.write(new_control_file)
f.flush()
f.close()
# Validate PAGE1
try:
self.validate_pb(
backup_dir, 'node', backup_id=backup_id_2)
self.assertEqual(
1, 0,
"Expecting Error because backup has status ERROR.\n "
"Output: {0} \n CMD: {1}".format(
repr(self.output), self.cmd))
except ProbackupException as e:
self.assertTrue(
'WARNING: Backup {0} is orphaned because '
'his parent {1} has status: ERROR'.format(
backup_id_2, backup_id_1) in e.message and
'INFO: Validating parents for backup {0}'.format(
backup_id_2) in e.message and
'WARNING: Backup {0} has status ERROR. Skip validation.'.format(
backup_id_1) and
'ERROR: Backup {0} is orphan.'.format(backup_id_2) in e.message,
'\n Unexpected Error Message: {0}\n '
'CMD: {1}'.format(
repr(e.message), self.cmd))
self.assertEqual(
'ERROR',
self.show_pb(backup_dir, 'node', backup_id_1)['status'],
'Backup STATUS should be "ERROR"')
self.assertEqual(
'ORPHAN',
self.show_pb(backup_dir, 'node', backup_id_2)['status'],
'Backup STATUS should be "ORPHAN"')
self.assertEqual(
'ORPHAN',
self.show_pb(backup_dir, 'node', backup_id_3)['status'],
'Backup STATUS should be "ORPHAN"')
# Clean after yourself
self.del_test_dir(module_name, fname)
# @unittest.skip("skip")
def test_validate_error_intermediate_backups(self):
"""
make archive node, take FULL, PAGE1, PAGE2 backups,
change backup status of FULL and PAGE1 to ERROR,
run validate on instance
purpose of this test is to be sure that not only
CORRUPT backup descendants can be orphanized
"""
fname = self.id().split('.')[3]
node = self.make_simple_node(
base_dir=os.path.join(module_name, fname, 'node'),
initdb_params=['--data-checksums'],
pg_options={'wal_level': 'replica'}
)
backup_dir = os.path.join(self.tmp_path, module_name, fname, 'backup')
self.init_pb(backup_dir)
self.add_instance(backup_dir, 'node', node)
self.set_archiving(backup_dir, 'node', node)
node.slow_start()
# FULL
backup_id_1 = self.backup_node(backup_dir, 'node', node)
# PAGE1
backup_id_2 = self.backup_node(
backup_dir, 'node', node, backup_type='page')
# PAGE2
backup_id_3 = self.backup_node(
backup_dir, 'node', node, backup_type='page')
# Change FULL backup status to ERROR
control_path = os.path.join(
backup_dir, 'backups', 'node', backup_id_1, 'backup.control')
with open(control_path, 'r') as f:
actual_control = f.read()
new_control_file = ''
for line in actual_control.splitlines():
new_control_file += line.replace(
'status = OK', 'status = ERROR')
new_control_file += '\n'
with open(control_path, 'wt') as f:
f.write(new_control_file)
f.flush()
f.close()
# Validate instance
try:
self.validate_pb(backup_dir)
self.assertEqual(
1, 0,
"Expecting Error because backup has status ERROR.\n "
"Output: {0} \n CMD: {1}".format(
repr(self.output), self.cmd))
except ProbackupException as e:
self.assertTrue(
"WARNING: Backup {0} is orphaned because "
"his parent {1} has status: ERROR".format(
backup_id_2, backup_id_1) in e.message and
'WARNING: Backup {0} has status ERROR. Skip validation'.format(
backup_id_1) in e.message and
"WARNING: Some backups are not valid" in e.message,
'\n Unexpected Error Message: {0}\n CMD: {1}'.format(
repr(e.message), self.cmd))
self.assertEqual(
'ERROR',
self.show_pb(backup_dir, 'node', backup_id_1)['status'],
'Backup STATUS should be "ERROR"')
self.assertEqual(
'ORPHAN',
self.show_pb(backup_dir, 'node', backup_id_2)['status'],
'Backup STATUS should be "ORPHAN"')
self.assertEqual(
'ORPHAN',
self.show_pb(backup_dir, 'node', backup_id_3)['status'],
'Backup STATUS should be "ORPHAN"')
# Clean after yourself
self.del_test_dir(module_name, fname)
# @unittest.skip("skip")
def test_validate_corrupted_intermediate_backups_1(self):
"""