mirror of
https://github.com/postgrespro/pg_probackup.git
synced 2024-11-28 09:33:54 +02:00
Merge branch 'pgpro-1918'
This commit is contained in:
commit
9243a8a399
@ -959,9 +959,6 @@ do_backup(time_t start_time)
|
||||
instance_config.master_user);
|
||||
}
|
||||
|
||||
/* Get exclusive lock of backup catalog */
|
||||
catalog_lock();
|
||||
|
||||
/*
|
||||
* Ensure that backup directory was initialized for the same PostgreSQL
|
||||
* instance we opened connection to. And that target backup database PGDATA
|
||||
@ -971,7 +968,6 @@ do_backup(time_t start_time)
|
||||
if (!is_remote_backup)
|
||||
check_system_identifiers();
|
||||
|
||||
|
||||
/* Start backup. Update backup status. */
|
||||
current.status = BACKUP_STATUS_RUNNING;
|
||||
current.start_time = start_time;
|
||||
@ -980,7 +976,10 @@ do_backup(time_t start_time)
|
||||
|
||||
/* Create backup directory and BACKUP_CONTROL_FILE */
|
||||
if (pgBackupCreateDir(¤t))
|
||||
elog(ERROR, "cannot create backup directory");
|
||||
elog(ERROR, "Cannot create backup directory");
|
||||
if (!lock_backup(¤t))
|
||||
elog(ERROR, "Cannot lock backup %s directory",
|
||||
base36enc(current.start_time));
|
||||
write_backup(¤t);
|
||||
|
||||
elog(LOG, "Backup destination is initialized");
|
||||
|
166
src/catalog.c
166
src/catalog.c
@ -21,23 +21,74 @@ static const char *backupModes[] = {"", "PAGE", "PTRACK", "DELTA", "FULL"};
|
||||
static pgBackup *readBackupControlFile(const char *path);
|
||||
|
||||
static bool exit_hook_registered = false;
|
||||
static char lock_file[MAXPGPATH];
|
||||
static parray *lock_files = NULL;
|
||||
|
||||
static void
|
||||
unlink_lock_atexit(void)
|
||||
{
|
||||
int res;
|
||||
res = unlink(lock_file);
|
||||
if (res != 0 && res != ENOENT)
|
||||
elog(WARNING, "%s: %s", lock_file, strerror(errno));
|
||||
int i;
|
||||
|
||||
if (lock_files == NULL)
|
||||
return;
|
||||
|
||||
for (i = 0; i < parray_num(lock_files); i++)
|
||||
{
|
||||
char *lock_file = (char *) parray_get(lock_files, i);
|
||||
int res;
|
||||
|
||||
res = unlink(lock_file);
|
||||
if (res != 0 && res != ENOENT)
|
||||
elog(WARNING, "%s: %s", lock_file, strerror(errno));
|
||||
}
|
||||
|
||||
parray_walk(lock_files, pfree);
|
||||
parray_free(lock_files);
|
||||
lock_files = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Create a lockfile.
|
||||
* Read backup meta information from BACKUP_CONTROL_FILE.
|
||||
* If no backup matches, return NULL.
|
||||
*/
|
||||
pgBackup *
|
||||
read_backup(time_t timestamp)
|
||||
{
|
||||
pgBackup tmp;
|
||||
char conf_path[MAXPGPATH];
|
||||
|
||||
tmp.start_time = timestamp;
|
||||
pgBackupGetPath(&tmp, conf_path, lengthof(conf_path), BACKUP_CONTROL_FILE);
|
||||
|
||||
return readBackupControlFile(conf_path);
|
||||
}
|
||||
|
||||
/*
|
||||
* Save the backup status into BACKUP_CONTROL_FILE.
|
||||
*
|
||||
* We need to reread the backup using its ID and save it changing only its
|
||||
* status.
|
||||
*/
|
||||
void
|
||||
catalog_lock(void)
|
||||
write_backup_status(pgBackup *backup, BackupStatus status)
|
||||
{
|
||||
pgBackup *tmp;
|
||||
|
||||
tmp = read_backup(backup->start_time);
|
||||
|
||||
backup->status = status;
|
||||
tmp->status = backup->status;
|
||||
write_backup(tmp);
|
||||
|
||||
pgBackupFree(tmp);
|
||||
}
|
||||
|
||||
/*
|
||||
* Create exclusive lockfile in the backup's directory.
|
||||
*/
|
||||
bool
|
||||
lock_backup(pgBackup *backup)
|
||||
{
|
||||
char lock_file[MAXPGPATH];
|
||||
int fd;
|
||||
char buffer[MAXPGPATH * 2 + 256];
|
||||
int ntries;
|
||||
@ -46,7 +97,7 @@ catalog_lock(void)
|
||||
pid_t my_pid,
|
||||
my_p_pid;
|
||||
|
||||
join_path_components(lock_file, backup_instance_path, BACKUP_CATALOG_PID);
|
||||
pgBackupGetPath(backup, lock_file, lengthof(lock_file), BACKUP_CATALOG_PID);
|
||||
|
||||
/*
|
||||
* If the PID in the lockfile is our own PID or our parent's or
|
||||
@ -99,7 +150,7 @@ catalog_lock(void)
|
||||
* Couldn't create the pid file. Probably it already exists.
|
||||
*/
|
||||
if ((errno != EEXIST && errno != EACCES) || ntries > 100)
|
||||
elog(ERROR, "could not create lock file \"%s\": %s",
|
||||
elog(ERROR, "Could not create lock file \"%s\": %s",
|
||||
lock_file, strerror(errno));
|
||||
|
||||
/*
|
||||
@ -111,22 +162,22 @@ catalog_lock(void)
|
||||
{
|
||||
if (errno == ENOENT)
|
||||
continue; /* race condition; try again */
|
||||
elog(ERROR, "could not open lock file \"%s\": %s",
|
||||
elog(ERROR, "Could not open lock file \"%s\": %s",
|
||||
lock_file, strerror(errno));
|
||||
}
|
||||
if ((len = read(fd, buffer, sizeof(buffer) - 1)) < 0)
|
||||
elog(ERROR, "could not read lock file \"%s\": %s",
|
||||
elog(ERROR, "Could not read lock file \"%s\": %s",
|
||||
lock_file, strerror(errno));
|
||||
close(fd);
|
||||
|
||||
if (len == 0)
|
||||
elog(ERROR, "lock file \"%s\" is empty", lock_file);
|
||||
elog(ERROR, "Lock file \"%s\" is empty", lock_file);
|
||||
|
||||
buffer[len] = '\0';
|
||||
encoded_pid = atoi(buffer);
|
||||
|
||||
if (encoded_pid <= 0)
|
||||
elog(ERROR, "bogus data in lock file \"%s\": \"%s\"",
|
||||
elog(ERROR, "Bogus data in lock file \"%s\": \"%s\"",
|
||||
lock_file, buffer);
|
||||
|
||||
/*
|
||||
@ -140,9 +191,21 @@ catalog_lock(void)
|
||||
*/
|
||||
if (encoded_pid != my_pid && encoded_pid != my_p_pid)
|
||||
{
|
||||
if (kill(encoded_pid, 0) == 0 ||
|
||||
(errno != ESRCH && errno != EPERM))
|
||||
elog(ERROR, "lock file \"%s\" already exists", lock_file);
|
||||
if (kill(encoded_pid, 0) == 0)
|
||||
{
|
||||
elog(WARNING, "Process %d is using backup %s and still is running",
|
||||
encoded_pid, base36enc(backup->start_time));
|
||||
return false;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (errno == ESRCH)
|
||||
elog(WARNING, "Process %d which used backup %s no longer exists",
|
||||
encoded_pid, base36enc(backup->start_time));
|
||||
else
|
||||
elog(ERROR, "Failed to send signal 0 to a process %d: %s",
|
||||
encoded_pid, strerror(errno));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
@ -151,7 +214,7 @@ catalog_lock(void)
|
||||
* would-be creators.
|
||||
*/
|
||||
if (unlink(lock_file) < 0)
|
||||
elog(ERROR, "could not remove old lock file \"%s\": %s",
|
||||
elog(ERROR, "Could not remove old lock file \"%s\": %s",
|
||||
lock_file, strerror(errno));
|
||||
}
|
||||
|
||||
@ -169,7 +232,7 @@ catalog_lock(void)
|
||||
unlink(lock_file);
|
||||
/* if write didn't set errno, assume problem is no disk space */
|
||||
errno = save_errno ? save_errno : ENOSPC;
|
||||
elog(ERROR, "could not write lock file \"%s\": %s",
|
||||
elog(ERROR, "Could not write lock file \"%s\": %s",
|
||||
lock_file, strerror(errno));
|
||||
}
|
||||
if (fsync(fd) != 0)
|
||||
@ -179,7 +242,7 @@ catalog_lock(void)
|
||||
close(fd);
|
||||
unlink(lock_file);
|
||||
errno = save_errno;
|
||||
elog(ERROR, "could not write lock file \"%s\": %s",
|
||||
elog(ERROR, "Could not write lock file \"%s\": %s",
|
||||
lock_file, strerror(errno));
|
||||
}
|
||||
if (close(fd) != 0)
|
||||
@ -188,7 +251,7 @@ catalog_lock(void)
|
||||
|
||||
unlink(lock_file);
|
||||
errno = save_errno;
|
||||
elog(ERROR, "could not write lock file \"%s\": %s",
|
||||
elog(ERROR, "Culd not write lock file \"%s\": %s",
|
||||
lock_file, strerror(errno));
|
||||
}
|
||||
|
||||
@ -200,41 +263,13 @@ catalog_lock(void)
|
||||
atexit(unlink_lock_atexit);
|
||||
exit_hook_registered = true;
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Read backup meta information from BACKUP_CONTROL_FILE.
|
||||
* If no backup matches, return NULL.
|
||||
*/
|
||||
pgBackup *
|
||||
read_backup(time_t timestamp)
|
||||
{
|
||||
pgBackup tmp;
|
||||
char conf_path[MAXPGPATH];
|
||||
/* Use parray so that the lock files are unlinked in a loop */
|
||||
if (lock_files == NULL)
|
||||
lock_files = parray_new();
|
||||
parray_append(lock_files, pgut_strdup(lock_file));
|
||||
|
||||
tmp.start_time = timestamp;
|
||||
pgBackupGetPath(&tmp, conf_path, lengthof(conf_path), BACKUP_CONTROL_FILE);
|
||||
|
||||
return readBackupControlFile(conf_path);
|
||||
}
|
||||
|
||||
/*
|
||||
* Save the backup status into BACKUP_CONTROL_FILE.
|
||||
*
|
||||
* We need to reread the backup using its ID and save it changing only its
|
||||
* status.
|
||||
*/
|
||||
void
|
||||
write_backup_status(pgBackup *backup)
|
||||
{
|
||||
pgBackup *tmp;
|
||||
|
||||
tmp = read_backup(backup->start_time);
|
||||
|
||||
tmp->status = backup->status;
|
||||
write_backup(tmp);
|
||||
|
||||
pgBackupFree(tmp);
|
||||
return true;
|
||||
}
|
||||
|
||||
/*
|
||||
@ -381,6 +416,31 @@ err_proc:
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
* Lock list of backups. Function goes in backward direction.
|
||||
*/
|
||||
void
|
||||
catalog_lock_backup_list(parray *backup_list, int from_idx, int to_idx)
|
||||
{
|
||||
int start_idx,
|
||||
end_idx;
|
||||
int i;
|
||||
|
||||
if (parray_num(backup_list) == 0)
|
||||
return;
|
||||
|
||||
start_idx = Max(from_idx, to_idx);
|
||||
end_idx = Min(from_idx, to_idx);
|
||||
|
||||
for (i = start_idx; i >= end_idx; i--)
|
||||
{
|
||||
pgBackup *backup = (pgBackup *) parray_get(backup_list, i);
|
||||
if (!lock_backup(backup))
|
||||
elog(ERROR, "Cannot lock backup %s directory",
|
||||
base36enc(backup->start_time));
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Find the last completed backup on given timeline
|
||||
*/
|
||||
|
26
src/delete.c
26
src/delete.c
@ -28,9 +28,6 @@ do_delete(time_t backup_id)
|
||||
XLogRecPtr oldest_lsn = InvalidXLogRecPtr;
|
||||
TimeLineID oldest_tli = 0;
|
||||
|
||||
/* Get exclusive lock of backup catalog */
|
||||
catalog_lock();
|
||||
|
||||
/* Get complete list of backups */
|
||||
backup_list = catalog_get_backup_list(INVALID_BACKUP_ID);
|
||||
|
||||
@ -76,6 +73,8 @@ do_delete(time_t backup_id)
|
||||
if (parray_num(delete_list) == 0)
|
||||
elog(ERROR, "no backup found, cannot delete");
|
||||
|
||||
catalog_lock_backup_list(delete_list, parray_num(delete_list) - 1, 0);
|
||||
|
||||
/* Delete backups from the end of list */
|
||||
for (i = (int) parray_num(delete_list) - 1; i >= 0; i--)
|
||||
{
|
||||
@ -146,9 +145,6 @@ do_retention_purge(void)
|
||||
}
|
||||
}
|
||||
|
||||
/* Get exclusive lock of backup catalog */
|
||||
catalog_lock();
|
||||
|
||||
/* Get a complete list of backups. */
|
||||
backup_list = catalog_get_backup_list(INVALID_BACKUP_ID);
|
||||
if (parray_num(backup_list) == 0)
|
||||
@ -206,6 +202,17 @@ do_retention_purge(void)
|
||||
continue;
|
||||
}
|
||||
|
||||
/*
|
||||
* If the backup still is used do not interrupt go to the next
|
||||
* backup.
|
||||
*/
|
||||
if (!lock_backup(backup))
|
||||
{
|
||||
elog(WARNING, "Cannot lock backup %s directory, skip purging",
|
||||
base36enc(backup->start_time));
|
||||
continue;
|
||||
}
|
||||
|
||||
/* Delete backup and update status to DELETED */
|
||||
delete_backup_files(backup);
|
||||
backup_deleted = true;
|
||||
@ -240,7 +247,7 @@ do_retention_purge(void)
|
||||
if (backup_deleted)
|
||||
elog(INFO, "Purging finished");
|
||||
else
|
||||
elog(INFO, "Nothing to delete by retention policy");
|
||||
elog(INFO, "There are no backups to delete by retention policy");
|
||||
|
||||
return 0;
|
||||
}
|
||||
@ -277,8 +284,7 @@ delete_backup_files(pgBackup *backup)
|
||||
* Update STATUS to BACKUP_STATUS_DELETING in preparation for the case which
|
||||
* the error occurs before deleting all backup files.
|
||||
*/
|
||||
backup->status = BACKUP_STATUS_DELETING;
|
||||
write_backup_status(backup);
|
||||
write_backup_status(backup, BACKUP_STATUS_DELETING);
|
||||
|
||||
/* list files to be deleted */
|
||||
files = parray_new();
|
||||
@ -430,6 +436,8 @@ do_delete_instance(void)
|
||||
/* Delete all backups. */
|
||||
backup_list = catalog_get_backup_list(INVALID_BACKUP_ID);
|
||||
|
||||
catalog_lock_backup_list(backup_list, 0, parray_num(backup_list) - 1);
|
||||
|
||||
for (i = 0; i < parray_num(backup_list); i++)
|
||||
{
|
||||
pgBackup *backup = (pgBackup *) parray_get(backup_list, i);
|
||||
|
11
src/merge.c
11
src/merge.c
@ -61,8 +61,6 @@ do_merge(time_t backup_id)
|
||||
|
||||
elog(INFO, "Merge started");
|
||||
|
||||
catalog_lock();
|
||||
|
||||
/* Get list of all backups sorted in order of descending start time */
|
||||
backups = catalog_get_backup_list(INVALID_BACKUP_ID);
|
||||
|
||||
@ -125,6 +123,8 @@ do_merge(time_t backup_id)
|
||||
|
||||
Assert(full_backup_idx != dest_backup_idx);
|
||||
|
||||
catalog_lock_backup_list(backups, full_backup_idx, dest_backup_idx);
|
||||
|
||||
/*
|
||||
* Found target and full backups, merge them and intermediate backups
|
||||
*/
|
||||
@ -227,11 +227,8 @@ merge_backups(pgBackup *to_backup, pgBackup *from_backup)
|
||||
if (from_backup->status == BACKUP_STATUS_DELETING)
|
||||
goto delete_source_backup;
|
||||
|
||||
to_backup->status = BACKUP_STATUS_MERGING;
|
||||
write_backup_status(to_backup);
|
||||
|
||||
from_backup->status = BACKUP_STATUS_MERGING;
|
||||
write_backup_status(from_backup);
|
||||
write_backup_status(to_backup, BACKUP_STATUS_MERGING);
|
||||
write_backup_status(from_backup, BACKUP_STATUS_MERGING);
|
||||
|
||||
create_data_directories(to_database_path, from_backup_path, false);
|
||||
|
||||
|
@ -484,8 +484,7 @@ validate_backup_wal_from_start_to_stop(pgBackup *backup,
|
||||
* If we don't have WAL between start_lsn and stop_lsn,
|
||||
* the backup is definitely corrupted. Update its status.
|
||||
*/
|
||||
backup->status = BACKUP_STATUS_CORRUPT;
|
||||
write_backup_status(backup);
|
||||
write_backup_status(backup, BACKUP_STATUS_CORRUPT);
|
||||
|
||||
elog(WARNING, "There are not enough WAL records to consistenly restore "
|
||||
"backup %s from START LSN: %X/%X to STOP LSN: %X/%X",
|
||||
|
@ -43,7 +43,7 @@
|
||||
#define PG_GLOBAL_DIR "global"
|
||||
#define BACKUP_CONTROL_FILE "backup.control"
|
||||
#define BACKUP_CATALOG_CONF_FILE "pg_probackup.conf"
|
||||
#define BACKUP_CATALOG_PID "pg_probackup.pid"
|
||||
#define BACKUP_CATALOG_PID "backup.pid"
|
||||
#define DATABASE_FILE_LIST "backup_content.control"
|
||||
#define PG_BACKUP_LABEL_FILE "backup_label"
|
||||
#define PG_BLACK_LIST "black_list"
|
||||
@ -459,14 +459,16 @@ extern int do_validate_all(void);
|
||||
/* in catalog.c */
|
||||
extern pgBackup *read_backup(time_t timestamp);
|
||||
extern void write_backup(pgBackup *backup);
|
||||
extern void write_backup_status(pgBackup *backup);
|
||||
extern void write_backup_status(pgBackup *backup, BackupStatus status);
|
||||
extern bool lock_backup(pgBackup *backup);
|
||||
|
||||
extern const char *pgBackupGetBackupMode(pgBackup *backup);
|
||||
|
||||
extern parray *catalog_get_backup_list(time_t requested_backup_id);
|
||||
extern void catalog_lock_backup_list(parray *backup_list, int from_idx,
|
||||
int to_idx);
|
||||
extern pgBackup *catalog_get_last_data_backup(parray *backup_list,
|
||||
TimeLineID tli);
|
||||
extern void catalog_lock(void);
|
||||
extern void pgBackupWriteControl(FILE *out, pgBackup *backup);
|
||||
extern void write_backup_filelist(pgBackup *backup, parray *files,
|
||||
const char *root);
|
||||
|
@ -75,8 +75,6 @@ do_restore_or_validate(time_t target_backup_id, pgRecoveryTarget *rt,
|
||||
|
||||
elog(LOG, "%s begin.", action);
|
||||
|
||||
/* Get exclusive lock of backup catalog */
|
||||
catalog_lock();
|
||||
/* Get list of all backups sorted in order of descending start time */
|
||||
backups = catalog_get_backup_list(INVALID_BACKUP_ID);
|
||||
|
||||
@ -126,7 +124,8 @@ do_restore_or_validate(time_t target_backup_id, pgRecoveryTarget *rt,
|
||||
{
|
||||
if ((current_backup->status == BACKUP_STATUS_DONE ||
|
||||
current_backup->status == BACKUP_STATUS_ORPHAN ||
|
||||
current_backup->status == BACKUP_STATUS_CORRUPT)
|
||||
current_backup->status == BACKUP_STATUS_CORRUPT ||
|
||||
current_backup->status == BACKUP_STATUS_RUNNING)
|
||||
&& !rt->restore_no_validate)
|
||||
elog(WARNING, "Backup %s has status: %s",
|
||||
base36enc(current_backup->start_time), status2str(current_backup->status));
|
||||
@ -211,8 +210,7 @@ do_restore_or_validate(time_t target_backup_id, pgRecoveryTarget *rt,
|
||||
{
|
||||
if (backup->status == BACKUP_STATUS_OK)
|
||||
{
|
||||
backup->status = BACKUP_STATUS_ORPHAN;
|
||||
write_backup_status(backup);
|
||||
write_backup_status(backup, BACKUP_STATUS_ORPHAN);
|
||||
|
||||
elog(WARNING, "Backup %s is orphaned because his parent %s is missing",
|
||||
base36enc(backup->start_time), missing_backup_id);
|
||||
@ -244,8 +242,7 @@ do_restore_or_validate(time_t target_backup_id, pgRecoveryTarget *rt,
|
||||
{
|
||||
if (backup->status == BACKUP_STATUS_OK)
|
||||
{
|
||||
backup->status = BACKUP_STATUS_ORPHAN;
|
||||
write_backup_status(backup);
|
||||
write_backup_status(backup, BACKUP_STATUS_ORPHAN);
|
||||
|
||||
elog(WARNING,
|
||||
"Backup %s is orphaned because his parent %s has status: %s",
|
||||
@ -319,9 +316,27 @@ do_restore_or_validate(time_t target_backup_id, pgRecoveryTarget *rt,
|
||||
{
|
||||
tmp_backup = (pgBackup *) parray_get(parent_chain, i);
|
||||
|
||||
/* Do not interrupt, validate the next backup */
|
||||
if (!lock_backup(tmp_backup))
|
||||
{
|
||||
if (is_restore)
|
||||
elog(ERROR, "Cannot lock backup %s directory",
|
||||
base36enc(tmp_backup->start_time));
|
||||
else
|
||||
{
|
||||
elog(WARNING, "Cannot lock backup %s directory, skip validation",
|
||||
base36enc(tmp_backup->start_time));
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
pgBackupValidate(tmp_backup);
|
||||
/* Maybe we should be more paranoid and check for !BACKUP_STATUS_OK? */
|
||||
if (tmp_backup->status == BACKUP_STATUS_CORRUPT)
|
||||
/* After pgBackupValidate() only following backup
|
||||
* states are possible: ERROR, RUNNING, CORRUPT and OK.
|
||||
* Validate WAL only for OK, because there is no point
|
||||
* in WAL validation for corrupted, errored or running backups.
|
||||
*/
|
||||
if (tmp_backup->status != BACKUP_STATUS_OK)
|
||||
{
|
||||
corrupted_backup = tmp_backup;
|
||||
/* we need corrupted backup index from 'backups' not parent_chain
|
||||
@ -361,8 +376,7 @@ do_restore_or_validate(time_t target_backup_id, pgRecoveryTarget *rt,
|
||||
{
|
||||
if (backup->status == BACKUP_STATUS_OK)
|
||||
{
|
||||
backup->status = BACKUP_STATUS_ORPHAN;
|
||||
write_backup_status(backup);
|
||||
write_backup_status(backup, BACKUP_STATUS_ORPHAN);
|
||||
|
||||
elog(WARNING, "Backup %s is orphaned because his parent %s has status: %s",
|
||||
base36enc(backup->start_time),
|
||||
@ -406,6 +420,13 @@ do_restore_or_validate(time_t target_backup_id, pgRecoveryTarget *rt,
|
||||
elog(ERROR, "Backup %s was created for version %s which doesn't support recovery_target_lsn",
|
||||
base36enc(dest_backup->start_time), dest_backup->server_version);
|
||||
|
||||
/*
|
||||
* Backup was locked during validation if no-validate wasn't
|
||||
* specified.
|
||||
*/
|
||||
if (rt->restore_no_validate && !lock_backup(backup))
|
||||
elog(ERROR, "Cannot lock backup directory");
|
||||
|
||||
restore_backup(backup);
|
||||
}
|
||||
|
||||
|
@ -19,6 +19,7 @@ static void *pgBackupValidateFiles(void *arg);
|
||||
static void do_validate_instance(void);
|
||||
|
||||
static bool corrupted_backup_found = false;
|
||||
static bool skipped_due_to_lock = false;
|
||||
|
||||
typedef struct
|
||||
{
|
||||
@ -59,6 +60,15 @@ pgBackupValidate(pgBackup *backup)
|
||||
"Please upgrade pg_probackup binary.",
|
||||
PROGRAM_VERSION, base36enc(backup->start_time), backup->program_version);
|
||||
|
||||
if (backup->status == BACKUP_STATUS_RUNNING)
|
||||
{
|
||||
elog(WARNING, "Backup %s has status %s, change it to ERROR and skip validation",
|
||||
base36enc(backup->start_time), status2str(backup->status));
|
||||
write_backup_status(backup, BACKUP_STATUS_ERROR);
|
||||
corrupted_backup_found = true;
|
||||
return;
|
||||
}
|
||||
|
||||
/* Revalidation is attempted for DONE, ORPHAN and CORRUPT backups */
|
||||
if (backup->status != BACKUP_STATUS_OK &&
|
||||
backup->status != BACKUP_STATUS_DONE &&
|
||||
@ -143,8 +153,8 @@ pgBackupValidate(pgBackup *backup)
|
||||
parray_free(files);
|
||||
|
||||
/* Update backup status */
|
||||
backup->status = corrupted ? BACKUP_STATUS_CORRUPT : BACKUP_STATUS_OK;
|
||||
write_backup_status(backup);
|
||||
write_backup_status(backup, corrupted ? BACKUP_STATUS_CORRUPT :
|
||||
BACKUP_STATUS_OK);
|
||||
|
||||
if (corrupted)
|
||||
elog(WARNING, "Backup %s data files are corrupted", base36enc(backup->start_time));
|
||||
@ -278,6 +288,9 @@ pgBackupValidateFiles(void *arg)
|
||||
int
|
||||
do_validate_all(void)
|
||||
{
|
||||
corrupted_backup_found = false;
|
||||
skipped_due_to_lock = false;
|
||||
|
||||
if (instance_name == NULL)
|
||||
{
|
||||
/* Show list of instances */
|
||||
@ -330,12 +343,16 @@ do_validate_all(void)
|
||||
do_validate_instance();
|
||||
}
|
||||
|
||||
if (skipped_due_to_lock)
|
||||
elog(WARNING, "Some backups weren't locked and they were skipped");
|
||||
|
||||
if (corrupted_backup_found)
|
||||
{
|
||||
elog(WARNING, "Some backups are not valid");
|
||||
return 1;
|
||||
}
|
||||
else
|
||||
|
||||
if (!skipped_due_to_lock && !corrupted_backup_found)
|
||||
elog(INFO, "All backups are valid");
|
||||
|
||||
return 0;
|
||||
@ -355,9 +372,6 @@ do_validate_instance(void)
|
||||
|
||||
elog(INFO, "Validate backups of the instance '%s'", instance_name);
|
||||
|
||||
/* Get exclusive lock of backup catalog */
|
||||
catalog_lock();
|
||||
|
||||
/* Get list of all backups sorted in order of descending start time */
|
||||
backups = catalog_get_backup_list(INVALID_BACKUP_ID);
|
||||
|
||||
@ -388,8 +402,7 @@ do_validate_instance(void)
|
||||
/* orphanize current_backup */
|
||||
if (current_backup->status == BACKUP_STATUS_OK)
|
||||
{
|
||||
current_backup->status = BACKUP_STATUS_ORPHAN;
|
||||
write_backup_status(current_backup);
|
||||
write_backup_status(current_backup, BACKUP_STATUS_ORPHAN);
|
||||
elog(WARNING, "Backup %s is orphaned because his parent %s is missing",
|
||||
base36enc(current_backup->start_time),
|
||||
parent_backup_id);
|
||||
@ -413,8 +426,7 @@ do_validate_instance(void)
|
||||
/* orphanize current_backup */
|
||||
if (current_backup->status == BACKUP_STATUS_OK)
|
||||
{
|
||||
current_backup->status = BACKUP_STATUS_ORPHAN;
|
||||
write_backup_status(current_backup);
|
||||
write_backup_status(current_backup, BACKUP_STATUS_ORPHAN);
|
||||
elog(WARNING, "Backup %s is orphaned because his parent %s has status: %s",
|
||||
base36enc(current_backup->start_time), parent_backup_id,
|
||||
status2str(tmp_backup->status));
|
||||
@ -438,6 +450,14 @@ do_validate_instance(void)
|
||||
else
|
||||
base_full_backup = current_backup;
|
||||
|
||||
/* Do not interrupt, validate the next backup */
|
||||
if (!lock_backup(current_backup))
|
||||
{
|
||||
elog(WARNING, "Cannot lock backup %s directory, skip validation",
|
||||
base36enc(current_backup->start_time));
|
||||
skipped_due_to_lock = true;
|
||||
continue;
|
||||
}
|
||||
/* Valiate backup files*/
|
||||
pgBackupValidate(current_backup);
|
||||
|
||||
@ -450,14 +470,14 @@ do_validate_instance(void)
|
||||
/*
|
||||
* Mark every descendant of corrupted backup as orphan
|
||||
*/
|
||||
if (current_backup->status == BACKUP_STATUS_CORRUPT)
|
||||
if (current_backup->status != BACKUP_STATUS_OK)
|
||||
{
|
||||
/* This is ridiculous but legal.
|
||||
* PAGE1_2b <- OK
|
||||
* PAGE1_2a <- OK
|
||||
* PAGE1_1b <- ORPHAN
|
||||
* PAGE1_1a <- CORRUPT
|
||||
* FULL1 <- OK
|
||||
* PAGE_b2 <- OK
|
||||
* PAGE_a2 <- OK
|
||||
* PAGE_b1 <- ORPHAN
|
||||
* PAGE_a1 <- CORRUPT
|
||||
* FULL <- OK
|
||||
*/
|
||||
|
||||
corrupted_backup_found = true;
|
||||
@ -471,8 +491,7 @@ do_validate_instance(void)
|
||||
{
|
||||
if (backup->status == BACKUP_STATUS_OK)
|
||||
{
|
||||
backup->status = BACKUP_STATUS_ORPHAN;
|
||||
write_backup_status(backup);
|
||||
write_backup_status(backup, BACKUP_STATUS_ORPHAN);
|
||||
|
||||
elog(WARNING, "Backup %s is orphaned because his parent %s has status: %s",
|
||||
base36enc(backup->start_time),
|
||||
@ -497,14 +516,14 @@ do_validate_instance(void)
|
||||
pgBackup *tmp_backup = NULL;
|
||||
int result;
|
||||
|
||||
//PAGE3b ORPHAN
|
||||
//PAGE2b ORPHAN -----
|
||||
//PAGE6a ORPHAN |
|
||||
//PAGE5a CORRUPT |
|
||||
//PAGE4a missing |
|
||||
//PAGE3a missing |
|
||||
//PAGE2a ORPHAN |
|
||||
//PAGE1a OK <- we are here <-|
|
||||
//PAGE_b2 ORPHAN
|
||||
//PAGE_b1 ORPHAN -----
|
||||
//PAGE_a5 ORPHAN |
|
||||
//PAGE_a4 CORRUPT |
|
||||
//PAGE_a3 missing |
|
||||
//PAGE_a2 missing |
|
||||
//PAGE_a1 ORPHAN |
|
||||
//PAGE OK <- we are here<-|
|
||||
//FULL OK
|
||||
|
||||
if (is_parent(current_backup->start_time, backup, false))
|
||||
@ -524,6 +543,14 @@ do_validate_instance(void)
|
||||
|
||||
if (backup->status == BACKUP_STATUS_ORPHAN)
|
||||
{
|
||||
/* Do not interrupt, validate the next backup */
|
||||
if (!lock_backup(backup))
|
||||
{
|
||||
elog(WARNING, "Cannot lock backup %s directory, skip validation",
|
||||
base36enc(backup->start_time));
|
||||
skipped_due_to_lock = true;
|
||||
continue;
|
||||
}
|
||||
/* Revaliate backup files*/
|
||||
pgBackupValidate(backup);
|
||||
|
||||
|
@ -806,7 +806,7 @@ class ProbackupTest(object):
|
||||
|
||||
def validate_pb(
|
||||
self, backup_dir, instance=None,
|
||||
backup_id=None, options=[], old_binary=False
|
||||
backup_id=None, options=[], old_binary=False, gdb=False
|
||||
):
|
||||
|
||||
cmd_list = [
|
||||
@ -818,7 +818,7 @@ class ProbackupTest(object):
|
||||
if backup_id:
|
||||
cmd_list += ['-i', backup_id]
|
||||
|
||||
return self.run_pb(cmd_list + options, old_binary=old_binary)
|
||||
return self.run_pb(cmd_list + options, old_binary=old_binary, gdb=gdb)
|
||||
|
||||
def delete_pb(
|
||||
self, backup_dir, instance,
|
||||
@ -1383,7 +1383,7 @@ class GDBobj(ProbackupTest):
|
||||
if line.startswith('*stopped,reason="breakpoint-hit"'):
|
||||
continue
|
||||
if (
|
||||
line.startswith('*stopped,reason="exited-normally"') or
|
||||
line.startswith('*stopped,reason="exited"') or
|
||||
line == '*stopped\n'
|
||||
):
|
||||
return
|
||||
@ -1391,6 +1391,18 @@ class GDBobj(ProbackupTest):
|
||||
'Failed to continue execution until exit.\n'
|
||||
)
|
||||
|
||||
def continue_execution_until_error(self):
|
||||
result = self._execute('continue', False)
|
||||
|
||||
for line in result:
|
||||
if line.startswith('^error'):
|
||||
return
|
||||
if line.startswith('*stopped,reason="exited'):
|
||||
return
|
||||
|
||||
raise GdbException(
|
||||
'Failed to continue execution until error.\n')
|
||||
|
||||
def continue_execution_until_break(self, ignore_count=0):
|
||||
if ignore_count > 0:
|
||||
result = self._execute(
|
||||
@ -1436,6 +1448,9 @@ class GDBobj(ProbackupTest):
|
||||
print(repr(line))
|
||||
if line.startswith('^done') or line.startswith('*stopped'):
|
||||
break
|
||||
if running and line.startswith('*running'):
|
||||
if line.startswith('^error'):
|
||||
break
|
||||
if running and (line.startswith('*running') or line.startswith('^running')):
|
||||
# if running and line.startswith('*running'):
|
||||
break
|
||||
return output
|
||||
|
411
tests/locking.py
411
tests/locking.py
@ -11,8 +11,68 @@ class LockingTest(ProbackupTest, unittest.TestCase):
|
||||
|
||||
# @unittest.skip("skip")
|
||||
# @unittest.expectedFailure
|
||||
def test_locking_simple(self):
|
||||
""""""
|
||||
def test_locking_running_validate_1(self):
|
||||
"""
|
||||
make node, take full backup, stop it in the middle
|
||||
run validate, expect it to successfully executed,
|
||||
concurrect RUNNING backup with pid file and active process is legal
|
||||
"""
|
||||
fname = self.id().split('.')[3]
|
||||
node = self.make_simple_node(
|
||||
base_dir=os.path.join(module_name, fname, 'node'),
|
||||
initdb_params=['--data-checksums'],
|
||||
pg_options={'wal_level': 'replica'})
|
||||
|
||||
backup_dir = os.path.join(self.tmp_path, module_name, fname, 'backup')
|
||||
self.init_pb(backup_dir)
|
||||
self.add_instance(backup_dir, 'node', node)
|
||||
self.set_archiving(backup_dir, 'node', node)
|
||||
node.slow_start()
|
||||
|
||||
self.backup_node(backup_dir, 'node', node)
|
||||
|
||||
gdb = self.backup_node(
|
||||
backup_dir, 'node', node, gdb=True)
|
||||
|
||||
gdb.set_breakpoint('copy_file')
|
||||
gdb.run_until_break()
|
||||
|
||||
if gdb.continue_execution_until_break(20) != 'breakpoint-hit':
|
||||
self.AssertTrue(False, 'Failed to hit breakpoint')
|
||||
|
||||
self.assertEqual(
|
||||
'OK', self.show_pb(backup_dir, 'node')[0]['status'])
|
||||
|
||||
self.assertEqual(
|
||||
'RUNNING', self.show_pb(backup_dir, 'node')[1]['status'])
|
||||
|
||||
validate_output = self.validate_pb(
|
||||
backup_dir, options=['--log-level-console=LOG'])
|
||||
|
||||
backup_id = self.show_pb(backup_dir, 'node')[1]['id']
|
||||
|
||||
self.assertIn(
|
||||
"is using backup {0} and still is running".format(backup_id),
|
||||
validate_output,
|
||||
'\n Unexpected Validate Output: {0}\n'.format(repr(validate_output)))
|
||||
|
||||
self.assertEqual(
|
||||
'OK', self.show_pb(backup_dir, 'node')[0]['status'])
|
||||
|
||||
self.assertEqual(
|
||||
'RUNNING', self.show_pb(backup_dir, 'node')[1]['status'])
|
||||
|
||||
# Clean after yourself
|
||||
# self.del_test_dir(module_name, fname)
|
||||
|
||||
def test_locking_running_validate_2(self):
|
||||
"""
|
||||
make node, take full backup, stop it in the middle,
|
||||
kill process so no cleanup is done - pid file is in place,
|
||||
run validate, expect it to not successfully executed,
|
||||
RUNNING backup with pid file AND without active pid is legal,
|
||||
but his status must be changed to ERROR and pid file is deleted
|
||||
"""
|
||||
fname = self.id().split('.')[3]
|
||||
node = self.make_simple_node(
|
||||
base_dir=os.path.join(module_name, fname, 'node'),
|
||||
@ -37,6 +97,34 @@ class LockingTest(ProbackupTest, unittest.TestCase):
|
||||
self.AssertTrue(False, 'Failed to hit breakpoint')
|
||||
|
||||
gdb._execute('signal SIGKILL')
|
||||
gdb.continue_execution_until_error()
|
||||
|
||||
self.assertEqual(
|
||||
'OK', self.show_pb(backup_dir, 'node')[0]['status'])
|
||||
|
||||
self.assertEqual(
|
||||
'RUNNING', self.show_pb(backup_dir, 'node')[1]['status'])
|
||||
|
||||
backup_id = self.show_pb(backup_dir, 'node')[1]['id']
|
||||
|
||||
try:
|
||||
self.validate_pb(backup_dir)
|
||||
self.assertEqual(
|
||||
1, 0,
|
||||
"Expecting Error because RUNNING backup is no longer active.\n "
|
||||
"Output: {0} \n CMD: {1}".format(
|
||||
repr(self.output), self.cmd))
|
||||
except ProbackupException as e:
|
||||
self.assertTrue(
|
||||
"which used backup {0} no longer exists".format(
|
||||
backup_id) in e.message and
|
||||
"Backup {0} has status RUNNING, change it "
|
||||
"to ERROR and skip validation".format(
|
||||
backup_id) in e.message and
|
||||
"WARNING: Some backups are not valid" in
|
||||
e.message,
|
||||
'\n Unexpected Error Message: {0}\n CMD: {1}'.format(
|
||||
repr(e.message), self.cmd))
|
||||
|
||||
self.assertEqual(
|
||||
'OK', self.show_pb(backup_dir, 'node')[0]['status'])
|
||||
@ -45,4 +133,321 @@ class LockingTest(ProbackupTest, unittest.TestCase):
|
||||
'ERROR', self.show_pb(backup_dir, 'node')[1]['status'])
|
||||
|
||||
# Clean after yourself
|
||||
self.del_test_dir(module_name, fname)
|
||||
self.del_test_dir(module_name, fname)
|
||||
|
||||
def test_locking_running_validate_2_specific_id(self):
|
||||
"""
|
||||
make node, take full backup, stop it in the middle,
|
||||
kill process so no cleanup is done - pid file is in place,
|
||||
run validate on this specific backup,
|
||||
expect it to not successfully executed,
|
||||
RUNNING backup with pid file AND without active pid is legal,
|
||||
but his status must be changed to ERROR and pid file is deleted
|
||||
"""
|
||||
fname = self.id().split('.')[3]
|
||||
node = self.make_simple_node(
|
||||
base_dir=os.path.join(module_name, fname, 'node'),
|
||||
initdb_params=['--data-checksums'],
|
||||
pg_options={'wal_level': 'replica'})
|
||||
|
||||
backup_dir = os.path.join(self.tmp_path, module_name, fname, 'backup')
|
||||
self.init_pb(backup_dir)
|
||||
self.add_instance(backup_dir, 'node', node)
|
||||
self.set_archiving(backup_dir, 'node', node)
|
||||
node.slow_start()
|
||||
|
||||
self.backup_node(backup_dir, 'node', node)
|
||||
|
||||
gdb = self.backup_node(
|
||||
backup_dir, 'node', node, gdb=True)
|
||||
|
||||
gdb.set_breakpoint('copy_file')
|
||||
gdb.run_until_break()
|
||||
|
||||
if gdb.continue_execution_until_break(20) != 'breakpoint-hit':
|
||||
self.AssertTrue(False, 'Failed to hit breakpoint')
|
||||
|
||||
gdb._execute('signal SIGKILL')
|
||||
gdb.continue_execution_until_error()
|
||||
|
||||
self.assertEqual(
|
||||
'OK', self.show_pb(backup_dir, 'node')[0]['status'])
|
||||
|
||||
self.assertEqual(
|
||||
'RUNNING', self.show_pb(backup_dir, 'node')[1]['status'])
|
||||
|
||||
backup_id = self.show_pb(backup_dir, 'node')[1]['id']
|
||||
|
||||
try:
|
||||
self.validate_pb(backup_dir, 'node', backup_id)
|
||||
self.assertEqual(
|
||||
1, 0,
|
||||
"Expecting Error because RUNNING backup is no longer active.\n "
|
||||
"Output: {0} \n CMD: {1}".format(
|
||||
repr(self.output), self.cmd))
|
||||
except ProbackupException as e:
|
||||
self.assertTrue(
|
||||
"which used backup {0} no longer exists".format(
|
||||
backup_id) in e.message and
|
||||
"Backup {0} has status RUNNING, change it "
|
||||
"to ERROR and skip validation".format(
|
||||
backup_id) in e.message and
|
||||
"ERROR: Backup {0} has status: ERROR".format(backup_id) in
|
||||
e.message,
|
||||
'\n Unexpected Error Message: {0}\n CMD: {1}'.format(
|
||||
repr(e.message), self.cmd))
|
||||
|
||||
self.assertEqual(
|
||||
'OK', self.show_pb(backup_dir, 'node')[0]['status'])
|
||||
|
||||
self.assertEqual(
|
||||
'ERROR', self.show_pb(backup_dir, 'node')[1]['status'])
|
||||
|
||||
try:
|
||||
self.validate_pb(backup_dir, 'node', backup_id)
|
||||
self.assertEqual(
|
||||
1, 0,
|
||||
"Expecting Error because backup has status ERROR.\n "
|
||||
"Output: {0} \n CMD: {1}".format(
|
||||
repr(self.output), self.cmd))
|
||||
except ProbackupException as e:
|
||||
self.assertIn(
|
||||
"ERROR: Backup {0} has status: ERROR".format(backup_id),
|
||||
e.message,
|
||||
'\n Unexpected Error Message: {0}\n CMD: {1}'.format(
|
||||
repr(e.message), self.cmd))
|
||||
|
||||
try:
|
||||
self.validate_pb(backup_dir)
|
||||
self.assertEqual(
|
||||
1, 0,
|
||||
"Expecting Error because backup has status ERROR.\n "
|
||||
"Output: {0} \n CMD: {1}".format(
|
||||
repr(self.output), self.cmd))
|
||||
except ProbackupException as e:
|
||||
self.assertTrue(
|
||||
"WARNING: Backup {0} has status ERROR. Skip validation".format(
|
||||
backup_id) in e.message and
|
||||
"WARNING: Some backups are not valid" in e.message,
|
||||
'\n Unexpected Error Message: {0}\n CMD: {1}'.format(
|
||||
repr(e.message), self.cmd))
|
||||
|
||||
# Clean after yourself
|
||||
self.del_test_dir(module_name, fname)
|
||||
|
||||
def test_locking_running_3(self):
|
||||
"""
|
||||
make node, take full backup, stop it in the middle,
|
||||
terminate process, delete pid file,
|
||||
run validate, expect it to not successfully executed,
|
||||
RUNNING backup without pid file AND without active pid is legal,
|
||||
his status must be changed to ERROR
|
||||
"""
|
||||
fname = self.id().split('.')[3]
|
||||
node = self.make_simple_node(
|
||||
base_dir=os.path.join(module_name, fname, 'node'),
|
||||
initdb_params=['--data-checksums'],
|
||||
pg_options={'wal_level': 'replica'})
|
||||
|
||||
backup_dir = os.path.join(self.tmp_path, module_name, fname, 'backup')
|
||||
self.init_pb(backup_dir)
|
||||
self.add_instance(backup_dir, 'node', node)
|
||||
self.set_archiving(backup_dir, 'node', node)
|
||||
node.slow_start()
|
||||
|
||||
self.backup_node(backup_dir, 'node', node)
|
||||
|
||||
gdb = self.backup_node(
|
||||
backup_dir, 'node', node, gdb=True)
|
||||
|
||||
gdb.set_breakpoint('copy_file')
|
||||
gdb.run_until_break()
|
||||
|
||||
if gdb.continue_execution_until_break(20) != 'breakpoint-hit':
|
||||
self.AssertTrue(False, 'Failed to hit breakpoint')
|
||||
|
||||
gdb._execute('signal SIGKILL')
|
||||
gdb.continue_execution_until_error()
|
||||
|
||||
self.assertEqual(
|
||||
'OK', self.show_pb(backup_dir, 'node')[0]['status'])
|
||||
|
||||
self.assertEqual(
|
||||
'RUNNING', self.show_pb(backup_dir, 'node')[1]['status'])
|
||||
|
||||
backup_id = self.show_pb(backup_dir, 'node')[1]['id']
|
||||
|
||||
os.remove(
|
||||
os.path.join(backup_dir, 'backups', 'node', backup_id, 'backup.pid'))
|
||||
|
||||
try:
|
||||
self.validate_pb(backup_dir)
|
||||
self.assertEqual(
|
||||
1, 0,
|
||||
"Expecting Error because RUNNING backup is no longer active.\n "
|
||||
"Output: {0} \n CMD: {1}".format(
|
||||
repr(self.output), self.cmd))
|
||||
except ProbackupException as e:
|
||||
self.assertTrue(
|
||||
"Backup {0} has status RUNNING, change it "
|
||||
"to ERROR and skip validation".format(
|
||||
backup_id) in e.message and
|
||||
"WARNING: Some backups are not valid" in
|
||||
e.message,
|
||||
'\n Unexpected Error Message: {0}\n CMD: {1}'.format(
|
||||
repr(e.message), self.cmd))
|
||||
|
||||
self.assertEqual(
|
||||
'OK', self.show_pb(backup_dir, 'node')[0]['status'])
|
||||
|
||||
self.assertEqual(
|
||||
'ERROR', self.show_pb(backup_dir, 'node')[1]['status'])
|
||||
|
||||
# Clean after yourself
|
||||
self.del_test_dir(module_name, fname)
|
||||
|
||||
def test_locking_restore_locked(self):
|
||||
"""
|
||||
make node, take full backup, take two page backups,
|
||||
launch validate on PAGE1 and stop it in the middle,
|
||||
launch restore of PAGE2.
|
||||
Expect restore to fail because validation of
|
||||
intermediate backup is impossible
|
||||
"""
|
||||
fname = self.id().split('.')[3]
|
||||
node = self.make_simple_node(
|
||||
base_dir=os.path.join(module_name, fname, 'node'),
|
||||
initdb_params=['--data-checksums'],
|
||||
pg_options={'wal_level': 'replica'})
|
||||
|
||||
backup_dir = os.path.join(self.tmp_path, module_name, fname, 'backup')
|
||||
self.init_pb(backup_dir)
|
||||
self.add_instance(backup_dir, 'node', node)
|
||||
self.set_archiving(backup_dir, 'node', node)
|
||||
node.slow_start()
|
||||
|
||||
# FULL
|
||||
full_id = self.backup_node(backup_dir, 'node', node)
|
||||
|
||||
# PAGE1
|
||||
backup_id = self.backup_node(backup_dir, 'node', node, backup_type='page')
|
||||
|
||||
# PAGE2
|
||||
self.backup_node(backup_dir, 'node', node, backup_type='page')
|
||||
|
||||
gdb = self.validate_pb(
|
||||
backup_dir, 'node', backup_id=backup_id, gdb=True)
|
||||
|
||||
gdb.set_breakpoint('pgBackupValidate')
|
||||
gdb.run_until_break()
|
||||
|
||||
node.cleanup()
|
||||
|
||||
try:
|
||||
self.restore_node(backup_dir, 'node', node)
|
||||
self.assertEqual(
|
||||
1, 0,
|
||||
"Expecting Error because restore without whole chain validation "
|
||||
"is prohibited unless --no-validate provided.\n "
|
||||
"Output: {0} \n CMD: {1}".format(
|
||||
repr(self.output), self.cmd))
|
||||
except ProbackupException as e:
|
||||
self.assertTrue(
|
||||
"ERROR: Cannot lock backup {0} directory\n".format(full_id) in e.message,
|
||||
'\n Unexpected Error Message: {0}\n CMD: {1}'.format(
|
||||
repr(e.message), self.cmd))
|
||||
|
||||
# Clean after yourself
|
||||
self.del_test_dir(module_name, fname)
|
||||
|
||||
def test_locking_restore_locked_without_validation(self):
|
||||
"""
|
||||
make node, take full backup, take page backup,
|
||||
launch validate on FULL and stop it in the middle,
|
||||
launch restore of PAGE.
|
||||
Expect restore to fail because validation of
|
||||
intermediate backup is impossible
|
||||
"""
|
||||
fname = self.id().split('.')[3]
|
||||
node = self.make_simple_node(
|
||||
base_dir=os.path.join(module_name, fname, 'node'),
|
||||
initdb_params=['--data-checksums'],
|
||||
pg_options={'wal_level': 'replica'})
|
||||
|
||||
backup_dir = os.path.join(self.tmp_path, module_name, fname, 'backup')
|
||||
self.init_pb(backup_dir)
|
||||
self.add_instance(backup_dir, 'node', node)
|
||||
self.set_archiving(backup_dir, 'node', node)
|
||||
node.slow_start()
|
||||
|
||||
# FULL
|
||||
backup_id = self.backup_node(backup_dir, 'node', node)
|
||||
|
||||
# PAGE1
|
||||
restore_id = self.backup_node(backup_dir, 'node', node, backup_type='page')
|
||||
|
||||
gdb = self.validate_pb(
|
||||
backup_dir, 'node', backup_id=backup_id, gdb=True)
|
||||
|
||||
gdb.set_breakpoint('pgBackupValidate')
|
||||
gdb.run_until_break()
|
||||
|
||||
node.cleanup()
|
||||
|
||||
try:
|
||||
self.restore_node(
|
||||
backup_dir, 'node', node, options=['--no-validate'])
|
||||
self.assertEqual(
|
||||
1, 0,
|
||||
"Expecting Error because restore without whole chain validation "
|
||||
"is prohibited unless --no-validate provided.\n "
|
||||
"Output: {0} \n CMD: {1}".format(
|
||||
repr(self.output), self.cmd))
|
||||
except ProbackupException as e:
|
||||
self.assertTrue(
|
||||
"Backup {0} is used without validation".format(
|
||||
restore_id) in e.message and
|
||||
'is using backup {0} and still is running'.format(
|
||||
backup_id) in e.message and
|
||||
'ERROR: Cannot lock backup directory' in e.message,
|
||||
'\n Unexpected Error Message: {0}\n CMD: {1}'.format(
|
||||
repr(e.message), self.cmd))
|
||||
|
||||
# Clean after yourself
|
||||
self.del_test_dir(module_name, fname)
|
||||
|
||||
def test_locking_concurrent_vaidate_and_backup(self):
|
||||
"""
|
||||
make node, take full backup, launch validate
|
||||
and stop it in the middle, take page backup.
|
||||
Expect PAGE backup to be successfully executed
|
||||
"""
|
||||
fname = self.id().split('.')[3]
|
||||
node = self.make_simple_node(
|
||||
base_dir=os.path.join(module_name, fname, 'node'),
|
||||
initdb_params=['--data-checksums'],
|
||||
pg_options={'wal_level': 'replica'})
|
||||
|
||||
backup_dir = os.path.join(self.tmp_path, module_name, fname, 'backup')
|
||||
self.init_pb(backup_dir)
|
||||
self.add_instance(backup_dir, 'node', node)
|
||||
self.set_archiving(backup_dir, 'node', node)
|
||||
node.slow_start()
|
||||
|
||||
# FULL
|
||||
self.backup_node(backup_dir, 'node', node)
|
||||
|
||||
# PAGE2
|
||||
backup_id = self.backup_node(backup_dir, 'node', node, backup_type='page')
|
||||
|
||||
gdb = self.validate_pb(
|
||||
backup_dir, 'node', backup_id=backup_id, gdb=True)
|
||||
|
||||
gdb.set_breakpoint('pgBackupValidate')
|
||||
gdb.run_until_break()
|
||||
|
||||
# This PAGE backup is expected to be successfull
|
||||
self.backup_node(backup_dir, 'node', node, backup_type='page')
|
||||
|
||||
# Clean after yourself
|
||||
self.del_test_dir(module_name, fname)
|
||||
|
@ -411,6 +411,180 @@ class ValidateTest(ProbackupTest, unittest.TestCase):
|
||||
# Clean after yourself
|
||||
self.del_test_dir(module_name, fname)
|
||||
|
||||
# @unittest.skip("skip")
|
||||
def test_validate_specific_error_intermediate_backups(self):
|
||||
"""
|
||||
make archive node, take FULL, PAGE1, PAGE2 backups,
|
||||
change backup status of FULL and PAGE1 to ERROR,
|
||||
run validate on PAGE1
|
||||
purpose of this test is to be sure that not only
|
||||
CORRUPT backup descendants can be orphanized
|
||||
"""
|
||||
fname = self.id().split('.')[3]
|
||||
node = self.make_simple_node(
|
||||
base_dir=os.path.join(module_name, fname, 'node'),
|
||||
initdb_params=['--data-checksums'],
|
||||
pg_options={'wal_level': 'replica'}
|
||||
)
|
||||
backup_dir = os.path.join(self.tmp_path, module_name, fname, 'backup')
|
||||
self.init_pb(backup_dir)
|
||||
self.add_instance(backup_dir, 'node', node)
|
||||
self.set_archiving(backup_dir, 'node', node)
|
||||
node.slow_start()
|
||||
|
||||
# FULL
|
||||
backup_id_1 = self.backup_node(backup_dir, 'node', node)
|
||||
|
||||
# PAGE1
|
||||
backup_id_2 = self.backup_node(
|
||||
backup_dir, 'node', node, backup_type='page')
|
||||
|
||||
# PAGE2
|
||||
backup_id_3 = self.backup_node(
|
||||
backup_dir, 'node', node, backup_type='page')
|
||||
|
||||
# Change FULL backup status to ERROR
|
||||
control_path = os.path.join(
|
||||
backup_dir, 'backups', 'node', backup_id_1, 'backup.control')
|
||||
|
||||
with open(control_path, 'r') as f:
|
||||
actual_control = f.read()
|
||||
|
||||
new_control_file = ''
|
||||
for line in actual_control.splitlines():
|
||||
new_control_file += line.replace(
|
||||
'status = OK', 'status = ERROR')
|
||||
new_control_file += '\n'
|
||||
|
||||
with open(control_path, 'wt') as f:
|
||||
f.write(new_control_file)
|
||||
f.flush()
|
||||
f.close()
|
||||
|
||||
# Validate PAGE1
|
||||
try:
|
||||
self.validate_pb(
|
||||
backup_dir, 'node', backup_id=backup_id_2)
|
||||
self.assertEqual(
|
||||
1, 0,
|
||||
"Expecting Error because backup has status ERROR.\n "
|
||||
"Output: {0} \n CMD: {1}".format(
|
||||
repr(self.output), self.cmd))
|
||||
except ProbackupException as e:
|
||||
self.assertTrue(
|
||||
'WARNING: Backup {0} is orphaned because '
|
||||
'his parent {1} has status: ERROR'.format(
|
||||
backup_id_2, backup_id_1) in e.message and
|
||||
'INFO: Validating parents for backup {0}'.format(
|
||||
backup_id_2) in e.message and
|
||||
'WARNING: Backup {0} has status ERROR. Skip validation.'.format(
|
||||
backup_id_1) and
|
||||
'ERROR: Backup {0} is orphan.'.format(backup_id_2) in e.message,
|
||||
'\n Unexpected Error Message: {0}\n '
|
||||
'CMD: {1}'.format(
|
||||
repr(e.message), self.cmd))
|
||||
|
||||
self.assertEqual(
|
||||
'ERROR',
|
||||
self.show_pb(backup_dir, 'node', backup_id_1)['status'],
|
||||
'Backup STATUS should be "ERROR"')
|
||||
self.assertEqual(
|
||||
'ORPHAN',
|
||||
self.show_pb(backup_dir, 'node', backup_id_2)['status'],
|
||||
'Backup STATUS should be "ORPHAN"')
|
||||
self.assertEqual(
|
||||
'ORPHAN',
|
||||
self.show_pb(backup_dir, 'node', backup_id_3)['status'],
|
||||
'Backup STATUS should be "ORPHAN"')
|
||||
|
||||
# Clean after yourself
|
||||
self.del_test_dir(module_name, fname)
|
||||
|
||||
# @unittest.skip("skip")
|
||||
def test_validate_error_intermediate_backups(self):
|
||||
"""
|
||||
make archive node, take FULL, PAGE1, PAGE2 backups,
|
||||
change backup status of FULL and PAGE1 to ERROR,
|
||||
run validate on instance
|
||||
purpose of this test is to be sure that not only
|
||||
CORRUPT backup descendants can be orphanized
|
||||
"""
|
||||
fname = self.id().split('.')[3]
|
||||
node = self.make_simple_node(
|
||||
base_dir=os.path.join(module_name, fname, 'node'),
|
||||
initdb_params=['--data-checksums'],
|
||||
pg_options={'wal_level': 'replica'}
|
||||
)
|
||||
backup_dir = os.path.join(self.tmp_path, module_name, fname, 'backup')
|
||||
self.init_pb(backup_dir)
|
||||
self.add_instance(backup_dir, 'node', node)
|
||||
self.set_archiving(backup_dir, 'node', node)
|
||||
node.slow_start()
|
||||
|
||||
# FULL
|
||||
backup_id_1 = self.backup_node(backup_dir, 'node', node)
|
||||
|
||||
# PAGE1
|
||||
backup_id_2 = self.backup_node(
|
||||
backup_dir, 'node', node, backup_type='page')
|
||||
|
||||
# PAGE2
|
||||
backup_id_3 = self.backup_node(
|
||||
backup_dir, 'node', node, backup_type='page')
|
||||
|
||||
# Change FULL backup status to ERROR
|
||||
control_path = os.path.join(
|
||||
backup_dir, 'backups', 'node', backup_id_1, 'backup.control')
|
||||
|
||||
with open(control_path, 'r') as f:
|
||||
actual_control = f.read()
|
||||
|
||||
new_control_file = ''
|
||||
for line in actual_control.splitlines():
|
||||
new_control_file += line.replace(
|
||||
'status = OK', 'status = ERROR')
|
||||
new_control_file += '\n'
|
||||
|
||||
with open(control_path, 'wt') as f:
|
||||
f.write(new_control_file)
|
||||
f.flush()
|
||||
f.close()
|
||||
|
||||
# Validate instance
|
||||
try:
|
||||
self.validate_pb(backup_dir)
|
||||
self.assertEqual(
|
||||
1, 0,
|
||||
"Expecting Error because backup has status ERROR.\n "
|
||||
"Output: {0} \n CMD: {1}".format(
|
||||
repr(self.output), self.cmd))
|
||||
except ProbackupException as e:
|
||||
self.assertTrue(
|
||||
"WARNING: Backup {0} is orphaned because "
|
||||
"his parent {1} has status: ERROR".format(
|
||||
backup_id_2, backup_id_1) in e.message and
|
||||
'WARNING: Backup {0} has status ERROR. Skip validation'.format(
|
||||
backup_id_1) in e.message and
|
||||
"WARNING: Some backups are not valid" in e.message,
|
||||
'\n Unexpected Error Message: {0}\n CMD: {1}'.format(
|
||||
repr(e.message), self.cmd))
|
||||
|
||||
self.assertEqual(
|
||||
'ERROR',
|
||||
self.show_pb(backup_dir, 'node', backup_id_1)['status'],
|
||||
'Backup STATUS should be "ERROR"')
|
||||
self.assertEqual(
|
||||
'ORPHAN',
|
||||
self.show_pb(backup_dir, 'node', backup_id_2)['status'],
|
||||
'Backup STATUS should be "ORPHAN"')
|
||||
self.assertEqual(
|
||||
'ORPHAN',
|
||||
self.show_pb(backup_dir, 'node', backup_id_3)['status'],
|
||||
'Backup STATUS should be "ORPHAN"')
|
||||
|
||||
# Clean after yourself
|
||||
self.del_test_dir(module_name, fname)
|
||||
|
||||
# @unittest.skip("skip")
|
||||
def test_validate_corrupted_intermediate_backups_1(self):
|
||||
"""
|
||||
|
Loading…
Reference in New Issue
Block a user