From 93e85cb86e5cc5ba9282fe7d2f2e899682fa0f41 Mon Sep 17 00:00:00 2001 From: Aleksandr Parfenov Date: Mon, 4 Jun 2018 11:27:00 +0300 Subject: [PATCH 1/5] Add --no-validate option for restore command --- src/pg_probackup.c | 4 +- src/pg_probackup.h | 3 +- src/restore.c | 113 ++++++++++++++++++++++------------------- tests/validate_test.py | 87 +++++++++++++++++++++++++++++++ 4 files changed, 154 insertions(+), 53 deletions(-) diff --git a/src/pg_probackup.c b/src/pg_probackup.c index 5d464171..598abdec 100644 --- a/src/pg_probackup.c +++ b/src/pg_probackup.c @@ -69,6 +69,7 @@ static char *target_action = NULL;; static pgRecoveryTarget *recovery_target_options = NULL; bool restore_as_replica = false; +bool restore_no_validate = false; /* delete options */ bool delete_wal = false; @@ -143,6 +144,7 @@ static pgut_option options[] = { 's', 25, "recovery-target-name", &target_name, SOURCE_CMDLINE }, { 's', 26, "recovery-target-action", &target_action, SOURCE_CMDLINE }, { 'b', 'R', "restore-as-replica", &restore_as_replica, SOURCE_CMDLINE }, + { 'b', 27, "no-validate", &restore_no_validate, SOURCE_CMDLINE }, /* delete options */ { 'b', 130, "wal", &delete_wal, SOURCE_CMDLINE }, { 'b', 131, "expired", &delete_expired, SOURCE_CMDLINE }, @@ -431,7 +433,7 @@ main(int argc, char *argv[]) /* parse all recovery target options into recovery_target_options structure */ recovery_target_options = parseRecoveryTargetOptions(target_time, target_xid, target_inclusive, target_tli, target_immediate, - target_name, target_action); + target_name, target_action, restore_no_validate); } if (num_threads < 1) diff --git a/src/pg_probackup.h b/src/pg_probackup.h index 30df34ce..76d5f3c9 100644 --- a/src/pg_probackup.h +++ b/src/pg_probackup.h @@ -247,6 +247,7 @@ typedef struct pgRecoveryTarget bool recovery_target_immediate; const char *recovery_target_name; const char *recovery_target_action; + bool restore_no_validate; } pgRecoveryTarget; /* Union to ease operations on relation pages */ @@ -378,7 +379,7 @@ extern parray * readTimeLineHistory_probackup(TimeLineID targetTLI); extern pgRecoveryTarget *parseRecoveryTargetOptions( const char *target_time, const char *target_xid, const char *target_inclusive, TimeLineID target_tli, bool target_immediate, - const char *target_name, const char *target_action); + const char *target_name, const char *target_action, bool restore_no_validate); extern void opt_tablespace_map(pgut_option *opt, const char *arg); diff --git a/src/restore.c b/src/restore.c index 69bac841..a5420a6c 100644 --- a/src/restore.c +++ b/src/restore.c @@ -243,66 +243,69 @@ do_restore_or_validate(time_t target_backup_id, if (is_restore) check_tablespace_mapping(dest_backup); - if (dest_backup->backup_mode != BACKUP_MODE_FULL) - elog(INFO, "Validating parents for backup %s", base36enc(dest_backup->start_time)); - - /* - * Validate backups from base_full_backup to dest_backup. - */ - for (i = base_full_backup_index; i >= dest_backup_index; i--) + if (!is_restore || !rt->restore_no_validate) { - pgBackup *backup = (pgBackup *) parray_get(backups, i); - pgBackupValidate(backup); - /* Maybe we should be more paranoid and check for !BACKUP_STATUS_OK? */ - if (backup->status == BACKUP_STATUS_CORRUPT) - { - corrupted_backup = backup; - corrupted_backup_index = i; - break; - } - /* We do not validate WAL files of intermediate backups - * It`s done to speed up restore - */ - } - /* There is no point in wal validation - * if there is corrupted backup between base_backup and dest_backup - */ - if (!corrupted_backup) + if (dest_backup->backup_mode != BACKUP_MODE_FULL) + elog(INFO, "Validating parents for backup %s", base36enc(dest_backup->start_time)); + /* - * Validate corresponding WAL files. - * We pass base_full_backup timeline as last argument to this function, - * because it's needed to form the name of xlog file. + * Validate backups from base_full_backup to dest_backup. */ - validate_wal(dest_backup, arclog_path, rt->recovery_target_time, - rt->recovery_target_xid, base_full_backup->tli); - - /* Set every incremental backup between corrupted backup and nearest FULL backup as orphans */ - if (corrupted_backup) - { - for (i = corrupted_backup_index - 1; i >= 0; i--) + for (i = base_full_backup_index; i >= dest_backup_index; i--) { pgBackup *backup = (pgBackup *) parray_get(backups, i); - /* Mark incremental OK backup as orphan */ - if (backup->backup_mode == BACKUP_MODE_FULL) - break; - if (backup->status != BACKUP_STATUS_OK) - continue; - else + pgBackupValidate(backup); + /* Maybe we should be more paranoid and check for !BACKUP_STATUS_OK? */ + if (backup->status == BACKUP_STATUS_CORRUPT) { - char *backup_id, - *corrupted_backup_id; + corrupted_backup = backup; + corrupted_backup_index = i; + break; + } + /* We do not validate WAL files of intermediate backups + * It`s done to speed up restore + */ + } + /* There is no point in wal validation + * if there is corrupted backup between base_backup and dest_backup + */ + if (!corrupted_backup) + /* + * Validate corresponding WAL files. + * We pass base_full_backup timeline as last argument to this function, + * because it's needed to form the name of xlog file. + */ + validate_wal(dest_backup, arclog_path, rt->recovery_target_time, + rt->recovery_target_xid, base_full_backup->tli); - backup->status = BACKUP_STATUS_ORPHAN; - pgBackupWriteBackupControlFile(backup); + /* Set every incremental backup between corrupted backup and nearest FULL backup as orphans */ + if (corrupted_backup) + { + for (i = corrupted_backup_index - 1; i >= 0; i--) + { + pgBackup *backup = (pgBackup *) parray_get(backups, i); + /* Mark incremental OK backup as orphan */ + if (backup->backup_mode == BACKUP_MODE_FULL) + break; + if (backup->status != BACKUP_STATUS_OK) + continue; + else + { + char *backup_id, + *corrupted_backup_id; - backup_id = base36enc_dup(backup->start_time); - corrupted_backup_id = base36enc_dup(corrupted_backup->start_time); + backup->status = BACKUP_STATUS_ORPHAN; + pgBackupWriteBackupControlFile(backup); - elog(WARNING, "Backup %s is orphaned because his parent %s is corrupted", - backup_id, corrupted_backup_id); + backup_id = base36enc_dup(backup->start_time); + corrupted_backup_id = base36enc_dup(corrupted_backup->start_time); - free(backup_id); - free(corrupted_backup_id); + elog(WARNING, "Backup %s is orphaned because his parent %s is corrupted", + backup_id, corrupted_backup_id); + + free(backup_id); + free(corrupted_backup_id); + } } } } @@ -1001,7 +1004,8 @@ parseRecoveryTargetOptions(const char *target_time, TimeLineID target_tli, bool target_immediate, const char *target_name, - const char *target_action) + const char *target_action, + bool restore_no_validate) { time_t dummy_time; TransactionId dummy_xid; @@ -1026,6 +1030,7 @@ parseRecoveryTargetOptions(const char *target_time, rt->recovery_target_immediate = false; rt->recovery_target_name = NULL; rt->recovery_target_action = NULL; + rt->restore_no_validate = false; /* parse given options */ if (target_time) @@ -1072,6 +1077,12 @@ parseRecoveryTargetOptions(const char *target_time, rt->recovery_target_immediate = target_immediate; } + if (restore_no_validate) + { + recovery_target_specified++; + rt->restore_no_validate = restore_no_validate; + } + if (target_name) { recovery_target_specified++; diff --git a/tests/validate_test.py b/tests/validate_test.py index 06ea1ea3..2cebcc33 100644 --- a/tests/validate_test.py +++ b/tests/validate_test.py @@ -748,6 +748,93 @@ class ValidateTest(ProbackupTest, unittest.TestCase): # Clean after yourself self.del_test_dir(module_name, fname) + # @unittest.skip("skip") + def test_validate_instance_with_corrupted_full_and_try_restore(self): + """make archive node, take FULL, PAGE1, PAGE2, FULL2, PAGE3 backups, + corrupt file in FULL backup and run validate on instance, + expect FULL to gain status CORRUPT, PAGE1 and PAGE2 to gain status ORPHAN, + try to restore backup with --no-validation option""" + fname = self.id().split('.')[3] + node = self.make_simple_node(base_dir="{0}/{1}/node".format(module_name, fname), + initdb_params=['--data-checksums'], + pg_options={'wal_level': 'replica'} + ) + backup_dir = os.path.join(self.tmp_path, module_name, fname, 'backup') + self.init_pb(backup_dir) + self.add_instance(backup_dir, 'node', node) + self.set_archiving(backup_dir, 'node', node) + node.start() + + node.safe_psql( + "postgres", + "create table t_heap as select i as id, md5(i::text) as text, md5(repeat(i::text,10))::tsvector as tsvector from generate_series(0,10000) i") + file_path_t_heap = node.safe_psql( + "postgres", + "select pg_relation_filepath('t_heap')").rstrip() + # FULL1 + backup_id_1 = self.backup_node(backup_dir, 'node', node) + + node.safe_psql( + "postgres", + "insert into t_heap select i as id, md5(i::text) as text, md5(repeat(i::text,10))::tsvector as tsvector from generate_series(0,10000) i") + # PAGE1 + backup_id_2 = self.backup_node(backup_dir, 'node', node, backup_type='page') + + # PAGE2 + node.safe_psql( + "postgres", + "insert into t_heap select i as id, md5(i::text) as text, md5(repeat(i::text,10))::tsvector as tsvector from generate_series(20000,30000) i") + backup_id_3 = self.backup_node(backup_dir, 'node', node, backup_type='page') + + # FULL1 + backup_id_4 = self.backup_node(backup_dir, 'node', node) + + # PAGE3 + node.safe_psql( + "postgres", + "insert into t_heap select i as id, md5(i::text) as text, md5(repeat(i::text,10))::tsvector as tsvector from generate_series(30000,40000) i") + backup_id_5 = self.backup_node(backup_dir, 'node', node, backup_type='page') + + # Corrupt some file in FULL backup + file_full = os.path.join(backup_dir, 'backups/node', backup_id_1, 'database', file_path_t_heap) + with open(file_full, "rb+", 0) as f: + f.seek(84) + f.write(b"blah") + f.flush() + f.close + + # Validate Instance + try: + self.validate_pb(backup_dir, 'node', options=['--log-level-file=verbose']) + self.assertEqual(1, 0, "Expecting Error because of data files corruption.\n Output: {0} \n CMD: {1}".format( + repr(self.output), self.cmd)) + except ProbackupException as e: + self.assertTrue( + 'INFO: Validating backup {0}'.format(backup_id_1) in e.message + and "INFO: Validate backups of the instance 'node'" in e.message + and 'WARNING: Invalid CRC of backup file "{0}"'.format(file_full) in e.message + and 'WARNING: Backup {0} data files are corrupted'.format(backup_id_1) in e.message, + '\n Unexpected Error Message: {0}\n CMD: {1}'.format(repr(e.message), self.cmd)) + + self.assertEqual('CORRUPT', self.show_pb(backup_dir, 'node', backup_id_1)['status'], 'Backup STATUS should be "CORRUPT"') + self.assertEqual('ORPHAN', self.show_pb(backup_dir, 'node', backup_id_2)['status'], 'Backup STATUS should be "ORPHAN"') + self.assertEqual('ORPHAN', self.show_pb(backup_dir, 'node', backup_id_3)['status'], 'Backup STATUS should be "ORPHAN"') + self.assertEqual('OK', self.show_pb(backup_dir, 'node', backup_id_4)['status'], 'Backup STATUS should be "OK"') + self.assertEqual('OK', self.show_pb(backup_dir, 'node', backup_id_5)['status'], 'Backup STATUS should be "OK"') + + node.cleanup() + restore_out = self.restore_node( + backup_dir, 'node', node, + options=["--no-validate"]) + self.assertIn( + "INFO: Restore of backup {0} completed.".format(backup_id_5), + restore_out, + '\n Unexpected Error Message: {0}\n CMD: {1}'.format( + repr(self.output), self.cmd)) + + # Clean after yourself + self.del_test_dir(module_name, fname) + # @unittest.skip("skip") def test_validate_instance_with_corrupted_full(self): """make archive node, take FULL, PAGE1, PAGE2, FULL2, PAGE3 backups, From 02fd1878bb60fb87baa219f6a07f8c71d89041e8 Mon Sep 17 00:00:00 2001 From: Aleksandr Parfenov Date: Tue, 5 Jun 2018 16:06:22 +0300 Subject: [PATCH 2/5] Add help message for --no-validate --- src/help.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/help.c b/src/help.c index 2f84b225..70d8b4d5 100644 --- a/src/help.c +++ b/src/help.c @@ -265,7 +265,7 @@ help_restore(void) printf(_(" [--timeline=timeline] [-T OLDDIR=NEWDIR]\n")); printf(_(" [--immediate] [--recovery-target-name=target-name]\n")); printf(_(" [--recovery-target-action=pause|promote|shutdown]\n")); - printf(_(" [--restore-as-replica]\n\n")); + printf(_(" [--restore-as-replica] [--no-validate]\n\n")); printf(_(" -B, --backup-path=backup-path location of the backup storage area\n")); printf(_(" --instance=instance_name name of the instance\n")); @@ -282,6 +282,7 @@ help_restore(void) printf(_(" relocate the tablespace from directory OLDDIR to NEWDIR\n")); printf(_(" --immediate end recovery as soon as a consistent state is reached\n")); + printf(_(" --no-validate disable backup validation during recovery\n")); printf(_(" --recovery-target-name=target-name\n")); printf(_(" the named restore point to which recovery will proceed\n")); printf(_(" --recovery-target-action=pause|promote|shutdown\n")); From 29639f71d60d5c799e23b8029599730a596b223a Mon Sep 17 00:00:00 2001 From: Aleksandr Parfenov Date: Thu, 5 Jul 2018 16:47:56 +0300 Subject: [PATCH 3/5] Fix --no-validate counting as recovery target --- src/restore.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/restore.c b/src/restore.c index a5420a6c..8765a9e5 100644 --- a/src/restore.c +++ b/src/restore.c @@ -1079,7 +1079,6 @@ parseRecoveryTargetOptions(const char *target_time, if (restore_no_validate) { - recovery_target_specified++; rt->restore_no_validate = restore_no_validate; } From c81a61208c5ee1f8caf40d6a2b724d1242977167 Mon Sep 17 00:00:00 2001 From: Ilya Skvortsov Date: Mon, 9 Jul 2018 13:57:10 +0300 Subject: [PATCH 4/5] new no-validate test added --- tests/validate_test.py | 59 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 59 insertions(+) diff --git a/tests/validate_test.py b/tests/validate_test.py index 2cebcc33..1c9a3cc5 100644 --- a/tests/validate_test.py +++ b/tests/validate_test.py @@ -1659,3 +1659,62 @@ class ValidateTest(ProbackupTest, unittest.TestCase): # Clean after yourself self.del_test_dir(module_name, fname) + + def test_file_size_corruption_no_validate(self): + + fname = self.id().split('.')[3] + node = self.make_simple_node( + base_dir="{0}/{1}/node".format(module_name, fname), + # initdb_params=['--data-checksums'], + pg_options={'wal_level': 'replica'} + ) + + backup_dir = os.path.join(self.tmp_path, module_name, fname, 'backup') + + self.init_pb(backup_dir) + self.add_instance(backup_dir, 'node', node) + self.set_archiving(backup_dir, 'node', node) + + node.start() + + node.safe_psql( + "postgres", + "create table t_heap as select 1 as id, md5(i::text) as text, " + "md5(repeat(i::text,10))::tsvector as tsvector " + "from generate_series(0,1000) i") + node.safe_psql( + "postgres", + "CHECKPOINT;") + + heap_path = node.safe_psql( + "postgres", + "select pg_relation_filepath('t_heap')").rstrip() + heap_size = node.safe_psql( + "postgres", + "select pg_relation_size('t_heap')") + + backup_id = self.backup_node( + backup_dir, 'node', node, backup_type="full", + options=["-j", "4"], async=False, gdb=False) + + node.stop() + node.cleanup() + + # Let`s do file corruption + with open(os.path.join(backup_dir, "backups", 'node', backup_id, "database", heap_path), "rb+", 0) as f: + f.truncate(int(heap_size) - 4096) + f.flush() + f.close + + node.cleanup() + + try: + self.restore_node( + backup_dir, 'node', node, + options=["--no-validate"]) + except ProbackupException as e: + self.assertTrue("ERROR: Data files restoring failed" in e.message, repr(e.message)) + print "\nExpected error: \n" + e.message + + # Clean after yourself + self.del_test_dir(module_name, fname) From e947cac1428c8b8e0beb1210ab0b8366a7341604 Mon Sep 17 00:00:00 2001 From: Aleksandr Parfenov Date: Mon, 9 Jul 2018 20:35:59 +0700 Subject: [PATCH 5/5] Change INFO message in --no-validate mode --- src/restore.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/restore.c b/src/restore.c index 8765a9e5..228882aa 100644 --- a/src/restore.c +++ b/src/restore.c @@ -315,7 +315,12 @@ do_restore_or_validate(time_t target_backup_id, * produce corresponding error message */ if (dest_backup->status == BACKUP_STATUS_OK) - elog(INFO, "Backup %s is valid.", base36enc(dest_backup->start_time)); + { + if (rt->restore_no_validate) + elog(INFO, "Backup %s is used without validation.", base36enc(dest_backup->start_time)); + else + elog(INFO, "Backup %s is valid.", base36enc(dest_backup->start_time)); + } else if (dest_backup->status == BACKUP_STATUS_CORRUPT) elog(ERROR, "Backup %s is corrupt.", base36enc(dest_backup->start_time)); else if (dest_backup->status == BACKUP_STATUS_ORPHAN)