diff --git a/src/backup.c b/src/backup.c index 68fee494..140f9ee5 100644 --- a/src/backup.c +++ b/src/backup.c @@ -940,7 +940,7 @@ do_block_validation(void) /* arrays with meta info for multi threaded backup */ pthread_t *threads; backup_files_arg *threads_args; - bool backup_isok = true; + bool check_isok = true; pgBackupGetPath(¤t, database_path, lengthof(database_path), DATABASE_DIR); @@ -1017,14 +1017,14 @@ do_block_validation(void) { pthread_join(threads[i], NULL); if (threads_args[i].ret > 0) - backup_isok = false; + check_isok = false; } /* TODO write better info message */ - if (backup_isok) - elog(INFO, "Data files are checked"); + if (check_isok) + elog(INFO, "Data files are valid"); else - elog(ERROR, "Data files checking failed"); + elog(ERROR, "Data files are corrupted"); if (backup_files_list) { @@ -1042,7 +1042,7 @@ do_amcheck(void) /* arrays with meta info for multi threaded backup */ pthread_t *threads; backup_files_arg *threads_args; - bool backup_isok = true; + bool check_isok = true; PGresult *res_db; int n_databases = 0; bool first_db_with_amcheck = true; @@ -1118,13 +1118,13 @@ do_amcheck(void) { pthread_join(threads[j], NULL); if (threads_args[j].ret == 1) - backup_isok = false; + check_isok = false; } pgut_disconnect(db_conn); } /* TODO write better info message */ - if (backup_isok) + if (check_isok) elog(INFO, "Indexes are checked"); else elog(ERROR, "Indexs checking failed"); @@ -1188,6 +1188,17 @@ pgdata_basic_setup(void) is_checksum_enabled = pg_checksum_enable(); + + /* + * Ensure that backup directory was initialized for the same PostgreSQL + * instance we opened connection to. And that target backup database PGDATA + * belogns to the same instance. + */ + /* TODO fix it for remote backup */ + + if (!is_remote_backup) + check_system_identifiers(); + if (is_checksum_enabled) elog(LOG, "This PostgreSQL instance was initialized with data block checksums. " "Data block corruption will be detected"); @@ -1225,10 +1236,6 @@ do_backup(time_t start_time) current.compress_alg = instance_config.compress_alg; current.compress_level = instance_config.compress_level; - /* TODO fix it for remote backup*/ - if (!is_remote_backup) - current.checksum_version = get_data_checksum_version(true); - current.stream = stream_wal; is_ptrack_support = pg_ptrack_support(); @@ -1261,15 +1268,6 @@ do_backup(time_t start_time) instance_config.master_user); } - /* - * Ensure that backup directory was initialized for the same PostgreSQL - * instance we opened connection to. And that target backup database PGDATA - * belogns to the same instance. - */ - /* TODO fix it for remote backup */ - if (!is_remote_backup) - check_system_identifiers(); - /* Start backup. Update backup status. */ current.status = BACKUP_STATUS_RUNNING; current.start_time = start_time; @@ -1415,10 +1413,24 @@ check_system_identifiers(void) system_id_pgdata = get_system_identifier(instance_config.pgdata); system_id_conn = get_remote_system_identifier(backup_conn); + /* for checkdb check only system_id_pgdata and system_id_conn */ + if (current.backup_mode == BACKUP_MODE_INVALID) + { + if (system_id_conn != system_id_pgdata) + { + elog(ERROR, "Data directory initialized with system id " UINT64_FORMAT ", " + "but connected instance system id is " UINT64_FORMAT, + system_id_pgdata, system_id_conn); + } + return; + } + + if (system_id_conn != instance_config.system_identifier) elog(ERROR, "Backup data directory was initialized for system id " UINT64_FORMAT ", " "but connected instance system id is " UINT64_FORMAT, instance_config.system_identifier, system_id_conn); + if (system_id_pgdata != instance_config.system_identifier) elog(ERROR, "Backup data directory was initialized for system id " UINT64_FORMAT ", " "but target backup directory system id is " UINT64_FORMAT, @@ -2562,7 +2574,7 @@ check_files(void *arg) elog(VERBOSE, "Checking file: \"%s\" ", file->path); /* check for interrupt */ - if (interrupted) + if (interrupted || thread_interrupted) elog(ERROR, "interrupted during checkdb"); if (progress) @@ -2579,7 +2591,6 @@ check_files(void *arg) * If file is not found, this is not en error. * It could have been deleted by concurrent postgres transaction. */ - file->write_size = BYTES_INVALID; elog(LOG, "File \"%s\" is not found", file->path); continue; } @@ -2597,7 +2608,7 @@ check_files(void *arg) if (S_ISREG(buf.st_mode)) { - /* check only uncompressed datafiles */ + /* check only uncompressed by cfs datafiles */ if (file->is_datafile && !file->is_cfs) { char to_path[MAXPGPATH]; @@ -2639,7 +2650,7 @@ check_indexes(void *arg) continue; /* check for interrupt */ - if (interrupted) + if (interrupted || thread_interrupted) elog(ERROR, "interrupted during checkdb --amcheck"); elog(VERBOSE, "Checking index number %d of %d : \"%s\" ", i,n_indexes, ind->name); diff --git a/src/data.c b/src/data.c index 58338de5..275184cc 100644 --- a/src/data.c +++ b/src/data.c @@ -181,6 +181,7 @@ typedef struct BackupPageHeader /* Special value for compressed_size field */ #define PageIsTruncated -2 #define SkipCurrentPage -3 +#define PageIsCorrupted -4 /* used by checkdb */ /* Verify page's header */ static bool @@ -278,7 +279,7 @@ read_page_from_file(pgFile *file, BlockNumber blknum, if (pg_checksum_page(page, file->segno * RELSEG_SIZE + blknum) != ((PageHeader) page)->pd_checksum) { - elog(WARNING, "File: %s blknum %u have wrong checksum, try again", + elog(LOG, "File: %s blknum %u have wrong checksum, try again", file->path, blknum); return -1; } @@ -304,6 +305,7 @@ read_page_from_file(pgFile *file, BlockNumber blknum, * Returns 0 if page was successfully retrieved * SkipCurrentPage(-3) if we need to skip this page * PageIsTruncated(-2) if the page was truncated + * PageIsCorrupted(-4) if the page check mismatch */ static int32 prepare_page(backup_files_arg *arguments, @@ -311,7 +313,8 @@ prepare_page(backup_files_arg *arguments, BlockNumber blknum, BlockNumber nblocks, FILE *in, int *n_skipped, BackupMode backup_mode, - Page page) + Page page, + bool strict) { XLogRecPtr page_lsn = 0; int try_again = 100; @@ -354,7 +357,7 @@ prepare_page(backup_files_arg *arguments, */ //elog(WARNING, "Checksum_Version: %i", current.checksum_version ? 1 : 0); - if (result == -1 && is_ptrack_support) + if (result == -1 && is_ptrack_support && strict) { elog(WARNING, "File %s, block %u, try to fetch via SQL", file->path, blknum); @@ -365,8 +368,24 @@ prepare_page(backup_files_arg *arguments, * If page is not valid after 100 attempts to read it * throw an error. */ - if(!page_is_valid && !is_ptrack_support) - elog(ERROR, "Data file checksum mismatch. Canceling backup"); + + if (!page_is_valid) + { + elog(WARNING, "CORRUPTION in file %s, block %u", + file->path, blknum); + + if (!is_ptrack_support && strict) + elog(ERROR, "Data file corruption. Canceling backup"); + } + + /* Checkdb not going futher */ + if (!strict) + { + if (page_is_valid) + return 0; + else + return PageIsCorrupted; + } } if (backup_mode == BACKUP_MODE_DIFF_PTRACK || (!page_is_valid && is_ptrack_support)) @@ -507,67 +526,6 @@ compress_and_backup_page(pgFile *file, BlockNumber blknum, file->write_size += write_buffer_size; } -bool -check_data_file(backup_files_arg* arguments, - pgFile *file) -{ - FILE *in; - BlockNumber blknum = 0; - BlockNumber nblocks = 0; - int n_blocks_skipped = 0; - int page_state; - char curr_page[BLCKSZ]; - - /* reset size summary */ - file->read_size = 0; - file->write_size = 0; - - /* open backup mode file for read */ - in = fopen(file->path, PG_BINARY_R); - if (in == NULL) - { - FIN_FILE_CRC32(true, file->crc); - - /* - * If file is not found, this is not en error. - * It could have been deleted by concurrent postgres transaction. - */ - if (errno == ENOENT) - { - elog(LOG, "File \"%s\" is not found", file->path); - return false; - } - - elog(ERROR, "cannot open file \"%s\": %s", - file->path, strerror(errno)); - } - - if (file->size % BLCKSZ != 0) - { - fclose(in); - elog(ERROR, "File: %s, invalid file size %zu", file->path, file->size); - } - - /* - * Compute expected number of blocks in the file. - * NOTE This is a normal situation, if the file size has changed - * since the moment we computed it. - */ - nblocks = file->size/BLCKSZ; - - for (blknum = 0; blknum < nblocks; blknum++) - { - page_state = prepare_page(arguments, file, 0, //0 = InvalidXLogRecPtr - blknum, nblocks, in, &n_blocks_skipped, - BACKUP_MODE_FULL, curr_page); - if (page_state == PageIsTruncated) - break; - } - - return true; - fclose(in); -} - /* * Backup data file in the from_root directory to the to_root directory with * same relative path. If prev_backup_start_lsn is not NULL, only pages with @@ -672,7 +630,7 @@ backup_data_file(backup_files_arg* arguments, { page_state = prepare_page(arguments, file, prev_backup_start_lsn, blknum, nblocks, in, &n_blocks_skipped, - backup_mode, curr_page); + backup_mode, curr_page, true); compress_and_backup_page(file, blknum, in, out, &(file->crc), page_state, curr_page, calg, clevel); n_blocks_read++; @@ -695,7 +653,7 @@ backup_data_file(backup_files_arg* arguments, { page_state = prepare_page(arguments, file, prev_backup_start_lsn, blknum, nblocks, in, &n_blocks_skipped, - backup_mode, curr_page); + backup_mode, curr_page, true); compress_and_backup_page(file, blknum, in, out, &(file->crc), page_state, curr_page, calg, clevel); n_blocks_read++; @@ -1580,10 +1538,7 @@ validate_one_page(Page page, pgFile *file, /* Verify checksum */ if (checksum_version) { - /* - * If checksum is wrong, sleep a bit and then try again - * several times. If it didn't help, throw error - */ + /* Checksums are enabled, so check it. */ if (pg_checksum_page(page, file->segno * RELSEG_SIZE + blknum) == ((PageHeader) page)->pd_checksum) { @@ -1597,19 +1552,25 @@ validate_one_page(Page page, pgFile *file, } else { - /* Get lsn from page header. Ensure that page is from our time */ - lsn = PageXLogRecPtrGet(phdr->pd_lsn); + if (stop_lsn > 0) + { + /* Get lsn from page header. Ensure that page is from our time. + * This is dangerous move, because we cannot be sure that + * lsn from page header is not a garbage. + */ + lsn = PageXLogRecPtrGet(phdr->pd_lsn); - if (lsn > stop_lsn) - elog(WARNING, "File: %s, block %u, checksum is not enabled. " - "Page is from future: pageLSN %X/%X stopLSN %X/%X", - file->path, blknum, (uint32) (lsn >> 32), (uint32) lsn, - (uint32) (stop_lsn >> 32), (uint32) stop_lsn); - else - return PAGE_IS_FOUND_AND_VALID; + if (lsn > stop_lsn) + elog(WARNING, "File: %s, block %u, checksum is not enabled. " + "Page is from future: pageLSN %X/%X stopLSN %X/%X", + file->path, blknum, (uint32) (lsn >> 32), (uint32) lsn, + (uint32) (stop_lsn >> 32), (uint32) stop_lsn); + else + return PAGE_IS_FOUND_AND_VALID; + } } - if (checksum_is_ok) + if (checksum_is_ok && stop_lsn > 0) { /* Get lsn from page header. Ensure that page is from our time */ lsn = PageXLogRecPtrGet(phdr->pd_lsn); @@ -1627,6 +1588,87 @@ validate_one_page(Page page, pgFile *file, return PAGE_IS_FOUND_AND_NOT_VALID; } +bool +check_data_file(backup_files_arg* arguments, + pgFile *file) +{ + FILE *in; + BlockNumber blknum = 0; + BlockNumber nblocks = 0; + int n_blocks_skipped = 0; + int page_state; + char curr_page[BLCKSZ]; + + bool is_valid = true; + + /* reset size summary */ + file->read_size = 0; + file->write_size = 0; + + /* open backup mode file for read */ + in = fopen(file->path, PG_BINARY_R); + if (in == NULL) + { + /* + * If file is not found, this is not en error. + * It could have been deleted by concurrent postgres transaction. + */ + if (errno == ENOENT) + { + elog(LOG, "File \"%s\" is not found", file->path); + return true; + } + + elog(ERROR, "cannot open file \"%s\": %s", + file->path, strerror(errno)); + } + + if (file->size % BLCKSZ != 0) + { + fclose(in); + elog(ERROR, "File: %s, invalid file size %zu", file->path, file->size); + } + + /* + * Compute expected number of blocks in the file. + * NOTE This is a normal situation, if the file size has changed + * since the moment we computed it. + */ + nblocks = file->size/BLCKSZ; + + for (blknum = 0; blknum < nblocks; blknum++) + { + page_state = prepare_page(arguments, file, InvalidXLogRecPtr, //0 = InvalidXLogRecPtr + blknum, nblocks, in, &n_blocks_skipped, + BACKUP_MODE_FULL, curr_page, false); + + if (page_state == PageIsTruncated) + break; + + if (page_state == PageIsCorrupted) + { + /* Page is corrupted */ + //elog(WARNING, "File %s, block %u is CORRUPTED.", + // file->path, blknum); + is_valid = false; + continue; + } + + /* Page is found and this point, but it may not be 'sane' */ + if (validate_one_page(curr_page, file, blknum, + InvalidXLogRecPtr, + 0) == PAGE_IS_FOUND_AND_NOT_VALID) + { + /* Page is corrupted */ + is_valid = false; + } + + } + + fclose(in); + return is_valid; +} + /* Valiate pages of datafile in backup one by one */ bool check_file_pages(pgFile *file, XLogRecPtr stop_lsn, uint32 checksum_version, diff --git a/src/pg_probackup.c b/src/pg_probackup.c index 4c15f99e..c24997f2 100644 --- a/src/pg_probackup.c +++ b/src/pg_probackup.c @@ -327,14 +327,14 @@ main(int argc, char *argv[]) help_command(command_name); /* backup_path is required for all pg_probackup commands except help and checkdb */ - if (backup_path == NULL && backup_subcmd != CHECKDB_CMD) + if (backup_path == NULL) { /* * If command line argument is not set, try to read BACKUP_PATH * from environment variable */ backup_path = getenv("BACKUP_PATH"); - if (backup_path == NULL) + if (backup_path == NULL && backup_subcmd != CHECKDB_CMD) elog(ERROR, "required parameter not specified: BACKUP_PATH (-B, --backup-path)"); } @@ -353,10 +353,10 @@ main(int argc, char *argv[]) } /* Option --instance is required for all commands except init and show */ - if (backup_subcmd != INIT_CMD && backup_subcmd != SHOW_CMD && - backup_subcmd != VALIDATE_CMD && backup_subcmd != CHECKDB_CMD) + if (instance_name == NULL) { - if (instance_name == NULL) + if (backup_subcmd != INIT_CMD && backup_subcmd != SHOW_CMD && + backup_subcmd != VALIDATE_CMD && backup_subcmd != CHECKDB_CMD) elog(ERROR, "required parameter not specified: --instance"); } @@ -399,7 +399,11 @@ main(int argc, char *argv[]) { join_path_components(path, backup_instance_path, BACKUP_CATALOG_CONF_FILE); - config_read_opt(path, instance_options, ERROR, true, false); + + if (backup_subcmd == CHECKDB_CMD) + config_read_opt(path, instance_options, ERROR, true, true); + else + config_read_opt(path, instance_options, ERROR, true, false); } }