You've already forked pgbackrest
mirror of
https://github.com/pgbackrest/pgbackrest.git
synced 2025-07-13 01:00:23 +02:00
Fix missing files corrupting the manifest.
If a file was removed by PostgreSQL during the backup (or was missing from the standby) then the next file might not be copied and updated in the manifest. If this happened then the backup would error when restored. The issue was that removing files from the manifest invalidated the pointers stored in the processing queues. When a file was removed, all the pointers shifted to the next file in the list, causing a file to be unprocessed. Since the unprocessed file was still in the manifest it would be saved with no checksum, causing a failure on restore. When process-max was > 1 then the bug would often not express since the file had already been pulled from the queue and updates to the manifest are done by name rather than by pointer.
This commit is contained in:
@ -14,6 +14,20 @@
|
|||||||
<release-list>
|
<release-list>
|
||||||
<release date="XXXX-XX-XX" version="2.23dev" title="UNDER DEVELOPMENT">
|
<release date="XXXX-XX-XX" version="2.23dev" title="UNDER DEVELOPMENT">
|
||||||
<release-core-list>
|
<release-core-list>
|
||||||
|
<release-bug-list>
|
||||||
|
<release-item>
|
||||||
|
<release-item-contributor-list>
|
||||||
|
<release-item-ideator id="vitaliy.kukharik"/>
|
||||||
|
<release-item-contributor id="david.steele"/>
|
||||||
|
<release-item-reviewer id="cynthia.shang"/>
|
||||||
|
</release-item-contributor-list>
|
||||||
|
|
||||||
|
<p>Fix missing files corrupting the manifest.</p>
|
||||||
|
|
||||||
|
<p>If a file was removed by <postgres/> during the backup (or was missing from the standby) then the next file might not be copied and updated in the manifest. If this happened then the backup would error when restored.</p>
|
||||||
|
</release-item>
|
||||||
|
</release-bug-list>
|
||||||
|
|
||||||
<release-improvement-list>
|
<release-improvement-list>
|
||||||
<release-item>
|
<release-item>
|
||||||
<release-item-contributor-list>
|
<release-item-contributor-list>
|
||||||
|
@ -1042,13 +1042,14 @@ Log the results of a job and throw errors
|
|||||||
***********************************************************************************************************************************/
|
***********************************************************************************************************************************/
|
||||||
static uint64_t
|
static uint64_t
|
||||||
backupJobResult(
|
backupJobResult(
|
||||||
Manifest *manifest, const String *host, const String *const fileName, ProtocolParallelJob *const job, const uint64_t sizeTotal,
|
Manifest *manifest, const String *host, const String *const fileName, StringList *fileRemove, ProtocolParallelJob *const job,
|
||||||
uint64_t sizeCopied, unsigned int pageSize)
|
const uint64_t sizeTotal, uint64_t sizeCopied, unsigned int pageSize)
|
||||||
{
|
{
|
||||||
FUNCTION_LOG_BEGIN(logLevelDebug);
|
FUNCTION_LOG_BEGIN(logLevelDebug);
|
||||||
FUNCTION_LOG_PARAM(MANIFEST, manifest);
|
FUNCTION_LOG_PARAM(MANIFEST, manifest);
|
||||||
FUNCTION_LOG_PARAM(STRING, host);
|
FUNCTION_LOG_PARAM(STRING, host);
|
||||||
FUNCTION_LOG_PARAM(STRING, fileName);
|
FUNCTION_LOG_PARAM(STRING, fileName);
|
||||||
|
FUNCTION_LOG_PARAM(STRING_LIST, fileRemove);
|
||||||
FUNCTION_LOG_PARAM(PROTOCOL_PARALLEL_JOB, job);
|
FUNCTION_LOG_PARAM(PROTOCOL_PARALLEL_JOB, job);
|
||||||
FUNCTION_LOG_PARAM(UINT64, sizeTotal);
|
FUNCTION_LOG_PARAM(UINT64, sizeTotal);
|
||||||
FUNCTION_LOG_PARAM(UINT64, sizeCopied);
|
FUNCTION_LOG_PARAM(UINT64, sizeCopied);
|
||||||
@ -1057,6 +1058,7 @@ backupJobResult(
|
|||||||
|
|
||||||
ASSERT(manifest != NULL);
|
ASSERT(manifest != NULL);
|
||||||
ASSERT(fileName != NULL);
|
ASSERT(fileName != NULL);
|
||||||
|
ASSERT(fileRemove != NULL);
|
||||||
ASSERT(job != NULL);
|
ASSERT(job != NULL);
|
||||||
|
|
||||||
// The job was successful
|
// The job was successful
|
||||||
@ -1098,11 +1100,13 @@ backupJobResult(
|
|||||||
LOG_DETAIL_PID_FMT(
|
LOG_DETAIL_PID_FMT(
|
||||||
processId, "checksum resumed file %s (%s)%s", strPtr(fileLog), strPtr(logProgress), strPtr(logChecksum));
|
processId, "checksum resumed file %s (%s)%s", strPtr(fileLog), strPtr(logProgress), strPtr(logChecksum));
|
||||||
}
|
}
|
||||||
// Else if the file was removed during backup then remove from manifest
|
// Else if the file was removed during backup add it to the list of files to be removed from the manifest when the
|
||||||
|
// backup is complete. It can't be removed right now because that will invalidate the pointers that are being used for
|
||||||
|
// processing.
|
||||||
else if (copyResult == backupCopyResultSkip)
|
else if (copyResult == backupCopyResultSkip)
|
||||||
{
|
{
|
||||||
LOG_DETAIL_PID_FMT(processId, "skip file removed by database %s", strPtr(fileLog));
|
LOG_DETAIL_PID_FMT(processId, "skip file removed by database %s", strPtr(fileLog));
|
||||||
manifestFileRemove(manifest, file->name);
|
strLstAdd(fileRemove, file->name);
|
||||||
}
|
}
|
||||||
// Else file was copied so update manifest
|
// Else file was copied so update manifest
|
||||||
else
|
else
|
||||||
@ -1587,6 +1591,9 @@ backupProcess(BackupData *backupData, Manifest *manifest, const String *lsnStart
|
|||||||
for (unsigned int processIdx = 2; processIdx <= processMax; processIdx++)
|
for (unsigned int processIdx = 2; processIdx <= processMax; processIdx++)
|
||||||
protocolParallelClientAdd(parallelExec, protocolLocalGet(protocolStorageTypePg, pgId, processIdx));
|
protocolParallelClientAdd(parallelExec, protocolLocalGet(protocolStorageTypePg, pgId, processIdx));
|
||||||
|
|
||||||
|
// Maintain a list of files that need to be removed from the manifest when the backup is complete
|
||||||
|
StringList *fileRemove = strLstNew();
|
||||||
|
|
||||||
// Determine how often the manifest will be saved (every one percent or threshold size, whichever is greater)
|
// Determine how often the manifest will be saved (every one percent or threshold size, whichever is greater)
|
||||||
uint64_t manifestSaveLast = 0;
|
uint64_t manifestSaveLast = 0;
|
||||||
uint64_t manifestSaveSize = sizeTotal / 100;
|
uint64_t manifestSaveSize = sizeTotal / 100;
|
||||||
@ -1613,7 +1620,7 @@ backupProcess(BackupData *backupData, Manifest *manifest, const String *lsnStart
|
|||||||
storagePathP(
|
storagePathP(
|
||||||
protocolParallelJobProcessId(job) > 1 ? storagePgId(pgId) : backupData->storagePrimary,
|
protocolParallelJobProcessId(job) > 1 ? storagePgId(pgId) : backupData->storagePrimary,
|
||||||
manifestPathPg(manifestFileFind(manifest, varStr(protocolParallelJobKey(job)))->name)),
|
manifestPathPg(manifestFileFind(manifest, varStr(protocolParallelJobKey(job)))->name)),
|
||||||
job, sizeTotal, sizeCopied, backupData->pageSize);
|
fileRemove, job, sizeTotal, sizeCopied, backupData->pageSize);
|
||||||
}
|
}
|
||||||
|
|
||||||
// A keep-alive is required here for the remote holding open the backup connection
|
// A keep-alive is required here for the remote holding open the backup connection
|
||||||
@ -1633,6 +1640,17 @@ backupProcess(BackupData *backupData, Manifest *manifest, const String *lsnStart
|
|||||||
}
|
}
|
||||||
MEM_CONTEXT_TEMP_END();
|
MEM_CONTEXT_TEMP_END();
|
||||||
|
|
||||||
|
#ifdef DEBUG
|
||||||
|
// Ensure that all processing queues are empty
|
||||||
|
for (unsigned int queueIdx = 0; queueIdx < lstSize(jobData.queueList); queueIdx++)
|
||||||
|
ASSERT(lstSize(*(List **)lstGet(jobData.queueList, queueIdx)) == 0);
|
||||||
|
#endif
|
||||||
|
|
||||||
|
// Remove files from the manifest that were removed during the backup. This must happen after processing to avoid
|
||||||
|
// invalidating pointers by deleting items from the list.
|
||||||
|
for (unsigned int fileRemoveIdx = 0; fileRemoveIdx < strLstSize(fileRemove); fileRemoveIdx++)
|
||||||
|
manifestFileRemove(manifest, strLstGet(fileRemove, fileRemoveIdx));
|
||||||
|
|
||||||
// Log references or create hardlinks for all files
|
// Log references or create hardlinks for all files
|
||||||
const char *const compressExt = jobData.compress ? "." GZIP_EXT : "";
|
const char *const compressExt = jobData.compress ? "." GZIP_EXT : "";
|
||||||
|
|
||||||
|
@ -1297,21 +1297,21 @@ testRun(void)
|
|||||||
harnessLogLevelSet(logLevelDetail);
|
harnessLogLevelSet(logLevelDetail);
|
||||||
|
|
||||||
// -------------------------------------------------------------------------------------------------------------------------
|
// -------------------------------------------------------------------------------------------------------------------------
|
||||||
TEST_TITLE("error when postmaster.pid exists");
|
TEST_TITLE("report job error");
|
||||||
|
|
||||||
ProtocolParallelJob *job = protocolParallelJobNew(VARSTRDEF("key"), protocolCommandNew(STRDEF("command")));
|
ProtocolParallelJob *job = protocolParallelJobNew(VARSTRDEF("key"), protocolCommandNew(STRDEF("command")));
|
||||||
protocolParallelJobErrorSet(job, errorTypeCode(&AssertError), STRDEF("error message"));
|
protocolParallelJobErrorSet(job, errorTypeCode(&AssertError), STRDEF("error message"));
|
||||||
|
|
||||||
TEST_ERROR(backupJobResult((Manifest *)1, NULL, STRDEF("log"), job, 0, 0, 0), AssertError, "error message");
|
TEST_ERROR(backupJobResult((Manifest *)1, NULL, STRDEF("log"), strLstNew(), job, 0, 0, 0), AssertError, "error message");
|
||||||
|
|
||||||
// -------------------------------------------------------------------------------------------------------------------------
|
// -------------------------------------------------------------------------------------------------------------------------
|
||||||
TEST_TITLE("remove skipped file");
|
TEST_TITLE("report host/100% progress on noop result");
|
||||||
|
|
||||||
// Create job that skips file
|
// Create job that skips file
|
||||||
job = protocolParallelJobNew(VARSTRDEF("pg_data/test"), protocolCommandNew(STRDEF("command")));
|
job = protocolParallelJobNew(VARSTRDEF("pg_data/test"), protocolCommandNew(STRDEF("command")));
|
||||||
|
|
||||||
VariantList *result = varLstNew();
|
VariantList *result = varLstNew();
|
||||||
varLstAdd(result, varNewUInt64(backupCopyResultSkip));
|
varLstAdd(result, varNewUInt64(backupCopyResultNoOp));
|
||||||
varLstAdd(result, varNewUInt64(0));
|
varLstAdd(result, varNewUInt64(0));
|
||||||
varLstAdd(result, varNewUInt64(0));
|
varLstAdd(result, varNewUInt64(0));
|
||||||
varLstAdd(result, NULL);
|
varLstAdd(result, NULL);
|
||||||
@ -1323,9 +1323,10 @@ testRun(void)
|
|||||||
Manifest *manifest = manifestNewInternal();
|
Manifest *manifest = manifestNewInternal();
|
||||||
manifestFileAdd(manifest, &(ManifestFile){.name = STRDEF("pg_data/test")});
|
manifestFileAdd(manifest, &(ManifestFile){.name = STRDEF("pg_data/test")});
|
||||||
|
|
||||||
TEST_RESULT_UINT(backupJobResult(manifest, STRDEF("host"), STRDEF("log-test"), job, 0, 0, 0), 0, "log skip result");
|
TEST_RESULT_UINT(
|
||||||
|
backupJobResult(manifest, STRDEF("host"), STRDEF("log-test"), strLstNew(), job, 0, 0, 0), 0, "log noop result");
|
||||||
|
|
||||||
TEST_RESULT_LOG("P00 DETAIL: skip file removed by database host:log-test");
|
TEST_RESULT_LOG("P00 DETAIL: match file from prior backup host:log-test (0B, 100%)");
|
||||||
}
|
}
|
||||||
|
|
||||||
// Offline tests should only be used to test offline functionality and errors easily tested in offline mode
|
// Offline tests should only be used to test offline functionality and errors easily tested in offline mode
|
||||||
@ -1900,7 +1901,7 @@ testRun(void)
|
|||||||
}
|
}
|
||||||
|
|
||||||
// -------------------------------------------------------------------------------------------------------------------------
|
// -------------------------------------------------------------------------------------------------------------------------
|
||||||
TEST_TITLE("online 9.6 back-standby full backup");
|
TEST_TITLE("online 9.6 backup-standby full backup");
|
||||||
|
|
||||||
backupTimeStart = BACKUP_EPOCH + 1200000;
|
backupTimeStart = BACKUP_EPOCH + 1200000;
|
||||||
|
|
||||||
@ -1945,6 +1946,15 @@ testRun(void)
|
|||||||
// that they were copied from the right place.
|
// that they were copied from the right place.
|
||||||
storagePutP(storageNewWriteP(storagePgIdWrite(1), STRDEF(PG_PATH_BASE "/1/1"), .timeModified = backupTimeStart), NULL);
|
storagePutP(storageNewWriteP(storagePgIdWrite(1), STRDEF(PG_PATH_BASE "/1/1"), .timeModified = backupTimeStart), NULL);
|
||||||
storagePutP(storageNewWriteP(storagePgIdWrite(2), STRDEF(PG_PATH_BASE "/1/1")), BUFSTRDEF("DATA"));
|
storagePutP(storageNewWriteP(storagePgIdWrite(2), STRDEF(PG_PATH_BASE "/1/1")), BUFSTRDEF("DATA"));
|
||||||
|
storagePutP(
|
||||||
|
storageNewWriteP(storagePgIdWrite(1), STRDEF(PG_PATH_BASE "/1/2"), .timeModified = backupTimeStart),
|
||||||
|
BUFSTRDEF("D"));
|
||||||
|
storagePutP(storageNewWriteP(storagePgIdWrite(2), STRDEF(PG_PATH_BASE "/1/2")), BUFSTRDEF("DATA"));
|
||||||
|
|
||||||
|
// Create a file on the primary that does not exist on the standby to test that the file is removed from the manifest
|
||||||
|
storagePutP(
|
||||||
|
storageNewWriteP(storagePgIdWrite(1), STRDEF(PG_PATH_BASE "/1/0"), .timeModified = backupTimeStart),
|
||||||
|
BUFSTRDEF("DATA"));
|
||||||
|
|
||||||
// Set log level to warn because the following test uses multiple processes so the log order will not be deterministic
|
// Set log level to warn because the following test uses multiple processes so the log order will not be deterministic
|
||||||
harnessLogLevelSet(logLevelWarn);
|
harnessLogLevelSet(logLevelWarn);
|
||||||
@ -1979,6 +1989,7 @@ testRun(void)
|
|||||||
"pg_data/base {path}\n"
|
"pg_data/base {path}\n"
|
||||||
"pg_data/base/1 {path}\n"
|
"pg_data/base/1 {path}\n"
|
||||||
"pg_data/base/1/1 {file, s=4}\n"
|
"pg_data/base/1/1 {file, s=4}\n"
|
||||||
|
"pg_data/base/1/2 {file, s=4}\n"
|
||||||
"pg_data/global {path}\n"
|
"pg_data/global {path}\n"
|
||||||
"pg_data/global/pg_control {file, s=8192}\n"
|
"pg_data/global/pg_control {file, s=8192}\n"
|
||||||
"pg_data/pg_xlog {path}\n"
|
"pg_data/pg_xlog {path}\n"
|
||||||
|
Reference in New Issue
Block a user