1
0
mirror of https://github.com/pgbackrest/pgbackrest.git synced 2024-12-14 10:13:05 +02:00

More work on error-handling.

This commit is contained in:
David Steele 2014-05-13 11:23:15 -04:00
parent a2b28623d4
commit d234aeea64
4 changed files with 57 additions and 44 deletions

View File

@ -4,18 +4,10 @@ Simple Postgres Backup and Restore
## planned for next release
* Default restore.conf is written to each backup.
* Able to set timeout on ssh connection in config file.
* Fix bug where .backup files written into old directories can cause the archive process to error.
* Add configurable sleep to archiver process to reduce ssh connections.
* Capture SDTERR in file functions - start with file_list_get() - IN PROGRESS.
## feature backlog
* Capture SDTERR in file functions - start with file_list_get().
* Move backups to be removed to temp before deleting.
* Async archive-get.
@ -26,8 +18,31 @@ Simple Postgres Backup and Restore
* Threading for archive-get and archive-put.
* Add configurable sleep to archiver process to reduce ssh connections.
* Fix bug where .backup files written into old directories can cause the archive process to error.
* Default restore.conf is written to each backup.
* Able to set timeout on ssh connection in config file.
## required perl modules
Config::IniFiles
Moose
IPC::System::Simple
Net::OpenSSH
JSON
IPC::Open3
## release notes
### v0.19: Improved error reporting/handling
* Working on improving error handling in the file object. This is not complete, but works well enough to find a few errors that have been causing us problems (notably, find is occasionally failing building the archive async manifest when system is under load).
* Found and squashed a nasty bug where file_copy was defaulted to ignore errors. There was also an issue in file_exists that was causing the test to fail when the file actually did exist. Together they could have resulted in a corrupt backup with no errors, though it is very unlikely.
### v0.18: Return soft error from archive-get when file is missing
* The archive-get function returns a 1 when the archive file is missing to differentiate from hard errors (ssh connection failure, file copy error, etc.) This lets Postgres know that that the archive stream has terminated normally. However, this does not take into account possible holes in the archive stream.

View File

@ -542,12 +542,6 @@ if ($strOperation eq OP_BACKUP)
config_load(CONFIG_SECTION_BACKUP, CONFIG_KEY_START_FAST, true, "n") eq "y" ? true : false);
$strOperation = OP_EXPIRE;
# my %hash = $oFile->manifest_get(PATH_DB_ABSOLUTE, "/Users/dsteele/pg_backrest");
# print "hash " . %hash . "\n";
# lock_file_remove();
# exit 0;
}
####################################################################################################################################

View File

@ -1191,17 +1191,22 @@ sub backup_file_thread
{
&log(DEBUG, "thread ${iThreadIdx} unable to copy file: " . $oFileCopyMap{$strFile}{db_file});
# If the copy fails then see if the file still exists on the database
if (!$oFileThread->file_exists(PATH_DB_ABSOLUTE, $oFileCopyMap{$strFile}{db_file}))
# If the copy fails then then check if the file exists. The database frequently removes files so it is normal for
# files to be missing after the manifest is built. However, if the file exists then it means there was some other
# sort of fatal copy error and an abort is required to prevent a corrupted backup
if ($oFileThread->file_exists(PATH_DB_ABSOLUTE, $oFileCopyMap{$strFile}{db_file}))
{
# If it is missing then the database must have removed it (or is now corrupt)
&log(INFO, "thread ${iThreadIdx} skipped file removed by database: " . $oFileCopyMap{$strFile}{db_file});
# Remove the destination file and the temp file just in case they had already been written
$oFileThread->file_remove(PATH_BACKUP_TMP, $oFileCopyMap{$strFile}{backup_file}, true);
$oFileThread->file_remove(PATH_BACKUP_TMP, $oFileCopyMap{$strFile}{backup_file});
# !!! Improve this error when able to retrieve error text from the File object
confess &log(ERROR, "unable to copy file $oFileCopyMap{$strFile}{db_file}");
}
# If file is missing assume the database removed it (else corruption and nothing we can do!)
&log(INFO, "thread ${iThreadIdx} skipped file removed by database: " . $oFileCopyMap{$strFile}{db_file});
# Remove the destination file and the temp file just in case they had already been written
$oFileThread->file_remove(PATH_BACKUP_TMP, $oFileCopyMap{$strFile}{backup_file}, true);
$oFileThread->file_remove(PATH_BACKUP_TMP, $oFileCopyMap{$strFile}{backup_file});
# Write a message into the master queue to have the file removed from the manifest
$oMasterQueue[$iThreadIdx]->enqueue("remove|$oFileCopyMap{$strFile}{file_section}|$oFileCopyMap{$strFile}{file}");
@ -1209,7 +1214,7 @@ sub backup_file_thread
next;
}
# Generate checksum for file if requested
# Generate checksum for file if configured
if ($bChecksum && $lSize != 0)
{
# Generate the checksum

View File

@ -103,7 +103,6 @@ sub BUILD
&log(TRACE, "connecting to backup ssh host " . $self->{strBackupHost});
$self->{oBackupSSH} = Net::OpenSSH->new($self->{strBackupHost}, timeout => 300, user => $self->{strBackupUser},
# master_stderr_file => $self->path_get(PATH_LOCK_ERR, "file"),
default_stderr_file => $self->path_get(PATH_LOCK_ERR, "file"),
master_opts => [-o => $strOptionSSHCompression, -o => $strOptionSSHRequestTTY]);
$self->{oBackupSSH}->error and confess &log(ERROR, "unable to connect to $self->{strBackupHost}: " . $self->{oBackupSSH}->error);
@ -115,7 +114,6 @@ sub BUILD
&log(TRACE, "connecting to database ssh host $self->{strDbHost}");
$self->{oDbSSH} = Net::OpenSSH->new($self->{strDbHost}, timeout => 300, user => $self->{strDbUser},
# master_stderr_file => $self->path_get(PATH_LOCK_ERR, "file"),
default_stderr_file => $self->path_get(PATH_LOCK_ERR, "file"),
master_opts => [-o => $strOptionSSHCompression, -o => $strOptionSSHRequestTTY]);
$self->{oDbSSH}->error and confess &log(ERROR, "unable to connect to $self->{strDbHost}: " . $self->{oDbSSH}->error);
@ -123,17 +121,6 @@ sub BUILD
}
}
####################################################################################################################################
# LOCK_PATH_SET
####################################################################################################################################
#sub lock_path_set
#{
# my $self = shift;
# my $strLockPathParam = shift;
#
# $self->{strLockPath} = $strLockPathParam;
#}
####################################################################################################################################
# CLONE
####################################################################################################################################
@ -257,7 +244,7 @@ sub path_get
confess &log(ASSERT, "\$strStanza not yet defined");
}
# Get the backup tmp path
# Get the lock error path
if ($strType eq PATH_LOCK_ERR)
{
my $strTempPath = "$self->{strLockPath}";
@ -266,7 +253,7 @@ sub path_get
(defined($self->{iThreadIdx}) ? ".$self->{iThreadIdx}" : "") . ".err" : "");
}
# Get the backup tmp error path
# Get the backup tmp path
if ($strType eq PATH_BACKUP_TMP)
{
my $strTempPath = "$self->{strBackupPath}/temp/$self->{strStanza}.tmp";
@ -574,7 +561,7 @@ sub file_copy
# if bPathCreate is not defined, default to true
$bPathCreate = defined($bPathCreate) ? $bPathCreate : true;
$bConfessCopyError = defined($bConfessCopyError) ? $bConfessCopyError : false;
$bConfessCopyError = defined($bConfessCopyError) ? $bConfessCopyError : true;
&log(TRACE, "file_copy: ${strSourcePathType}: " . (defined($strSourceFile) ? ":${strSourceFile}" : "") .
" to ${strDestinationPathType}" . (defined($strDestinationFile) ? ":${strDestinationFile}" : ""));
@ -886,7 +873,7 @@ sub file_exists
my $strPathExists = $self->path_get($strPathType, $strPath);
# Builds the exists command
my $strCommand = "ls ${strPathExists} 2> /dev/null";
my $strCommand = "ls ${strPathExists}";
# Run the file exists command
my $strExists = "";
@ -897,15 +884,22 @@ sub file_exists
&log(TRACE, "file_exists: remote ${strPathType}:${strPathExists}");
my $oSSH = $self->remote_get($strPathType);
$strExists = $oSSH->capture($strCommand);
$strExists = trim($oSSH->capture($strCommand));
if ($oSSH->error)
{
confess &log(ERROR, "unable to execute file exists (${strCommand}): " . $self->error_get());
}
}
# Run locally
else
{
&log(TRACE, "file_exists: local ${strPathType}:${strPathExists}");
$strExists = capture($strCommand);
$strExists = trim(capture($strCommand));
}
&log(TRACE, "file_exists: search = ${strPathExists}, result = ${strExists}");
# If the return from ls eq strPathExists then true
return ($strExists eq $strPathExists);
}
@ -939,11 +933,16 @@ sub file_remove
my $oSSH = $self->remote_get($strPathType);
$oSSH->system($strCommand) or $bErrorIfNotExists ? confess &log(ERROR, "unable to remove remote ${strPathType}:${strPathRemove}") : true;
if ($oSSH->error)
{
confess &log(ERROR, "unable to execute file_remove (${strCommand}): " . $self->error_get());
}
}
# Run locally
else
{
&log(TRACE, "file_exists: local ${strPathType}:${strPathRemove}");
&log(TRACE, "file_remove: local ${strPathType}:${strPathRemove}");
system($strCommand) == 0 or $bErrorIfNotExists ? confess &log(ERROR, "unable to remove local ${strPathType}:${strPathRemove}") : true;
}
}