1
0
mirror of https://github.com/pgbackrest/pgbackrest.git synced 2024-12-14 10:13:05 +02:00

v0.19: Improved error reporting/handling

* Working on improving error handling in the file object.  This is not complete, but works well enough to find a few errors that have been causing us problems (notably, find is occasionally failing building the archive async manifest when system is under load).

* Found and squashed a nasty bug where file_copy was defaulted to ignore errors.  There was also an issue in file_exists that was causing the test to fail when the file actually did exist.  Together they could have resulted in a corrupt backup with no errors, though it is very unlikely.
This commit is contained in:
David Steele 2014-05-13 11:47:14 -04:00
parent 9acbba173e
commit 1fa8dbb778
6 changed files with 170 additions and 67 deletions

2
.gitignore vendored
View File

@ -1,4 +1,4 @@
**/*~
*~
*.swp
test/test

View File

@ -4,16 +4,45 @@ Simple Postgres Backup and Restore
## planned for next release
* Capture SDTERR in file functions - start with file_list_get() - IN PROGRESS.
## feature backlog
* Move backups to be removed to temp before deleting.
* Async archive-get.
* Database restore.
* --version param (with VERSION file written to directory).
* Threading for archive-get and archive-put.
* Add configurable sleep to archiver process to reduce ssh connections.
* Fix bug where .backup files written into old directories can cause the archive process to error.
* Default restore.conf is written to each backup.
* Able to set timeout on ssh connection in config file.
* Fix bug where .backup files written into old directories can cause the archive process to error.
## required perl modules
* Add configurable sleep to archiver process to reduce ssh connections.
Config::IniFiles
Moose
IPC::System::Simple
Net::OpenSSH
JSON
IPC::Open3
## release notes
### v0.19: Improved error reporting/handling
* Working on improving error handling in the file object. This is not complete, but works well enough to find a few errors that have been causing us problems (notably, find is occasionally failing building the archive async manifest when system is under load).
* Found and squashed a nasty bug where file_copy was defaulted to ignore errors. There was also an issue in file_exists that was causing the test to fail when the file actually did exist. Together they could have resulted in a corrupt backup with no errors, though it is very unlikely.
### v0.18: Return soft error from archive-get when file is missing
* The archive-get function returns a 1 when the archive file is missing to differentiate from hard errors (ssh connection failure, file copy error, etc.) This lets Postgres know that that the archive stream has terminated normally. However, this does not take into account possible holes in the archive stream.

View File

@ -352,9 +352,9 @@ if ($strOperation eq OP_ARCHIVE_PUSH || $strOperation eq OP_ARCHIVE_PULL)
}
# Create a lock file to make sure archive-pull does not run more than once
my $strLockFile = "${strArchivePath}/lock/${strStanza}-archive.lock";
my $strLockPath = "${strArchivePath}/lock/${strStanza}-archive.lock";
if (!lock_file_create($strLockFile))
if (!lock_file_create($strLockPath))
{
&log(DEBUG, "archive-pull process is already running - exiting");
exit 0
@ -381,7 +381,8 @@ if ($strOperation eq OP_ARCHIVE_PUSH || $strOperation eq OP_ARCHIVE_PULL)
strCommandChecksum => config_load(CONFIG_SECTION_COMMAND, CONFIG_KEY_CHECKSUM, $bChecksum),
strCommandCompress => config_load(CONFIG_SECTION_COMMAND, CONFIG_KEY_COMPRESS, $bCompress),
strCommandDecompress => config_load(CONFIG_SECTION_COMMAND, CONFIG_KEY_DECOMPRESS, $bCompress),
strCommandManifest => config_load(CONFIG_SECTION_COMMAND, CONFIG_KEY_MANIFEST)
strCommandManifest => config_load(CONFIG_SECTION_COMMAND, CONFIG_KEY_MANIFEST),
strLockPath => $strLockPath
);
backup_init
@ -484,6 +485,15 @@ elsif ($strType ne "full" && $strType ne "differential" && $strType ne "incremen
my $bCompress = config_load(CONFIG_SECTION_BACKUP, CONFIG_KEY_COMPRESS, true, "y") eq "y" ? true : false;
my $bChecksum = config_load(CONFIG_SECTION_BACKUP, CONFIG_KEY_CHECKSUM, true, "y") eq "y" ? true : false;
# Set the lock path
my $strLockPath = config_load(CONFIG_SECTION_BACKUP, CONFIG_KEY_PATH, true) . "/lock/${strStanza}-${strOperation}.lock";
if (!lock_file_create($strLockPath))
{
&log(ERROR, "backup process is already running for stanza ${strStanza} - exiting");
exit 0
}
# Run file_init_archive - the rest of the file config required for backup and restore
my $oFile = pg_backrest_file->new
(
@ -498,7 +508,8 @@ my $oFile = pg_backrest_file->new
strCommandCompress => config_load(CONFIG_SECTION_COMMAND, CONFIG_KEY_COMPRESS, $bCompress),
strCommandDecompress => config_load(CONFIG_SECTION_COMMAND, CONFIG_KEY_DECOMPRESS, $bCompress),
strCommandManifest => config_load(CONFIG_SECTION_COMMAND, CONFIG_KEY_MANIFEST),
strCommandPsql => config_load(CONFIG_SECTION_COMMAND, CONFIG_KEY_PSQL)
strCommandPsql => config_load(CONFIG_SECTION_COMMAND, CONFIG_KEY_PSQL),
strLockPath => $strLockPath
);
my $oDb = pg_backrest_db->new
@ -527,20 +538,10 @@ backup_init
####################################################################################################################################
if ($strOperation eq OP_BACKUP)
{
my $strLockFile = $oFile->path_get(PATH_BACKUP, "lock/${strStanza}-backup.lock");
if (!lock_file_create($strLockFile))
{
&log(ERROR, "backup process is already running for stanza ${strStanza} - exiting");
exit 0
}
backup(config_load(CONFIG_SECTION_STANZA, CONFIG_KEY_PATH),
config_load(CONFIG_SECTION_BACKUP, CONFIG_KEY_START_FAST, true, "n") eq "y" ? true : false);
$strOperation = OP_EXPIRE;
lock_file_remove();
}
####################################################################################################################################
@ -548,14 +549,6 @@ if ($strOperation eq OP_BACKUP)
####################################################################################################################################
if ($strOperation eq OP_EXPIRE)
{
my $strLockFile = $oFile->path_get(PATH_BACKUP, "lock/${strStanza}-expire.lock");
if (!lock_file_create($strLockFile))
{
&log(ERROR, "expire process is already running for stanza ${strStanza} - exiting");
exit 0
}
backup_expire
(
$oFile->path_get(PATH_BACKUP_CLUSTER),

View File

@ -1191,17 +1191,22 @@ sub backup_file_thread
{
&log(DEBUG, "thread ${iThreadIdx} unable to copy file: " . $oFileCopyMap{$strFile}{db_file});
# If the copy fails then see if the file still exists on the database
if (!$oFileThread->file_exists(PATH_DB_ABSOLUTE, $oFileCopyMap{$strFile}{db_file}))
# If the copy fails then then check if the file exists. The database frequently removes files so it is normal for
# files to be missing after the manifest is built. However, if the file exists then it means there was some other
# sort of fatal copy error and an abort is required to prevent a corrupted backup
if ($oFileThread->file_exists(PATH_DB_ABSOLUTE, $oFileCopyMap{$strFile}{db_file}))
{
# If it is missing then the database must have removed it (or is now corrupt)
&log(INFO, "thread ${iThreadIdx} skipped file removed by database: " . $oFileCopyMap{$strFile}{db_file});
# Remove the destination file and the temp file just in case they had already been written
$oFileThread->file_remove(PATH_BACKUP_TMP, $oFileCopyMap{$strFile}{backup_file}, true);
$oFileThread->file_remove(PATH_BACKUP_TMP, $oFileCopyMap{$strFile}{backup_file});
# !!! Improve this error when able to retrieve error text from the File object
confess &log(ERROR, "unable to copy file $oFileCopyMap{$strFile}{db_file}");
}
# If file is missing assume the database removed it (else corruption and nothing we can do!)
&log(INFO, "thread ${iThreadIdx} skipped file removed by database: " . $oFileCopyMap{$strFile}{db_file});
# Remove the destination file and the temp file just in case they had already been written
$oFileThread->file_remove(PATH_BACKUP_TMP, $oFileCopyMap{$strFile}{backup_file}, true);
$oFileThread->file_remove(PATH_BACKUP_TMP, $oFileCopyMap{$strFile}{backup_file});
# Write a message into the master queue to have the file removed from the manifest
$oMasterQueue[$iThreadIdx]->enqueue("remove|$oFileCopyMap{$strFile}{file_section}|$oFileCopyMap{$strFile}{file}");
@ -1209,7 +1214,7 @@ sub backup_file_thread
next;
}
# Generate checksum for file if requested
# Generate checksum for file if configured
if ($bChecksum && $lSize != 0)
{
# Generate the checksum

View File

@ -32,6 +32,13 @@ has strCommandDecompress => (is => 'bare');
has strCommandCat => (is => 'bare', default => 'cat %file%');
has strCommandManifest => (is => 'bare');
# Lock path
has strLockPath => (is => 'bare');
# Files to hold stderr
#has strBackupStdErrFile => (is => 'bare');
#has strDbStdErrFile => (is => 'bare');
# Module variables
has strDbUser => (is => 'bare'); # Database user
has strDbHost => (is => 'bare'); # Database host
@ -48,6 +55,21 @@ has bNoCompression => (is => 'bare');
has strStanza => (is => 'bare');
has iThreadIdx => (is => 'bare');
####################################################################################################################################
# PATH_GET Constants
####################################################################################################################################
use constant
{
PATH_DB => 'db',
PATH_DB_ABSOLUTE => 'db:absolute',
PATH_BACKUP => 'backup',
PATH_BACKUP_ABSOLUTE => 'backup:absolute',
PATH_BACKUP_CLUSTER => 'backup:cluster',
PATH_BACKUP_TMP => 'backup:tmp',
PATH_BACKUP_ARCHIVE => 'backup:archive',
PATH_LOCK_ERR => 'lock:err'
};
####################################################################################################################################
# CONSTRUCTOR
####################################################################################################################################
@ -79,9 +101,9 @@ sub BUILD
if (!defined($self->{oBackupSSH}) && defined($self->{strBackupHost}))
{
&log(TRACE, "connecting to backup ssh host " . $self->{strBackupHost});
# !!! This could be improved by redirecting stderr to a file to get a better error message
$self->{oBackupSSH} = Net::OpenSSH->new($self->{strBackupHost}, timeout => 300, user => $self->{strBackupUser},
default_stderr_file => $self->path_get(PATH_LOCK_ERR, "file"),
master_opts => [-o => $strOptionSSHCompression, -o => $strOptionSSHRequestTTY]);
$self->{oBackupSSH}->error and confess &log(ERROR, "unable to connect to $self->{strBackupHost}: " . $self->{oBackupSSH}->error);
}
@ -91,8 +113,8 @@ sub BUILD
{
&log(TRACE, "connecting to database ssh host $self->{strDbHost}");
# !!! This could be improved by redirecting stderr to a file to get a better error message
$self->{oDbSSH} = Net::OpenSSH->new($self->{strDbHost}, timeout => 300, user => $self->{strDbUser},
default_stderr_file => $self->path_get(PATH_LOCK_ERR, "file"),
master_opts => [-o => $strOptionSSHCompression, -o => $strOptionSSHRequestTTY]);
$self->{oDbSSH}->error and confess &log(ERROR, "unable to connect to $self->{strDbHost}: " . $self->{oDbSSH}->error);
}
@ -127,24 +149,31 @@ sub clone
strBackupClusterPath => $self->{strBackupClusterPath},
bNoCompression => $self->{bNoCompression},
strStanza => $self->{strStanza},
iThreadIdx => $iThreadIdx
iThreadIdx => $iThreadIdx,
strLockPath => $self->{strLockPath}
);
}
####################################################################################################################################
# ERROR_GET
####################################################################################################################################
sub error_get
{
my $self = shift;
my $strErrorFile = $self->path_get(PATH_LOCK_ERR, "file");
open my $hFile, '<', $strErrorFile or return "error opening ${strErrorFile} to read STDERR output";
my $strError = do {local $/; <$hFile>};
close $hFile;
return trim($strError);
}
####################################################################################################################################
# PATH_GET
####################################################################################################################################
use constant
{
PATH_DB => 'db',
PATH_DB_ABSOLUTE => 'db:absolute',
PATH_BACKUP => 'backup',
PATH_BACKUP_ABSOLUTE => 'backup:absolute',
PATH_BACKUP_CLUSTER => 'backup:cluster',
PATH_BACKUP_TMP => 'backup:tmp',
PATH_BACKUP_ARCHIVE => 'backup:archive'
};
sub path_type_get
{
my $self = shift;
@ -215,6 +244,15 @@ sub path_get
confess &log(ASSERT, "\$strStanza not yet defined");
}
# Get the lock error path
if ($strType eq PATH_LOCK_ERR)
{
my $strTempPath = "$self->{strLockPath}";
return ${strTempPath} . (defined($strFile) ? "/${strFile}" .
(defined($self->{iThreadIdx}) ? ".$self->{iThreadIdx}" : "") . ".err" : "");
}
# Get the backup tmp path
if ($strType eq PATH_BACKUP_TMP)
{
@ -523,7 +561,7 @@ sub file_copy
# if bPathCreate is not defined, default to true
$bPathCreate = defined($bPathCreate) ? $bPathCreate : true;
$bConfessCopyError = defined($bConfessCopyError) ? $bConfessCopyError : false;
$bConfessCopyError = defined($bConfessCopyError) ? $bConfessCopyError : true;
&log(TRACE, "file_copy: ${strSourcePathType}: " . (defined($strSourceFile) ? ":${strSourceFile}" : "") .
" to ${strDestinationPathType}" . (defined($strDestinationFile) ? ":${strDestinationFile}" : ""));
@ -774,8 +812,8 @@ sub file_list_get
my $strPathList = $self->path_get($strPathType, $strPath);
# Builds the file list command
# my $strCommand = "ls ${strPathList} | egrep \"$strExpression\" 2> /dev/null";
my $strCommand = "ls -1 ${strPathList} 2> /dev/null";
# my $strCommand = "ls ${strPathList} | egrep \"$strExpression\"";
my $strCommand = "ls -1 ${strPathList}";
# Run the file list command
my $strFileList = "";
@ -787,16 +825,30 @@ sub file_list_get
my $oSSH = $self->remote_get($strPathType);
$strFileList = $oSSH->capture($strCommand);
if ($oSSH->error)
{
confess &log(ERROR, "unable to execute file list (${strCommand}): " . $self->error_get());
}
}
# Run locally
else
{
&log(TRACE, "file_list_get: local ${strPathType}:${strPathList} ${strCommand}");
$strFileList = capture($strCommand) or confess("error in ${strCommand}");
$strFileList = capture($strCommand);
}
# Split the files into an array
my @stryFileList = grep(/$strExpression/i, split(/\n/, $strFileList));
my @stryFileList;
if (defined($strExpression))
{
@stryFileList = grep(/$strExpression/i, split(/\n/, $strFileList));
}
else
{
@stryFileList = split(/\n/, $strFileList);
}
# Return the array in reverse order if specified
if (defined($strSortOrder) && $strSortOrder eq "reverse")
@ -821,7 +873,7 @@ sub file_exists
my $strPathExists = $self->path_get($strPathType, $strPath);
# Builds the exists command
my $strCommand = "ls ${strPathExists} 2> /dev/null";
my $strCommand = "ls ${strPathExists}";
# Run the file exists command
my $strExists = "";
@ -832,15 +884,22 @@ sub file_exists
&log(TRACE, "file_exists: remote ${strPathType}:${strPathExists}");
my $oSSH = $self->remote_get($strPathType);
$strExists = $oSSH->capture($strCommand);
$strExists = trim($oSSH->capture($strCommand));
if ($oSSH->error)
{
confess &log(ERROR, "unable to execute file exists (${strCommand}): " . $self->error_get());
}
}
# Run locally
else
{
&log(TRACE, "file_exists: local ${strPathType}:${strPathExists}");
$strExists = capture($strCommand);
$strExists = trim(capture($strCommand));
}
&log(TRACE, "file_exists: search = ${strPathExists}, result = ${strExists}");
# If the return from ls eq strPathExists then true
return ($strExists eq $strPathExists);
}
@ -874,11 +933,16 @@ sub file_remove
my $oSSH = $self->remote_get($strPathType);
$oSSH->system($strCommand) or $bErrorIfNotExists ? confess &log(ERROR, "unable to remove remote ${strPathType}:${strPathRemove}") : true;
if ($oSSH->error)
{
confess &log(ERROR, "unable to execute file_remove (${strCommand}): " . $self->error_get());
}
}
# Run locally
else
{
&log(TRACE, "file_exists: local ${strPathType}:${strPathRemove}");
&log(TRACE, "file_remove: local ${strPathType}:${strPathRemove}");
system($strCommand) == 0 or $bErrorIfNotExists ? confess &log(ERROR, "unable to remove local ${strPathType}:${strPathRemove}") : true;
}
}
@ -903,7 +967,6 @@ sub manifest_get
# Builds the manifest command
my $strCommand = $self->{strCommandManifest};
$strCommand =~ s/\%path\%/${strPathManifest}/g;
$strCommand .= " 2> /dev/null";
# Run the manifest command
my $strManifest;
@ -914,7 +977,8 @@ sub manifest_get
&log(TRACE, "manifest_get: remote ${strPathType}:${strPathManifest}");
my $oSSH = $self->remote_get($strPathType);
$strManifest = $oSSH->capture($strCommand) or confess &log(ERROR, "unable to execute remote command '${strCommand}'");
$strManifest = $oSSH->capture($strCommand) or
confess &log(ERROR, "unable to execute remote manifest (${strCommand}): " . $self->error_get());
}
# Run locally
else

View File

@ -10,6 +10,7 @@ use warnings;
use Carp;
use IPC::System::Simple qw(capture);
use Fcntl qw(:DEFAULT :flock);
use File::Path qw(remove_tree);
use Exporter qw(import);
@ -41,7 +42,7 @@ my $strLogLevelFile = ERROR;
my $strLogLevelConsole = ERROR;
my %oLogLevelRank;
my $strLockFile;
my $strLockPath;
my $hLockFile;
$oLogLevelRank{TRACE}{rank} = 6;
@ -57,13 +58,23 @@ $oLogLevelRank{OFF}{rank} = 0;
####################################################################################################################################
sub lock_file_create
{
my $strLockFileParam = shift;
$strLockFile = $strLockFileParam;
my $strLockPathParam = shift;
my $strLockFile = $strLockPathParam . "/process.lock";
if (defined($hLockFile))
{
confess &lock(ASSERT, "${strLockFile} lock is already held, cannot create lock ${strLockFile}");
confess &lock(ASSERT, "${strLockFile} lock is already held");
}
$strLockPath = $strLockPathParam;
unless (-e $strLockPath)
{
if (system("mkdir -p ${strLockPath}") != 0)
{
confess &log(ERROR, "Unable to create lock path ${strLockPath}");
}
}
sysopen($hLockFile, $strLockFile, O_WRONLY | O_CREAT)
@ -86,10 +97,11 @@ sub lock_file_remove
if (defined($hLockFile))
{
close($hLockFile);
unlink($strLockFile) or confess &log(ERROR, "unable to remove lock file ${strLockFile}");
remove_tree($strLockPath) or confess &log(ERROR, "unable to delete lock path ${strLockPath}");
$hLockFile = undef;
$strLockFile = undef;
$strLockPath = undef;
}
else
{