v0.19: Improved error reporting/handling

* Working on improving error handling in the file object. This is not complete, but works well enough to find a few errors that have been causing us problems (notably, find is occasionally failing building the archive async manifest when system is under load). * Found and squashed a nasty bug where file_copy was defaulted to ignore errors. There was also an issue in file_exists that was causing the test to fail when the file actually did exist. Together they could have resulted in a corrupt backup with no errors, though it is very unlikely.
2025-11-06 08:49:29 +02:00 · 2014-05-13 11:47:14 -04:00
parent 9acbba173e
commit 1fa8dbb778
6 changed files with 170 additions and 67 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,4 @@
 **/*~
 *~
-
 *.swp
+test/test
--- a/README.md
+++ b/README.md
@@ -4,16 +4,45 @@ Simple Postgres Backup and Restore

 ## planned for next release

+* Capture SDTERR in file functions - start with file_list_get() - IN PROGRESS.
+
+## feature backlog
+
+* Move backups to be removed to temp before deleting.
+
+* Async archive-get.
+
+* Database restore.
+
+* --version param (with VERSION file written to directory).
+
+* Threading for archive-get and archive-put.
+
+* Add configurable sleep to archiver process to reduce ssh connections.
+
+* Fix bug where .backup files written into old directories can cause the archive process to error.
+
 * Default restore.conf is written to each backup.

 * Able to set timeout on ssh connection in config file.

-* Fix bug where .backup files written into old directories can cause the archive process to error.
+## required perl modules

-* Add configurable sleep to archiver process to reduce ssh connections.
+Config::IniFiles
+Moose
+IPC::System::Simple
+Net::OpenSSH
+JSON
+IPC::Open3

 ## release notes

+### v0.19: Improved error reporting/handling
+
+* Working on improving error handling in the file object.  This is not complete, but works well enough to find a few errors that have been causing us problems (notably, find is occasionally failing building the archive async manifest when system is under load).
+
+* Found and squashed a nasty bug where file_copy was defaulted to ignore errors.  There was also an issue in file_exists that was causing the test to fail when the file actually did exist.  Together they could have resulted in a corrupt backup with no errors, though it is very unlikely.
+
 ### v0.18: Return soft error from archive-get when file is missing

 * The archive-get function returns a 1 when the archive file is missing to differentiate from hard errors (ssh connection failure, file copy error, etc.)  This lets Postgres know that that the archive stream has terminated normally.  However, this does not take into account possible holes in the archive stream.
--- a/pg_backrest.pl
+++ b/pg_backrest.pl
@@ -352,9 +352,9 @@ if ($strOperation eq OP_ARCHIVE_PUSH || $strOperation eq OP_ARCHIVE_PULL)
        }
        
        # Create a lock file to make sure archive-pull does not run more than once
-        my $strLockFile = "${strArchivePath}/lock/${strStanza}-archive.lock";
+        my $strLockPath = "${strArchivePath}/lock/${strStanza}-archive.lock";

-        if (!lock_file_create($strLockFile))
+        if (!lock_file_create($strLockPath))
        {
            &log(DEBUG, "archive-pull process is already running - exiting");
            exit 0
@@ -381,7 +381,8 @@ if ($strOperation eq OP_ARCHIVE_PUSH || $strOperation eq OP_ARCHIVE_PULL)
                strCommandChecksum => config_load(CONFIG_SECTION_COMMAND, CONFIG_KEY_CHECKSUM, $bChecksum),
                strCommandCompress => config_load(CONFIG_SECTION_COMMAND, CONFIG_KEY_COMPRESS, $bCompress),
                strCommandDecompress => config_load(CONFIG_SECTION_COMMAND, CONFIG_KEY_DECOMPRESS, $bCompress),
-                strCommandManifest => config_load(CONFIG_SECTION_COMMAND, CONFIG_KEY_MANIFEST)
+                strCommandManifest => config_load(CONFIG_SECTION_COMMAND, CONFIG_KEY_MANIFEST),
+                strLockPath => $strLockPath
            );

            backup_init
@@ -484,6 +485,15 @@ elsif ($strType ne "full" && $strType ne "differential" && $strType ne "incremen
 my $bCompress = config_load(CONFIG_SECTION_BACKUP, CONFIG_KEY_COMPRESS, true, "y") eq "y" ? true : false;
 my $bChecksum = config_load(CONFIG_SECTION_BACKUP, CONFIG_KEY_CHECKSUM, true, "y") eq "y" ? true : false;

+# Set the lock path
+my $strLockPath = config_load(CONFIG_SECTION_BACKUP, CONFIG_KEY_PATH, true) .  "/lock/${strStanza}-${strOperation}.lock";
+
+if (!lock_file_create($strLockPath))
+{
+    &log(ERROR, "backup process is already running for stanza ${strStanza} - exiting");
+    exit 0
+}
+
 # Run file_init_archive - the rest of the file config required for backup and restore
 my $oFile = pg_backrest_file->new
 (
@@ -498,7 +508,8 @@ my $oFile = pg_backrest_file->new
    strCommandCompress => config_load(CONFIG_SECTION_COMMAND, CONFIG_KEY_COMPRESS, $bCompress),
    strCommandDecompress => config_load(CONFIG_SECTION_COMMAND, CONFIG_KEY_DECOMPRESS, $bCompress),
    strCommandManifest => config_load(CONFIG_SECTION_COMMAND, CONFIG_KEY_MANIFEST),
-    strCommandPsql => config_load(CONFIG_SECTION_COMMAND, CONFIG_KEY_PSQL)
+    strCommandPsql => config_load(CONFIG_SECTION_COMMAND, CONFIG_KEY_PSQL),
+    strLockPath => $strLockPath
 );

 my $oDb = pg_backrest_db->new
@@ -527,20 +538,10 @@ backup_init
 ####################################################################################################################################
 if ($strOperation eq OP_BACKUP)
 {
-    my $strLockFile = $oFile->path_get(PATH_BACKUP, "lock/${strStanza}-backup.lock");
-
-    if (!lock_file_create($strLockFile))
-    {
-        &log(ERROR, "backup process is already running for stanza ${strStanza} - exiting");
-        exit 0
-    }
-
    backup(config_load(CONFIG_SECTION_STANZA, CONFIG_KEY_PATH),
           config_load(CONFIG_SECTION_BACKUP, CONFIG_KEY_START_FAST, true, "n") eq "y" ? true : false);

    $strOperation = OP_EXPIRE;
-
-    lock_file_remove();
 }

 ####################################################################################################################################
@@ -548,14 +549,6 @@ if ($strOperation eq OP_BACKUP)
 ####################################################################################################################################
 if ($strOperation eq OP_EXPIRE)
 {
-    my $strLockFile = $oFile->path_get(PATH_BACKUP, "lock/${strStanza}-expire.lock");
-
-    if (!lock_file_create($strLockFile))
-    {
-        &log(ERROR, "expire process is already running for stanza ${strStanza} - exiting");
-        exit 0
-    }
-
    backup_expire
    (
        $oFile->path_get(PATH_BACKUP_CLUSTER),
--- a/pg_backrest_backup.pm
+++ b/pg_backrest_backup.pm
@@ -1191,17 +1191,22 @@ sub backup_file_thread
        {
            &log(DEBUG, "thread ${iThreadIdx} unable to copy file: " . $oFileCopyMap{$strFile}{db_file});

-            # If the copy fails then see if the file still exists on the database
-            if (!$oFileThread->file_exists(PATH_DB_ABSOLUTE, $oFileCopyMap{$strFile}{db_file}))
+            # If the copy fails then then check if the file exists.  The database frequently removes files so it is normal for
+            # files to be missing after the manifest is built.  However, if the file exists then it means there was some other
+            # sort of fatal copy error and an abort is required to prevent a corrupted backup
+            if ($oFileThread->file_exists(PATH_DB_ABSOLUTE, $oFileCopyMap{$strFile}{db_file}))
            {
-                # If it is missing then the database must have removed it (or is now corrupt)
-                &log(INFO, "thread ${iThreadIdx} skipped file removed by database: " . $oFileCopyMap{$strFile}{db_file});
-
-                # Remove the destination file and the temp file just in case they had already been written
-                $oFileThread->file_remove(PATH_BACKUP_TMP, $oFileCopyMap{$strFile}{backup_file}, true);
-                $oFileThread->file_remove(PATH_BACKUP_TMP, $oFileCopyMap{$strFile}{backup_file});
+                # !!! Improve this error when able to retrieve error text from the File object
+                confess &log(ERROR, "unable to copy file $oFileCopyMap{$strFile}{db_file}");
            }

+            # If file is missing assume the database removed it (else corruption and nothing we can do!)
+            &log(INFO, "thread ${iThreadIdx} skipped file removed by database: " . $oFileCopyMap{$strFile}{db_file});
+
+            # Remove the destination file and the temp file just in case they had already been written
+            $oFileThread->file_remove(PATH_BACKUP_TMP, $oFileCopyMap{$strFile}{backup_file}, true);
+            $oFileThread->file_remove(PATH_BACKUP_TMP, $oFileCopyMap{$strFile}{backup_file});
+
            # Write a message into the master queue to have the file removed from the manifest
            $oMasterQueue[$iThreadIdx]->enqueue("remove|$oFileCopyMap{$strFile}{file_section}|$oFileCopyMap{$strFile}{file}");

@@ -1209,7 +1214,7 @@ sub backup_file_thread
            next;
        }

-        # Generate checksum for file if requested
+        # Generate checksum for file if configured
        if ($bChecksum && $lSize != 0)
        {
            # Generate the checksum
--- a/pg_backrest_file.pm
+++ b/pg_backrest_file.pm
@@ -32,6 +32,13 @@ has strCommandDecompress => (is => 'bare');
 has strCommandCat => (is => 'bare', default => 'cat %file%');
 has strCommandManifest => (is => 'bare');

+# Lock path
+has strLockPath => (is => 'bare');
+
+# Files to hold stderr
+#has strBackupStdErrFile => (is => 'bare');
+#has strDbStdErrFile => (is => 'bare');
+
 # Module variables
 has strDbUser => (is => 'bare');                # Database user
 has strDbHost => (is => 'bare');                # Database host
@@ -48,6 +55,21 @@ has bNoCompression => (is => 'bare');
 has strStanza => (is => 'bare');
 has iThreadIdx => (is => 'bare');

+####################################################################################################################################
+# PATH_GET Constants
+####################################################################################################################################
+use constant
+{
+    PATH_DB              => 'db',
+    PATH_DB_ABSOLUTE     => 'db:absolute',
+    PATH_BACKUP          => 'backup',
+    PATH_BACKUP_ABSOLUTE => 'backup:absolute',
+    PATH_BACKUP_CLUSTER  => 'backup:cluster',
+    PATH_BACKUP_TMP      => 'backup:tmp',
+    PATH_BACKUP_ARCHIVE  => 'backup:archive',
+    PATH_LOCK_ERR        => 'lock:err'
+};
+
 ####################################################################################################################################
 # CONSTRUCTOR
 ####################################################################################################################################
@@ -79,9 +101,9 @@ sub BUILD
        if (!defined($self->{oBackupSSH}) && defined($self->{strBackupHost}))
        {
            &log(TRACE, "connecting to backup ssh host " . $self->{strBackupHost});
-
-            # !!! This could be improved by redirecting stderr to a file to get a better error message
+            
            $self->{oBackupSSH} = Net::OpenSSH->new($self->{strBackupHost}, timeout => 300, user => $self->{strBackupUser},
+                                      default_stderr_file => $self->path_get(PATH_LOCK_ERR, "file"),
                                      master_opts => [-o => $strOptionSSHCompression, -o => $strOptionSSHRequestTTY]);
            $self->{oBackupSSH}->error and confess &log(ERROR, "unable to connect to $self->{strBackupHost}: " . $self->{oBackupSSH}->error);
        }
@@ -91,8 +113,8 @@ sub BUILD
        {
            &log(TRACE, "connecting to database ssh host $self->{strDbHost}");

-            # !!! This could be improved by redirecting stderr to a file to get a better error message
            $self->{oDbSSH} = Net::OpenSSH->new($self->{strDbHost}, timeout => 300, user => $self->{strDbUser},
+                                  default_stderr_file => $self->path_get(PATH_LOCK_ERR, "file"),
                                  master_opts => [-o => $strOptionSSHCompression, -o => $strOptionSSHRequestTTY]);
            $self->{oDbSSH}->error and confess &log(ERROR, "unable to connect to $self->{strDbHost}: " . $self->{oDbSSH}->error);
        }
@@ -127,24 +149,31 @@ sub clone
        strBackupClusterPath => $self->{strBackupClusterPath},
        bNoCompression => $self->{bNoCompression},
        strStanza => $self->{strStanza},
-        iThreadIdx => $iThreadIdx
+        iThreadIdx => $iThreadIdx,
+        strLockPath => $self->{strLockPath}
    );
 }

+####################################################################################################################################
+# ERROR_GET
+####################################################################################################################################
+sub error_get
+{
+    my $self = shift;
+    
+    my $strErrorFile = $self->path_get(PATH_LOCK_ERR, "file");
+    
+    open my $hFile, '<', $strErrorFile or return "error opening ${strErrorFile} to read STDERR output";
+    
+    my $strError = do {local $/; <$hFile>};
+    close $hFile;
+    
+    return trim($strError);
+}
+
 ####################################################################################################################################
 # PATH_GET
 ####################################################################################################################################
-use constant 
-{
-    PATH_DB              => 'db',
-    PATH_DB_ABSOLUTE     => 'db:absolute',
-    PATH_BACKUP          => 'backup',
-    PATH_BACKUP_ABSOLUTE => 'backup:absolute',
-    PATH_BACKUP_CLUSTER  => 'backup:cluster',
-    PATH_BACKUP_TMP      => 'backup:tmp',
-    PATH_BACKUP_ARCHIVE  => 'backup:archive'
-};
-
 sub path_type_get
 {
    my $self = shift;
@@ -215,6 +244,15 @@ sub path_get
        confess &log(ASSERT, "\$strStanza not yet defined");
    }

+    # Get the lock error path
+    if ($strType eq PATH_LOCK_ERR)
+    {
+        my $strTempPath = "$self->{strLockPath}";
+
+        return ${strTempPath} . (defined($strFile) ? "/${strFile}" .
+                                (defined($self->{iThreadIdx}) ? ".$self->{iThreadIdx}" : "") . ".err" : "");
+    }
+
    # Get the backup tmp path
    if ($strType eq PATH_BACKUP_TMP)
    {
@@ -523,7 +561,7 @@ sub file_copy

    # if bPathCreate is not defined, default to true
    $bPathCreate = defined($bPathCreate) ? $bPathCreate : true;
-    $bConfessCopyError = defined($bConfessCopyError) ? $bConfessCopyError : false;
+    $bConfessCopyError = defined($bConfessCopyError) ? $bConfessCopyError : true;

    &log(TRACE, "file_copy: ${strSourcePathType}: " . (defined($strSourceFile) ? ":${strSourceFile}" : "") .
                " to ${strDestinationPathType}" . (defined($strDestinationFile) ? ":${strDestinationFile}" : ""));
@@ -774,8 +812,8 @@ sub file_list_get
    my $strPathList = $self->path_get($strPathType, $strPath);

    # Builds the file list command
-#    my $strCommand = "ls ${strPathList} | egrep \"$strExpression\" 2> /dev/null";
-    my $strCommand = "ls -1 ${strPathList} 2> /dev/null";
+#    my $strCommand = "ls ${strPathList} | egrep \"$strExpression\"";
+    my $strCommand = "ls -1 ${strPathList}";
    
    # Run the file list command
    my $strFileList = "";
@@ -787,16 +825,30 @@ sub file_list_get

        my $oSSH = $self->remote_get($strPathType);
        $strFileList = $oSSH->capture($strCommand);
+        
+        if ($oSSH->error)
+        {
+            confess &log(ERROR, "unable to execute file list (${strCommand}): " . $self->error_get());
+        }
    }
    # Run locally
    else
    {
        &log(TRACE, "file_list_get: local ${strPathType}:${strPathList} ${strCommand}");
-        $strFileList = capture($strCommand) or confess("error in ${strCommand}");
+        $strFileList = capture($strCommand);
    }

    # Split the files into an array
-    my @stryFileList = grep(/$strExpression/i, split(/\n/, $strFileList));
+    my @stryFileList;
+    
+    if (defined($strExpression))
+    {
+        @stryFileList = grep(/$strExpression/i, split(/\n/, $strFileList));
+    }
+    else
+    {
+        @stryFileList = split(/\n/, $strFileList);
+    }

    # Return the array in reverse order if specified
    if (defined($strSortOrder) && $strSortOrder eq "reverse")
@@ -821,7 +873,7 @@ sub file_exists
    my $strPathExists = $self->path_get($strPathType, $strPath);

    # Builds the exists command
-    my $strCommand = "ls ${strPathExists} 2> /dev/null";
+    my $strCommand = "ls ${strPathExists}";
    
    # Run the file exists command
    my $strExists = "";
@@ -832,15 +884,22 @@ sub file_exists
        &log(TRACE, "file_exists: remote ${strPathType}:${strPathExists}");

        my $oSSH = $self->remote_get($strPathType);
-        $strExists = $oSSH->capture($strCommand);
+        $strExists = trim($oSSH->capture($strCommand));
+        
+        if ($oSSH->error)
+        {
+            confess &log(ERROR, "unable to execute file exists (${strCommand}): " . $self->error_get());
+        }
    }
    # Run locally
    else
    {
        &log(TRACE, "file_exists: local ${strPathType}:${strPathExists}");
-        $strExists = capture($strCommand);
+        $strExists = trim(capture($strCommand));
    }

+    &log(TRACE, "file_exists: search = ${strPathExists}, result = ${strExists}");
+
    # If the return from ls eq strPathExists then true
    return ($strExists eq $strPathExists);
 }
@@ -874,11 +933,16 @@ sub file_remove

        my $oSSH = $self->remote_get($strPathType);
        $oSSH->system($strCommand) or $bErrorIfNotExists ? confess &log(ERROR, "unable to remove remote ${strPathType}:${strPathRemove}") : true;
+        
+        if ($oSSH->error)
+        {
+            confess &log(ERROR, "unable to execute file_remove (${strCommand}): " . $self->error_get());
+        }
    }
    # Run locally
    else
    {
-        &log(TRACE, "file_exists: local ${strPathType}:${strPathRemove}");
+        &log(TRACE, "file_remove: local ${strPathType}:${strPathRemove}");
        system($strCommand) == 0 or $bErrorIfNotExists ? confess &log(ERROR, "unable to remove local ${strPathType}:${strPathRemove}") : true;
    }
 }
@@ -903,7 +967,6 @@ sub manifest_get
    # Builds the manifest command
    my $strCommand = $self->{strCommandManifest};
    $strCommand =~ s/\%path\%/${strPathManifest}/g;
-    $strCommand .= " 2> /dev/null";
    
    # Run the manifest command
    my $strManifest;
@@ -914,7 +977,8 @@ sub manifest_get
        &log(TRACE, "manifest_get: remote ${strPathType}:${strPathManifest}");

        my $oSSH = $self->remote_get($strPathType);
-        $strManifest = $oSSH->capture($strCommand) or confess &log(ERROR, "unable to execute remote command '${strCommand}'");
+        $strManifest = $oSSH->capture($strCommand) or
+            confess &log(ERROR, "unable to execute remote manifest (${strCommand}): " . $self->error_get());
    }
    # Run locally
    else
--- a/pg_backrest_utility.pm
+++ b/pg_backrest_utility.pm
@@ -10,6 +10,7 @@ use warnings;
 use Carp;
 use IPC::System::Simple qw(capture);
 use Fcntl qw(:DEFAULT :flock);
+use File::Path qw(remove_tree);

 use Exporter qw(import);

@@ -41,7 +42,7 @@ my $strLogLevelFile = ERROR;
 my $strLogLevelConsole = ERROR;
 my %oLogLevelRank;

-my $strLockFile;
+my $strLockPath;
 my $hLockFile;

 $oLogLevelRank{TRACE}{rank} = 6;
@@ -57,13 +58,23 @@ $oLogLevelRank{OFF}{rank} = 0;
 ####################################################################################################################################
 sub lock_file_create
 {
-    my $strLockFileParam = shift;
-    
-    $strLockFile = $strLockFileParam;
+    my $strLockPathParam = shift;
+
+    my $strLockFile = $strLockPathParam . "/process.lock";

    if (defined($hLockFile))
    {
-        confess &lock(ASSERT, "${strLockFile} lock is already held, cannot create lock ${strLockFile}");
+        confess &lock(ASSERT, "${strLockFile} lock is already held");
+    }
+    
+    $strLockPath = $strLockPathParam;
+    
+    unless (-e $strLockPath)
+    {
+        if (system("mkdir -p ${strLockPath}") != 0)
+        {
+            confess &log(ERROR, "Unable to create lock path ${strLockPath}");
+        }
    }

    sysopen($hLockFile, $strLockFile, O_WRONLY | O_CREAT)
@@ -86,10 +97,11 @@ sub lock_file_remove
    if (defined($hLockFile))
    {
        close($hLockFile);
-        unlink($strLockFile) or confess &log(ERROR, "unable to remove lock file ${strLockFile}");
+        
+        remove_tree($strLockPath) or confess &log(ERROR, "unable to delete lock path ${strLockPath}");
        
        $hLockFile = undef;
-        $strLockFile = undef;
+        $strLockPath = undef;
    }
    else
    {