More work on error-handling.

2024-12-14 10:13:05 +02:00 · 2014-05-13 11:23:15 -04:00 · 2014-05-13 11:23:15 -04:00 · d234aeea64
commit d234aeea64
parent a2b28623d4
4 changed files with 57 additions and 44 deletions
--- a/README.md
+++ b/README.md
@ -4,18 +4,10 @@ Simple Postgres Backup and Restore

 ## planned for next release

-* Default restore.conf is written to each backup.
-
-* Able to set timeout on ssh connection in config file.
-
-* Fix bug where .backup files written into old directories can cause the archive process to error.
-
-* Add configurable sleep to archiver process to reduce ssh connections.
+* Capture SDTERR in file functions - start with file_list_get() - IN PROGRESS.

 ## feature backlog

-* Capture SDTERR in file functions - start with file_list_get().
-
 * Move backups to be removed to temp before deleting.

 * Async archive-get.
@ -26,8 +18,31 @@ Simple Postgres Backup and Restore

 * Threading for archive-get and archive-put.

+* Add configurable sleep to archiver process to reduce ssh connections.
+
+* Fix bug where .backup files written into old directories can cause the archive process to error.
+
+* Default restore.conf is written to each backup.
+
+* Able to set timeout on ssh connection in config file.
+
+## required perl modules
+
+Config::IniFiles
+Moose
+IPC::System::Simple
+Net::OpenSSH
+JSON
+IPC::Open3
+
 ## release notes

+### v0.19: Improved error reporting/handling
+
+* Working on improving error handling in the file object.  This is not complete, but works well enough to find a few errors that have been causing us problems (notably, find is occasionally failing building the archive async manifest when system is under load).
+
+* Found and squashed a nasty bug where file_copy was defaulted to ignore errors.  There was also an issue in file_exists that was causing the test to fail when the file actually did exist.  Together they could have resulted in a corrupt backup with no errors, though it is very unlikely.
+
 ### v0.18: Return soft error from archive-get when file is missing

 * The archive-get function returns a 1 when the archive file is missing to differentiate from hard errors (ssh connection failure, file copy error, etc.)  This lets Postgres know that that the archive stream has terminated normally.  However, this does not take into account possible holes in the archive stream.
--- a/pg_backrest.pl
+++ b/pg_backrest.pl
@ -542,12 +542,6 @@ if ($strOperation eq OP_BACKUP)
           config_load(CONFIG_SECTION_BACKUP, CONFIG_KEY_START_FAST, true, "n") eq "y" ? true : false);

    $strOperation = OP_EXPIRE;
-
-#    my %hash = $oFile->manifest_get(PATH_DB_ABSOLUTE, "/Users/dsteele/pg_backrest");
-#    print "hash " . %hash . "\n";
-
-#    lock_file_remove();
-#    exit 0;
 }

 ####################################################################################################################################
--- a/pg_backrest_backup.pm
+++ b/pg_backrest_backup.pm
@ -1191,17 +1191,22 @@ sub backup_file_thread
        {
            &log(DEBUG, "thread ${iThreadIdx} unable to copy file: " . $oFileCopyMap{$strFile}{db_file});

-            # If the copy fails then see if the file still exists on the database
-            if (!$oFileThread->file_exists(PATH_DB_ABSOLUTE, $oFileCopyMap{$strFile}{db_file}))
+            # If the copy fails then then check if the file exists.  The database frequently removes files so it is normal for
+            # files to be missing after the manifest is built.  However, if the file exists then it means there was some other
+            # sort of fatal copy error and an abort is required to prevent a corrupted backup
+            if ($oFileThread->file_exists(PATH_DB_ABSOLUTE, $oFileCopyMap{$strFile}{db_file}))
            {
-                # If it is missing then the database must have removed it (or is now corrupt)
-                &log(INFO, "thread ${iThreadIdx} skipped file removed by database: " . $oFileCopyMap{$strFile}{db_file});
-
-                # Remove the destination file and the temp file just in case they had already been written
-                $oFileThread->file_remove(PATH_BACKUP_TMP, $oFileCopyMap{$strFile}{backup_file}, true);
-                $oFileThread->file_remove(PATH_BACKUP_TMP, $oFileCopyMap{$strFile}{backup_file});
+                # !!! Improve this error when able to retrieve error text from the File object
+                confess &log(ERROR, "unable to copy file $oFileCopyMap{$strFile}{db_file}");
            }

+            # If file is missing assume the database removed it (else corruption and nothing we can do!)
+            &log(INFO, "thread ${iThreadIdx} skipped file removed by database: " . $oFileCopyMap{$strFile}{db_file});
+
+            # Remove the destination file and the temp file just in case they had already been written
+            $oFileThread->file_remove(PATH_BACKUP_TMP, $oFileCopyMap{$strFile}{backup_file}, true);
+            $oFileThread->file_remove(PATH_BACKUP_TMP, $oFileCopyMap{$strFile}{backup_file});
+
            # Write a message into the master queue to have the file removed from the manifest
            $oMasterQueue[$iThreadIdx]->enqueue("remove|$oFileCopyMap{$strFile}{file_section}|$oFileCopyMap{$strFile}{file}");

@ -1209,7 +1214,7 @@ sub backup_file_thread
            next;
        }

-        # Generate checksum for file if requested
+        # Generate checksum for file if configured
        if ($bChecksum && $lSize != 0)
        {
            # Generate the checksum
--- a/pg_backrest_file.pm
+++ b/pg_backrest_file.pm
@ -103,7 +103,6 @@ sub BUILD
            &log(TRACE, "connecting to backup ssh host " . $self->{strBackupHost});
            
            $self->{oBackupSSH} = Net::OpenSSH->new($self->{strBackupHost}, timeout => 300, user => $self->{strBackupUser},
-#                                      master_stderr_file => $self->path_get(PATH_LOCK_ERR, "file"),
                                      default_stderr_file => $self->path_get(PATH_LOCK_ERR, "file"),
                                      master_opts => [-o => $strOptionSSHCompression, -o => $strOptionSSHRequestTTY]);
            $self->{oBackupSSH}->error and confess &log(ERROR, "unable to connect to $self->{strBackupHost}: " . $self->{oBackupSSH}->error);
@ -115,7 +114,6 @@ sub BUILD
            &log(TRACE, "connecting to database ssh host $self->{strDbHost}");

            $self->{oDbSSH} = Net::OpenSSH->new($self->{strDbHost}, timeout => 300, user => $self->{strDbUser},
-#                                  master_stderr_file => $self->path_get(PATH_LOCK_ERR, "file"),
                                  default_stderr_file => $self->path_get(PATH_LOCK_ERR, "file"),
                                  master_opts => [-o => $strOptionSSHCompression, -o => $strOptionSSHRequestTTY]);
            $self->{oDbSSH}->error and confess &log(ERROR, "unable to connect to $self->{strDbHost}: " . $self->{oDbSSH}->error);
@ -123,17 +121,6 @@ sub BUILD
    }
 }

-####################################################################################################################################
-# LOCK_PATH_SET
-####################################################################################################################################
-#sub lock_path_set
-#{
-#    my $self = shift;
-#    my $strLockPathParam = shift;
-#    
-#    $self->{strLockPath} = $strLockPathParam;
-#}
-
 ####################################################################################################################################
 # CLONE
 ####################################################################################################################################
@ -257,7 +244,7 @@ sub path_get
        confess &log(ASSERT, "\$strStanza not yet defined");
    }

-    # Get the backup tmp path
+    # Get the lock error path
    if ($strType eq PATH_LOCK_ERR)
    {
        my $strTempPath = "$self->{strLockPath}";
@ -266,7 +253,7 @@ sub path_get
                                (defined($self->{iThreadIdx}) ? ".$self->{iThreadIdx}" : "") . ".err" : "");
    }

-    # Get the backup tmp error path
+    # Get the backup tmp path
    if ($strType eq PATH_BACKUP_TMP)
    {
        my $strTempPath = "$self->{strBackupPath}/temp/$self->{strStanza}.tmp";
@ -574,7 +561,7 @@ sub file_copy

    # if bPathCreate is not defined, default to true
    $bPathCreate = defined($bPathCreate) ? $bPathCreate : true;
-    $bConfessCopyError = defined($bConfessCopyError) ? $bConfessCopyError : false;
+    $bConfessCopyError = defined($bConfessCopyError) ? $bConfessCopyError : true;

    &log(TRACE, "file_copy: ${strSourcePathType}: " . (defined($strSourceFile) ? ":${strSourceFile}" : "") .
                " to ${strDestinationPathType}" . (defined($strDestinationFile) ? ":${strDestinationFile}" : ""));
@ -886,7 +873,7 @@ sub file_exists
    my $strPathExists = $self->path_get($strPathType, $strPath);

    # Builds the exists command
-    my $strCommand = "ls ${strPathExists} 2> /dev/null";
+    my $strCommand = "ls ${strPathExists}";
    
    # Run the file exists command
    my $strExists = "";
@ -897,15 +884,22 @@ sub file_exists
        &log(TRACE, "file_exists: remote ${strPathType}:${strPathExists}");

        my $oSSH = $self->remote_get($strPathType);
-        $strExists = $oSSH->capture($strCommand);
+        $strExists = trim($oSSH->capture($strCommand));
+        
+        if ($oSSH->error)
+        {
+            confess &log(ERROR, "unable to execute file exists (${strCommand}): " . $self->error_get());
+        }
    }
    # Run locally
    else
    {
        &log(TRACE, "file_exists: local ${strPathType}:${strPathExists}");
-        $strExists = capture($strCommand);
+        $strExists = trim(capture($strCommand));
    }

+    &log(TRACE, "file_exists: search = ${strPathExists}, result = ${strExists}");
+
    # If the return from ls eq strPathExists then true
    return ($strExists eq $strPathExists);
 }
@ -939,11 +933,16 @@ sub file_remove

        my $oSSH = $self->remote_get($strPathType);
        $oSSH->system($strCommand) or $bErrorIfNotExists ? confess &log(ERROR, "unable to remove remote ${strPathType}:${strPathRemove}") : true;
+        
+        if ($oSSH->error)
+        {
+            confess &log(ERROR, "unable to execute file_remove (${strCommand}): " . $self->error_get());
+        }
    }
    # Run locally
    else
    {
-        &log(TRACE, "file_exists: local ${strPathType}:${strPathRemove}");
+        &log(TRACE, "file_remove: local ${strPathType}:${strPathRemove}");
        system($strCommand) == 0 or $bErrorIfNotExists ? confess &log(ERROR, "unable to remove local ${strPathType}:${strPathRemove}") : true;
    }
 }