From e5bde423035406051468987b11777fcc86b7d4f7 Mon Sep 17 00:00:00 2001 From: nielash Date: Tue, 11 Jul 2023 06:57:49 -0400 Subject: [PATCH] bisync: Add experimental --resilient mode to allow recovery from self-correctable errors https://forum.rclone.org/t/bisync-bugs-and-feature-requests/37636#:~:text=2.%20Bisync%20should%20be%20more%20resilient%20to%20self%2Dcorrectable%20errors --- cmd/bisync/cmd.go | 2 ++ cmd/bisync/deltas.go | 1 + cmd/bisync/listing.go | 1 + cmd/bisync/operations.go | 28 +++++++++++++++++++++------- docs/content/bisync.md | 29 +++++++++++++++++++++++++++++ 5 files changed, 54 insertions(+), 7 deletions(-) diff --git a/cmd/bisync/cmd.go b/cmd/bisync/cmd.go index 5c4fddf48..cc8a2d925 100644 --- a/cmd/bisync/cmd.go +++ b/cmd/bisync/cmd.go @@ -40,6 +40,7 @@ type Options struct { NoCleanup bool SaveQueues bool // save extra debugging files (test only flag) IgnoreListingChecksum bool + Resilient bool } // Default values @@ -110,6 +111,7 @@ func init() { flags.BoolVarP(cmdFlags, &tzLocal, "localtime", "", tzLocal, "Use local time in listings (default: UTC)", "") flags.BoolVarP(cmdFlags, &Opt.NoCleanup, "no-cleanup", "", Opt.NoCleanup, "Retain working files (useful for troubleshooting and testing).", "") flags.BoolVarP(cmdFlags, &Opt.IgnoreListingChecksum, "ignore-listing-checksum", "", Opt.IgnoreListingChecksum, "Do not use checksums for listings (add --ignore-checksum to additionally skip post-copy checksum checks)", "") + flags.BoolVarP(cmdFlags, &Opt.Resilient, "resilient", "", Opt.Resilient, "Allow future runs to retry after certain less-serious errors, instead of requiring --resync. Use at your own risk!", "") } // bisync command definition diff --git a/cmd/bisync/deltas.go b/cmd/bisync/deltas.go index c79f62483..bb7e56615 100644 --- a/cmd/bisync/deltas.go +++ b/cmd/bisync/deltas.go @@ -104,6 +104,7 @@ func (b *bisyncRun) checkconflicts(ctxCheck context.Context, filterCheck *filter opt, close, checkopterr := check.GetCheckOpt(b.fs1, b.fs2) if checkopterr != nil { b.critical = true + b.retryable = true fs.Debugf(nil, "GetCheckOpt error: %v", checkopterr) return matches, checkopterr } diff --git a/cmd/bisync/listing.go b/cmd/bisync/listing.go index 557e44d5d..520548bd7 100644 --- a/cmd/bisync/listing.go +++ b/cmd/bisync/listing.go @@ -301,5 +301,6 @@ func (b *bisyncRun) checkListing(ls *fileList, listing, msg string) error { } fs.Errorf(nil, "Empty %s listing. Cannot sync to an empty directory: %s", msg, listing) b.critical = true + b.retryable = true return fmt.Errorf("empty %s listing: %s", msg, listing) } diff --git a/cmd/bisync/operations.go b/cmd/bisync/operations.go index 5ac46d3c2..25c53cca2 100644 --- a/cmd/bisync/operations.go +++ b/cmd/bisync/operations.go @@ -25,13 +25,14 @@ var ErrBisyncAborted = errors.New("bisync aborted") // bisyncRun keeps bisync runtime state type bisyncRun struct { - fs1 fs.Fs - fs2 fs.Fs - abort bool - critical bool - basePath string - workDir string - opt *Options + fs1 fs.Fs + fs2 fs.Fs + abort bool + critical bool + retryable bool + basePath string + workDir string + opt *Options } // Bisync handles lock file, performs bisync run and checks exit status @@ -123,6 +124,10 @@ func Bisync(ctx context.Context, fs1, fs2 fs.Fs, optArg *Options) (err error) { } if b.critical { + if b.retryable && b.opt.Resilient { + fs.Errorf(nil, "Bisync critical error: %v", err) + fs.Errorf(nil, "Bisync aborted. Error is retryable without --resync due to --resilient mode.") + } else { if bilib.FileExists(listing1) { _ = os.Rename(listing1, listing1+"-err") } @@ -131,6 +136,7 @@ func Bisync(ctx context.Context, fs1, fs2 fs.Fs, optArg *Options) (err error) { } fs.Errorf(nil, "Bisync critical error: %v", err) fs.Errorf(nil, "Bisync aborted. Must run --resync to recover.") + } return ErrBisyncAborted } if b.abort { @@ -152,6 +158,7 @@ func (b *bisyncRun) runLocked(octx context.Context, listing1, listing2 string) ( fs.Infof(nil, "Validating listings for Path1 %s vs Path2 %s", quotePath(path1), quotePath(path2)) if err = b.checkSync(listing1, listing2); err != nil { b.critical = true + b.retryable = true } return err } @@ -176,6 +183,7 @@ func (b *bisyncRun) runLocked(octx context.Context, listing1, listing2 string) ( var fctx context.Context if fctx, err = b.opt.applyFilters(octx); err != nil { b.critical = true + b.retryable = true return } @@ -188,6 +196,7 @@ func (b *bisyncRun) runLocked(octx context.Context, listing1, listing2 string) ( if !bilib.FileExists(listing1) || !bilib.FileExists(listing2) { // On prior critical error abort, the prior listings are renamed to .lst-err to lock out further runs b.critical = true + b.retryable = true return errors.New("cannot find prior Path1 or Path2 listings, likely due to critical error on prior run") } @@ -215,6 +224,7 @@ func (b *bisyncRun) runLocked(octx context.Context, listing1, listing2 string) ( err = b.checkAccess(ds1.checkFiles, ds2.checkFiles) if err != nil { b.critical = true + b.retryable = true return } } @@ -255,6 +265,7 @@ func (b *bisyncRun) runLocked(octx context.Context, listing1, listing2 string) ( changes1, changes2, err = b.applyDeltas(octx, ds1, ds2) if err != nil { b.critical = true + // b.retryable = true // not sure about this one return err } } @@ -283,6 +294,7 @@ func (b *bisyncRun) runLocked(octx context.Context, listing1, listing2 string) ( } if err != nil { b.critical = true + b.retryable = true return err } @@ -310,6 +322,7 @@ func (b *bisyncRun) runLocked(octx context.Context, listing1, listing2 string) ( } if err != nil { b.critical = true + b.retryable = true return err } } @@ -369,6 +382,7 @@ func (b *bisyncRun) resync(octx, fctx context.Context, listing1, listing2 string err = b.checkAccess(ds1.checkFiles, ds2.checkFiles) if err != nil { b.critical = true + b.retryable = true return err } } diff --git a/docs/content/bisync.md b/docs/content/bisync.md index ec2851762..a3a714d9b 100644 --- a/docs/content/bisync.md +++ b/docs/content/bisync.md @@ -97,6 +97,8 @@ Optional Flags: Consider using `--verbose` or `--dry-run` first. --ignore-listing-checksum Do not use checksums for listings (add --ignore-checksum to additionally skip post-copy checksum checks) + --resilient Allow future runs to retry after certain less-serious errors, + instead of requiring --resync. Use at your own risk! --localtime Use local time in listings (default: UTC) --no-cleanup Retain working files (useful for troubleshooting and testing). --workdir PATH Use custom working directory (useful for testing). @@ -285,6 +287,30 @@ were generated on a run using `--ignore-listing-checksum`. For a more robust int consider using [`check`](commands/rclone_check/) (or [`cryptcheck`](/commands/rclone_cryptcheck/), if at least one path is a `crypt` remote.) +#### --resilient + +***Caution: this is an experimental feature. Use at your own risk!*** + +By default, most errors or interruptions will cause bisync to abort and +require [`--resync`](#resync) to recover. This is a safety feature, +to prevent bisync from running again until a user checks things out. +However, in some cases, bisync can go too far and enforce a lockout when one isn't actually necessary, +like for certain less-serious errors that might resolve themselves on the next run. +When `--resilient` is specified, bisync tries its best to recover and self-correct, +and only requires `--resync` as a last resort when a human's involvement is absolutely necessary. +The intended use case is for running bisync as a background process (such as via scheduled [cron](#cron)). + +When using `--resilient` mode, bisync will still report the error and abort, +however it will not lock out future runs -- allowing the possibility of retrying at the next normally scheduled time, +without requiring a `--resync` first. Examples of such retryable errors include +access test failures, missing listing files, and filter change detections. +These safety features will still prevent the *current* run from proceeding -- +the difference is that if conditions have improved by the time of the *next* run, +that next run will be allowed to proceed. +Certain more serious errors will still enforce a `--resync` lockout, even in `--resilient` mode, to prevent data loss. + +Behavior of `--resilient` may change in a future version. + ## Operation ### Runtime flow details @@ -394,6 +420,8 @@ typically at `${HOME}/.cache/rclone/bisync/` on Linux. Some errors are considered temporary and re-running the bisync is not blocked. The _critical return_ blocks further bisync runs. +See also: [`--resilient`](#resilient) + ### Lock file When bisync is running, a lock file is created in the bisync working directory, @@ -1164,5 +1192,6 @@ causing dry runs to inadvertently commit filter changes causing bisync to consider more files than necessary due to overbroad filters during delete operations * [Improved detection of false positive change conflicts](https://forum.rclone.org/t/bisync-bugs-and-feature-requests/37636#:~:text=1.%20Identical%20files%20should%20be%20left%20alone%2C%20even%20if%20new/newer/changed%20on%20both%20sides) (identical files are now left alone instead of renamed) +* Added experimental `--resilient` mode to allow [recovery from self-correctable errors](https://forum.rclone.org/t/bisync-bugs-and-feature-requests/37636#:~:text=2.%20Bisync%20should%20be%20more%20resilient%20to%20self%2Dcorrectable%20errors) * Added [new `--ignore-listing-checksum` flag](https://forum.rclone.org/t/bisync-bugs-and-feature-requests/37636#:~:text=6.%20%2D%2Dignore%2Dchecksum%20should%20be%20split%20into%20two%20flags%20for%20separate%20purposes) to distinguish from `--ignore-checksum`