2024-03-15 11:42:22 +01:00
// Copyright 2024 - offen.software <hioffen@posteo.de>
2024-02-16 15:35:42 +01:00
// SPDX-License-Identifier: MPL-2.0
2024-01-31 12:17:41 +01:00
package main
import (
"context"
"errors"
"fmt"
"io"
"os"
"sync"
"time"
"github.com/docker/cli/cli/command/service/progress"
2025-11-20 17:16:08 +01:00
"github.com/moby/moby/api/types/swarm"
"github.com/moby/moby/client"
2024-02-16 15:35:42 +01:00
"github.com/offen/docker-volume-backup/internal/errwrap"
2024-01-31 12:17:41 +01:00
)
func scaleService ( cli * client . Client , serviceID string , replicas uint64 ) ( [ ] string , error ) {
2025-11-20 17:16:08 +01:00
result , err := cli . ServiceInspect ( context . Background ( ) , serviceID , client . ServiceInspectOptions { } )
2024-01-31 12:17:41 +01:00
if err != nil {
2024-02-16 15:35:42 +01:00
return nil , errwrap . Wrap ( err , fmt . Sprintf ( "error inspecting service %s" , serviceID ) )
2024-01-31 12:17:41 +01:00
}
2025-11-20 17:16:08 +01:00
service := result . Service
2024-01-31 12:17:41 +01:00
serviceMode := & service . Spec . Mode
switch {
case serviceMode . Replicated != nil :
serviceMode . Replicated . Replicas = & replicas
default :
2024-02-16 15:35:42 +01:00
return nil , errwrap . Wrap ( nil , fmt . Sprintf ( "service to be scaled %s has to be in replicated mode" , service . Spec . Name ) )
2024-01-31 12:17:41 +01:00
}
2025-11-20 17:16:08 +01:00
response , err := cli . ServiceUpdate ( context . Background ( ) , service . ID , client . ServiceUpdateOptions { Version : service . Version , Spec : service . Spec } )
2024-01-31 12:17:41 +01:00
if err != nil {
2024-02-16 15:35:42 +01:00
return nil , errwrap . Wrap ( err , "error updating service" )
2024-01-31 12:17:41 +01:00
}
discardWriter := & noopWriteCloser { io . Discard }
if err := progress . ServiceProgress ( context . Background ( ) , cli , service . ID , discardWriter ) ; err != nil {
return nil , err
}
return response . Warnings , nil
}
func awaitContainerCountForService ( cli * client . Client , serviceID string , count int , timeoutAfter time . Duration ) error {
poll := time . NewTicker ( time . Second )
timeout := time . NewTimer ( timeoutAfter )
defer timeout . Stop ( )
defer poll . Stop ( )
for {
select {
case <- timeout . C :
2024-02-16 15:35:42 +01:00
return errwrap . Wrap (
nil ,
fmt . Sprintf (
"timed out after waiting %s for service %s to reach desired container count of %d" ,
timeoutAfter ,
serviceID ,
count ,
) ,
2024-01-31 12:17:41 +01:00
)
case <- poll . C :
2025-11-20 17:16:08 +01:00
containers , err := cli . ContainerList ( context . Background ( ) , client . ContainerListOptions {
Filters : client . Filters { } . Add ( "label" , fmt . Sprintf ( "com.docker.swarm.service.id=%s" , serviceID ) ) ,
2024-01-31 12:17:41 +01:00
} )
if err != nil {
2024-02-16 15:35:42 +01:00
return errwrap . Wrap ( err , "error listing containers" )
2024-01-31 12:17:41 +01:00
}
2025-11-20 17:16:08 +01:00
if len ( containers . Items ) == count {
2024-01-31 12:17:41 +01:00
return nil
}
}
}
}
2024-02-27 22:12:36 +01:00
func isSwarm ( c interface {
2025-11-20 17:16:08 +01:00
Info ( context . Context , client . InfoOptions ) ( client . SystemInfoResult , error )
2024-02-27 22:12:36 +01:00
} ) ( bool , error ) {
2025-11-20 17:16:08 +01:00
result , err := c . Info ( context . Background ( ) , client . InfoOptions { } )
2024-02-27 22:12:36 +01:00
if err != nil {
return false , errwrap . Wrap ( err , "error getting docker info" )
}
2025-11-20 17:16:08 +01:00
return result . Info . Swarm . LocalNodeState != "" && result . Info . Swarm . LocalNodeState != swarm . LocalNodeStateInactive && result . Info . Swarm . ControlAvailable , nil
2024-02-27 22:12:36 +01:00
}
2025-11-01 10:52:56 +01:00
func hasLabel ( labels map [ string ] string , key , value string ) bool {
val , ok := labels [ key ]
return ok && val == value
}
func checkStopLabels ( labels map [ string ] string , stopDuringBackupLabelValue string , stopDuringBackupNoRestartLabelValue string ) ( bool , bool , error ) {
hasStopDuringBackupLabel := hasLabel ( labels , "docker-volume-backup.stop-during-backup" , stopDuringBackupLabelValue )
hasStopDuringBackupNoRestartLabel := hasLabel ( labels , "docker-volume-backup.stop-during-backup-no-restart" , stopDuringBackupNoRestartLabelValue )
if hasStopDuringBackupLabel && hasStopDuringBackupNoRestartLabel {
return hasStopDuringBackupLabel , hasStopDuringBackupNoRestartLabel , errwrap . Wrap ( nil , "both docker-volume-backup.stop-during-backup and docker-volume-backup.stop-during-backup-no-restart have been set, cannot continue" )
}
return hasStopDuringBackupLabel , hasStopDuringBackupNoRestartLabel , nil
}
2024-01-31 12:17:41 +01:00
// stopContainersAndServices stops all Docker containers that are marked as to being
// stopped during the backup and returns a function that can be called to
// restart everything that has been stopped.
func ( s * script ) stopContainersAndServices ( ) ( func ( ) error , error ) {
if s . cli == nil {
return noop , nil
}
2024-02-27 22:12:36 +01:00
isDockerSwarm , err := isSwarm ( s . cli )
2024-01-31 12:17:41 +01:00
if err != nil {
2024-02-27 22:12:36 +01:00
return noop , errwrap . Wrap ( err , "error determining swarm state" )
2024-01-31 12:17:41 +01:00
}
labelValue := s . c . BackupStopDuringBackupLabel
if s . c . BackupStopContainerLabel != "" {
s . logger . Warn (
"Using BACKUP_STOP_CONTAINER_LABEL has been deprecated and will be removed in the next major version." ,
)
s . logger . Warn (
"Please use BACKUP_STOP_DURING_BACKUP_LABEL instead. Refer to the docs for an upgrade guide." ,
)
if _ , ok := os . LookupEnv ( "BACKUP_STOP_DURING_BACKUP_LABEL" ) ; ok {
2024-02-16 15:35:42 +01:00
return noop , errwrap . Wrap ( nil , "both BACKUP_STOP_DURING_BACKUP_LABEL and BACKUP_STOP_CONTAINER_LABEL have been set, cannot continue" )
2024-01-31 12:17:41 +01:00
}
labelValue = s . c . BackupStopContainerLabel
}
2025-11-01 10:52:56 +01:00
stopDuringBackupLabel := fmt . Sprintf (
2024-01-31 12:17:41 +01:00
"docker-volume-backup.stop-during-backup=%s" ,
labelValue ,
)
2025-11-01 10:52:56 +01:00
stopDuringBackupNoRestartLabel := fmt . Sprintf (
"docker-volume-backup.stop-during-backup-no-restart=%s" ,
s . c . BackupStopDuringBackupNoRestartLabel ,
)
2025-11-20 17:16:08 +01:00
allContainers , err := s . cli . ContainerList ( context . Background ( ) , client . ContainerListOptions { } )
2024-01-31 12:17:41 +01:00
if err != nil {
2024-02-16 15:35:42 +01:00
return noop , errwrap . Wrap ( err , "error querying for containers" )
2024-01-31 12:17:41 +01:00
}
2025-11-01 10:52:56 +01:00
var containersToStop [ ] handledContainer
2025-11-20 17:16:08 +01:00
for _ , c := range allContainers . Items {
2025-11-01 10:52:56 +01:00
hasStopDuringBackupLabel , hasStopDuringBackupNoRestartLabel , err := checkStopLabels ( c . Labels , labelValue , s . c . BackupStopDuringBackupNoRestartLabel )
if err != nil {
return noop , errwrap . Wrap ( err , "error querying for containers to stop" )
}
if ! hasStopDuringBackupLabel && ! hasStopDuringBackupNoRestartLabel {
continue
}
containersToStop = append ( containersToStop , handledContainer {
summary : c ,
restart : ! hasStopDuringBackupNoRestartLabel ,
} )
2024-01-31 12:17:41 +01:00
}
var allServices [ ] swarm . Service
var servicesToScaleDown [ ] handledSwarmService
if isDockerSwarm {
2025-11-20 17:16:08 +01:00
result , err := s . cli . ServiceList ( context . Background ( ) , client . ServiceListOptions { Status : true } )
allServices := result . Items
2024-01-31 12:17:41 +01:00
if err != nil {
2024-02-16 15:35:42 +01:00
return noop , errwrap . Wrap ( err , "error querying for services" )
2024-01-31 12:17:41 +01:00
}
2025-11-01 10:52:56 +01:00
for _ , service := range allServices {
hasStopDuringBackupLabel , hasStopDuringBackupNoRestartLabel , err := checkStopLabels ( service . Spec . Labels , labelValue , s . c . BackupStopDuringBackupNoRestartLabel )
if err != nil {
return noop , errwrap . Wrap ( err , "error querying for services to scale down" )
}
if ! hasStopDuringBackupLabel && ! hasStopDuringBackupNoRestartLabel {
continue
}
if service . Spec . Mode . Replicated == nil {
2024-04-15 15:08:37 +02:00
return noop , errwrap . Wrap (
nil ,
2025-11-01 10:52:56 +01:00
fmt . Sprintf ( "only replicated services can be restarted, but found a label on service %s" , service . Spec . Name ) ,
2024-04-15 15:08:37 +02:00
)
}
2025-11-01 10:52:56 +01:00
2024-01-31 12:17:41 +01:00
servicesToScaleDown = append ( servicesToScaleDown , handledSwarmService {
2025-11-01 10:52:56 +01:00
serviceID : service . ID ,
initialReplicaCount : * service . Spec . Mode . Replicated . Replicas ,
restart : ! hasStopDuringBackupNoRestartLabel ,
2024-01-31 12:17:41 +01:00
} )
}
}
if len ( containersToStop ) == 0 && len ( servicesToScaleDown ) == 0 {
return noop , nil
}
if isDockerSwarm {
for _ , container := range containersToStop {
2025-11-01 10:52:56 +01:00
if swarmServiceID , ok := container . summary . Labels [ "com.docker.swarm.service.id" ] ; ok {
2025-11-20 17:16:08 +01:00
parentService , err := s . cli . ServiceInspect ( context . Background ( ) , swarmServiceID , client . ServiceInspectOptions { } )
2024-01-31 12:17:41 +01:00
if err != nil {
2024-02-16 15:35:42 +01:00
return noop , errwrap . Wrap ( err , fmt . Sprintf ( "error querying for parent service with ID %s" , swarmServiceID ) )
2024-01-31 12:17:41 +01:00
}
2025-11-20 17:16:08 +01:00
for label := range parentService . Service . Spec . Labels {
2024-01-31 12:17:41 +01:00
if label == "docker-volume-backup.stop-during-backup" {
2024-02-16 15:35:42 +01:00
return noop , errwrap . Wrap (
nil ,
fmt . Sprintf (
"container %s is labeled to stop but has parent service %s which is also labeled, cannot continue" ,
2025-11-01 10:52:56 +01:00
container . summary . Names [ 0 ] ,
2025-11-20 17:16:08 +01:00
parentService . Service . Spec . Name ,
2024-02-16 15:35:42 +01:00
) ,
2024-01-31 12:17:41 +01:00
)
}
}
}
}
}
s . logger . Info (
fmt . Sprintf (
2025-11-01 10:52:56 +01:00
"Stopping %d out of %d running container(s) as they were labeled %s or %s." ,
2024-01-31 12:17:41 +01:00
len ( containersToStop ) ,
2025-11-20 17:16:08 +01:00
len ( allContainers . Items ) ,
2025-11-01 10:52:56 +01:00
stopDuringBackupLabel ,
stopDuringBackupNoRestartLabel ,
2024-01-31 12:17:41 +01:00
) ,
)
if isDockerSwarm {
s . logger . Info (
fmt . Sprintf (
2025-11-01 10:52:56 +01:00
"Scaling down %d out of %d active service(s) as they were labeled %s or %s." ,
2024-01-31 12:17:41 +01:00
len ( servicesToScaleDown ) ,
len ( allServices ) ,
2025-11-01 10:52:56 +01:00
stopDuringBackupLabel ,
stopDuringBackupNoRestartLabel ,
2024-01-31 12:17:41 +01:00
) ,
)
}
2025-11-01 10:52:56 +01:00
var stoppedContainers [ ] handledContainer
2024-01-31 12:17:41 +01:00
var stopErrors [ ] error
for _ , container := range containersToStop {
2025-11-20 17:16:08 +01:00
if _ , err := s . cli . ContainerStop ( context . Background ( ) , container . summary . ID , client . ContainerStopOptions { } ) ; err != nil {
2024-01-31 12:17:41 +01:00
stopErrors = append ( stopErrors , err )
} else {
stoppedContainers = append ( stoppedContainers , container )
}
}
var scaledDownServices [ ] handledSwarmService
var scaleDownErrors concurrentSlice [ error ]
if isDockerSwarm {
wg := sync . WaitGroup { }
for _ , svc := range servicesToScaleDown {
wg . Add ( 1 )
go func ( svc handledSwarmService ) {
defer wg . Done ( )
warnings , err := scaleService ( s . cli , svc . serviceID , 0 )
if err != nil {
scaleDownErrors . append ( err )
2024-02-05 14:27:06 +01:00
return
2024-01-31 12:17:41 +01:00
}
2024-02-05 14:27:06 +01:00
scaledDownServices = append ( scaledDownServices , svc )
2024-01-31 12:17:41 +01:00
for _ , warning := range warnings {
s . logger . Warn (
fmt . Sprintf ( "The Docker API returned a warning when scaling down service %s: %s" , svc . serviceID , warning ) ,
)
}
// progress.ServiceProgress returns too early, so we need to manually check
// whether all containers belonging to the service have actually been removed
if err := awaitContainerCountForService ( s . cli , svc . serviceID , 0 , s . c . BackupStopServiceTimeout ) ; err != nil {
scaleDownErrors . append ( err )
}
} ( svc )
}
wg . Wait ( )
}
s . stats . Containers = ContainersStats {
2025-11-20 17:16:08 +01:00
All : uint ( len ( allContainers . Items ) ) ,
2024-01-31 12:17:41 +01:00
ToStop : uint ( len ( containersToStop ) ) ,
Stopped : uint ( len ( stoppedContainers ) ) ,
StopErrors : uint ( len ( stopErrors ) ) ,
}
s . stats . Services = ServicesStats {
All : uint ( len ( allServices ) ) ,
ToScaleDown : uint ( len ( servicesToScaleDown ) ) ,
ScaledDown : uint ( len ( scaledDownServices ) ) ,
ScaleDownErrors : uint ( len ( scaleDownErrors . value ( ) ) ) ,
}
var initialErr error
allErrors := append ( stopErrors , scaleDownErrors . value ( ) ... )
if len ( allErrors ) != 0 {
2024-02-16 15:35:42 +01:00
initialErr = errwrap . Wrap (
2024-01-31 12:17:41 +01:00
errors . Join ( allErrors ... ) ,
2024-02-16 15:35:42 +01:00
fmt . Sprintf (
"%d error(s) stopping containers" ,
len ( allErrors ) ,
) ,
2024-01-31 12:17:41 +01:00
)
}
return func ( ) error {
var restartErrors [ ] error
2025-11-01 10:52:56 +01:00
var restartedContainers [ ] handledContainer
2024-01-31 12:17:41 +01:00
matchedServices := map [ string ] bool { }
for _ , container := range stoppedContainers {
2025-11-01 10:52:56 +01:00
if ! container . restart {
continue
}
if swarmServiceID , ok := container . summary . Labels [ "com.docker.swarm.service.id" ] ; ok && isDockerSwarm {
2024-01-31 12:17:41 +01:00
if _ , ok := matchedServices [ swarmServiceID ] ; ok {
continue
}
matchedServices [ swarmServiceID ] = true
// in case a container was part of a swarm service, the service requires to
// be force updated instead of restarting the container as it would otherwise
// remain in a "completed" state
2025-11-20 17:16:08 +01:00
result , err := s . cli . ServiceInspect ( context . Background ( ) , swarmServiceID , client . ServiceInspectOptions { } )
service := result . Service
2024-01-31 12:17:41 +01:00
if err != nil {
restartErrors = append (
restartErrors ,
2024-02-16 15:35:42 +01:00
errwrap . Wrap ( err , "error looking up parent service" ) ,
2024-01-31 12:17:41 +01:00
)
continue
}
service . Spec . TaskTemplate . ForceUpdate += 1
if _ , err := s . cli . ServiceUpdate (
context . Background ( ) , service . ID ,
2025-11-20 17:16:08 +01:00
client . ServiceUpdateOptions { Spec : service . Spec , Version : service . Version } ,
2024-01-31 12:17:41 +01:00
) ; err != nil {
restartErrors = append ( restartErrors , err )
}
continue
}
2025-11-20 17:16:08 +01:00
if _ , err := s . cli . ContainerStart ( context . Background ( ) , container . summary . ID , client . ContainerStartOptions { } ) ; err != nil {
2024-01-31 12:17:41 +01:00
restartErrors = append ( restartErrors , err )
2025-11-01 10:52:56 +01:00
} else {
restartedContainers = append ( restartedContainers , container )
2024-01-31 12:17:41 +01:00
}
}
var scaleUpErrors concurrentSlice [ error ]
2025-11-01 10:52:56 +01:00
var scaledUpServices [ ] handledSwarmService
2024-01-31 12:17:41 +01:00
if isDockerSwarm {
wg := & sync . WaitGroup { }
for _ , svc := range servicesToScaleDown {
2025-11-01 10:52:56 +01:00
if ! svc . restart {
continue
}
2024-01-31 12:17:41 +01:00
wg . Add ( 1 )
go func ( svc handledSwarmService ) {
defer wg . Done ( )
warnings , err := scaleService ( s . cli , svc . serviceID , svc . initialReplicaCount )
if err != nil {
scaleDownErrors . append ( err )
return
}
2025-11-01 10:52:56 +01:00
scaledUpServices = append ( scaledUpServices , svc )
2024-01-31 12:17:41 +01:00
for _ , warning := range warnings {
s . logger . Warn (
fmt . Sprintf ( "The Docker API returned a warning when scaling up service %s: %s" , svc . serviceID , warning ) ,
)
}
} ( svc )
}
wg . Wait ( )
}
allErrors := append ( restartErrors , scaleUpErrors . value ( ) ... )
if len ( allErrors ) != 0 {
2024-02-16 15:35:42 +01:00
return errwrap . Wrap (
2024-01-31 12:17:41 +01:00
errors . Join ( allErrors ... ) ,
2024-02-16 15:35:42 +01:00
fmt . Sprintf (
"%d error(s) restarting containers and services" ,
len ( allErrors ) ,
) ,
2024-01-31 12:17:41 +01:00
)
}
s . logger . Info (
fmt . Sprintf (
2025-11-01 10:52:56 +01:00
"Restarted %d out of %d stopped container(s)." ,
len ( restartedContainers ) ,
2024-01-31 12:17:41 +01:00
len ( stoppedContainers ) ,
) ,
)
if isDockerSwarm {
s . logger . Info (
fmt . Sprintf (
2025-11-01 10:52:56 +01:00
"Scaled %d out of %d scaled down service(s) back up." ,
len ( scaledUpServices ) ,
2024-01-31 12:17:41 +01:00
len ( scaledDownServices ) ,
) ,
)
}
return nil
} , initialErr
}