Fix workflow beeing skipped and marked as failed when agent starts before server (#6361)

When the agent started before the server was available, it retried the connection as expected. However, once the server came up and a workflow was picked up, the pipeline would immediately fail without running any steps — the agent logs showed `workflow context done` firing instantly after `received execution`. The root cause was a package-level `shutdownCtx` shared across retry iterations. On each failed attempt, `stopAgentFunc` stamped it with a 5-second timeout — starting the clock immediately. By the time the agent successfully connected and received a workflow, `workflowCtx` was derived from this already-expired context, so execution failed before Docker even started a container. The fix removes the global mutable shutdown context and the `stopAgentFunc` indirection. Instead, `runner.Run()` no longer accepts a `shutdownCtx` parameter — it creates a fresh one locally only when needed for the `Done()` fallback call. The healthcheck server shutdown does the same. This makes the lifetime of each shutdown window explicit and local.
2026-06-03 16:35:37 +02:00 · 2026-03-31 19:56:39 +02:00
parent 2de59621fb
commit a7739a2bb7
2 changed files with 20 additions and 27 deletions
@@ -54,33 +54,14 @@ const (
 	authInterceptorRefreshInterval = time.Minute * 30
 )

-const (
-	shutdownTimeout = time.Second * 5
-)
-
-var (
-	stopAgentFunc      context.CancelCauseFunc = func(error) {}
-	shutdownCancelFunc context.CancelFunc      = func() {}
-	shutdownCtx                                = context.Background()
-)
-
 func run(ctx context.Context, c *cli.Command, backends []types.Backend) error {
 	log.Info().Str("version", version.String()).Msg("Starting Woodpecker agent")

 	agentCtx, ctxCancel := context.WithCancelCause(ctx)
-	stopAgentFunc = func(err error) {
-		msg := "shutdown of whole agent"
-		if err != nil {
-			log.Error().Err(err).Msg(msg)
-		} else {
-			log.Info().Msg(msg)
-		}
-		stopAgentFunc = func(error) {}
-		shutdownCtx, shutdownCancelFunc = context.WithTimeout(shutdownCtx, shutdownTimeout)
-		ctxCancel(err)
-	}
-	defer stopAgentFunc(nil)
-	defer shutdownCancelFunc()
+	defer func() {
+		log.Info().Msg("shutdown of whole agent")
+		ctxCancel(nil)
+	}()

 	serviceWaitingGroup := errgroup.Group{}

@@ -107,6 +88,10 @@ func run(ctx context.Context, c *cli.Command, backends []types.Backend) error {
 				go func() {
 					<-agentCtx.Done()
 					log.Info().Msg("shutdown healthcheck server ...")
+
+					shutdownCtx, shutdownCtxCancel := agent.GetShutdownContext()
+					defer shutdownCtxCancel()
+
 					if err := server.Shutdown(shutdownCtx); err != nil { //nolint:contextcheck
 						log.Error().Err(err).Msg("shutdown healthcheck server failed")
 					} else {
@@ -302,7 +287,7 @@ func run(ctx context.Context, c *cli.Command, backends []types.Backend) error {
 				}

 				log.Debug().Msg("polling new workflow")
-				if err := runner.Run(agentCtx, shutdownCtx); err != nil {
+				if err := runner.Run(agentCtx); err != nil {
 					if singleWorkflow {
 						log.Error().Err(err).Msg("runner done with error")
 						ctxCancel(nil)