1
0
mirror of https://github.com/woodpecker-ci/woodpecker.git synced 2026-06-03 16:35:37 +02:00

fix(kubernetes): retry WaitStep when container terminated state not yet finalized (#6672)

## Problem

Kubelet sets `pod.Status.Phase = Succeeded` before finalizing `containerStatuses[0].state.terminated`. When the informer sees the phase change and `WaitStep` calls `Get()`, the container status may still show `Terminated == nil`, causing a hard error:

```
no terminated state found for container wp-XXX/wp-XXX
```

This is a known race in the Kubernetes API server/kubelet eventually-consistent model. The window is normally milliseconds but widens to seconds under load (apiserver latency spikes, ResourceQuota admission storms, node pressure).

## Fix

Wrap the post-informer `Get()` + `Terminated == nil` check in `backoff.Retry` with exponential backoff (200ms initial, 5s max interval, 15s total budget). This mirrors the retry pattern already used for `TailStep` log stream recovery (#5550).
This commit is contained in:
Simon C. Kemper
2026-05-30 12:35:53 +02:00
committed by GitHub
parent 3ed910ac3b
commit a765cb885a
+26 -11
View File
@@ -353,7 +353,32 @@ func (e *kube) WaitStep(ctx context.Context, step *types.Step, taskUUID string)
return nil, ctx.Err()
}
pod, err := e.client.CoreV1().Pods(e.config.GetNamespace(step.OrgID)).Get(ctx, podName, kube_meta_v1.GetOptions{})
// After the informer signals completion, kubelet may not have finalized
// containerStatuses yet (phase=Succeeded before state.terminated is set).
// Retry with backoff to allow kubelet to catch up.
pod, err := backoff.Retry(ctx,
func() (*kube_core_v1.Pod, error) {
p, err := e.client.CoreV1().Pods(e.config.GetNamespace(step.OrgID)).Get(ctx, podName, kube_meta_v1.GetOptions{})
if err != nil {
if kube_errors.IsNotFound(err) {
return nil, backoff.Permanent(err)
}
return nil, err
}
if len(p.Status.ContainerStatuses) == 0 {
return nil, fmt.Errorf("no container statuses found for pod %s", podName)
}
if p.Status.ContainerStatuses[0].State.Terminated == nil {
return nil, fmt.Errorf("container %s/%s terminated state not yet finalized", podName, p.Status.ContainerStatuses[0].Name)
}
return p, nil
},
backoff.WithBackOff(backoff.NewExponentialBackOff()),
backoff.WithMaxElapsedTime(maxRetryDuration),
backoff.WithNotify(func(err error, delay time.Duration) {
log.Warn().Err(err).Str("pod", podName).Dur("backoff", delay).Msg("waiting for container terminated state, retrying with backoff")
}),
)
if err != nil {
if kube_errors.IsNotFound(err) {
return &types.State{ExitCode: 0, Exited: true}, nil
@@ -365,18 +390,8 @@ func (e *kube) WaitStep(ctx context.Context, step *types.Step, taskUUID string)
return nil, fmt.Errorf("could not pull image for pod %s", podName)
}
if len(pod.Status.ContainerStatuses) == 0 {
return nil, fmt.Errorf("no container statuses found for pod %s", podName)
}
cs := pod.Status.ContainerStatuses[0]
if cs.State.Terminated == nil {
err := fmt.Errorf("no terminated state found for container %s/%s", podName, cs.Name)
log.Error().Str("taskUUID", taskUUID).Str("pod", podName).Str("container", cs.Name).Interface("state", cs.State).Msg(err.Error())
return nil, err
}
bs := &types.State{
ExitCode: int(cs.State.Terminated.ExitCode),
Exited: true,