You've already forked woodpecker
mirror of
https://github.com/woodpecker-ci/woodpecker.git
synced 2026-06-03 16:35:37 +02:00
fix(kubernetes): retry WaitStep when container terminated state not yet finalized (#6672)
## Problem Kubelet sets `pod.Status.Phase = Succeeded` before finalizing `containerStatuses[0].state.terminated`. When the informer sees the phase change and `WaitStep` calls `Get()`, the container status may still show `Terminated == nil`, causing a hard error: ``` no terminated state found for container wp-XXX/wp-XXX ``` This is a known race in the Kubernetes API server/kubelet eventually-consistent model. The window is normally milliseconds but widens to seconds under load (apiserver latency spikes, ResourceQuota admission storms, node pressure). ## Fix Wrap the post-informer `Get()` + `Terminated == nil` check in `backoff.Retry` with exponential backoff (200ms initial, 5s max interval, 15s total budget). This mirrors the retry pattern already used for `TailStep` log stream recovery (#5550).
This commit is contained in:
@@ -353,7 +353,32 @@ func (e *kube) WaitStep(ctx context.Context, step *types.Step, taskUUID string)
|
||||
return nil, ctx.Err()
|
||||
}
|
||||
|
||||
pod, err := e.client.CoreV1().Pods(e.config.GetNamespace(step.OrgID)).Get(ctx, podName, kube_meta_v1.GetOptions{})
|
||||
// After the informer signals completion, kubelet may not have finalized
|
||||
// containerStatuses yet (phase=Succeeded before state.terminated is set).
|
||||
// Retry with backoff to allow kubelet to catch up.
|
||||
pod, err := backoff.Retry(ctx,
|
||||
func() (*kube_core_v1.Pod, error) {
|
||||
p, err := e.client.CoreV1().Pods(e.config.GetNamespace(step.OrgID)).Get(ctx, podName, kube_meta_v1.GetOptions{})
|
||||
if err != nil {
|
||||
if kube_errors.IsNotFound(err) {
|
||||
return nil, backoff.Permanent(err)
|
||||
}
|
||||
return nil, err
|
||||
}
|
||||
if len(p.Status.ContainerStatuses) == 0 {
|
||||
return nil, fmt.Errorf("no container statuses found for pod %s", podName)
|
||||
}
|
||||
if p.Status.ContainerStatuses[0].State.Terminated == nil {
|
||||
return nil, fmt.Errorf("container %s/%s terminated state not yet finalized", podName, p.Status.ContainerStatuses[0].Name)
|
||||
}
|
||||
return p, nil
|
||||
},
|
||||
backoff.WithBackOff(backoff.NewExponentialBackOff()),
|
||||
backoff.WithMaxElapsedTime(maxRetryDuration),
|
||||
backoff.WithNotify(func(err error, delay time.Duration) {
|
||||
log.Warn().Err(err).Str("pod", podName).Dur("backoff", delay).Msg("waiting for container terminated state, retrying with backoff")
|
||||
}),
|
||||
)
|
||||
if err != nil {
|
||||
if kube_errors.IsNotFound(err) {
|
||||
return &types.State{ExitCode: 0, Exited: true}, nil
|
||||
@@ -365,18 +390,8 @@ func (e *kube) WaitStep(ctx context.Context, step *types.Step, taskUUID string)
|
||||
return nil, fmt.Errorf("could not pull image for pod %s", podName)
|
||||
}
|
||||
|
||||
if len(pod.Status.ContainerStatuses) == 0 {
|
||||
return nil, fmt.Errorf("no container statuses found for pod %s", podName)
|
||||
}
|
||||
|
||||
cs := pod.Status.ContainerStatuses[0]
|
||||
|
||||
if cs.State.Terminated == nil {
|
||||
err := fmt.Errorf("no terminated state found for container %s/%s", podName, cs.Name)
|
||||
log.Error().Str("taskUUID", taskUUID).Str("pod", podName).Str("container", cs.Name).Interface("state", cs.State).Msg(err.Error())
|
||||
return nil, err
|
||||
}
|
||||
|
||||
bs := &types.State{
|
||||
ExitCode: int(cs.State.Terminated.ExitCode),
|
||||
Exited: true,
|
||||
|
||||
Reference in New Issue
Block a user