-
Notifications
You must be signed in to change notification settings - Fork 40.1k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
kubelet: dockershim ExecSync should return context.DeadlineExeceeded on timeout #96495
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,26 +19,22 @@ limitations under the License. | |
package dockershim | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"io" | ||
"strings" | ||
"time" | ||
|
||
dockertypes "github.com/docker/docker/api/types" | ||
"k8s.io/klog/v2" | ||
|
||
utilfeature "k8s.io/apiserver/pkg/util/feature" | ||
"k8s.io/client-go/tools/remotecommand" | ||
"k8s.io/kubernetes/pkg/features" | ||
"k8s.io/klog/v2" | ||
kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" | ||
"k8s.io/kubernetes/pkg/probe/exec" | ||
|
||
"k8s.io/kubernetes/pkg/kubelet/dockershim/libdocker" | ||
) | ||
|
||
// ExecHandler knows how to execute a command in a running Docker container. | ||
type ExecHandler interface { | ||
ExecInContainer(client libdocker.Interface, container *dockertypes.ContainerJSON, cmd []string, stdin io.Reader, stdout, stderr io.WriteCloser, tty bool, resize <-chan remotecommand.TerminalSize, timeout time.Duration) error | ||
ExecInContainer(ctx context.Context, client libdocker.Interface, container *dockertypes.ContainerJSON, cmd []string, stdin io.Reader, stdout, stderr io.WriteCloser, tty bool, resize <-chan remotecommand.TerminalSize, timeout time.Duration) error | ||
} | ||
|
||
type dockerExitError struct { | ||
|
@@ -65,7 +61,7 @@ func (d *dockerExitError) ExitStatus() int { | |
type NativeExecHandler struct{} | ||
|
||
// ExecInContainer executes the cmd in container using the Docker's exec API | ||
func (*NativeExecHandler) ExecInContainer(client libdocker.Interface, container *dockertypes.ContainerJSON, cmd []string, stdin io.Reader, stdout, stderr io.WriteCloser, tty bool, resize <-chan remotecommand.TerminalSize, timeout time.Duration) error { | ||
func (*NativeExecHandler) ExecInContainer(ctx context.Context, client libdocker.Interface, container *dockertypes.ContainerJSON, cmd []string, stdin io.Reader, stdout, stderr io.WriteCloser, tty bool, resize <-chan remotecommand.TerminalSize, timeout time.Duration) error { | ||
done := make(chan struct{}) | ||
defer close(done) | ||
|
||
|
@@ -109,53 +105,55 @@ func (*NativeExecHandler) ExecInContainer(client libdocker.Interface, container | |
RawTerminal: tty, | ||
ExecStarted: execStarted, | ||
} | ||
err = client.StartExec(execObj.ID, startOpts, streamOpts) | ||
if err != nil { | ||
return err | ||
|
||
if timeout > 0 { | ||
var cancel context.CancelFunc | ||
ctx, cancel = context.WithTimeout(ctx, timeout) | ||
defer cancel() | ||
} | ||
|
||
// if ExecProbeTimeout feature gate is disabled, preserve existing behavior to ignore exec timeouts | ||
var execTimeout <-chan time.Time | ||
if timeout > 0 && utilfeature.DefaultFeatureGate.Enabled(features.ExecProbeTimeout) { | ||
execTimeout = time.After(timeout) | ||
} else { | ||
// skip exec timeout if provided timeout is 0 | ||
execTimeout = nil | ||
// StartExec is a blocking call, so we need to run it concurrently and catch | ||
// its error in a channel | ||
execErr := make(chan error, 1) | ||
go func() { | ||
execErr <- client.StartExec(execObj.ID, startOpts, streamOpts) | ||
}() | ||
|
||
select { | ||
case <-ctx.Done(): | ||
return ctx.Err() | ||
case err := <-execErr: | ||
if err != nil { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nothing bad will happen, but maybe need to call There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. cancel is called with defer on line 112 There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. yes, totally. My comment is that we can cancel timeout in case the exec has already finished =) |
||
return err | ||
} | ||
} | ||
|
||
// InspectExec may not always return latest state of exec, so call it a few times until | ||
// it returns an exec inspect that shows that the process is no longer running. | ||
retries := 0 | ||
maxRetries := 5 | ||
ticker := time.NewTicker(2 * time.Second) | ||
defer ticker.Stop() | ||
count := 0 | ||
for { | ||
select { | ||
case <-execTimeout: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This timeout check was incorrect because |
||
return exec.NewTimeoutError(fmt.Errorf("command %q timed out", strings.Join(cmd, " ")), timeout) | ||
// need to use "default" here instead of <-ticker.C, otherwise we delay the initial InspectExec by 2 seconds. | ||
default: | ||
inspect, inspectErr := client.InspectExec(execObj.ID) | ||
if inspectErr != nil { | ||
return inspectErr | ||
} | ||
|
||
if !inspect.Running { | ||
if inspect.ExitCode != 0 { | ||
return &dockerExitError{inspect} | ||
} | ||
inspect, err := client.InspectExec(execObj.ID) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
return nil | ||
if !inspect.Running { | ||
if inspect.ExitCode != 0 { | ||
return &dockerExitError{inspect} | ||
} | ||
|
||
// Only limit the amount of InspectExec calls if the exec timeout was not set. | ||
// When a timeout is not set, we stop polling the exec session after 5 attempts and allow the process to continue running. | ||
if execTimeout == nil { | ||
count++ | ||
if count == 5 { | ||
klog.Errorf("Exec session %s in container %s terminated but process still running!", execObj.ID, container.ID) | ||
return nil | ||
} | ||
} | ||
return nil | ||
} | ||
|
||
<-ticker.C | ||
retries++ | ||
if retries == maxRetries { | ||
klog.Errorf("Exec session %s in container %s terminated but process still running!", execObj.ID, container.ID) | ||
return nil | ||
} | ||
|
||
<-ticker.C | ||
} | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -123,6 +123,7 @@ var _ = framework.KubeDescribe("Probing container", func() { | |
livenessProbe := &v1.Probe{ | ||
Handler: execHandler([]string{"cat", "/tmp/health"}), | ||
InitialDelaySeconds: 15, | ||
TimeoutSeconds: 5, // default 1s can be pretty aggressive in CI environments with low resources | ||
FailureThreshold: 1, | ||
} | ||
pod := busyBoxPodSpec(nil, livenessProbe, cmd) | ||
|
@@ -139,6 +140,7 @@ var _ = framework.KubeDescribe("Probing container", func() { | |
livenessProbe := &v1.Probe{ | ||
Handler: execHandler([]string{"cat", "/tmp/health"}), | ||
InitialDelaySeconds: 15, | ||
TimeoutSeconds: 5, // default 1s can be pretty aggressive in CI environments with low resources | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This test started to flake once dockershim was updated to correctly return DeadlineExceeded grpc status error. It seems like the default 1s timeout was too aggressive here, my guess is because Increasing the timeout here seems safe, because this test is really expecting that the exit code from the probe returns 0, it should allow for the probe to take longer than the default 1s, as long as it doesn't return a non-zero exit code. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. should the same be on line 124? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. hmmm.. so that test wasn't failing because the test is expecting a restart, but now that you mention it may be failing due to timeout and not a non-zero exit code, so I think it may make sense to set a similar timeout there. |
||
FailureThreshold: 1, | ||
} | ||
pod := busyBoxPodSpec(nil, livenessProbe, cmd) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
remote_runtime.go expects a grpc error with status code
DeadlineExceeded
if exec timed out:kubernetes/pkg/kubelet/cri/remote/remote_runtime.go
Lines 394 to 397 in 4bb30c3
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
maybe this comment can be added into the code. It is not very logical when you simply looking at this method