From b842a7dd1584afb627e2f77bbedf87c86a6f1f20 Mon Sep 17 00:00:00 2001 From: Clayton Coleman Date: Tue, 2 Jun 2015 20:36:58 -0400 Subject: [PATCH 1/8] Revert "Revert "Gracefully delete pods from the Kubelet"" This reverts commit 98115facfdfc206278a38a586bcc1fbc62a46bfd. --- .../saltbase/salt/fluentd-es/fluentd-es.yaml | 1 + .../salt/fluentd-gcp/fluentd-gcp.yaml | 1 + cmd/integration/integration.go | 10 +- .../for-tests/network-tester/slow-pod.json | 28 ++ contrib/for-tests/network-tester/slow-rc.json | 42 +++ hack/lib/test.sh | 2 +- hack/test-cmd.sh | 21 +- pkg/api/deep_copy_generated.go | 6 + pkg/api/rest/create.go | 2 + pkg/api/rest/delete.go | 27 +- pkg/api/rest/resttest/resttest.go | 122 ++++++++- pkg/api/serialization_test.go | 3 + pkg/api/testing/fuzzer.go | 9 + pkg/api/types.go | 4 + pkg/api/v1/conversion_generated.go | 12 + pkg/api/v1/deep_copy_generated.go | 6 + pkg/api/v1/defaults.go | 4 + pkg/api/v1/types.go | 8 +- pkg/api/validation/validation.go | 10 + pkg/controller/controller_utils.go | 3 +- .../endpoint/endpoints_controller.go | 6 +- .../replication/replication_controller.go | 15 ++ pkg/kubectl/cmd/get_test.go | 27 +- pkg/kubectl/cmd/util/helpers_test.go | 11 +- pkg/kubectl/describe.go | 7 +- pkg/kubectl/resource/builder_test.go | 11 +- pkg/kubectl/resource/helper_test.go | 12 +- pkg/kubectl/resource_printer.go | 3 + pkg/kubectl/rolling_updater_test.go | 6 +- pkg/kubelet/config/common_test.go | 12 +- pkg/kubelet/config/config.go | 23 +- pkg/kubelet/config/file_test.go | 8 +- pkg/kubelet/config/http_test.go | 25 +- pkg/kubelet/container/fake_runtime.go | 6 +- pkg/kubelet/container/runtime.go | 4 +- pkg/kubelet/dockertools/manager.go | 240 ++++++++++++------ pkg/kubelet/dockertools/manager_test.go | 20 +- pkg/kubelet/kubelet.go | 8 +- pkg/kubelet/rkt/rkt.go | 8 +- pkg/kubelet/status_manager.go | 27 +- pkg/registry/controller/etcd/etcd_test.go | 2 +- pkg/registry/event/etcd/etcd_test.go | 1 + .../persistentvolume/etcd/etcd_test.go | 2 +- .../persistentvolumeclaim/etcd/etcd_test.go | 2 +- pkg/registry/pod/etcd/etcd_test.go | 14 +- pkg/registry/pod/rest.go | 23 +- pkg/registry/resourcequota/etcd/etcd_test.go | 2 +- pkg/storage/etcd/etcd_helper.go | 9 +- pkg/storage/etcd/etcd_helper_test.go | 54 ++-- pkg/storage/etcd/etcd_watcher.go | 3 +- pkg/tools/fake_etcd_client.go | 9 + plugin/pkg/scheduler/factory/factory_test.go | 6 +- test/e2e/kubectl.go | 2 +- test/e2e/pd.go | 20 +- test/e2e/pods.go | 28 +- test/e2e/util.go | 12 +- test/images/network-tester/webserver.go | 22 +- test/integration/auth_test.go | 2 +- test/integration/etcd_tools_test.go | 52 ++++ test/integration/scheduler_test.go | 2 +- 60 files changed, 831 insertions(+), 236 deletions(-) create mode 100644 contrib/for-tests/network-tester/slow-pod.json create mode 100644 contrib/for-tests/network-tester/slow-rc.json diff --git a/cluster/saltbase/salt/fluentd-es/fluentd-es.yaml b/cluster/saltbase/salt/fluentd-es/fluentd-es.yaml index a2075fd92784a..d5419146f4a69 100644 --- a/cluster/saltbase/salt/fluentd-es/fluentd-es.yaml +++ b/cluster/saltbase/salt/fluentd-es/fluentd-es.yaml @@ -18,6 +18,7 @@ spec: mountPath: /varlog - name: containers mountPath: /var/lib/docker/containers + terminationGracePeriodSeconds: 30 volumes: - name: varlog hostPath: diff --git a/cluster/saltbase/salt/fluentd-gcp/fluentd-gcp.yaml b/cluster/saltbase/salt/fluentd-gcp/fluentd-gcp.yaml index c6b1547af3aa4..31ba54acb7a5f 100644 --- a/cluster/saltbase/salt/fluentd-gcp/fluentd-gcp.yaml +++ b/cluster/saltbase/salt/fluentd-gcp/fluentd-gcp.yaml @@ -19,6 +19,7 @@ spec: mountPath: /varlog - name: containers mountPath: /var/lib/docker/containers + terminationGracePeriodSeconds: 30 volumes: - name: varlog hostPath: diff --git a/cmd/integration/integration.go b/cmd/integration/integration.go index 6f3246666d6ea..3e7d694edb2b2 100644 --- a/cmd/integration/integration.go +++ b/cmd/integration/integration.go @@ -872,18 +872,24 @@ func runSchedulerNoPhantomPodsTest(client *client.Client) { // Delete a pod to free up room. glog.Infof("Deleting pod %v", bar.Name) - err = client.Pods(api.NamespaceDefault).Delete(bar.Name, nil) + err = client.Pods(api.NamespaceDefault).Delete(bar.Name, api.NewDeleteOptions(1)) if err != nil { glog.Fatalf("FAILED: couldn't delete pod %q: %v", bar.Name, err) } + time.Sleep(2 * time.Second) + pod.ObjectMeta.Name = "phantom.baz" baz, err := client.Pods(api.NamespaceDefault).Create(pod) if err != nil { glog.Fatalf("Failed to create pod: %v, %v", pod, err) } if err := wait.Poll(time.Second, time.Second*60, podRunning(client, baz.Namespace, baz.Name)); err != nil { - glog.Fatalf("FAILED: (Scheduler probably didn't process deletion of 'phantom.bar') Pod never started running: %v", err) + if pod, perr := client.Pods(api.NamespaceDefault).Get("phantom.bar"); perr == nil { + glog.Fatalf("FAILED: 'phantom.bar' was never deleted: %#v", pod) + } else { + glog.Fatalf("FAILED: (Scheduler probably didn't process deletion of 'phantom.bar') Pod never started running: %v", err) + } } glog.Info("Scheduler doesn't make phantom pods: test passed.") diff --git a/contrib/for-tests/network-tester/slow-pod.json b/contrib/for-tests/network-tester/slow-pod.json new file mode 100644 index 0000000000000..8fce984aed046 --- /dev/null +++ b/contrib/for-tests/network-tester/slow-pod.json @@ -0,0 +1,28 @@ +{ + "kind": "Pod", + "apiVersion": "v1", + "metadata": { + "name": "slow-pod", + "labels": { + "name": "nettest" + } + }, + "spec": { + "containers": [ + { + "name": "webserver", + "image": "gcr.io/google_containers/nettest:1.5", + "args": [ + "-service=nettest", + "-delay-shutdown=10" + ], + "ports": [ + { + "containerPort": 8080, + "protocol": "TCP" + } + ] + } + ] + } +} diff --git a/contrib/for-tests/network-tester/slow-rc.json b/contrib/for-tests/network-tester/slow-rc.json new file mode 100644 index 0000000000000..d70a145555c73 --- /dev/null +++ b/contrib/for-tests/network-tester/slow-rc.json @@ -0,0 +1,42 @@ +{ + "kind": "ReplicationController", + "apiVersion": "v1", + "metadata": { + "name": "slow-rc", + "labels": { + "name": "nettest" + } + }, + "spec": { + "replicas": 8, + "selector": { + "name": "nettest" + }, + "template": { + "metadata": { + "labels": { + "name": "nettest" + } + }, + "spec": { + "terminationGracePeriodSeconds": 5, + "containers": [ + { + "name": "webserver", + "image": "gcr.io/google_containers/nettest:1.5", + "args": [ + "-service=nettest", + "-delay-shutdown=10" + ], + "ports": [ + { + "containerPort": 8080, + "protocol": "TCP" + } + ] + } + ] + } + } + } +} diff --git a/hack/lib/test.sh b/hack/lib/test.sh index 02a4c8ad9e474..c48337763a687 100644 --- a/hack/lib/test.sh +++ b/hack/lib/test.sh @@ -23,7 +23,7 @@ readonly red=$(tput setaf 1) readonly green=$(tput setaf 2) kube::test::clear_all() { - kubectl delete "${kube_flags[@]}" rc,pods --all + kubectl delete "${kube_flags[@]}" rc,pods --all --grace-period=0 } kube::test::get_object_assert() { diff --git a/hack/test-cmd.sh b/hack/test-cmd.sh index 4e1635ca7a645..c650bf06af66d 100755 --- a/hack/test-cmd.sh +++ b/hack/test-cmd.sh @@ -247,6 +247,11 @@ runTests() { kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" 'valid-pod:' # Command kubectl delete pod valid-pod "${kube_flags[@]}" + # Post-condition: pod is still there, in terminating + kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" 'valid-pod:' + [[ "$(kubectl get pods "${kube_flags[@]}" | grep Terminating)" ]] + # Command + kubectl delete pod valid-pod "${kube_flags[@]}" --grace-period=0 # Post-condition: no POD is running kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" '' @@ -262,7 +267,7 @@ runTests() { # Pre-condition: valid-pod POD is running kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" 'valid-pod:' # Command - kubectl delete -f docs/admin/limitrange/valid-pod.yaml "${kube_flags[@]}" + kubectl delete -f docs/admin/limitrange/valid-pod.yaml "${kube_flags[@]}" --grace-period=0 # Post-condition: no POD is running kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" '' @@ -278,7 +283,7 @@ runTests() { # Pre-condition: valid-pod POD is running kube::test::get_object_assert "pods -l'name in (valid-pod)'" '{{range.items}}{{$id_field}}:{{end}}' 'valid-pod:' # Command - kubectl delete pods -l'name in (valid-pod)' "${kube_flags[@]}" + kubectl delete pods -l'name in (valid-pod)' "${kube_flags[@]}" --grace-period=0 # Post-condition: no POD is running kube::test::get_object_assert "pods -l'name in (valid-pod)'" '{{range.items}}{{$id_field}}:{{end}}' '' @@ -310,7 +315,7 @@ runTests() { # Pre-condition: valid-pod POD is running kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" 'valid-pod:' # Command - kubectl delete --all pods "${kube_flags[@]}" # --all remove all the pods + kubectl delete --all pods "${kube_flags[@]}" --grace-period=0 # --all remove all the pods # Post-condition: no POD is running kube::test::get_object_assert "pods -l'name in (valid-pod)'" '{{range.items}}{{$id_field}}:{{end}}' '' @@ -327,7 +332,7 @@ runTests() { # Pre-condition: valid-pod and redis-proxy PODs are running kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" 'redis-proxy:valid-pod:' # Command - kubectl delete pods valid-pod redis-proxy "${kube_flags[@]}" # delete multiple pods at once + kubectl delete pods valid-pod redis-proxy "${kube_flags[@]}" --grace-period=0 # delete multiple pods at once # Post-condition: no POD is running kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" '' @@ -344,7 +349,7 @@ runTests() { # Pre-condition: valid-pod and redis-proxy PODs are running kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" 'redis-proxy:valid-pod:' # Command - kubectl stop pods valid-pod redis-proxy "${kube_flags[@]}" # stop multiple pods at once + kubectl stop pods valid-pod redis-proxy "${kube_flags[@]}" --grace-period=0 # stop multiple pods at once # Post-condition: no POD is running kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" '' @@ -368,7 +373,7 @@ runTests() { # Pre-condition: valid-pod POD is running kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" 'valid-pod:' # Command - kubectl delete pods -lnew-name=new-valid-pod "${kube_flags[@]}" + kubectl delete pods -lnew-name=new-valid-pod --grace-period=0 "${kube_flags[@]}" # Post-condition: no POD is running kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" '' @@ -419,7 +424,7 @@ runTests() { # Pre-condition: valid-pod POD is running kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" 'valid-pod:' # Command - kubectl delete pods -l'name in (valid-pod-super-sayan)' "${kube_flags[@]}" + kubectl delete pods -l'name in (valid-pod-super-sayan)' --grace-period=0 "${kube_flags[@]}" # Post-condition: no POD is running kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" '' @@ -455,7 +460,7 @@ runTests() { # Pre-condition: valid-pod POD is running kube::test::get_object_assert 'pods --namespace=other' "{{range.items}}{{$id_field}}:{{end}}" 'valid-pod:' # Command - kubectl delete "${kube_flags[@]}" pod --namespace=other valid-pod + kubectl delete "${kube_flags[@]}" pod --namespace=other valid-pod --grace-period=0 # Post-condition: no POD is running kube::test::get_object_assert 'pods --namespace=other' "{{range.items}}{{$id_field}}:{{end}}" '' diff --git a/pkg/api/deep_copy_generated.go b/pkg/api/deep_copy_generated.go index 2a88c4cda3637..f4acae42ea9c7 100644 --- a/pkg/api/deep_copy_generated.go +++ b/pkg/api/deep_copy_generated.go @@ -1011,6 +1011,12 @@ func deepCopy_api_ObjectMeta(in ObjectMeta, out *ObjectMeta, c *conversion.Clone } else { out.DeletionTimestamp = nil } + if in.DeletionGracePeriodSeconds != nil { + out.DeletionGracePeriodSeconds = new(int64) + *out.DeletionGracePeriodSeconds = *in.DeletionGracePeriodSeconds + } else { + out.DeletionGracePeriodSeconds = nil + } if in.Labels != nil { out.Labels = make(map[string]string) for key, val := range in.Labels { diff --git a/pkg/api/rest/create.go b/pkg/api/rest/create.go index 3d51ded7e7671..e653a6b74ab85 100644 --- a/pkg/api/rest/create.go +++ b/pkg/api/rest/create.go @@ -59,6 +59,8 @@ func BeforeCreate(strategy RESTCreateStrategy, ctx api.Context, obj runtime.Obje } else { objectMeta.Namespace = api.NamespaceNone } + objectMeta.DeletionTimestamp = nil + objectMeta.DeletionGracePeriodSeconds = nil strategy.PrepareForCreate(obj) api.FillObjectMetaSystemFields(ctx, objectMeta) api.GenerateName(strategy, objectMeta) diff --git a/pkg/api/rest/delete.go b/pkg/api/rest/delete.go index 9d433f80fefa9..d1b04e6dded35 100644 --- a/pkg/api/rest/delete.go +++ b/pkg/api/rest/delete.go @@ -40,12 +40,37 @@ func BeforeDelete(strategy RESTDeleteStrategy, ctx api.Context, obj runtime.Obje if strategy == nil { return false, false, nil } - _, _, kerr := objectMetaAndKind(strategy, obj) + objectMeta, _, kerr := objectMetaAndKind(strategy, obj) if kerr != nil { return false, false, kerr } + + // if the object is already being deleted + if objectMeta.DeletionTimestamp != nil { + // if we are already being deleted, we may only shorten the deletion grace period + // this means the object was gracefully deleted previously but deletionGracePeriodSeconds was not set, + // so we force deletion immediately + if objectMeta.DeletionGracePeriodSeconds == nil { + return false, false, nil + } + // only a shorter grace period may be provided by a user + if options.GracePeriodSeconds != nil { + period := int64(*options.GracePeriodSeconds) + if period > *objectMeta.DeletionGracePeriodSeconds { + return false, true, nil + } + objectMeta.DeletionGracePeriodSeconds = &period + options.GracePeriodSeconds = &period + return true, false, nil + } + // graceful deletion is pending, do nothing + options.GracePeriodSeconds = objectMeta.DeletionGracePeriodSeconds + return false, true, nil + } + if !strategy.CheckGracefulDelete(obj, options) { return false, false, nil } + objectMeta.DeletionGracePeriodSeconds = options.GracePeriodSeconds return true, false, nil } diff --git a/pkg/api/rest/resttest/resttest.go b/pkg/api/rest/resttest/resttest.go index f8c1716ef6ab9..5576692c09389 100644 --- a/pkg/api/rest/resttest/resttest.go +++ b/pkg/api/rest/resttest/resttest.go @@ -298,6 +298,22 @@ func (t *Tester) testUpdateFailsOnVersion(older runtime.Object) { // ============================================================================= // Deletion tests. +func (t *Tester) TestDeleteNoGraceful(createFn func() runtime.Object, wasGracefulFn func() bool) { + existing := createFn() + objectMeta := t.getObjectMetaOrFail(existing) + ctx := api.WithNamespace(t.TestContext(), objectMeta.Namespace) + _, err := t.storage.(rest.GracefulDeleter).Delete(ctx, objectMeta.Name, api.NewDeleteOptions(10)) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if _, err := t.storage.(rest.Getter).Get(ctx, objectMeta.Name); !errors.IsNotFound(err) { + t.Errorf("unexpected error, object should not exist: %v", err) + } + if wasGracefulFn() { + t.Errorf("resource should not support graceful delete") + } +} + func (t *Tester) testDeleteInvokesValidation(invalid ...runtime.Object) { for i, obj := range invalid { objectMeta := t.getObjectMetaOrFail(obj) @@ -322,38 +338,116 @@ func (t *Tester) testDeleteNonExist(createFn func() runtime.Object) { }) } -func (t *Tester) testDeleteNoGraceful(createFn func() runtime.Object, wasGracefulFn func() bool) { - existing := createFn() +// ============================================================================= +// Graceful Deletion tests. + +func (t *Tester) TestDeleteGraceful(createFn func() runtime.Object, expectedGrace int64, wasGracefulFn func() bool) { + t.TestDeleteGracefulHasDefault(createFn(), expectedGrace, wasGracefulFn) + t.TestDeleteGracefulWithValue(createFn(), expectedGrace, wasGracefulFn) + t.TestDeleteGracefulUsesZeroOnNil(createFn(), 0) + t.TestDeleteGracefulExtend(createFn(), expectedGrace, wasGracefulFn) +} + +func (t *Tester) testDeleteGracefulHasDefault(existing runtime.Object, expectedGrace int64, wasGracefulFn func() bool) { objectMeta := t.getObjectMetaOrFail(existing) ctx := api.WithNamespace(t.TestContext(), objectMeta.Namespace) - _, err := t.storage.(rest.GracefulDeleter).Delete(ctx, objectMeta.Name, api.NewDeleteOptions(10)) + _, err := t.storage.(rest.GracefulDeleter).Delete(ctx, objectMeta.Name, &api.DeleteOptions{}) if err != nil { t.Errorf("unexpected error: %v", err) } - if _, err := t.storage.(rest.Getter).Get(ctx, objectMeta.Name); !errors.IsNotFound(err) { - t.Errorf("unexpected error, object should not exist: %v", err) + if !wasGracefulFn() { + t.Errorf("did not gracefully delete resource") + return } - if wasGracefulFn() { - t.Errorf("resource should not support graceful delete") + object, err := t.storage.(rest.Getter).Get(ctx, objectMeta.Name) + if err != nil { + t.Errorf("unexpected error, object should exist: %v", err) + return + } + objectMeta, err = api.ObjectMetaFor(object) + if err != nil { + t.Fatalf("object does not have ObjectMeta: %v\n%#v", err, object) + } + if objectMeta.DeletionTimestamp == nil { + t.Errorf("did not set deletion timestamp") + } + if objectMeta.DeletionGracePeriodSeconds == nil { + t.Fatalf("did not set deletion grace period seconds") + } + if *objectMeta.DeletionGracePeriodSeconds != expectedGrace { + t.Errorf("actual grace period does not match expected: %d", *objectMeta.DeletionGracePeriodSeconds) } } -// ============================================================================= -// Graceful Deletion tests. +func (t *Tester) TestDeleteGracefulWithValue(existing runtime.Object, expectedGrace int64, wasGracefulFn func() bool) { + objectMeta, err := api.ObjectMetaFor(existing) + if err != nil { + t.Fatalf("object does not have ObjectMeta: %v\n%#v", err, existing) + } -func (t *Tester) testDeleteGracefulHasDefault(existing runtime.Object, expectedGrace int64, wasGracefulFn func() bool) { - objectMeta := t.getObjectMetaOrFail(existing) ctx := api.WithNamespace(t.TestContext(), objectMeta.Namespace) - _, err := t.storage.(rest.GracefulDeleter).Delete(ctx, objectMeta.Name, &api.DeleteOptions{}) + _, err = t.storage.(rest.GracefulDeleter).Delete(ctx, objectMeta.Name, api.NewDeleteOptions(expectedGrace+2)) if err != nil { t.Errorf("unexpected error: %v", err) } - if _, err := t.storage.(rest.Getter).Get(ctx, objectMeta.Name); err != nil { + if !wasGracefulFn() { + t.Errorf("did not gracefully delete resource") + } + object, err := t.storage.(rest.Getter).Get(ctx, objectMeta.Name) + if err != nil { t.Errorf("unexpected error, object should exist: %v", err) } + objectMeta, err = api.ObjectMetaFor(object) + if err != nil { + t.Fatalf("object does not have ObjectMeta: %v\n%#v", err, object) + } + if objectMeta.DeletionTimestamp == nil { + t.Errorf("did not set deletion timestamp") + } + if objectMeta.DeletionGracePeriodSeconds == nil { + t.Fatalf("did not set deletion grace period seconds") + } + if *objectMeta.DeletionGracePeriodSeconds != expectedGrace+2 { + t.Errorf("actual grace period does not match expected: %d", *objectMeta.DeletionGracePeriodSeconds) + } +} + +func (t *Tester) TestDeleteGracefulExtend(existing runtime.Object, expectedGrace int64, wasGracefulFn func() bool) { + objectMeta, err := api.ObjectMetaFor(existing) + if err != nil { + t.Fatalf("object does not have ObjectMeta: %v\n%#v", err, existing) + } + + ctx := api.WithNamespace(t.TestContext(), objectMeta.Namespace) + _, err = t.storage.(rest.GracefulDeleter).Delete(ctx, objectMeta.Name, api.NewDeleteOptions(expectedGrace)) + if err != nil { + t.Errorf("unexpected error: %v", err) + } if !wasGracefulFn() { t.Errorf("did not gracefully delete resource") } + // second delete duration is ignored + _, err = t.storage.(rest.GracefulDeleter).Delete(ctx, objectMeta.Name, api.NewDeleteOptions(expectedGrace+2)) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + object, err := t.storage.(rest.Getter).Get(ctx, objectMeta.Name) + if err != nil { + t.Errorf("unexpected error, object should exist: %v", err) + } + objectMeta, err = api.ObjectMetaFor(object) + if err != nil { + t.Fatalf("object does not have ObjectMeta: %v\n%#v", err, object) + } + if objectMeta.DeletionTimestamp == nil { + t.Errorf("did not set deletion timestamp") + } + if objectMeta.DeletionGracePeriodSeconds == nil { + t.Fatalf("did not set deletion grace period seconds") + } + if *objectMeta.DeletionGracePeriodSeconds != expectedGrace { + t.Errorf("actual grace period does not match expected: %d", *objectMeta.DeletionGracePeriodSeconds) + } } func (t *Tester) testDeleteGracefulUsesZeroOnNil(existing runtime.Object, expectedGrace int64) { @@ -364,7 +458,7 @@ func (t *Tester) testDeleteGracefulUsesZeroOnNil(existing runtime.Object, expect t.Errorf("unexpected error: %v", err) } if _, err := t.storage.(rest.Getter).Get(ctx, objectMeta.Name); !errors.IsNotFound(err) { - t.Errorf("unexpected error, object should exist: %v", err) + t.Errorf("unexpected error, object should not exist: %v", err) } } diff --git a/pkg/api/serialization_test.go b/pkg/api/serialization_test.go index ba3b0b86bc925..cc19b4d1d1c3d 100644 --- a/pkg/api/serialization_test.go +++ b/pkg/api/serialization_test.go @@ -151,6 +151,7 @@ func TestRoundTripTypes(t *testing.T) { } func TestEncode_Ptr(t *testing.T) { + grace := int64(30) pod := &api.Pod{ ObjectMeta: api.ObjectMeta{ Labels: map[string]string{"name": "foo"}, @@ -158,6 +159,8 @@ func TestEncode_Ptr(t *testing.T) { Spec: api.PodSpec{ RestartPolicy: api.RestartPolicyAlways, DNSPolicy: api.DNSClusterFirst, + + TerminationGracePeriodSeconds: &grace, }, } obj := runtime.Object(pod) diff --git a/pkg/api/testing/fuzzer.go b/pkg/api/testing/fuzzer.go index a88202f8e4bc7..2ea5c8fc461cc 100644 --- a/pkg/api/testing/fuzzer.go +++ b/pkg/api/testing/fuzzer.go @@ -89,6 +89,15 @@ func FuzzerFor(t *testing.T, version string, src rand.Source) *fuzz.Fuzzer { j.LabelSelector, _ = labels.Parse("a=b") j.FieldSelector, _ = fields.ParseSelector("a=b") }, + func(j *api.PodSpec, c fuzz.Continue) { + c.FuzzNoCustom(j) + // has a default value + ttl := int64(30) + if c.RandBool() { + ttl = int64(c.Uint32()) + } + j.TerminationGracePeriodSeconds = &ttl + }, func(j *api.PodPhase, c fuzz.Continue) { statuses := []api.PodPhase{api.PodPending, api.PodRunning, api.PodFailed, api.PodUnknown} *j = statuses[c.Rand.Intn(len(statuses))] diff --git a/pkg/api/types.go b/pkg/api/types.go index 1698cda38728c..028d61a467ff7 100644 --- a/pkg/api/types.go +++ b/pkg/api/types.go @@ -143,6 +143,10 @@ type ObjectMeta struct { // will send a hard termination signal to the container. DeletionTimestamp *util.Time `json:"deletionTimestamp,omitempty"` + // DeletionGracePeriodSeconds records the graceful deletion value set when graceful deletion + // was requested. Represents the most recent grace period, and may only be shortened once set. + DeletionGracePeriodSeconds *int64 `json:"deletionGracePeriodSeconds,omitempty"` + // Labels are key value pairs that may be used to scope and select individual resources. // Label keys are of the form: // label-key ::= prefixed-name | name diff --git a/pkg/api/v1/conversion_generated.go b/pkg/api/v1/conversion_generated.go index c0736192e242e..a1bdb14cd026d 100644 --- a/pkg/api/v1/conversion_generated.go +++ b/pkg/api/v1/conversion_generated.go @@ -1176,6 +1176,12 @@ func convert_api_ObjectMeta_To_v1_ObjectMeta(in *api.ObjectMeta, out *ObjectMeta } else { out.DeletionTimestamp = nil } + if in.DeletionGracePeriodSeconds != nil { + out.DeletionGracePeriodSeconds = new(int64) + *out.DeletionGracePeriodSeconds = *in.DeletionGracePeriodSeconds + } else { + out.DeletionGracePeriodSeconds = nil + } if in.Labels != nil { out.Labels = make(map[string]string) for key, val := range in.Labels { @@ -3591,6 +3597,12 @@ func convert_v1_ObjectMeta_To_api_ObjectMeta(in *ObjectMeta, out *api.ObjectMeta } else { out.DeletionTimestamp = nil } + if in.DeletionGracePeriodSeconds != nil { + out.DeletionGracePeriodSeconds = new(int64) + *out.DeletionGracePeriodSeconds = *in.DeletionGracePeriodSeconds + } else { + out.DeletionGracePeriodSeconds = nil + } if in.Labels != nil { out.Labels = make(map[string]string) for key, val := range in.Labels { diff --git a/pkg/api/v1/deep_copy_generated.go b/pkg/api/v1/deep_copy_generated.go index 24a3acdd9c5c8..cc285d83925b0 100644 --- a/pkg/api/v1/deep_copy_generated.go +++ b/pkg/api/v1/deep_copy_generated.go @@ -1010,6 +1010,12 @@ func deepCopy_v1_ObjectMeta(in ObjectMeta, out *ObjectMeta, c *conversion.Cloner } else { out.DeletionTimestamp = nil } + if in.DeletionGracePeriodSeconds != nil { + out.DeletionGracePeriodSeconds = new(int64) + *out.DeletionGracePeriodSeconds = *in.DeletionGracePeriodSeconds + } else { + out.DeletionGracePeriodSeconds = nil + } if in.Labels != nil { out.Labels = make(map[string]string) for key, val := range in.Labels { diff --git a/pkg/api/v1/defaults.go b/pkg/api/v1/defaults.go index 903d6f42b6598..eee2a647c0c19 100644 --- a/pkg/api/v1/defaults.go +++ b/pkg/api/v1/defaults.go @@ -113,6 +113,10 @@ func addDefaultingFuncs() { if obj.HostNetwork { defaultHostNetworkPorts(&obj.Containers) } + if obj.TerminationGracePeriodSeconds == nil { + period := int64(DefaultTerminationGracePeriodSeconds) + obj.TerminationGracePeriodSeconds = &period + } }, func(obj *Probe) { if obj.TimeoutSeconds == 0 { diff --git a/pkg/api/v1/types.go b/pkg/api/v1/types.go index 097b7b8125285..ea0749f584b22 100644 --- a/pkg/api/v1/types.go +++ b/pkg/api/v1/types.go @@ -141,6 +141,10 @@ type ObjectMeta struct { // will send a hard termination signal to the container. DeletionTimestamp *util.Time `json:"deletionTimestamp,omitempty" description:"RFC 3339 date and time at which the object will be deleted; populated by the system when a graceful deletion is requested, read-only; if not set, graceful deletion of the object has not been requested; see http://releases.k8s.io/HEAD/docs/devel/api-conventions.md#metadata"` + // DeletionGracePeriodSeconds records the graceful deletion value set when graceful deletion + // was requested. Represents the most recent grace period, and may only be shortened once set. + DeletionGracePeriodSeconds *int64 `json:"deletionGracePeriodSeconds,omitempty" description:"number of seconds allowed for this object to gracefully terminate before it will be removed from the system; only set when deletionTimestamp is also set, read-only; may only be shortened"` + // Labels are key value pairs that may be used to scope and select individual resources. // TODO: replace map[string]string with labels.LabelSet type Labels map[string]string `json:"labels,omitempty" description:"map of string keys and values that can be used to organize and categorize objects; may match selectors of replication controllers and services; see http://releases.k8s.io/HEAD/docs/user-guide/labels.md"` @@ -858,6 +862,8 @@ const ( // DNSDefault indicates that the pod should use the default (as // determined by kubelet) DNS settings. DNSDefault DNSPolicy = "Default" + + DefaultTerminationGracePeriodSeconds = 30 ) // PodSpec is a description of a pod @@ -872,7 +878,7 @@ type PodSpec struct { // The grace period is the duration in seconds after the processes running in the pod are sent // a termination signal and the time when the processes are forcibly halted with a kill signal. // Set this value longer than the expected cleanup time for your process. - TerminationGracePeriodSeconds *int64 `json:"terminationGracePeriodSeconds,omitempty" description:"optional duration in seconds the pod needs to terminate gracefully; may be decreased in delete request; value must be non-negative integer; the value zero indicates delete immediately; if this value is not set, the default grace period will be used instead; the grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal; set this value longer than the expected cleanup time for your process"` + TerminationGracePeriodSeconds *int64 `json:"terminationGracePeriodSeconds,omitempty" description:"optional duration in seconds the pod needs to terminate gracefully; may be decreased in delete request; value must be non-negative integer; the value zero indicates delete immediately; if this value is not set, the default grace period will be used instead; the grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal; set this value longer than the expected cleanup time for your process; defaults to 30 seconds"` ActiveDeadlineSeconds *int64 `json:"activeDeadlineSeconds,omitempty" description:"optional duration in seconds the pod may be active on the node relative to StartTime before the system will actively try to mark it failed and kill associated containers; value must be a positive integer"` // Optional: Set DNS policy. Defaults to "ClusterFirst" DNSPolicy DNSPolicy `json:"dnsPolicy,omitempty" description:"DNS policy for containers within the pod; one of 'ClusterFirst' or 'Default'"` diff --git a/pkg/api/validation/validation.go b/pkg/api/validation/validation.go index 9d42e5b1427ed..ed9578ebe8b92 100644 --- a/pkg/api/validation/validation.go +++ b/pkg/api/validation/validation.go @@ -265,6 +265,16 @@ func ValidateObjectMetaUpdate(new, old *api.ObjectMeta) errs.ValidationErrorList } else { new.CreationTimestamp = old.CreationTimestamp } + // an object can never remove a deletion timestamp or clear/change grace period seconds + if !old.DeletionTimestamp.IsZero() { + new.DeletionTimestamp = old.DeletionTimestamp + } + if old.DeletionGracePeriodSeconds != nil && new.DeletionGracePeriodSeconds == nil { + new.DeletionGracePeriodSeconds = old.DeletionGracePeriodSeconds + } + if new.DeletionGracePeriodSeconds != nil && *new.DeletionGracePeriodSeconds != *old.DeletionGracePeriodSeconds { + allErrs = append(allErrs, errs.NewFieldInvalid("deletionGracePeriodSeconds", new.DeletionGracePeriodSeconds, "field is immutable; may only be changed via deletion")) + } // Reject updates that don't specify a resource version if new.ResourceVersion == "" { diff --git a/pkg/controller/controller_utils.go b/pkg/controller/controller_utils.go index 2438c8148ad7d..77419e155ed83 100644 --- a/pkg/controller/controller_utils.go +++ b/pkg/controller/controller_utils.go @@ -322,7 +322,8 @@ func FilterActivePods(pods []api.Pod) []*api.Pod { var result []*api.Pod for i := range pods { if api.PodSucceeded != pods[i].Status.Phase && - api.PodFailed != pods[i].Status.Phase { + api.PodFailed != pods[i].Status.Phase && + pods[i].DeletionTimestamp == nil { result = append(result, &pods[i]) } } diff --git a/pkg/controller/endpoint/endpoints_controller.go b/pkg/controller/endpoint/endpoints_controller.go index d4962cf1df704..88345f9dea98c 100644 --- a/pkg/controller/endpoint/endpoints_controller.go +++ b/pkg/controller/endpoint/endpoints_controller.go @@ -310,7 +310,11 @@ func (e *EndpointController) syncService(key string) { continue } if len(pod.Status.PodIP) == 0 { - glog.V(4).Infof("Failed to find an IP for pod %s/%s", pod.Namespace, pod.Name) + glog.V(5).Infof("Failed to find an IP for pod %s/%s", pod.Namespace, pod.Name) + continue + } + if pod.DeletionTimestamp != nil { + glog.V(5).Infof("Pod is being deleted %s/%s", pod.Namespace, pod.Name) continue } diff --git a/pkg/controller/replication/replication_controller.go b/pkg/controller/replication/replication_controller.go index b212647c45c65..726a8f735ac04 100644 --- a/pkg/controller/replication/replication_controller.go +++ b/pkg/controller/replication/replication_controller.go @@ -213,6 +213,12 @@ func (rm *ReplicationManager) getPodController(pod *api.Pod) *api.ReplicationCon // When a pod is created, enqueue the controller that manages it and update it's expectations. func (rm *ReplicationManager) addPod(obj interface{}) { pod := obj.(*api.Pod) + if pod.DeletionTimestamp != nil { + // on a restart of the controller manager, it's possible a new pod shows up in a state that + // is already pending deletion. Prevent the pod from being a creation observation. + rm.deletePod(pod) + return + } if rc := rm.getPodController(pod); rc != nil { rcKey, err := controller.KeyFunc(rc) if err != nil { @@ -234,6 +240,15 @@ func (rm *ReplicationManager) updatePod(old, cur interface{}) { } // TODO: Write a unittest for this case curPod := cur.(*api.Pod) + if curPod.DeletionTimestamp != nil { + // when a pod is deleted gracefully it's deletion timestamp is first modified to reflect a grace period, + // and after such time has passed, the kubelet actually deletes it from the store. We receive an update + // for modification of the deletion timestamp and expect an rc to create more replicas asap, not wait + // until the kubelet actually deletes the pod. This is different from the Phase of a pod changing, because + // an rc never initiates a phase change, and so is never asleep waiting for the same. + rm.deletePod(curPod) + return + } if rc := rm.getPodController(curPod); rc != nil { rm.enqueueController(rc) } diff --git a/pkg/kubectl/cmd/get_test.go b/pkg/kubectl/cmd/get_test.go index bb49d4c47626c..7d7267d5a4888 100644 --- a/pkg/kubectl/cmd/get_test.go +++ b/pkg/kubectl/cmd/get_test.go @@ -38,6 +38,7 @@ import ( ) func testData() (*api.PodList, *api.ServiceList, *api.ReplicationControllerList) { + grace := int64(30) pods := &api.PodList{ ListMeta: api.ListMeta{ ResourceVersion: "15", @@ -46,15 +47,17 @@ func testData() (*api.PodList, *api.ServiceList, *api.ReplicationControllerList) { ObjectMeta: api.ObjectMeta{Name: "foo", Namespace: "test", ResourceVersion: "10"}, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, { ObjectMeta: api.ObjectMeta{Name: "bar", Namespace: "test", ResourceVersion: "11"}, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, }, @@ -536,6 +539,7 @@ func TestGetMultipleTypeObjectsWithDirectReference(t *testing.T) { } } func watchTestData() ([]api.Pod, []watch.Event) { + grace := int64(30) pods := []api.Pod{ { ObjectMeta: api.ObjectMeta{ @@ -544,8 +548,9 @@ func watchTestData() ([]api.Pod, []watch.Event) { ResourceVersion: "10", }, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, } @@ -559,8 +564,9 @@ func watchTestData() ([]api.Pod, []watch.Event) { ResourceVersion: "11", }, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, }, @@ -573,8 +579,9 @@ func watchTestData() ([]api.Pod, []watch.Event) { ResourceVersion: "12", }, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, }, diff --git a/pkg/kubectl/cmd/util/helpers_test.go b/pkg/kubectl/cmd/util/helpers_test.go index c870531160a43..0a9cd7459a0d4 100644 --- a/pkg/kubectl/cmd/util/helpers_test.go +++ b/pkg/kubectl/cmd/util/helpers_test.go @@ -34,6 +34,7 @@ import ( ) func TestMerge(t *testing.T) { + grace := int64(30) tests := []struct { obj runtime.Object fragment string @@ -54,8 +55,9 @@ func TestMerge(t *testing.T) { Name: "foo", }, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, }, @@ -122,8 +124,9 @@ func TestMerge(t *testing.T) { VolumeSource: api.VolumeSource{EmptyDir: &api.EmptyDirVolumeSource{}}, }, }, - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, }, diff --git a/pkg/kubectl/describe.go b/pkg/kubectl/describe.go index bb1ec67d12e85..107abc65f3b0f 100644 --- a/pkg/kubectl/describe.go +++ b/pkg/kubectl/describe.go @@ -417,7 +417,12 @@ func describePod(pod *api.Pod, rcs []api.ReplicationController, events *api.Even fmt.Fprintf(out, "Image(s):\t%s\n", makeImageList(&pod.Spec)) fmt.Fprintf(out, "Node:\t%s\n", pod.Spec.NodeName+"/"+pod.Status.HostIP) fmt.Fprintf(out, "Labels:\t%s\n", formatLabels(pod.Labels)) - fmt.Fprintf(out, "Status:\t%s\n", string(pod.Status.Phase)) + if pod.DeletionTimestamp != nil { + fmt.Fprintf(out, "Status:\tTerminating (expires %s)\n", pod.DeletionTimestamp.Time.Format(time.RFC1123Z)) + fmt.Fprintf(out, "Termination Grace Period:\t%ss\n", pod.DeletionGracePeriodSeconds) + } else { + fmt.Fprintf(out, "Status:\t%s\n", string(pod.Status.Phase)) + } fmt.Fprintf(out, "Reason:\t%s\n", pod.Status.Reason) fmt.Fprintf(out, "Message:\t%s\n", pod.Status.Message) fmt.Fprintf(out, "IP:\t%s\n", pod.Status.PodIP) diff --git a/pkg/kubectl/resource/builder_test.go b/pkg/kubectl/resource/builder_test.go index ebe480c3f339e..7b96f74656ad6 100644 --- a/pkg/kubectl/resource/builder_test.go +++ b/pkg/kubectl/resource/builder_test.go @@ -83,6 +83,7 @@ func fakeClientWith(testName string, t *testing.T, data map[string]string) Clien } func testData() (*api.PodList, *api.ServiceList) { + grace := int64(30) pods := &api.PodList{ ListMeta: api.ListMeta{ ResourceVersion: "15", @@ -91,15 +92,17 @@ func testData() (*api.PodList, *api.ServiceList) { { ObjectMeta: api.ObjectMeta{Name: "foo", Namespace: "test", ResourceVersion: "10"}, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, { ObjectMeta: api.ObjectMeta{Name: "bar", Namespace: "test", ResourceVersion: "11"}, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, }, diff --git a/pkg/kubectl/resource/helper_test.go b/pkg/kubectl/resource/helper_test.go index d40da3d2710ba..51f81a7925d4e 100644 --- a/pkg/kubectl/resource/helper_test.go +++ b/pkg/kubectl/resource/helper_test.go @@ -128,6 +128,7 @@ func TestHelperCreate(t *testing.T) { return true } + grace := int64(30) tests := []struct { Resp *http.Response RespFunc client.HTTPClientFunc @@ -172,8 +173,9 @@ func TestHelperCreate(t *testing.T) { ExpectObject: &api.Pod{ ObjectMeta: api.ObjectMeta{Name: "foo"}, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, Resp: &http.Response{StatusCode: http.StatusOK, Body: objBody(&api.Status{Status: api.StatusSuccess})}, @@ -381,6 +383,7 @@ func TestHelperReplace(t *testing.T) { return true } + grace := int64(30) tests := []struct { Resp *http.Response RespFunc client.HTTPClientFunc @@ -418,8 +421,9 @@ func TestHelperReplace(t *testing.T) { ExpectObject: &api.Pod{ ObjectMeta: api.ObjectMeta{Name: "foo", ResourceVersion: "10"}, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, Overwrite: true, diff --git a/pkg/kubectl/resource_printer.go b/pkg/kubectl/resource_printer.go index 0db84a8c3399d..09ae64128c7bd 100644 --- a/pkg/kubectl/resource_printer.go +++ b/pkg/kubectl/resource_printer.go @@ -410,6 +410,9 @@ func printPod(pod *api.Pod, w io.Writer, withNamespace bool, wide bool, columnLa readyContainers++ } } + if pod.DeletionTimestamp != nil { + reason = "Terminating" + } if withNamespace { if _, err := fmt.Fprintf(w, "%s\t", namespace); err != nil { diff --git a/pkg/kubectl/rolling_updater_test.go b/pkg/kubectl/rolling_updater_test.go index 827b40777e9b4..67b50955d8642 100644 --- a/pkg/kubectl/rolling_updater_test.go +++ b/pkg/kubectl/rolling_updater_test.go @@ -880,6 +880,7 @@ func TestUpdateExistingReplicationController(t *testing.T) { func TestUpdateWithRetries(t *testing.T) { codec := testapi.Codec() + grace := int64(30) rc := &api.ReplicationController{ ObjectMeta: api.ObjectMeta{Name: "rc", Labels: map[string]string{ @@ -897,8 +898,9 @@ func TestUpdateWithRetries(t *testing.T) { }, }, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, }, diff --git a/pkg/kubelet/config/common_test.go b/pkg/kubelet/config/common_test.go index 9ec02de37ce49..fcf16781bab8b 100644 --- a/pkg/kubelet/config/common_test.go +++ b/pkg/kubelet/config/common_test.go @@ -31,6 +31,7 @@ import ( func noDefault(*api.Pod) error { return nil } func TestDecodeSinglePod(t *testing.T) { + grace := int64(30) pod := &api.Pod{ TypeMeta: api.TypeMeta{ APIVersion: "", @@ -41,8 +42,9 @@ func TestDecodeSinglePod(t *testing.T) { Namespace: "mynamespace", }, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, Containers: []api.Container{{ Name: "image", Image: "test/image", @@ -91,6 +93,7 @@ func TestDecodeSinglePod(t *testing.T) { } func TestDecodePodList(t *testing.T) { + grace := int64(30) pod := &api.Pod{ TypeMeta: api.TypeMeta{ APIVersion: "", @@ -101,8 +104,9 @@ func TestDecodePodList(t *testing.T) { Namespace: "mynamespace", }, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, Containers: []api.Container{{ Name: "image", Image: "test/image", diff --git a/pkg/kubelet/config/config.go b/pkg/kubelet/config/config.go index cbb8ef760cd8c..f8f16fa65e2e1 100644 --- a/pkg/kubelet/config/config.go +++ b/pkg/kubelet/config/config.go @@ -217,9 +217,8 @@ func (s *podStorage) merge(source string, change interface{}) (adds, updates, de for _, ref := range filtered { name := kubecontainer.GetPodFullName(ref) if existing, found := pods[name]; found { - if !reflect.DeepEqual(existing.Spec, ref.Spec) { + if checkAndUpdatePod(existing, ref) { // this is an update - existing.Spec = ref.Spec updates.Pods = append(updates.Pods, existing) continue } @@ -261,9 +260,8 @@ func (s *podStorage) merge(source string, change interface{}) (adds, updates, de name := kubecontainer.GetPodFullName(ref) if existing, found := oldPods[name]; found { pods[name] = existing - if !reflect.DeepEqual(existing.Spec, ref.Spec) { + if checkAndUpdatePod(existing, ref) { // this is an update - existing.Spec = ref.Spec updates.Pods = append(updates.Pods, existing) continue } @@ -335,6 +333,23 @@ func filterInvalidPods(pods []*api.Pod, source string, recorder record.EventReco return } +// checkAndUpdatePod updates existing if ref makes a meaningful change and returns true, or +// returns false if there was no update. +func checkAndUpdatePod(existing, ref *api.Pod) bool { + // TODO: it would be better to update the whole object and only preserve certain things + // like the source annotation or the UID (to ensure safety) + if reflect.DeepEqual(existing.Spec, ref.Spec) && + reflect.DeepEqual(existing.DeletionTimestamp, ref.DeletionTimestamp) && + reflect.DeepEqual(existing.DeletionGracePeriodSeconds, ref.DeletionGracePeriodSeconds) { + return false + } + // this is an update + existing.Spec = ref.Spec + existing.DeletionTimestamp = ref.DeletionTimestamp + existing.DeletionGracePeriodSeconds = ref.DeletionGracePeriodSeconds + return true +} + // Sync sends a copy of the current state through the update channel. func (s *podStorage) Sync() { s.updateLock.Lock() diff --git a/pkg/kubelet/config/file_test.go b/pkg/kubelet/config/file_test.go index 440761e32632e..66ca28ba97e91 100644 --- a/pkg/kubelet/config/file_test.go +++ b/pkg/kubelet/config/file_test.go @@ -69,6 +69,7 @@ func writeTestFile(t *testing.T, dir, name string, contents string) *os.File { func TestReadPodsFromFile(t *testing.T) { hostname := "random-test-hostname" + grace := int64(30) var testCases = []struct { desc string pod runtime.Object @@ -98,9 +99,10 @@ func TestReadPodsFromFile(t *testing.T) { SelfLink: getSelfLink("test-"+hostname, "mynamespace"), }, Spec: api.PodSpec{ - NodeName: hostname, - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + NodeName: hostname, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, Containers: []api.Container{{ Name: "image", Image: "test/image", diff --git a/pkg/kubelet/config/http_test.go b/pkg/kubelet/config/http_test.go index 08fafe92880c8..4279609414ad7 100644 --- a/pkg/kubelet/config/http_test.go +++ b/pkg/kubelet/config/http_test.go @@ -123,6 +123,7 @@ func TestExtractInvalidPods(t *testing.T) { func TestExtractPodsFromHTTP(t *testing.T) { hostname := "different-value" + grace := int64(30) var testCases = []struct { desc string pods runtime.Object @@ -156,9 +157,11 @@ func TestExtractPodsFromHTTP(t *testing.T) { SelfLink: getSelfLink("foo-"+hostname, "mynamespace"), }, Spec: api.PodSpec{ - NodeName: hostname, - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + NodeName: hostname, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, + Containers: []api.Container{{ Name: "1", Image: "foo", @@ -209,9 +212,11 @@ func TestExtractPodsFromHTTP(t *testing.T) { SelfLink: getSelfLink("foo-"+hostname, kubelet.NamespaceDefault), }, Spec: api.PodSpec{ - NodeName: hostname, - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + NodeName: hostname, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, + Containers: []api.Container{{ Name: "1", Image: "foo", @@ -229,9 +234,11 @@ func TestExtractPodsFromHTTP(t *testing.T) { SelfLink: getSelfLink("bar-"+hostname, kubelet.NamespaceDefault), }, Spec: api.PodSpec{ - NodeName: hostname, - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + NodeName: hostname, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, + Containers: []api.Container{{ Name: "2", Image: "bar", diff --git a/pkg/kubelet/container/fake_runtime.go b/pkg/kubelet/container/fake_runtime.go index 03d65e50a3394..324d7ae6bf3ad 100644 --- a/pkg/kubelet/container/fake_runtime.go +++ b/pkg/kubelet/container/fake_runtime.go @@ -163,13 +163,13 @@ func (f *FakeRuntime) SyncPod(pod *api.Pod, _ Pod, _ api.PodStatus, _ []api.Secr return f.Err } -func (f *FakeRuntime) KillPod(pod Pod) error { +func (f *FakeRuntime) KillPod(pod *api.Pod, runningPod Pod) error { f.Lock() defer f.Unlock() f.CalledFunctions = append(f.CalledFunctions, "KillPod") - f.KilledPods = append(f.KilledPods, string(pod.ID)) - for _, c := range pod.Containers { + f.KilledPods = append(f.KilledPods, string(runningPod.ID)) + for _, c := range runningPod.Containers { f.KilledContainers = append(f.KilledContainers, c.Name) } return f.Err diff --git a/pkg/kubelet/container/runtime.go b/pkg/kubelet/container/runtime.go index d50f021f6a5d5..8d476f9b16889 100644 --- a/pkg/kubelet/container/runtime.go +++ b/pkg/kubelet/container/runtime.go @@ -54,8 +54,8 @@ type Runtime interface { GetPods(all bool) ([]*Pod, error) // Syncs the running pod into the desired pod. SyncPod(pod *api.Pod, runningPod Pod, podStatus api.PodStatus, pullSecrets []api.Secret) error - // KillPod kills all the containers of a pod. - KillPod(pod Pod) error + // KillPod kills all the containers of a pod. Pod may be nil, running pod must not be. + KillPod(pod *api.Pod, runningPod Pod) error // GetPodStatus retrieves the status of the pod, including the information of // all containers in the pod. Clients of this interface assume the containers // statuses in a pod always have a deterministic ordering (eg: sorted by name). diff --git a/pkg/kubelet/dockertools/manager.go b/pkg/kubelet/dockertools/manager.go index b845191a8248b..3de60eb192613 100644 --- a/pkg/kubelet/dockertools/manager.go +++ b/pkg/kubelet/dockertools/manager.go @@ -56,12 +56,21 @@ import ( const ( maxReasonCacheEntries = 200 - kubernetesPodLabel = "io.kubernetes.pod.data" - kubernetesContainerLabel = "io.kubernetes.container.name" // ndots specifies the minimum number of dots that a domain name must contain for the resolver to consider it as FQDN (fully-qualified) // we want to able to consider SRV lookup names like _dns._udp.kube-dns.default.svc to be considered relative. // hence, setting ndots to be 5. ndotsDNSOption = "options ndots:5\n" + // In order to avoid unnecessary SIGKILLs, give every container a minimum grace + // period after SIGTERM. Docker will guarantee the termination, but SIGTERM is + // potentially dangerous. + // TODO: evaluate whether there are scenarios in which SIGKILL is preferable to + // SIGTERM for certain process types, which may justify setting this to 0. + minimumGracePeriodInSeconds = 2 + + kubernetesNameLabel = "io.kubernetes.pod.name" + kubernetesPodLabel = "io.kubernetes.pod.data" + kubernetesTerminationGracePeriodLabel = "io.kubernetes.pod.terminationGracePeriod" + kubernetesContainerLabel = "io.kubernetes.container.name" ) // DockerManager implements the Runtime interface. @@ -588,12 +597,19 @@ func (dm *DockerManager) runContainer( if len(containerHostname) > hostnameMaxLen { containerHostname = containerHostname[:hostnameMaxLen] } + + // Pod information is recorded on the container as labels to preserve it in the event the pod is deleted + // while the Kubelet is down and there is no information available to recover the pod. This includes + // termination information like the termination grace period and the pre stop hooks. + // TODO: keep these labels up to date if the pod changes namespacedName := types.NamespacedName{Namespace: pod.Namespace, Name: pod.Name} labels := map[string]string{ - "io.kubernetes.pod.name": namespacedName.String(), + kubernetesNameLabel: namespacedName.String(), + } + if pod.Spec.TerminationGracePeriodSeconds != nil { + labels[kubernetesTerminationGracePeriodLabel] = strconv.FormatInt(*pod.Spec.TerminationGracePeriodSeconds, 10) } if container.Lifecycle != nil && container.Lifecycle.PreStop != nil { - glog.V(1).Infof("Setting preStop hook") // TODO: This is kind of hacky, we should really just encode the bits we need. data, err := latest.Codec.Encode(pod) if err != nil { @@ -1104,40 +1120,56 @@ func (dm *DockerManager) PortForward(pod *kubecontainer.Pod, port uint16, stream } // Kills all containers in the specified pod -func (dm *DockerManager) KillPod(pod kubecontainer.Pod) error { +func (dm *DockerManager) KillPod(pod *api.Pod, runningPod kubecontainer.Pod) error { // Send the kills in parallel since they may take a long time. Len + 1 since there // can be Len errors + the networkPlugin teardown error. - errs := make(chan error, len(pod.Containers)+1) + errs := make(chan error, len(runningPod.Containers)+1) wg := sync.WaitGroup{} - var networkID types.UID - for _, container := range pod.Containers { + var ( + networkContainer *kubecontainer.Container + networkSpec *api.Container + ) + for _, container := range runningPod.Containers { wg.Add(1) go func(container *kubecontainer.Container) { defer util.HandleCrash() defer wg.Done() + var containerSpec *api.Container + if pod != nil { + for i, c := range pod.Spec.Containers { + if c.Name == container.Name { + containerSpec = &pod.Spec.Containers[i] + break + } + } + } + // TODO: Handle this without signaling the pod infra container to // adapt to the generic container runtime. if container.Name == PodInfraContainerName { // Store the container runtime for later deletion. // We do this so that PreStop handlers can run in the network namespace. - networkID = container.ID + networkContainer = container + networkSpec = containerSpec return } - if err := dm.killContainer(container.ID); err != nil { - glog.Errorf("Failed to delete container: %v; Skipping pod %q", err, pod.ID) + + err := dm.killContainer(container.ID, containerSpec, pod) + if err != nil { + glog.Errorf("Failed to delete container: %v; Skipping pod %q", err, runningPod.ID) errs <- err } }(container) } wg.Wait() - if len(networkID) > 0 { - if err := dm.networkPlugin.TearDownPod(pod.Namespace, pod.Name, kubeletTypes.DockerID(networkID)); err != nil { + if networkContainer != nil { + if err := dm.networkPlugin.TearDownPod(runningPod.Namespace, runningPod.Name, kubeletTypes.DockerID(networkContainer.ID)); err != nil { glog.Errorf("Failed tearing down the infra container: %v", err) errs <- err } - if err := dm.killContainer(networkID); err != nil { - glog.Errorf("Failed to delete container: %v; Skipping pod %q", err, pod.ID) + if err := dm.killContainer(networkContainer.ID, networkSpec, pod); err != nil { + glog.Errorf("Failed to delete container: %v; Skipping pod %q", err, runningPod.ID) errs <- err } } @@ -1152,70 +1184,81 @@ func (dm *DockerManager) KillPod(pod kubecontainer.Pod) error { return nil } -// KillContainerInPod kills a container in the pod. -func (dm *DockerManager) KillContainerInPod(container api.Container, pod *api.Pod) error { - // Locate the container. - pods, err := dm.GetPods(false) - if err != nil { - return err - } - targetPod := kubecontainer.Pods(pods).FindPod(kubecontainer.GetPodFullName(pod), pod.UID) - targetContainer := targetPod.FindContainerByName(container.Name) - if targetContainer == nil { - return fmt.Errorf("unable to find container %q in pod %q", container.Name, targetPod.Name) - } - return dm.killContainer(targetContainer.ID) -} +// KillContainerInPod kills a container in the pod. It must be passed either a container ID or a container and pod, +// and will attempt to lookup the other information if missing. +func (dm *DockerManager) KillContainerInPod(containerID types.UID, container *api.Container, pod *api.Pod) error { + switch { + case len(containerID) == 0: + // Locate the container. + pods, err := dm.GetPods(false) + if err != nil { + return err + } + targetPod := kubecontainer.Pods(pods).FindPod(kubecontainer.GetPodFullName(pod), pod.UID) + targetContainer := targetPod.FindContainerByName(container.Name) + if targetContainer == nil { + return fmt.Errorf("unable to find container %q in pod %q", container.Name, targetPod.Name) + } + containerID = targetContainer.ID -// TODO(vmarmol): Unexport this as it is no longer used externally. -// KillContainer kills a container identified by containerID. -// Internally, it invokes docker's StopContainer API with a timeout of 10s. -// TODO: Deprecate this function in favor of KillContainerInPod. -func (dm *DockerManager) KillContainer(containerID types.UID) error { - return dm.killContainer(containerID) + case container == nil || pod == nil: + // Read information about the container from labels + inspect, err := dm.client.InspectContainer(string(containerID)) + if err != nil { + return err + } + storedPod, storedContainer, cerr := containerAndPodFromLabels(inspect) + if cerr != nil { + glog.Errorf("unable to access pod data from container: %v", err) + } + if container == nil { + container = storedContainer + } + if pod == nil { + pod = storedPod + } + } + return dm.killContainer(containerID, container, pod) } -func (dm *DockerManager) killContainer(containerID types.UID) error { +// killContainer accepts a containerID and an optional container or pod containing shutdown policies. Invoke +// KillContainerInPod if information must be retrieved first. +func (dm *DockerManager) killContainer(containerID types.UID, container *api.Container, pod *api.Pod) error { ID := string(containerID) - glog.V(2).Infof("Killing container with id %q", ID) - inspect, err := dm.client.InspectContainer(ID) - if err != nil { - return err + name := ID + if container != nil { + name = fmt.Sprintf("%s %s", name, container.Name) } - var found bool - var preStop string - if inspect != nil && inspect.Config != nil && inspect.Config.Labels != nil { - preStop, found = inspect.Config.Labels[kubernetesPodLabel] + if pod != nil { + name = fmt.Sprintf("%s %s/%s", name, pod.Namespace, pod.Name) } - if found { - var pod api.Pod - err := latest.Codec.DecodeInto([]byte(preStop), &pod) - if err != nil { - glog.Errorf("Failed to decode prestop: %s, %s", preStop, ID) - } else { - name := inspect.Config.Labels[kubernetesContainerLabel] - var container *api.Container - for ix := range pod.Spec.Containers { - if pod.Spec.Containers[ix].Name == name { - container = &pod.Spec.Containers[ix] - break - } - } - if container != nil { - glog.V(1).Infof("Running preStop hook") - if err := dm.runner.Run(ID, &pod, container, container.Lifecycle.PreStop); err != nil { - glog.Errorf("failed to run preStop hook: %v", err) - } - } else { - glog.Errorf("unable to find container %v, %s", pod, name) - } + + gracePeriod := int64(minimumGracePeriodInSeconds) + if pod != nil && pod.DeletionGracePeriodSeconds != nil { + gracePeriod = *pod.DeletionGracePeriodSeconds + } + glog.V(2).Infof("Killing container %q with %d second grace period", name, gracePeriod) + + if pod != nil && container != nil && container.Lifecycle != nil && container.Lifecycle.PreStop != nil { + glog.V(4).Infof("Running preStop hook for container %q", name) + start := util.Now() + // TODO: timebox PreStop execution to at most gracePeriod + if err := dm.runner.Run(ID, pod, container, container.Lifecycle.PreStop); err != nil { + glog.Errorf("preStop hook for container %q failed: %v", name, err) } + gracePeriod -= int64(util.Now().Sub(start.Time).Seconds()) } + dm.readinessManager.RemoveReadiness(ID) - err = dm.client.StopContainer(ID, 10) + + // always give containers a minimal shutdown window to avoid unnecessary SIGKILLs + if gracePeriod < minimumGracePeriodInSeconds { + gracePeriod = minimumGracePeriodInSeconds + } + err := dm.client.StopContainer(ID, uint(gracePeriod)) ref, ok := dm.containerRefManager.GetRef(ID) if !ok { - glog.Warningf("No ref for pod '%v'", ID) + glog.Warningf("No ref for pod '%q'", name) } else { // TODO: pass reason down here, and state, or move this call up the stack. dm.recorder.Eventf(ref, "Killing", "Killing with docker id %v", util.ShortenString(ID, 12)) @@ -1223,6 +1266,48 @@ func (dm *DockerManager) killContainer(containerID types.UID) error { return err } +var errNoPodOnContainer = fmt.Errorf("no pod information labels on Docker container") + +// containerAndPodFromLabels tries to load the appropriate container info off of a Docker container's labels +func containerAndPodFromLabels(inspect *docker.Container) (pod *api.Pod, container *api.Container, err error) { + if inspect == nil && inspect.Config == nil && inspect.Config.Labels == nil { + return nil, nil, errNoPodOnContainer + } + labels := inspect.Config.Labels + + // the pod data may not be set + if body, found := labels[kubernetesPodLabel]; found { + pod = &api.Pod{} + if err = latest.Codec.DecodeInto([]byte(body), pod); err == nil { + name := labels[kubernetesContainerLabel] + for ix := range pod.Spec.Containers { + if pod.Spec.Containers[ix].Name == name { + container = &pod.Spec.Containers[ix] + break + } + } + if container == nil { + err = fmt.Errorf("unable to find container %s in pod %v", name, pod) + } + } else { + pod = nil + } + } + + // attempt to find the default grace period if we didn't commit a pod, but set the generic metadata + // field (the one used by kill) + if pod == nil { + if period, ok := labels[kubernetesTerminationGracePeriodLabel]; ok { + if seconds, err := strconv.ParseInt(period, 10, 64); err == nil { + pod = &api.Pod{} + pod.DeletionGracePeriodSeconds = &seconds + } + } + } + + return +} + // Run a single container from a pod. Returns the docker container ID func (dm *DockerManager) runContainerInPod(pod *api.Pod, container *api.Container, netMode, ipcMode string) (kubeletTypes.DockerID, error) { start := time.Now() @@ -1253,7 +1338,7 @@ func (dm *DockerManager) runContainerInPod(pod *api.Pod, container *api.Containe if container.Lifecycle != nil && container.Lifecycle.PostStart != nil { handlerErr := dm.runner.Run(id, pod, container, container.Lifecycle.PostStart) if handlerErr != nil { - dm.killContainer(types.UID(id)) + dm.killContainer(types.UID(id), container, pod) return kubeletTypes.DockerID(""), fmt.Errorf("failed to call event handler: %v", handlerErr) } } @@ -1413,6 +1498,11 @@ func (dm *DockerManager) computePodContainerChanges(pod *api.Pod, runningPod kub containersToKeep := make(map[kubeletTypes.DockerID]int) createPodInfraContainer := false + if pod.DeletionTimestamp != nil { + glog.V(4).Infof("Pod is terminating %q", podFullName) + return PodContainerChangesSpec{}, nil + } + var err error var podInfraContainerID kubeletTypes.DockerID var changed bool @@ -1547,7 +1637,7 @@ func (dm *DockerManager) SyncPod(pod *api.Pod, runningPod kubecontainer.Pod, pod } // Killing phase: if we want to start new infra container, or nothing is running kill everything (including infra container) - err = dm.KillPod(runningPod) + err = dm.KillPod(pod, runningPod) if err != nil { return err } @@ -1557,7 +1647,15 @@ func (dm *DockerManager) SyncPod(pod *api.Pod, runningPod kubecontainer.Pod, pod _, keep := containerChanges.ContainersToKeep[kubeletTypes.DockerID(container.ID)] if !keep { glog.V(3).Infof("Killing unwanted container %+v", container) - err = dm.KillContainer(container.ID) + // attempt to find the appropriate container policy + var podContainer *api.Container + for i, c := range pod.Spec.Containers { + if c.Name == container.Name { + podContainer = &pod.Spec.Containers[i] + break + } + } + err = dm.KillContainerInPod(container.ID, podContainer, pod) if err != nil { glog.Errorf("Error killing container: %v", err) } diff --git a/pkg/kubelet/dockertools/manager_test.go b/pkg/kubelet/dockertools/manager_test.go index 137988c0ae586..0da1ca938b16e 100644 --- a/pkg/kubelet/dockertools/manager_test.go +++ b/pkg/kubelet/dockertools/manager_test.go @@ -405,7 +405,7 @@ func TestKillContainerInPod(t *testing.T) { manager.readinessManager.SetReadiness(c.ID, true) } - if err := manager.KillContainerInPod(pod.Spec.Containers[0], pod); err != nil { + if err := manager.KillContainerInPod("", &pod.Spec.Containers[0], pod); err != nil { t.Errorf("unexpected error: %v", err) } // Assert the container has been stopped. @@ -478,14 +478,14 @@ func TestKillContainerInPodWithPreStop(t *testing.T) { manager.readinessManager.SetReadiness(c.ID, true) } - if err := manager.KillContainerInPod(pod.Spec.Containers[0], pod); err != nil { + if err := manager.KillContainerInPod("", &pod.Spec.Containers[0], pod); err != nil { t.Errorf("unexpected error: %v", err) } // Assert the container has been stopped. if err := fakeDocker.AssertStopped([]string{containerToKill.ID}); err != nil { t.Errorf("container was not stopped correctly: %v", err) } - verifyCalls(t, fakeDocker, []string{"list", "inspect_container", "create_exec", "start_exec", "stop"}) + verifyCalls(t, fakeDocker, []string{"list", "create_exec", "start_exec", "stop"}) if !reflect.DeepEqual(expectedCmd, fakeDocker.execCmd) { t.Errorf("expected: %v, got %v", expectedCmd, fakeDocker.execCmd) } @@ -522,7 +522,7 @@ func TestKillContainerInPodWithError(t *testing.T) { manager.readinessManager.SetReadiness(c.ID, true) } - if err := manager.KillContainerInPod(pod.Spec.Containers[0], pod); err == nil { + if err := manager.KillContainerInPod("", &pod.Spec.Containers[0], pod); err == nil { t.Errorf("expected error, found nil") } @@ -1021,7 +1021,7 @@ func TestSyncPodDeletesWithNoPodInfraContainer(t *testing.T) { verifyCalls(t, fakeDocker, []string{ // Kill the container since pod infra container is not running. - "inspect_container", "stop", + "stop", // Create pod infra container. "create", "start", "inspect_container", // Create container. @@ -1096,7 +1096,7 @@ func TestSyncPodDeletesDuplicate(t *testing.T) { // Check the pod infra container. "inspect_container", // Kill the duplicated container. - "inspect_container", "stop", + "stop", }) // Expect one of the duplicates to be killed. if len(fakeDocker.Stopped) != 1 || (fakeDocker.Stopped[0] != "1234" && fakeDocker.Stopped[0] != "4567") { @@ -1150,7 +1150,7 @@ func TestSyncPodBadHash(t *testing.T) { // Check the pod infra container. "inspect_container", // Kill and restart the bad hash container. - "inspect_container", "stop", "create", "start", "inspect_container", + "stop", "create", "start", }) if err := fakeDocker.AssertStopped([]string{"1234"}); err != nil { @@ -1208,7 +1208,7 @@ func TestSyncPodsUnhealthy(t *testing.T) { // Check the pod infra container. "inspect_container", // Kill the unhealthy container. - "inspect_container", "stop", + "stop", // Restart the unhealthy container. "create", "start", "inspect_container", }) @@ -1443,7 +1443,7 @@ func TestSyncPodWithRestartPolicy(t *testing.T) { // Check the pod infra container. "inspect_container", // Stop the last pod infra container. - "inspect_container", "stop", + "stop", }, []string{}, []string{"9876"}, @@ -1910,7 +1910,7 @@ func TestSyncPodEventHandlerFails(t *testing.T) { // Create the container. "create", "start", // Kill the container since event handler fails. - "inspect_container", "stop", + "stop", }) // TODO(yifan): Check the stopped container's name. diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index 3fd7c83f45d67..5e5a094c249f0 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -1120,8 +1120,8 @@ func parseResolvConf(reader io.Reader) (nameservers []string, searches []string, } // Kill all running containers in a pod (includes the pod infra container). -func (kl *Kubelet) killPod(pod kubecontainer.Pod) error { - return kl.containerRuntime.KillPod(pod) +func (kl *Kubelet) killPod(pod *api.Pod, runningPod kubecontainer.Pod) error { + return kl.containerRuntime.KillPod(pod, runningPod) } type empty struct{} @@ -1179,7 +1179,7 @@ func (kl *Kubelet) syncPod(pod *api.Pod, mirrorPod *api.Pod, runningPod kubecont // Kill pods we can't run. err := canRunPod(pod) if err != nil { - kl.killPod(runningPod) + kl.killPod(pod, runningPod) return err } @@ -1550,7 +1550,7 @@ func (kl *Kubelet) killUnwantedPods(desiredPods map[types.UID]empty, }() glog.V(1).Infof("Killing unwanted pod %q", pod.Name) // Stop the containers. - err = kl.killPod(*pod) + err = kl.killPod(nil, *pod) if err != nil { glog.Errorf("Failed killing the pod %q: %v", pod.Name, err) return diff --git a/pkg/kubelet/rkt/rkt.go b/pkg/kubelet/rkt/rkt.go index 92d718636b7e2..bb9ef9147b583 100644 --- a/pkg/kubelet/rkt/rkt.go +++ b/pkg/kubelet/rkt/rkt.go @@ -676,11 +676,11 @@ func (r *runtime) GetPods(all bool) ([]*kubecontainer.Pod, error) { } // KillPod invokes 'systemctl kill' to kill the unit that runs the pod. -func (r *runtime) KillPod(pod kubecontainer.Pod) error { - glog.V(4).Infof("Rkt is killing pod: name %q.", pod.Name) +func (r *runtime) KillPod(pod *api.Pod, runningPod kubecontainer.Pod) error { + glog.V(4).Infof("Rkt is killing pod: name %q.", runningPod.Name) // TODO(yifan): More graceful stop. Replace with StopUnit and wait for a timeout. - r.systemd.KillUnit(makePodServiceFileName(pod.ID), int32(syscall.SIGKILL)) + r.systemd.KillUnit(makePodServiceFileName(runningPod.ID), int32(syscall.SIGKILL)) return r.systemd.Reload() } @@ -887,7 +887,7 @@ func (r *runtime) SyncPod(pod *api.Pod, runningPod kubecontainer.Pod, podStatus if restartPod { // TODO(yifan): Handle network plugin. - if err := r.KillPod(runningPod); err != nil { + if err := r.KillPod(pod, runningPod); err != nil { return err } if err := r.RunPod(pod); err != nil { diff --git a/pkg/kubelet/status_manager.go b/pkg/kubelet/status_manager.go index 9f7f45e1b0da4..5e9c753f9faf5 100644 --- a/pkg/kubelet/status_manager.go +++ b/pkg/kubelet/status_manager.go @@ -24,6 +24,7 @@ import ( "github.com/golang/glog" "k8s.io/kubernetes/pkg/api" + "k8s.io/kubernetes/pkg/api/errors" "k8s.io/kubernetes/pkg/client" kubecontainer "k8s.io/kubernetes/pkg/kubelet/container" kubeletTypes "k8s.io/kubernetes/pkg/kubelet/types" @@ -161,13 +162,24 @@ func (s *statusManager) syncBatch() error { } // TODO: make me easier to express from client code statusPod, err = s.kubeClient.Pods(statusPod.Namespace).Get(statusPod.Name) + if errors.IsNotFound(err) { + glog.V(3).Infof("Pod %q was deleted on the server", pod.Name) + return nil + } if err == nil { statusPod.Status = status - _, err = s.kubeClient.Pods(pod.Namespace).UpdateStatus(statusPod) // TODO: handle conflict as a retry, make that easier too. + statusPod, err = s.kubeClient.Pods(pod.Namespace).UpdateStatus(statusPod) if err == nil { glog.V(3).Infof("Status for pod %q updated successfully", kubeletUtil.FormatPodName(pod)) - return nil + + if statusPod.DeletionTimestamp == nil || !allTerminated(statusPod.Status.ContainerStatuses) { + return nil + } + if err := s.kubeClient.Pods(statusPod.Namespace).Delete(statusPod.Name, api.NewDeleteOptions(0)); err == nil { + glog.V(3).Infof("Pod %q fully terminated and removed from etcd", statusPod.Name) + return nil + } } } @@ -181,3 +193,14 @@ func (s *statusManager) syncBatch() error { go s.DeletePodStatus(podFullName) return fmt.Errorf("error updating status for pod %q: %v", kubeletUtil.FormatPodName(pod), err) } + +// allTerminated returns true if every status is terminated, or the status list +// is empty. +func allTerminated(statuses []api.ContainerStatus) bool { + for _, status := range statuses { + if status.State.Terminated == nil { + return false + } + } + return true +} diff --git a/pkg/registry/controller/etcd/etcd_test.go b/pkg/registry/controller/etcd/etcd_test.go index c826400d35e22..fddea31d1665a 100644 --- a/pkg/registry/controller/etcd/etcd_test.go +++ b/pkg/registry/controller/etcd/etcd_test.go @@ -695,7 +695,7 @@ func TestDelete(t *testing.T) { // If the controller is still around after trying to delete either the delete // failed, or we're deleting it gracefully. if fakeClient.Data[key].R.Node != nil { - return true + return fakeClient.Data[key].R.Node.TTL != 0 } return false } diff --git a/pkg/registry/event/etcd/etcd_test.go b/pkg/registry/event/etcd/etcd_test.go index bf3504303d3e4..6d7403368f444 100644 --- a/pkg/registry/event/etcd/etcd_test.go +++ b/pkg/registry/event/etcd/etcd_test.go @@ -37,6 +37,7 @@ var testTTL uint64 = 60 func NewTestEventStorage(t *testing.T) (*tools.FakeEtcdClient, *REST) { f := tools.NewFakeEtcdClient(t) + f.HideExpires = true f.TestIndex = true s := etcdstorage.NewEtcdStorage(f, testapi.Codec(), etcdtest.PathPrefix()) diff --git a/pkg/registry/persistentvolume/etcd/etcd_test.go b/pkg/registry/persistentvolume/etcd/etcd_test.go index 07da7b165b8d3..c7b31488d06ae 100644 --- a/pkg/registry/persistentvolume/etcd/etcd_test.go +++ b/pkg/registry/persistentvolume/etcd/etcd_test.go @@ -300,7 +300,7 @@ func TestEtcdUpdateStatus(t *testing.T) { key, _ := storage.KeyFunc(ctx, "foo") key = etcdtest.AddPrefix(key) pvStart := validNewPersistentVolume("foo") - fakeClient.Set(key, runtime.EncodeOrDie(latest.Codec, pvStart), 1) + fakeClient.Set(key, runtime.EncodeOrDie(latest.Codec, pvStart), 0) pvIn := &api.PersistentVolume{ ObjectMeta: api.ObjectMeta{ diff --git a/pkg/registry/persistentvolumeclaim/etcd/etcd_test.go b/pkg/registry/persistentvolumeclaim/etcd/etcd_test.go index e6fecd389e516..c36d0e01e2bf4 100644 --- a/pkg/registry/persistentvolumeclaim/etcd/etcd_test.go +++ b/pkg/registry/persistentvolumeclaim/etcd/etcd_test.go @@ -298,7 +298,7 @@ func TestEtcdUpdateStatus(t *testing.T) { key, _ := storage.KeyFunc(ctx, "foo") key = etcdtest.AddPrefix(key) pvcStart := validNewPersistentVolumeClaim("foo", api.NamespaceDefault) - fakeClient.Set(key, runtime.EncodeOrDie(latest.Codec, pvcStart), 1) + fakeClient.Set(key, runtime.EncodeOrDie(latest.Codec, pvcStart), 0) pvc := &api.PersistentVolumeClaim{ ObjectMeta: api.ObjectMeta{ diff --git a/pkg/registry/pod/etcd/etcd_test.go b/pkg/registry/pod/etcd/etcd_test.go index e05f3065e9f61..1bd879b2011e9 100644 --- a/pkg/registry/pod/etcd/etcd_test.go +++ b/pkg/registry/pod/etcd/etcd_test.go @@ -56,6 +56,7 @@ func newStorage(t *testing.T) (*REST, *BindingREST, *StatusREST, *tools.FakeEtcd } func validNewPod() *api.Pod { + grace := int64(30) return &api.Pod{ ObjectMeta: api.ObjectMeta{ Name: "foo", @@ -64,6 +65,8 @@ func validNewPod() *api.Pod { Spec: api.PodSpec{ RestartPolicy: api.RestartPolicyAlways, DNSPolicy: api.DNSClusterFirst, + + TerminationGracePeriodSeconds: &grace, Containers: []api.Container{ { Name: "foo", @@ -134,9 +137,9 @@ func TestDelete(t *testing.T) { if fakeEtcdClient.Data[key].R.Node == nil { return false } - return fakeEtcdClient.Data[key].R.Node.TTL == 30 + return fakeEtcdClient.Data[key].R.Node.TTL != 0 } - test.TestDelete(createFn, gracefulSetFn) + test.TestDeleteGraceful(createFn, 30, gracefulSetFn) } func expectPod(t *testing.T, out runtime.Object) (*api.Pod, bool) { @@ -1027,6 +1030,7 @@ func TestEtcdUpdateScheduled(t *testing.T) { }, }), 1) + grace := int64(30) podIn := api.Pod{ ObjectMeta: api.ObjectMeta{ Name: "foo", @@ -1048,6 +1052,8 @@ func TestEtcdUpdateScheduled(t *testing.T) { }, RestartPolicy: api.RestartPolicyAlways, DNSPolicy: api.DNSClusterFirst, + + TerminationGracePeriodSeconds: &grace, }, } _, _, err := registry.Update(ctx, &podIn) @@ -1088,7 +1094,7 @@ func TestEtcdUpdateStatus(t *testing.T) { }, }, } - fakeClient.Set(key, runtime.EncodeOrDie(latest.Codec, &podStart), 1) + fakeClient.Set(key, runtime.EncodeOrDie(latest.Codec, &podStart), 0) podIn := api.Pod{ ObjectMeta: api.ObjectMeta{ @@ -1117,6 +1123,8 @@ func TestEtcdUpdateStatus(t *testing.T) { expected := podStart expected.ResourceVersion = "2" + grace := int64(30) + expected.Spec.TerminationGracePeriodSeconds = &grace expected.Spec.RestartPolicy = api.RestartPolicyAlways expected.Spec.DNSPolicy = api.DNSClusterFirst expected.Spec.Containers[0].ImagePullPolicy = api.PullIfNotPresent diff --git a/pkg/registry/pod/rest.go b/pkg/registry/pod/rest.go index e62d7ab27a76d..43135278d5049 100644 --- a/pkg/registry/pod/rest.go +++ b/pkg/registry/pod/rest.go @@ -81,13 +81,31 @@ func (podStrategy) ValidateUpdate(ctx api.Context, obj, old runtime.Object) fiel return append(errorList, validation.ValidatePodUpdate(obj.(*api.Pod), old.(*api.Pod))...) } +// AllowUnconditionalUpdate allows pods to be overwritten func (podStrategy) AllowUnconditionalUpdate() bool { return true } -// CheckGracefulDelete allows a pod to be gracefully deleted. +// CheckGracefulDelete allows a pod to be gracefully deleted. It updates the DeleteOptions to +// reflect the desired grace value. func (podStrategy) CheckGracefulDelete(obj runtime.Object, options *api.DeleteOptions) bool { - return false + if options == nil { + return false + } + period := int64(0) + // user has specified a value + if options.GracePeriodSeconds != nil { + period = *options.GracePeriodSeconds + } else { + // use the default value if set, or deletes the pod immediately (0) + pod := obj.(*api.Pod) + if pod.Spec.TerminationGracePeriodSeconds != nil { + period = *pod.Spec.TerminationGracePeriodSeconds + } + } + // ensure the options and the pod are in sync + options.GracePeriodSeconds = &period + return true } type podStatusStrategy struct { @@ -100,6 +118,7 @@ func (podStatusStrategy) PrepareForUpdate(obj, old runtime.Object) { newPod := obj.(*api.Pod) oldPod := old.(*api.Pod) newPod.Spec = oldPod.Spec + newPod.DeletionTimestamp = nil } func (podStatusStrategy) ValidateUpdate(ctx api.Context, obj, old runtime.Object) fielderrors.ValidationErrorList { diff --git a/pkg/registry/resourcequota/etcd/etcd_test.go b/pkg/registry/resourcequota/etcd/etcd_test.go index 05198ebdf9371..d99ae1f091025 100644 --- a/pkg/registry/resourcequota/etcd/etcd_test.go +++ b/pkg/registry/resourcequota/etcd/etcd_test.go @@ -389,7 +389,7 @@ func TestEtcdUpdateStatus(t *testing.T) { key, _ := registry.KeyFunc(ctx, "foo") key = etcdtest.AddPrefix(key) resourcequotaStart := validNewResourceQuota() - fakeClient.Set(key, runtime.EncodeOrDie(latest.Codec, resourcequotaStart), 1) + fakeClient.Set(key, runtime.EncodeOrDie(latest.Codec, resourcequotaStart), 0) resourcequotaIn := &api.ResourceQuota{ ObjectMeta: api.ObjectMeta{ diff --git a/pkg/storage/etcd/etcd_helper.go b/pkg/storage/etcd/etcd_helper.go index 09efa9819c1be..f4a07472355ad 100644 --- a/pkg/storage/etcd/etcd_helper.go +++ b/pkg/storage/etcd/etcd_helper.go @@ -394,14 +394,21 @@ func (h *etcdHelper) GuaranteedUpdate(key string, ptrToType runtime.Object, igno ttl := uint64(0) if node != nil { index = node.ModifiedIndex - if node.TTL > 0 { + if node.TTL != 0 { ttl = uint64(node.TTL) } + if node.Expiration != nil && ttl == 0 { + ttl = 1 + } } else if res != nil { index = res.EtcdIndex } if newTTL != nil { + if ttl != 0 && *newTTL == 0 { + // TODO: remove this after we have verified this is no longer an issue + glog.V(4).Infof("GuaranteedUpdate is clearing TTL for %q, may not be intentional", key) + } ttl = *newTTL } diff --git a/pkg/storage/etcd/etcd_helper_test.go b/pkg/storage/etcd/etcd_helper_test.go index 4cb6ef73d3a6b..c84b66f808d28 100644 --- a/pkg/storage/etcd/etcd_helper_test.go +++ b/pkg/storage/etcd/etcd_helper_test.go @@ -123,28 +123,32 @@ func TestList(t *testing.T) { }, }, } + grace := int64(30) expect := api.PodList{ ListMeta: api.ListMeta{ResourceVersion: "10"}, Items: []api.Pod{ { ObjectMeta: api.ObjectMeta{Name: "bar", ResourceVersion: "2"}, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, { ObjectMeta: api.ObjectMeta{Name: "baz", ResourceVersion: "3"}, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, { ObjectMeta: api.ObjectMeta{Name: "foo", ResourceVersion: "1"}, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, }, @@ -206,6 +210,7 @@ func TestListAcrossDirectories(t *testing.T) { }, }, } + grace := int64(30) expect := api.PodList{ ListMeta: api.ListMeta{ResourceVersion: "10"}, Items: []api.Pod{ @@ -213,22 +218,25 @@ func TestListAcrossDirectories(t *testing.T) { { ObjectMeta: api.ObjectMeta{Name: "baz", ResourceVersion: "3"}, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, { ObjectMeta: api.ObjectMeta{Name: "foo", ResourceVersion: "1"}, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, { ObjectMeta: api.ObjectMeta{Name: "bar", ResourceVersion: "2"}, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, }, @@ -278,28 +286,32 @@ func TestListExcludesDirectories(t *testing.T) { }, }, } + grace := int64(30) expect := api.PodList{ ListMeta: api.ListMeta{ResourceVersion: "10"}, Items: []api.Pod{ { ObjectMeta: api.ObjectMeta{Name: "bar", ResourceVersion: "2"}, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, { ObjectMeta: api.ObjectMeta{Name: "baz", ResourceVersion: "3"}, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, { ObjectMeta: api.ObjectMeta{Name: "foo", ResourceVersion: "1"}, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, }, }, @@ -319,11 +331,13 @@ func TestGet(t *testing.T) { fakeClient := tools.NewFakeEtcdClient(t) helper := newEtcdHelper(fakeClient, testapi.Codec(), etcdtest.PathPrefix()) key := etcdtest.AddPrefix("/some/key") + grace := int64(30) expect := api.Pod{ ObjectMeta: api.ObjectMeta{Name: "foo"}, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, } fakeClient.Set(key, runtime.EncodeOrDie(testapi.Codec(), &expect), 0) diff --git a/pkg/storage/etcd/etcd_watcher.go b/pkg/storage/etcd/etcd_watcher.go index 93fdd16829ed6..4ab51cabbde75 100644 --- a/pkg/storage/etcd/etcd_watcher.go +++ b/pkg/storage/etcd/etcd_watcher.go @@ -38,6 +38,7 @@ const ( EtcdSet = "set" EtcdCAS = "compareAndSwap" EtcdDelete = "delete" + EtcdExpire = "expire" ) // TransformFunc attempts to convert an object to another object for use with a watcher. @@ -353,7 +354,7 @@ func (w *etcdWatcher) sendResult(res *etcd.Response) { w.sendAdd(res) case EtcdSet, EtcdCAS: w.sendModify(res) - case EtcdDelete: + case EtcdDelete, EtcdExpire: w.sendDelete(res) default: glog.Errorf("unknown action: %v", res.Action) diff --git a/pkg/tools/fake_etcd_client.go b/pkg/tools/fake_etcd_client.go index 22b8a59e763b9..87a30bd91208a 100644 --- a/pkg/tools/fake_etcd_client.go +++ b/pkg/tools/fake_etcd_client.go @@ -20,6 +20,7 @@ import ( "errors" "sort" "sync" + "time" "github.com/coreos/go-etcd/etcd" ) @@ -52,6 +53,8 @@ type FakeEtcdClient struct { TestIndex bool ChangeIndex uint64 LastSetTTL uint64 + // Will avoid setting the expires header on objects to make comparison easier + HideExpires bool Machines []string // Will become valid after Watch is called; tester may write to it. Tester may @@ -175,6 +178,11 @@ func (f *FakeEtcdClient) setLocked(key, value string, ttl uint64) (*etcd.Respons prevResult := f.Data[key] createdIndex := prevResult.R.Node.CreatedIndex f.t.Logf("updating %v, index %v -> %v (ttl: %d)", key, createdIndex, i, ttl) + var expires *time.Time + if !f.HideExpires && ttl > 0 { + now := time.Now() + expires = &now + } result := EtcdResponseWithError{ R: &etcd.Response{ Node: &etcd.Node{ @@ -182,6 +190,7 @@ func (f *FakeEtcdClient) setLocked(key, value string, ttl uint64) (*etcd.Respons CreatedIndex: createdIndex, ModifiedIndex: i, TTL: int64(ttl), + Expiration: expires, }, }, } diff --git a/plugin/pkg/scheduler/factory/factory_test.go b/plugin/pkg/scheduler/factory/factory_test.go index a5795b4e38156..22f05d186d4ad 100644 --- a/plugin/pkg/scheduler/factory/factory_test.go +++ b/plugin/pkg/scheduler/factory/factory_test.go @@ -132,11 +132,13 @@ func PriorityTwo(pod *api.Pod, podLister algorithm.PodLister, minionLister algor } func TestDefaultErrorFunc(t *testing.T) { + grace := int64(30) testPod := &api.Pod{ ObjectMeta: api.ObjectMeta{Name: "foo", Namespace: "bar"}, Spec: api.PodSpec{ - RestartPolicy: api.RestartPolicyAlways, - DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, + DNSPolicy: api.DNSClusterFirst, + TerminationGracePeriodSeconds: &grace, }, } handler := util.FakeHandler{ diff --git a/test/e2e/kubectl.go b/test/e2e/kubectl.go index 02d289ba25709..a79946bd5ef12 100644 --- a/test/e2e/kubectl.go +++ b/test/e2e/kubectl.go @@ -791,7 +791,7 @@ func getUDData(jpgExpected string, ns string) func(*client.Client, string) error if strings.Contains(data.Image, jpgExpected) { return nil } else { - return errors.New(fmt.Sprintf("data served up in container is innaccurate, %s didn't contain %s", data, jpgExpected)) + return errors.New(fmt.Sprintf("data served up in container is inaccurate, %s didn't contain %s", data, jpgExpected)) } } } diff --git a/test/e2e/pd.go b/test/e2e/pd.go index 56517247bd96c..6d22c6e2cdcae 100644 --- a/test/e2e/pd.go +++ b/test/e2e/pd.go @@ -78,8 +78,8 @@ var _ = Describe("Pod Disks", func() { By("cleaning up PD-RW test environment") // Teardown pods, PD. Ignore errors. // Teardown should do nothing unless test failed. - podClient.Delete(host0Pod.Name, nil) - podClient.Delete(host1Pod.Name, nil) + podClient.Delete(host0Pod.Name, api.NewDeleteOptions(0)) + podClient.Delete(host1Pod.Name, api.NewDeleteOptions(0)) detachPD(host0Name, diskName) detachPD(host1Name, diskName) deletePD(diskName) @@ -98,7 +98,7 @@ var _ = Describe("Pod Disks", func() { Logf("Wrote value: %v", testFileContents) By("deleting host0Pod") - expectNoError(podClient.Delete(host0Pod.Name, nil), "Failed to delete host0Pod") + expectNoError(podClient.Delete(host0Pod.Name, api.NewDeleteOptions(0)), "Failed to delete host0Pod") By("submitting host1Pod to kubernetes") _, err = podClient.Create(host1Pod) @@ -113,7 +113,7 @@ var _ = Describe("Pod Disks", func() { Expect(strings.TrimSpace(v)).To(Equal(strings.TrimSpace(testFileContents))) By("deleting host1Pod") - expectNoError(podClient.Delete(host1Pod.Name, nil), "Failed to delete host1Pod") + expectNoError(podClient.Delete(host1Pod.Name, api.NewDeleteOptions(0)), "Failed to delete host1Pod") By(fmt.Sprintf("deleting PD %q", diskName)) deletePDWithRetry(diskName) @@ -136,9 +136,9 @@ var _ = Describe("Pod Disks", func() { By("cleaning up PD-RO test environment") // Teardown pods, PD. Ignore errors. // Teardown should do nothing unless test failed. - podClient.Delete(rwPod.Name, nil) - podClient.Delete(host0ROPod.Name, nil) - podClient.Delete(host1ROPod.Name, nil) + podClient.Delete(rwPod.Name, api.NewDeleteOptions(0)) + podClient.Delete(host0ROPod.Name, api.NewDeleteOptions(0)) + podClient.Delete(host1ROPod.Name, api.NewDeleteOptions(0)) detachPD(host0Name, diskName) detachPD(host1Name, diskName) @@ -149,7 +149,7 @@ var _ = Describe("Pod Disks", func() { _, err = podClient.Create(rwPod) expectNoError(err, "Failed to create rwPod") expectNoError(framework.WaitForPodRunning(rwPod.Name)) - expectNoError(podClient.Delete(rwPod.Name, nil), "Failed to delete host0Pod") + expectNoError(podClient.Delete(rwPod.Name, api.NewDeleteOptions(0)), "Failed to delete host0Pod") expectNoError(waitForPDDetach(diskName, host0Name)) By("submitting host0ROPod to kubernetes") @@ -165,10 +165,10 @@ var _ = Describe("Pod Disks", func() { expectNoError(framework.WaitForPodRunning(host1ROPod.Name)) By("deleting host0ROPod") - expectNoError(podClient.Delete(host0ROPod.Name, nil), "Failed to delete host0ROPod") + expectNoError(podClient.Delete(host0ROPod.Name, api.NewDeleteOptions(0)), "Failed to delete host0ROPod") By("deleting host1ROPod") - expectNoError(podClient.Delete(host1ROPod.Name, nil), "Failed to delete host1ROPod") + expectNoError(podClient.Delete(host1ROPod.Name, api.NewDeleteOptions(0)), "Failed to delete host1ROPod") By(fmt.Sprintf("deleting PD %q", diskName)) deletePDWithRetry(diskName) diff --git a/test/e2e/pods.go b/test/e2e/pods.go index 4fc7b2d70d88b..c28758a36f1dc 100644 --- a/test/e2e/pods.go +++ b/test/e2e/pods.go @@ -43,7 +43,7 @@ func runLivenessTest(c *client.Client, ns string, podDescr *api.Pod, expectResta // At the end of the test, clean up by removing the pod. defer func() { By("deleting the pod") - c.Pods(ns).Delete(podDescr.Name, nil) + c.Pods(ns).Delete(podDescr.Name, api.NewDeleteOptions(0)) }() // Wait until the pod is not pending. (Here we need to check for something other than @@ -86,15 +86,14 @@ func runLivenessTest(c *client.Client, ns string, podDescr *api.Pod, expectResta func testHostIP(c *client.Client, ns string, pod *api.Pod) { podClient := c.Pods(ns) By("creating pod") - defer podClient.Delete(pod.Name, nil) - _, err := podClient.Create(pod) - if err != nil { + defer podClient.Delete(pod.Name, api.NewDeleteOptions(0)) + if _, err := podClient.Create(pod); err != nil { Failf("Failed to create pod: %v", err) } By("ensuring that pod is running and has a hostIP") // Wait for the pods to enter the running state. Waiting loops until the pods // are running so non-running pods cause a timeout for this test. - err = waitForPodRunningInNamespace(c, pod.Name, ns) + err := waitForPodRunningInNamespace(c, pod.Name, ns) Expect(err).NotTo(HaveOccurred()) // Try to make sure we get a hostIP for each pod. hostIPTimeout := 2 * time.Minute @@ -222,7 +221,7 @@ var _ = Describe("Pods", func() { // We call defer here in case there is a problem with // the test so we can ensure that we clean up after // ourselves - defer podClient.Delete(pod.Name, nil) + defer podClient.Delete(pod.Name, api.NewDeleteOptions(0)) _, err = podClient.Create(pod) if err != nil { Failf("Failed to create pod: %v", err) @@ -235,7 +234,7 @@ var _ = Describe("Pods", func() { } Expect(len(pods.Items)).To(Equal(1)) - By("veryfying pod creation was observed") + By("verifying pod creation was observed") select { case event, _ := <-w.ResultChan(): if event.Type != watch.Added { @@ -245,7 +244,7 @@ var _ = Describe("Pods", func() { Fail("Timeout while waiting for pod creation") } - By("deleting the pod") + By("deleting the pod gracefully") if err := podClient.Delete(pod.Name, nil); err != nil { Failf("Failed to delete pod: %v", err) } @@ -253,11 +252,13 @@ var _ = Describe("Pods", func() { By("verifying pod deletion was observed") deleted := false timeout := false + var lastPod *api.Pod timer := time.After(podStartTimeout) for !deleted && !timeout { select { case event, _ := <-w.ResultChan(): if event.Type == watch.Deleted { + lastPod = event.Object.(*api.Pod) deleted = true } case <-timer: @@ -268,6 +269,9 @@ var _ = Describe("Pods", func() { Fail("Failed to observe pod deletion") } + Expect(lastPod.DeletionTimestamp).ToNot(BeNil()) + Expect(lastPod.Spec.TerminationGracePeriodSeconds).ToNot(BeZero()) + pods, err = podClient.List(labels.SelectorFromSet(labels.Set(map[string]string{"time": value})), fields.Everything()) if err != nil { Fail(fmt.Sprintf("Failed to list pods to verify deletion: %v", err)) @@ -312,7 +316,7 @@ var _ = Describe("Pods", func() { By("submitting the pod to kubernetes") defer func() { By("deleting the pod") - podClient.Delete(pod.Name, nil) + podClient.Delete(pod.Name, api.NewDeleteOptions(0)) }() pod, err := podClient.Create(pod) if err != nil { @@ -376,7 +380,7 @@ var _ = Describe("Pods", func() { }, }, } - defer framework.Client.Pods(framework.Namespace.Name).Delete(serverPod.Name, nil) + defer framework.Client.Pods(framework.Namespace.Name).Delete(serverPod.Name, api.NewDeleteOptions(0)) _, err := framework.Client.Pods(framework.Namespace.Name).Create(serverPod) if err != nil { Failf("Failed to create serverPod: %v", err) @@ -600,7 +604,7 @@ var _ = Describe("Pods", func() { // We call defer here in case there is a problem with // the test so we can ensure that we clean up after // ourselves - podClient.Delete(pod.Name) + podClient.Delete(pod.Name, api.NewDeleteOptions(0)) }() By("waiting for the pod to start running") @@ -673,7 +677,7 @@ var _ = Describe("Pods", func() { // We call defer here in case there is a problem with // the test so we can ensure that we clean up after // ourselves - podClient.Delete(pod.Name) + podClient.Delete(pod.Name, api.NewDeleteOptions(0)) }() By("waiting for the pod to start running") diff --git a/test/e2e/util.go b/test/e2e/util.go index af69273ce7217..16156a88aca39 100644 --- a/test/e2e/util.go +++ b/test/e2e/util.go @@ -831,20 +831,24 @@ func expectNoError(err error, explain ...interface{}) { ExpectWithOffset(1, err).NotTo(HaveOccurred(), explain...) } -// Stops everything from filePath from namespace ns and checks if everything maching selectors from the given namespace is correctly stopped. +// Stops everything from filePath from namespace ns and checks if everything matching selectors from the given namespace is correctly stopped. func cleanup(filePath string, ns string, selectors ...string) { - By("using stop to clean up resources") + By("using delete to clean up resources") var nsArg string if ns != "" { nsArg = fmt.Sprintf("--namespace=%s", ns) } - runKubectl("stop", "-f", filePath, nsArg) + runKubectl("stop", "--grace-period=0", "-f", filePath, nsArg) for _, selector := range selectors { - resources := runKubectl("get", "pods,rc,svc", "-l", selector, "--no-headers", nsArg) + resources := runKubectl("get", "rc,se", "-l", selector, "--no-headers", nsArg) if resources != "" { Failf("Resources left running after stop:\n%s", resources) } + pods := runKubectl("get", "pods", "-l", selector, nsArg, "-t", "{{ range .items }}{{ if not .metadata.deletionTimestamp }}{{ .metadata.name }}{{ \"\\n\" }}{{ end }}{{ end }}") + if pods != "" { + Failf("Pods left unterminated after stop:\n%s", pods) + } } } diff --git a/test/images/network-tester/webserver.go b/test/images/network-tester/webserver.go index 8f6e6a999ff80..9f6e90631226f 100644 --- a/test/images/network-tester/webserver.go +++ b/test/images/network-tester/webserver.go @@ -39,7 +39,9 @@ import ( "math/rand" "net/http" "os" + "os/signal" "sync" + "syscall" "time" "k8s.io/kubernetes/pkg/client" @@ -47,10 +49,11 @@ import ( ) var ( - port = flag.Int("port", 8080, "Port number to serve at.") - peerCount = flag.Int("peers", 8, "Must find at least this many peers for the test to pass.") - service = flag.String("service", "nettest", "Service to find other network test pods in.") - namespace = flag.String("namespace", "default", "Namespace of this pod. TODO: kubernetes should make this discoverable.") + port = flag.Int("port", 8080, "Port number to serve at.") + peerCount = flag.Int("peers", 8, "Must find at least this many peers for the test to pass.") + service = flag.String("service", "nettest", "Service to find other network test pods in.") + namespace = flag.String("namespace", "default", "Namespace of this pod. TODO: kubernetes should make this discoverable.") + delayShutdown = flag.Int("delay-shutdown", 0, "Number of seconds to delay shutdown when receiving SIGTERM.") ) // State tracks the internal state of our little http server. @@ -178,6 +181,17 @@ func main() { log.Fatalf("Error getting hostname: %v", err) } + if *delayShutdown > 0 { + termCh := make(chan os.Signal) + signal.Notify(termCh, syscall.SIGTERM) + go func() { + <-termCh + log.Printf("Sleeping %d seconds before exit ...", *delayShutdown) + time.Sleep(time.Duration(*delayShutdown) * time.Second) + os.Exit(0) + }() + } + state := State{ Hostname: hostname, StillContactingPeers: true, diff --git a/test/integration/auth_test.go b/test/integration/auth_test.go index 9662724fb6238..3c1d931087b68 100644 --- a/test/integration/auth_test.go +++ b/test/integration/auth_test.go @@ -232,7 +232,7 @@ var deleteNow string = ` { "kind": "DeleteOptions", "apiVersion": "` + testapi.Version() + `", - "gracePeriodSeconds": null%s + "gracePeriodSeconds": 0%s } ` diff --git a/test/integration/etcd_tools_test.go b/test/integration/etcd_tools_test.go index da278a5be6d19..b3b37bd0bc24a 100644 --- a/test/integration/etcd_tools_test.go +++ b/test/integration/etcd_tools_test.go @@ -80,6 +80,58 @@ func TestGet(t *testing.T) { }) } +func TestWriteTTL(t *testing.T) { + client := framework.NewEtcdClient() + helper := tools.EtcdHelper{Client: client, Codec: stringCodec{}} + framework.WithEtcdKey(func(key string) { + _, err := client.Set(key, "object", 0) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + s := fakeAPIObject("") + err = helper.GuaranteedUpdate(key, &s, false, func(obj runtime.Object, res tools.ResponseMeta) (runtime.Object, *uint64, error) { + if *(obj.(*fakeAPIObject)) != "object" { + t.Fatalf("unexpected existing object: %v", obj) + } + if res.TTL != 0 { + t.Fatalf("unexpected TTL: %#v", res) + } + ttl := uint64(10) + out := fakeAPIObject("test") + return &out, &ttl, nil + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if s != "test" { + t.Errorf("unexpected response: %#v", s) + } + if res, err := client.Get(key, false, false); err != nil || res == nil || res.Node.TTL != 10 { + t.Fatalf("unexpected get: %v %#v", err, res) + } + + err = helper.GuaranteedUpdate(key, &s, false, func(obj runtime.Object, res tools.ResponseMeta) (runtime.Object, *uint64, error) { + if *(obj.(*fakeAPIObject)) != "test" { + t.Fatalf("unexpected existing object: %v", obj) + } + if res.TTL <= 1 { + t.Fatalf("unexpected TTL: %#v", res) + } + out := fakeAPIObject("test2") + return &out, nil, nil + }) + if err != nil { + t.Fatalf("unexpected error: %v", err) + } + if s != "test2" { + t.Errorf("unexpected response: %#v", s) + } + if res, err := client.Get(key, false, false); err != nil || res == nil || res.Node.TTL <= 1 { + t.Fatalf("unexpected get: %v %#v", err, res) + } + }) +} + func TestWatch(t *testing.T) { client := framework.NewEtcdClient() etcdStorage := etcd.NewEtcdStorage(client, testapi.Codec(), etcdtest.PathPrefix()) diff --git a/test/integration/scheduler_test.go b/test/integration/scheduler_test.go index 796dc4a7a1942..0761cbcdf0e47 100644 --- a/test/integration/scheduler_test.go +++ b/test/integration/scheduler_test.go @@ -277,7 +277,7 @@ func DoTestUnschedulableNodes(t *testing.T, restClient *client.Client, nodeStore t.Logf("Test %d: Pod got scheduled on a schedulable node", i) } - err = restClient.Pods(api.NamespaceDefault).Delete(myPod.Name, nil) + err = restClient.Pods(api.NamespaceDefault).Delete(myPod.Name, api.NewDeleteOptions(0)) if err != nil { t.Errorf("Failed to delete pod: %v", err) } From 89f1f3b1b8ec99d7efb0ed800b35828962b76317 Mon Sep 17 00:00:00 2001 From: Clayton Coleman Date: Mon, 27 Jul 2015 16:41:31 -0400 Subject: [PATCH 2/8] Alter graceful deletion to not use TTL Avoid TTL by deleting pods immediately when they aren't scheduled, and letting the Kubelet delete them otherwise. Ensure the Kubelet uses pod.Spec.TerminationGracePeriodSeconds when no pod.DeletionGracePeriodSeconds is available. --- cmd/integration/integration.go | 5 +- docs/getting-started-guides/logging.md | 1 + hack/test-cmd.sh | 5 -- pkg/api/rest/delete.go | 7 ++ pkg/api/rest/resttest/resttest.go | 89 ++++++++++++++++--------- pkg/api/validation/validation.go | 2 +- pkg/api/validation/validation_test.go | 27 ++++++++ pkg/kubectl/describe.go | 2 +- pkg/kubelet/dockertools/manager.go | 23 ++++--- pkg/kubelet/dockertools/manager_test.go | 4 +- pkg/kubelet/kubelet.go | 7 +- pkg/kubelet/status_manager.go | 15 +++-- pkg/registry/generic/etcd/etcd.go | 39 ++++++++++- pkg/registry/pod/etcd/etcd_test.go | 13 +++- pkg/registry/pod/rest.go | 18 ++++- test/e2e/util.go | 2 +- test/integration/etcd_tools_test.go | 33 ++++----- 17 files changed, 212 insertions(+), 80 deletions(-) diff --git a/cmd/integration/integration.go b/cmd/integration/integration.go index 3e7d694edb2b2..d314a149101ea 100644 --- a/cmd/integration/integration.go +++ b/cmd/integration/integration.go @@ -872,12 +872,13 @@ func runSchedulerNoPhantomPodsTest(client *client.Client) { // Delete a pod to free up room. glog.Infof("Deleting pod %v", bar.Name) - err = client.Pods(api.NamespaceDefault).Delete(bar.Name, api.NewDeleteOptions(1)) + err = client.Pods(api.NamespaceDefault).Delete(bar.Name, api.NewDeleteOptions(0)) if err != nil { glog.Fatalf("FAILED: couldn't delete pod %q: %v", bar.Name, err) } - time.Sleep(2 * time.Second) + //TODO: reenable once fake_docker_client handles deletion cleanly + //time.Sleep(2 * time.Second) pod.ObjectMeta.Name = "phantom.baz" baz, err := client.Pods(api.NamespaceDefault).Create(pod) diff --git a/docs/getting-started-guides/logging.md b/docs/getting-started-guides/logging.md index 7062a0a09e1eb..2d61fa0cc872b 100644 --- a/docs/getting-started-guides/logging.md +++ b/docs/getting-started-guides/logging.md @@ -182,6 +182,7 @@ spec: mountPath: /varlog - name: containers mountPath: /var/lib/docker/containers + terminationGracePeriodSeconds: 30 volumes: - name: varlog hostPath: diff --git a/hack/test-cmd.sh b/hack/test-cmd.sh index c650bf06af66d..b872eff37e538 100755 --- a/hack/test-cmd.sh +++ b/hack/test-cmd.sh @@ -246,11 +246,6 @@ runTests() { # Pre-condition: valid-pod POD is running kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" 'valid-pod:' # Command - kubectl delete pod valid-pod "${kube_flags[@]}" - # Post-condition: pod is still there, in terminating - kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" 'valid-pod:' - [[ "$(kubectl get pods "${kube_flags[@]}" | grep Terminating)" ]] - # Command kubectl delete pod valid-pod "${kube_flags[@]}" --grace-period=0 # Post-condition: no POD is running kube::test::get_object_assert pods "{{range.items}}{{$id_field}}:{{end}}" '' diff --git a/pkg/api/rest/delete.go b/pkg/api/rest/delete.go index d1b04e6dded35..c9df5970ef55b 100644 --- a/pkg/api/rest/delete.go +++ b/pkg/api/rest/delete.go @@ -17,8 +17,11 @@ limitations under the License. package rest import ( + "time" + "k8s.io/kubernetes/pkg/api" "k8s.io/kubernetes/pkg/runtime" + "k8s.io/kubernetes/pkg/util" ) // RESTDeleteStrategy defines deletion behavior on an object that follows Kubernetes @@ -59,6 +62,8 @@ func BeforeDelete(strategy RESTDeleteStrategy, ctx api.Context, obj runtime.Obje if period > *objectMeta.DeletionGracePeriodSeconds { return false, true, nil } + now := util.NewTime(util.Now().Add(time.Second * time.Duration(*options.GracePeriodSeconds))) + objectMeta.DeletionTimestamp = &now objectMeta.DeletionGracePeriodSeconds = &period options.GracePeriodSeconds = &period return true, false, nil @@ -71,6 +76,8 @@ func BeforeDelete(strategy RESTDeleteStrategy, ctx api.Context, obj runtime.Obje if !strategy.CheckGracefulDelete(obj, options) { return false, false, nil } + now := util.NewTime(util.Now().Add(time.Second * time.Duration(*options.GracePeriodSeconds))) + objectMeta.DeletionTimestamp = &now objectMeta.DeletionGracePeriodSeconds = options.GracePeriodSeconds return true, false, nil } diff --git a/pkg/api/rest/resttest/resttest.go b/pkg/api/rest/resttest/resttest.go index 5576692c09389..694782c07300a 100644 --- a/pkg/api/rest/resttest/resttest.go +++ b/pkg/api/rest/resttest/resttest.go @@ -114,17 +114,20 @@ func (t *Tester) TestUpdate(valid runtime.Object, existing, older runtime.Object // Test deleting an object. func (t *Tester) TestDelete(createFn func() runtime.Object, wasGracefulFn func() bool, invalid ...runtime.Object) { - t.testDeleteNonExist(createFn) - t.testDeleteNoGraceful(createFn, wasGracefulFn) - t.testDeleteInvokesValidation(invalid...) + t.TestDeleteNonExist(createFn) + t.TestDeleteNoGraceful(createFn, wasGracefulFn) + t.TestDeleteInvokesValidation(invalid...) // TODO: Test delete namespace mismatch rejection // once #5684 is fixed. } // Test graceful deletion. func (t *Tester) TestDeleteGraceful(createFn func() runtime.Object, expectedGrace int64, wasGracefulFn func() bool) { - t.testDeleteGracefulHasDefault(createFn(), expectedGrace, wasGracefulFn) - t.testDeleteGracefulUsesZeroOnNil(createFn(), 0) + t.TestDeleteGracefulHasDefault(createFn(), expectedGrace, wasGracefulFn) + t.TestDeleteGracefulWithValue(createFn(), expectedGrace, wasGracefulFn) + t.TestDeleteGracefulUsesZeroOnNil(createFn(), 0) + t.TestDeleteGracefulExtend(createFn(), expectedGrace, wasGracefulFn) + t.TestDeleteGracefulImmediate(createFn(), expectedGrace, wasGracefulFn) } // Test getting object. @@ -298,23 +301,7 @@ func (t *Tester) testUpdateFailsOnVersion(older runtime.Object) { // ============================================================================= // Deletion tests. -func (t *Tester) TestDeleteNoGraceful(createFn func() runtime.Object, wasGracefulFn func() bool) { - existing := createFn() - objectMeta := t.getObjectMetaOrFail(existing) - ctx := api.WithNamespace(t.TestContext(), objectMeta.Namespace) - _, err := t.storage.(rest.GracefulDeleter).Delete(ctx, objectMeta.Name, api.NewDeleteOptions(10)) - if err != nil { - t.Errorf("unexpected error: %v", err) - } - if _, err := t.storage.(rest.Getter).Get(ctx, objectMeta.Name); !errors.IsNotFound(err) { - t.Errorf("unexpected error, object should not exist: %v", err) - } - if wasGracefulFn() { - t.Errorf("resource should not support graceful delete") - } -} - -func (t *Tester) testDeleteInvokesValidation(invalid ...runtime.Object) { +func (t *Tester) TestDeleteInvokesValidation(invalid ...runtime.Object) { for i, obj := range invalid { objectMeta := t.getObjectMetaOrFail(obj) ctx := t.TestContext() @@ -325,7 +312,7 @@ func (t *Tester) testDeleteInvokesValidation(invalid ...runtime.Object) { } } -func (t *Tester) testDeleteNonExist(createFn func() runtime.Object) { +func (t *Tester) TestDeleteNonExist(createFn func() runtime.Object) { existing := createFn() objectMeta := t.getObjectMetaOrFail(existing) context := t.TestContext() @@ -341,14 +328,23 @@ func (t *Tester) testDeleteNonExist(createFn func() runtime.Object) { // ============================================================================= // Graceful Deletion tests. -func (t *Tester) TestDeleteGraceful(createFn func() runtime.Object, expectedGrace int64, wasGracefulFn func() bool) { - t.TestDeleteGracefulHasDefault(createFn(), expectedGrace, wasGracefulFn) - t.TestDeleteGracefulWithValue(createFn(), expectedGrace, wasGracefulFn) - t.TestDeleteGracefulUsesZeroOnNil(createFn(), 0) - t.TestDeleteGracefulExtend(createFn(), expectedGrace, wasGracefulFn) +func (t *Tester) TestDeleteNoGraceful(createFn func() runtime.Object, wasGracefulFn func() bool) { + existing := createFn() + objectMeta := t.getObjectMetaOrFail(existing) + ctx := api.WithNamespace(t.TestContext(), objectMeta.Namespace) + _, err := t.storage.(rest.GracefulDeleter).Delete(ctx, objectMeta.Name, api.NewDeleteOptions(10)) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if _, err := t.storage.(rest.Getter).Get(ctx, objectMeta.Name); !errors.IsNotFound(err) { + t.Errorf("unexpected error, object should not exist: %v", err) + } + if wasGracefulFn() { + t.Errorf("resource should not support graceful delete") + } } -func (t *Tester) testDeleteGracefulHasDefault(existing runtime.Object, expectedGrace int64, wasGracefulFn func() bool) { +func (t *Tester) TestDeleteGracefulHasDefault(existing runtime.Object, expectedGrace int64, wasGracefulFn func() bool) { objectMeta := t.getObjectMetaOrFail(existing) ctx := api.WithNamespace(t.TestContext(), objectMeta.Namespace) _, err := t.storage.(rest.GracefulDeleter).Delete(ctx, objectMeta.Name, &api.DeleteOptions{}) @@ -450,7 +446,40 @@ func (t *Tester) TestDeleteGracefulExtend(existing runtime.Object, expectedGrace } } -func (t *Tester) testDeleteGracefulUsesZeroOnNil(existing runtime.Object, expectedGrace int64) { +func (t *Tester) TestDeleteGracefulImmediate(existing runtime.Object, expectedGrace int64, wasGracefulFn func() bool) { + objectMeta, err := api.ObjectMetaFor(existing) + if err != nil { + t.Fatalf("object does not have ObjectMeta: %v\n%#v", err, existing) + } + + ctx := api.WithNamespace(t.TestContext(), objectMeta.Namespace) + _, err = t.storage.(rest.GracefulDeleter).Delete(ctx, objectMeta.Name, api.NewDeleteOptions(expectedGrace)) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + if !wasGracefulFn() { + t.Errorf("did not gracefully delete resource") + } + // second delete is immediate, resource is deleted + out, err := t.storage.(rest.GracefulDeleter).Delete(ctx, objectMeta.Name, api.NewDeleteOptions(0)) + if err != nil { + t.Errorf("unexpected error: %v", err) + } + _, err = t.storage.(rest.Getter).Get(ctx, objectMeta.Name) + if !errors.IsNotFound(err) { + t.Errorf("unexpected error, object should be deleted immediately: %v", err) + } + objectMeta, err = api.ObjectMetaFor(out) + if err != nil { + t.Errorf("unexpected error: %v", err) + return + } + if objectMeta.DeletionTimestamp == nil || objectMeta.DeletionGracePeriodSeconds == nil || *objectMeta.DeletionGracePeriodSeconds != 0 { + t.Errorf("unexpected deleted meta: %#v", objectMeta) + } +} + +func (t *Tester) TestDeleteGracefulUsesZeroOnNil(existing runtime.Object, expectedGrace int64) { objectMeta := t.getObjectMetaOrFail(existing) ctx := api.WithNamespace(t.TestContext(), objectMeta.Namespace) _, err := t.storage.(rest.GracefulDeleter).Delete(ctx, objectMeta.Name, nil) diff --git a/pkg/api/validation/validation.go b/pkg/api/validation/validation.go index ed9578ebe8b92..d8422a55c5792 100644 --- a/pkg/api/validation/validation.go +++ b/pkg/api/validation/validation.go @@ -272,7 +272,7 @@ func ValidateObjectMetaUpdate(new, old *api.ObjectMeta) errs.ValidationErrorList if old.DeletionGracePeriodSeconds != nil && new.DeletionGracePeriodSeconds == nil { new.DeletionGracePeriodSeconds = old.DeletionGracePeriodSeconds } - if new.DeletionGracePeriodSeconds != nil && *new.DeletionGracePeriodSeconds != *old.DeletionGracePeriodSeconds { + if new.DeletionGracePeriodSeconds != nil && old.DeletionGracePeriodSeconds != nil && *new.DeletionGracePeriodSeconds != *old.DeletionGracePeriodSeconds { allErrs = append(allErrs, errs.NewFieldInvalid("deletionGracePeriodSeconds", new.DeletionGracePeriodSeconds, "field is immutable; may only be changed via deletion")) } diff --git a/pkg/api/validation/validation_test.go b/pkg/api/validation/validation_test.go index b20de4cd9d223..1c8c1e737c38e 100644 --- a/pkg/api/validation/validation_test.go +++ b/pkg/api/validation/validation_test.go @@ -1317,6 +1317,9 @@ func TestValidatePod(t *testing.T) { } func TestValidatePodUpdate(t *testing.T) { + now := util.Now() + grace := int64(30) + grace2 := int64(31) tests := []struct { a api.Pod b api.Pod @@ -1403,6 +1406,30 @@ func TestValidatePodUpdate(t *testing.T) { false, "more containers", }, + { + api.Pod{ + ObjectMeta: api.ObjectMeta{Name: "foo", DeletionTimestamp: &now}, + Spec: api.PodSpec{Containers: []api.Container{{Image: "foo:V1"}}}, + }, + api.Pod{ + ObjectMeta: api.ObjectMeta{Name: "foo"}, + Spec: api.PodSpec{Containers: []api.Container{{Image: "foo:V1"}}}, + }, + true, + "deletion timestamp filled out", + }, + { + api.Pod{ + ObjectMeta: api.ObjectMeta{Name: "foo", DeletionTimestamp: &now, DeletionGracePeriodSeconds: &grace}, + Spec: api.PodSpec{Containers: []api.Container{{Image: "foo:V1"}}}, + }, + api.Pod{ + ObjectMeta: api.ObjectMeta{Name: "foo", DeletionTimestamp: &now, DeletionGracePeriodSeconds: &grace2}, + Spec: api.PodSpec{Containers: []api.Container{{Image: "foo:V1"}}}, + }, + false, + "deletion grace period seconds cleared", + }, { api.Pod{ ObjectMeta: api.ObjectMeta{Name: "foo"}, diff --git a/pkg/kubectl/describe.go b/pkg/kubectl/describe.go index 107abc65f3b0f..c3de3b8506d6b 100644 --- a/pkg/kubectl/describe.go +++ b/pkg/kubectl/describe.go @@ -419,7 +419,7 @@ func describePod(pod *api.Pod, rcs []api.ReplicationController, events *api.Even fmt.Fprintf(out, "Labels:\t%s\n", formatLabels(pod.Labels)) if pod.DeletionTimestamp != nil { fmt.Fprintf(out, "Status:\tTerminating (expires %s)\n", pod.DeletionTimestamp.Time.Format(time.RFC1123Z)) - fmt.Fprintf(out, "Termination Grace Period:\t%ss\n", pod.DeletionGracePeriodSeconds) + fmt.Fprintf(out, "Termination Grace Period:\t%ds\n", pod.DeletionGracePeriodSeconds) } else { fmt.Fprintf(out, "Status:\t%s\n", string(pod.Status.Phase)) } diff --git a/pkg/kubelet/dockertools/manager.go b/pkg/kubelet/dockertools/manager.go index 3de60eb192613..e019495caa4cf 100644 --- a/pkg/kubelet/dockertools/manager.go +++ b/pkg/kubelet/dockertools/manager.go @@ -1234,14 +1234,19 @@ func (dm *DockerManager) killContainer(containerID types.UID, container *api.Con } gracePeriod := int64(minimumGracePeriodInSeconds) - if pod != nil && pod.DeletionGracePeriodSeconds != nil { - gracePeriod = *pod.DeletionGracePeriodSeconds + if pod != nil { + switch { + case pod.DeletionGracePeriodSeconds != nil: + gracePeriod = *pod.DeletionGracePeriodSeconds + case pod.Spec.TerminationGracePeriodSeconds != nil: + gracePeriod = *pod.Spec.TerminationGracePeriodSeconds + } } glog.V(2).Infof("Killing container %q with %d second grace period", name, gracePeriod) + start := util.Now() if pod != nil && container != nil && container.Lifecycle != nil && container.Lifecycle.PreStop != nil { glog.V(4).Infof("Running preStop hook for container %q", name) - start := util.Now() // TODO: timebox PreStop execution to at most gracePeriod if err := dm.runner.Run(ID, pod, container, container.Lifecycle.PreStop); err != nil { glog.Errorf("preStop hook for container %q failed: %v", name, err) @@ -1256,6 +1261,11 @@ func (dm *DockerManager) killContainer(containerID types.UID, container *api.Con gracePeriod = minimumGracePeriodInSeconds } err := dm.client.StopContainer(ID, uint(gracePeriod)) + if err == nil { + glog.V(2).Infof("Container %q exited after %s", name, util.Now().Sub(start.Time)) + } else { + glog.V(2).Infof("Container %q termination failed after %s: %v", name, util.Now().Sub(start.Time), err) + } ref, ok := dm.containerRefManager.GetRef(ID) if !ok { glog.Warningf("No ref for pod '%q'", name) @@ -1498,11 +1508,6 @@ func (dm *DockerManager) computePodContainerChanges(pod *api.Pod, runningPod kub containersToKeep := make(map[kubeletTypes.DockerID]int) createPodInfraContainer := false - if pod.DeletionTimestamp != nil { - glog.V(4).Infof("Pod is terminating %q", podFullName) - return PodContainerChangesSpec{}, nil - } - var err error var podInfraContainerID kubeletTypes.DockerID var changed bool @@ -1624,10 +1629,10 @@ func (dm *DockerManager) SyncPod(pod *api.Pod, runningPod kubecontainer.Pod, pod podFullName := kubecontainer.GetPodFullName(pod) containerChanges, err := dm.computePodContainerChanges(pod, runningPod, podStatus) - glog.V(3).Infof("Got container changes for pod %q: %+v", podFullName, containerChanges) if err != nil { return err } + glog.V(3).Infof("Got container changes for pod %q: %+v", podFullName, containerChanges) if containerChanges.StartInfraContainer || (len(containerChanges.ContainersToKeep) == 0 && len(containerChanges.ContainersToStart) == 0) { if len(containerChanges.ContainersToKeep) == 0 && len(containerChanges.ContainersToStart) == 0 { diff --git a/pkg/kubelet/dockertools/manager_test.go b/pkg/kubelet/dockertools/manager_test.go index 0da1ca938b16e..606404fb6a3dc 100644 --- a/pkg/kubelet/dockertools/manager_test.go +++ b/pkg/kubelet/dockertools/manager_test.go @@ -568,7 +568,7 @@ func replaceProber(dm *DockerManager, result probe.Result, err error) { // Unknown or error. // // PLEASE READ THE PROBE DOCS BEFORE CHANGING THIS TEST IF YOU ARE UNSURE HOW PROBES ARE SUPPOSED TO WORK: -// (See https://github.com/GoogleCloudPlatform/kubernetes/blob/master/docs/pod-states.md#pod-conditions) +// (See https://k8s.io/kubernetes/blob/master/docs/pod-states.md#pod-conditions) func TestProbeContainer(t *testing.T) { manager, _ := newTestDockerManager() dc := &docker.APIContainers{ @@ -1150,7 +1150,7 @@ func TestSyncPodBadHash(t *testing.T) { // Check the pod infra container. "inspect_container", // Kill and restart the bad hash container. - "stop", "create", "start", + "stop", "create", "start", "inspect_container", }) if err := fakeDocker.AssertStopped([]string{"1234"}); err != nil { diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index 5e5a094c249f0..41850d039c770 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -1177,9 +1177,10 @@ func (kl *Kubelet) syncPod(pod *api.Pod, mirrorPod *api.Pod, runningPod kubecont }() // Kill pods we can't run. - err := canRunPod(pod) - if err != nil { - kl.killPod(pod, runningPod) + if err := canRunPod(pod); err != nil || pod.DeletionTimestamp != nil { + if err := kl.killPod(pod, runningPod); err != nil { + util.HandleError(err) + } return err } diff --git a/pkg/kubelet/status_manager.go b/pkg/kubelet/status_manager.go index 5e9c753f9faf5..4d754286309a9 100644 --- a/pkg/kubelet/status_manager.go +++ b/pkg/kubelet/status_manager.go @@ -123,7 +123,7 @@ func (s *statusManager) SetPodStatus(pod *api.Pod, status api.PodStatus) { // Currently this routine is not called for the same pod from multiple // workers and/or the kubelet but dropping the lock before sending the // status down the channel feels like an easy way to get a bullet in foot. - if !found || !isStatusEqual(&oldStatus, &status) { + if !found || !isStatusEqual(&oldStatus, &status) || pod.DeletionTimestamp != nil { s.podStatuses[podFullName] = status s.podStatusChannel <- podStatusSyncRequest{pod, status} } else { @@ -173,11 +173,16 @@ func (s *statusManager) syncBatch() error { if err == nil { glog.V(3).Infof("Status for pod %q updated successfully", kubeletUtil.FormatPodName(pod)) - if statusPod.DeletionTimestamp == nil || !allTerminated(statusPod.Status.ContainerStatuses) { + if pod.DeletionTimestamp == nil { + return nil + } + if !notRunning(pod.Status.ContainerStatuses) { + glog.V(3).Infof("Pod %q is terminated, but some pods are still running", pod.Name) return nil } if err := s.kubeClient.Pods(statusPod.Namespace).Delete(statusPod.Name, api.NewDeleteOptions(0)); err == nil { glog.V(3).Infof("Pod %q fully terminated and removed from etcd", statusPod.Name) + s.DeletePodStatus(podFullName) return nil } } @@ -194,11 +199,11 @@ func (s *statusManager) syncBatch() error { return fmt.Errorf("error updating status for pod %q: %v", kubeletUtil.FormatPodName(pod), err) } -// allTerminated returns true if every status is terminated, or the status list +// notRunning returns true if every status is terminated or waiting, or the status list // is empty. -func allTerminated(statuses []api.ContainerStatus) bool { +func notRunning(statuses []api.ContainerStatus) bool { for _, status := range statuses { - if status.State.Terminated == nil { + if status.State.Terminated == nil && status.State.Waiting == nil { return false } } diff --git a/pkg/registry/generic/etcd/etcd.go b/pkg/registry/generic/etcd/etcd.go index 0b0d4190ac909..3242d20bc73b7 100644 --- a/pkg/registry/generic/etcd/etcd.go +++ b/pkg/registry/generic/etcd/etcd.go @@ -341,6 +341,11 @@ func (e *Etcd) Get(ctx api.Context, name string) (runtime.Object, error) { return obj, nil } +var ( + errAlreadyDeleting = fmt.Errorf("abort delete") + errDeleteNow = fmt.Errorf("delete now") +) + // Delete removes the item from etcd. func (e *Etcd) Delete(ctx api.Context, name string, options *api.DeleteOptions) (runtime.Object, error) { key, err := e.KeyFunc(ctx, name) @@ -367,13 +372,41 @@ func (e *Etcd) Delete(ctx api.Context, name string, options *api.DeleteOptions) if pendingGraceful { return e.finalizeDelete(obj, false) } - if graceful && *options.GracePeriodSeconds != 0 { + if graceful { trace.Step("Graceful deletion") out := e.NewFunc() - if err := e.Storage.Set(key, obj, out, uint64(*options.GracePeriodSeconds)); err != nil { + lastGraceful := int64(0) + err := e.Storage.GuaranteedUpdate( + key, out, false, + storage.SimpleUpdate(func(existing runtime.Object) (runtime.Object, error) { + graceful, pendingGraceful, err := rest.BeforeDelete(e.DeleteStrategy, ctx, existing, options) + if err != nil { + return nil, err + } + if pendingGraceful { + return nil, errAlreadyDeleting + } + if !graceful { + return nil, errDeleteNow + } + lastGraceful = *options.GracePeriodSeconds + return existing, nil + }), + ) + switch err { + case nil: + if lastGraceful > 0 { + return out, nil + } + // fall through and delete immediately + case errDeleteNow: + // we've updated the object to have a zero grace period, or it's already at 0, so + // we should fall through and truly delete the object. + case errAlreadyDeleting: + return e.finalizeDelete(obj, true) + default: return nil, etcderr.InterpretUpdateError(err, e.EndpointName, name) } - return e.finalizeDelete(out, true) } // delete immediately, or no graceful deletion supported diff --git a/pkg/registry/pod/etcd/etcd_test.go b/pkg/registry/pod/etcd/etcd_test.go index 1bd879b2011e9..f565dfcabc2ba 100644 --- a/pkg/registry/pod/etcd/etcd_test.go +++ b/pkg/registry/pod/etcd/etcd_test.go @@ -121,8 +121,10 @@ func TestDelete(t *testing.T) { key = etcdtest.AddPrefix(key) test := resttest.New(t, storage, fakeEtcdClient.SetError) + expectedNode := "some-node" createFn := func() runtime.Object { pod := validChangedPod() + pod.Spec.NodeName = expectedNode fakeEtcdClient.Data[key] = tools.EtcdResponseWithError{ R: &etcd.Response{ Node: &etcd.Node{ @@ -137,9 +139,18 @@ func TestDelete(t *testing.T) { if fakeEtcdClient.Data[key].R.Node == nil { return false } - return fakeEtcdClient.Data[key].R.Node.TTL != 0 + obj, err := latest.Codec.Decode([]byte(fakeEtcdClient.Data[key].R.Node.Value)) + if err != nil { + return false + } + pod := obj.(*api.Pod) + t.Logf("found object %#v", pod.ObjectMeta) + return pod.DeletionTimestamp != nil && pod.DeletionGracePeriodSeconds != nil && *pod.DeletionGracePeriodSeconds != 0 } test.TestDeleteGraceful(createFn, 30, gracefulSetFn) + + expectedNode = "" + test.TestDelete(createFn, gracefulSetFn) } func expectPod(t *testing.T, out runtime.Object) (*api.Pod, bool) { diff --git a/pkg/registry/pod/rest.go b/pkg/registry/pod/rest.go index 43135278d5049..699d9cddecc8d 100644 --- a/pkg/registry/pod/rest.go +++ b/pkg/registry/pod/rest.go @@ -92,22 +92,38 @@ func (podStrategy) CheckGracefulDelete(obj runtime.Object, options *api.DeleteOp if options == nil { return false } + pod := obj.(*api.Pod) period := int64(0) // user has specified a value if options.GracePeriodSeconds != nil { period = *options.GracePeriodSeconds } else { // use the default value if set, or deletes the pod immediately (0) - pod := obj.(*api.Pod) if pod.Spec.TerminationGracePeriodSeconds != nil { period = *pod.Spec.TerminationGracePeriodSeconds } } + // if the pod is not scheduled, delete immediately + if len(pod.Spec.NodeName) == 0 { + period = 0 + } // ensure the options and the pod are in sync options.GracePeriodSeconds = &period return true } +type podStrategyWithoutGraceful struct { + podStrategy +} + +// CheckGracefulDelete prohibits graceful deletion. +func (podStrategyWithoutGraceful) CheckGracefulDelete(obj runtime.Object, options *api.DeleteOptions) bool { + return false +} + +// StrategyWithoutGraceful implements the legacy instant delele behavior. +var StrategyWithoutGraceful = podStrategyWithoutGraceful{Strategy} + type podStatusStrategy struct { podStrategy } diff --git a/test/e2e/util.go b/test/e2e/util.go index 16156a88aca39..f239b9724bc41 100644 --- a/test/e2e/util.go +++ b/test/e2e/util.go @@ -841,7 +841,7 @@ func cleanup(filePath string, ns string, selectors ...string) { runKubectl("stop", "--grace-period=0", "-f", filePath, nsArg) for _, selector := range selectors { - resources := runKubectl("get", "rc,se", "-l", selector, "--no-headers", nsArg) + resources := runKubectl("get", "rc,svc", "-l", selector, "--no-headers", nsArg) if resources != "" { Failf("Resources left running after stop:\n%s", resources) } diff --git a/test/integration/etcd_tools_test.go b/test/integration/etcd_tools_test.go index b3b37bd0bc24a..60b9408edb0ea 100644 --- a/test/integration/etcd_tools_test.go +++ b/test/integration/etcd_tools_test.go @@ -82,49 +82,50 @@ func TestGet(t *testing.T) { func TestWriteTTL(t *testing.T) { client := framework.NewEtcdClient() - helper := tools.EtcdHelper{Client: client, Codec: stringCodec{}} + etcdStorage := etcd.NewEtcdStorage(client, testapi.Codec(), "") framework.WithEtcdKey(func(key string) { - _, err := client.Set(key, "object", 0) - if err != nil { + testObject := api.ServiceAccount{ObjectMeta: api.ObjectMeta{Name: "foo"}} + if err := etcdStorage.Set(key, &testObject, nil, 0); err != nil { t.Fatalf("unexpected error: %v", err) } - s := fakeAPIObject("") - err = helper.GuaranteedUpdate(key, &s, false, func(obj runtime.Object, res tools.ResponseMeta) (runtime.Object, *uint64, error) { - if *(obj.(*fakeAPIObject)) != "object" { + result := &api.ServiceAccount{} + err := etcdStorage.GuaranteedUpdate(key, result, false, func(obj runtime.Object, res storage.ResponseMeta) (runtime.Object, *uint64, error) { + if in, ok := obj.(*api.ServiceAccount); !ok || in.Name != "foo" { t.Fatalf("unexpected existing object: %v", obj) } if res.TTL != 0 { t.Fatalf("unexpected TTL: %#v", res) } ttl := uint64(10) - out := fakeAPIObject("test") - return &out, &ttl, nil + out := &api.ServiceAccount{ObjectMeta: api.ObjectMeta{Name: "out"}} + return out, &ttl, nil }) if err != nil { t.Fatalf("unexpected error: %v", err) } - if s != "test" { - t.Errorf("unexpected response: %#v", s) + if result.Name != "out" { + t.Errorf("unexpected response: %#v", result) } if res, err := client.Get(key, false, false); err != nil || res == nil || res.Node.TTL != 10 { t.Fatalf("unexpected get: %v %#v", err, res) } - err = helper.GuaranteedUpdate(key, &s, false, func(obj runtime.Object, res tools.ResponseMeta) (runtime.Object, *uint64, error) { - if *(obj.(*fakeAPIObject)) != "test" { + result = &api.ServiceAccount{} + err = etcdStorage.GuaranteedUpdate(key, result, false, func(obj runtime.Object, res storage.ResponseMeta) (runtime.Object, *uint64, error) { + if in, ok := obj.(*api.ServiceAccount); !ok || in.Name != "out" { t.Fatalf("unexpected existing object: %v", obj) } if res.TTL <= 1 { t.Fatalf("unexpected TTL: %#v", res) } - out := fakeAPIObject("test2") - return &out, nil, nil + out := &api.ServiceAccount{ObjectMeta: api.ObjectMeta{Name: "out2"}} + return out, nil, nil }) if err != nil { t.Fatalf("unexpected error: %v", err) } - if s != "test2" { - t.Errorf("unexpected response: %#v", s) + if result.Name != "out2" { + t.Errorf("unexpected response: %#v", result) } if res, err := client.Get(key, false, false); err != nil || res == nil || res.Node.TTL <= 1 { t.Fatalf("unexpected get: %v %#v", err, res) From 71d10c6c7a058faa6e5597b78ac2e1e526128e56 Mon Sep 17 00:00:00 2001 From: Clayton Coleman Date: Sun, 26 Jul 2015 23:56:32 -0400 Subject: [PATCH 3/8] Update swagger --- api/swagger-spec/v1.json | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/api/swagger-spec/v1.json b/api/swagger-spec/v1.json index 70c4a124c5a19..0db5ef03d365f 100644 --- a/api/swagger-spec/v1.json +++ b/api/swagger-spec/v1.json @@ -11017,6 +11017,11 @@ "type": "string", "description": "RFC 3339 date and time at which the object will be deleted; populated by the system when a graceful deletion is requested, read-only; if not set, graceful deletion of the object has not been requested; see http://releases.k8s.io/HEAD/docs/devel/api-conventions.md#metadata" }, + "deletionGracePeriodSeconds": { + "type": "integer", + "format": "int64", + "description": "number of seconds allowed for this object to gracefully terminate before it will be removed from the system; only set when deletionTimestamp is also set, read-only; may only be shortened" + }, "labels": { "type": "any", "description": "map of string keys and values that can be used to organize and categorize objects; may match selectors of replication controllers and services; see http://releases.k8s.io/HEAD/docs/user-guide/labels.md" @@ -12327,7 +12332,7 @@ "terminationGracePeriodSeconds": { "type": "integer", "format": "int64", - "description": "optional duration in seconds the pod needs to terminate gracefully; may be decreased in delete request; value must be non-negative integer; the value zero indicates delete immediately; if this value is not set, the default grace period will be used instead; the grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal; set this value longer than the expected cleanup time for your process" + "description": "optional duration in seconds the pod needs to terminate gracefully; may be decreased in delete request; value must be non-negative integer; the value zero indicates delete immediately; if this value is not set, the default grace period will be used instead; the grace period is the duration in seconds after the processes running in the pod are sent a termination signal and the time when the processes are forcibly halted with a kill signal; set this value longer than the expected cleanup time for your process; defaults to 30 seconds" }, "activeDeadlineSeconds": { "type": "integer", From 780accb3ba9bbefd60b19efa4959ec4e165acdbb Mon Sep 17 00:00:00 2001 From: Clayton Coleman Date: Tue, 28 Jul 2015 16:06:05 -0400 Subject: [PATCH 4/8] Kubelet should garbage collect dead pods The sync loop should check for terminated pods that are no longer running and clear them. The status loop should never write status if the pod UID changes. Mirror pods should be deleted immediately rather than gracefully. --- pkg/kubelet/dockertools/manager.go | 4 ++++ pkg/kubelet/kubelet.go | 30 ++++++++++++++++++++++++++++++ pkg/kubelet/mirror_client.go | 2 +- pkg/kubelet/status_manager.go | 27 +++++++++++++++++++++++++++ pkg/kubelet/status_manager_test.go | 29 +++++++++++++++++++++++++++++ 5 files changed, 91 insertions(+), 1 deletion(-) diff --git a/pkg/kubelet/dockertools/manager.go b/pkg/kubelet/dockertools/manager.go index e019495caa4cf..8c55d779b3ca0 100644 --- a/pkg/kubelet/dockertools/manager.go +++ b/pkg/kubelet/dockertools/manager.go @@ -1261,6 +1261,10 @@ func (dm *DockerManager) killContainer(containerID types.UID, container *api.Con gracePeriod = minimumGracePeriodInSeconds } err := dm.client.StopContainer(ID, uint(gracePeriod)) + if _, ok := err.(*docker.ContainerNotRunning); ok && err != nil { + glog.V(4).Infof("Container %q has already exited", name) + return nil + } if err == nil { glog.V(2).Infof("Container %q exited after %s", name, util.Now().Sub(start.Time)) } else { diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index 41850d039c770..a34d756b3a48d 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -1367,6 +1367,32 @@ func (kl *Kubelet) cleanupOrphanedVolumes(pods []*api.Pod, runningPods []*kubeco return nil } +// Delete any pods that are no longer running and are marked for deletion. +func (kl *Kubelet) cleanupTerminatedPods(pods []*api.Pod, runningPods []*kubecontainer.Pod) error { + var terminating []*api.Pod + for _, pod := range pods { + if pod.DeletionTimestamp != nil { + found := false + for _, runningPod := range runningPods { + if runningPod.ID == pod.UID { + found = true + break + } + } + if found { + podFullName := kubecontainer.GetPodFullName(pod) + glog.V(5).Infof("Keeping terminated pod %q and uid %q, still running", podFullName, pod.UID) + continue + } + terminating = append(terminating, pod) + } + } + if !kl.statusManager.TerminatePods(terminating) { + return errors.New("not all pods were successfully terminated") + } + return nil +} + // pastActiveDeadline returns true if the pod has been active for more than // ActiveDeadlineSeconds. func (kl *Kubelet) pastActiveDeadline(pod *api.Pod) bool { @@ -1529,6 +1555,10 @@ func (kl *Kubelet) cleanupPods(allPods []*api.Pod, admittedPods []*api.Pod) erro // Remove any orphaned mirror pods. kl.podManager.DeleteOrphanedMirrorPods() + if err := kl.cleanupTerminatedPods(allPods, runningPods); err != nil { + glog.Errorf("Failed to cleanup terminated pods: %v", err) + } + return err } diff --git a/pkg/kubelet/mirror_client.go b/pkg/kubelet/mirror_client.go index ae54c33017c4e..8ef630d1b7e5c 100644 --- a/pkg/kubelet/mirror_client.go +++ b/pkg/kubelet/mirror_client.go @@ -64,7 +64,7 @@ func (mc *basicMirrorClient) DeleteMirrorPod(podFullName string) error { return err } glog.V(4).Infof("Deleting a mirror pod %q", podFullName) - if err := mc.apiserverClient.Pods(namespace).Delete(name, nil); err != nil { + if err := mc.apiserverClient.Pods(namespace).Delete(name, api.NewDeleteOptions(0)); err != nil { glog.Errorf("Failed deleting a mirror pod %q: %v", podFullName, err) } return nil diff --git a/pkg/kubelet/status_manager.go b/pkg/kubelet/status_manager.go index 4d754286309a9..448c5bd5853f6 100644 --- a/pkg/kubelet/status_manager.go +++ b/pkg/kubelet/status_manager.go @@ -131,6 +131,29 @@ func (s *statusManager) SetPodStatus(pod *api.Pod, status api.PodStatus) { } } +// TerminatePods resets the container status for the provided pods to terminated and triggers +// a status update. This function may not enqueue all the provided pods, in which case it will +// return false +func (s *statusManager) TerminatePods(pods []*api.Pod) bool { + sent := true + s.podStatusesLock.Lock() + defer s.podStatusesLock.Unlock() + for _, pod := range pods { + for i := range pod.Status.ContainerStatuses { + pod.Status.ContainerStatuses[i].State = api.ContainerState{ + Terminated: &api.ContainerStateTerminated{}, + } + } + select { + case s.podStatusChannel <- podStatusSyncRequest{pod, pod.Status}: + default: + sent = false + glog.V(4).Infof("Termination notice for %q was dropped because the status channel is full", kubeletUtil.FormatPodName(pod)) + } + } + return sent +} + func (s *statusManager) DeletePodStatus(podFullName string) { s.podStatusesLock.Lock() defer s.podStatusesLock.Unlock() @@ -167,6 +190,10 @@ func (s *statusManager) syncBatch() error { return nil } if err == nil { + if len(pod.UID) > 0 && statusPod.UID != pod.UID { + glog.V(3).Infof("Pod %q was deleted and then recreated, skipping status update", kubeletUtil.FormatPodName(pod)) + return nil + } statusPod.Status = status // TODO: handle conflict as a retry, make that easier too. statusPod, err = s.kubeClient.Pods(pod.Namespace).UpdateStatus(statusPod) diff --git a/pkg/kubelet/status_manager_test.go b/pkg/kubelet/status_manager_test.go index 585d2ffd34f33..22e93275024d4 100644 --- a/pkg/kubelet/status_manager_test.go +++ b/pkg/kubelet/status_manager_test.go @@ -153,8 +153,21 @@ func TestUnchangedStatus(t *testing.T) { verifyUpdates(t, syncer, 1) } +func TestSyncBatchIgnoresNotFound(t *testing.T) { + syncer := newTestStatusManager() + syncer.SetPodStatus(testPod, getRandomPodStatus()) + err := syncer.syncBatch() + if err != nil { + t.Errorf("unexpected syncing error: %v", err) + } + verifyActions(t, syncer.kubeClient, []testclient.Action{ + testclient.GetActionImpl{ActionImpl: testclient.ActionImpl{Verb: "get", Resource: "pods"}}, + }) +} + func TestSyncBatch(t *testing.T) { syncer := newTestStatusManager() + syncer.kubeClient = testclient.NewSimpleFake(testPod) syncer.SetPodStatus(testPod, getRandomPodStatus()) err := syncer.syncBatch() if err != nil { @@ -167,6 +180,22 @@ func TestSyncBatch(t *testing.T) { ) } +func TestSyncBatchChecksMismatchedUID(t *testing.T) { + syncer := newTestStatusManager() + testPod.UID = "first" + differentPod := *testPod + differentPod.UID = "second" + syncer.kubeClient = testclient.NewSimpleFake(testPod) + syncer.SetPodStatus(&differentPod, getRandomPodStatus()) + err := syncer.syncBatch() + if err != nil { + t.Errorf("unexpected syncing error: %v", err) + } + verifyActions(t, syncer.kubeClient, []testclient.Action{ + testclient.GetActionImpl{ActionImpl: testclient.ActionImpl{Verb: "get", Resource: "pods"}}, + }) +} + // shuffle returns a new shuffled list of container statuses. func shuffle(statuses []api.ContainerStatus) []api.ContainerStatus { numStatuses := len(statuses) From edb108802d05f17c3c534a2c2a7d2909f365132a Mon Sep 17 00:00:00 2001 From: Clayton Coleman Date: Thu, 30 Jul 2015 00:31:13 -0400 Subject: [PATCH 5/8] Handle gracefully terminated pods in node controller Eviction should retry longer and wait for completion of the pod. --- cmd/integration/integration.go | 2 +- .../app/controllermanager.go | 2 +- .../controllermanager/controllermanager.go | 2 +- pkg/controller/node/nodecontroller.go | 165 ++++++++++++---- pkg/controller/node/nodecontroller_test.go | 23 ++- pkg/controller/node/podevictor.go | 129 ------------ pkg/controller/node/rate_limited_queue.go | 183 ++++++++++++++++++ ...tor_test.go => rate_limited_queue_test.go} | 93 ++++++--- 8 files changed, 396 insertions(+), 203 deletions(-) delete mode 100644 pkg/controller/node/podevictor.go create mode 100644 pkg/controller/node/rate_limited_queue.go rename pkg/controller/node/{podevictor_test.go => rate_limited_queue_test.go} (67%) diff --git a/cmd/integration/integration.go b/cmd/integration/integration.go index d314a149101ea..44715dbac771f 100644 --- a/cmd/integration/integration.go +++ b/cmd/integration/integration.go @@ -196,7 +196,7 @@ func startComponents(firstManifestURL, secondManifestURL, apiVersion string) (st // TODO: Write an integration test for the replication controllers watch. go controllerManager.Run(3, util.NeverStop) - nodeController := nodecontroller.NewNodeController(nil, cl, 5*time.Minute, nodecontroller.NewPodEvictor(util.NewFakeRateLimiter()), + nodeController := nodecontroller.NewNodeController(nil, cl, 5*time.Minute, util.NewFakeRateLimiter(), 40*time.Second, 60*time.Second, 5*time.Second, nil, false) nodeController.Run(5 * time.Second) cadvisorInterface := new(cadvisor.Fake) diff --git a/cmd/kube-controller-manager/app/controllermanager.go b/cmd/kube-controller-manager/app/controllermanager.go index 935224d9d03e9..7deea27fb0919 100644 --- a/cmd/kube-controller-manager/app/controllermanager.go +++ b/cmd/kube-controller-manager/app/controllermanager.go @@ -196,7 +196,7 @@ func (s *CMServer) Run(_ []string) error { } nodeController := nodecontroller.NewNodeController(cloud, kubeClient, - s.PodEvictionTimeout, nodecontroller.NewPodEvictor(util.NewTokenBucketRateLimiter(s.DeletingPodsQps, s.DeletingPodsBurst)), + s.PodEvictionTimeout, util.NewTokenBucketRateLimiter(s.DeletingPodsQps, s.DeletingPodsBurst), s.NodeMonitorGracePeriod, s.NodeStartupGracePeriod, s.NodeMonitorPeriod, &s.ClusterCIDR, s.AllocateNodeCIDRs) nodeController.Run(s.NodeSyncPeriod) diff --git a/contrib/mesos/pkg/controllermanager/controllermanager.go b/contrib/mesos/pkg/controllermanager/controllermanager.go index f354430640da5..ad6aa59408a27 100644 --- a/contrib/mesos/pkg/controllermanager/controllermanager.go +++ b/contrib/mesos/pkg/controllermanager/controllermanager.go @@ -123,7 +123,7 @@ func (s *CMServer) Run(_ []string) error { } nodeController := nodecontroller.NewNodeController(cloud, kubeClient, - s.PodEvictionTimeout, nodecontroller.NewPodEvictor(util.NewTokenBucketRateLimiter(s.DeletingPodsQps, s.DeletingPodsBurst)), + s.PodEvictionTimeout, util.NewTokenBucketRateLimiter(s.DeletingPodsQps, s.DeletingPodsBurst), s.NodeMonitorGracePeriod, s.NodeStartupGracePeriod, s.NodeMonitorPeriod, (*net.IPNet)(&s.ClusterCIDR), s.AllocateNodeCIDRs) nodeController.Run(s.NodeSyncPeriod) diff --git a/pkg/controller/node/nodecontroller.go b/pkg/controller/node/nodecontroller.go index 22fa7f89c42bb..aa0f3e62ce79c 100644 --- a/pkg/controller/node/nodecontroller.go +++ b/pkg/controller/node/nodecontroller.go @@ -89,8 +89,11 @@ type NodeController struct { nodeStatusMap map[string]nodeStatusData now func() util.Time // worker that evicts pods from unresponsive nodes. - podEvictor *PodEvictor + podEvictor *RateLimitedTimedQueue + terminationEvictor *RateLimitedTimedQueue podEvictionTimeout time.Duration + // The maximum duration before a pod evicted from a node can be forcefully terminated. + maximumGracePeriod time.Duration recorder record.EventRecorder } @@ -99,7 +102,7 @@ func NewNodeController( cloud cloudprovider.Interface, kubeClient client.Interface, podEvictionTimeout time.Duration, - podEvictor *PodEvictor, + podEvictionLimiter util.RateLimiter, nodeMonitorGracePeriod time.Duration, nodeStartupGracePeriod time.Duration, nodeMonitorPeriod time.Duration, @@ -123,7 +126,9 @@ func NewNodeController( kubeClient: kubeClient, recorder: recorder, podEvictionTimeout: podEvictionTimeout, - podEvictor: podEvictor, + maximumGracePeriod: 5 * time.Minute, + podEvictor: NewRateLimitedTimedQueue(podEvictionLimiter, false), + terminationEvictor: NewRateLimitedTimedQueue(podEvictionLimiter, false), nodeStatusMap: make(map[string]nodeStatusData), nodeMonitorGracePeriod: nodeMonitorGracePeriod, nodeMonitorPeriod: nodeMonitorPeriod, @@ -145,38 +150,36 @@ func (nc *NodeController) Run(period time.Duration) { }, nc.nodeMonitorPeriod) go util.Forever(func() { - nc.podEvictor.TryEvict(func(nodeName string) { nc.deletePods(nodeName) }) + nc.podEvictor.Try(func(value TimedValue) (bool, time.Duration) { + remaining, err := nc.deletePods(value.Value) + if err != nil { + util.HandleError(fmt.Errorf("unable to evict node %q: %v", value.Value, err)) + return false, 0 + } + if remaining { + glog.V(2).Infof("Pods terminating on %q", value.Value) + nc.terminationEvictor.Add(value.Value) + } + return true, 0 + }) }, nodeEvictionPeriod) -} -// We observed a Node deletion in etcd. Currently we only need to remove Pods that -// were assigned to it. -func (nc *NodeController) deleteNode(nodeID string) error { - return nc.deletePods(nodeID) -} - -// deletePods will delete all pods from master running on given node. -func (nc *NodeController) deletePods(nodeID string) error { - glog.V(2).Infof("Delete all pods from %v", nodeID) - pods, err := nc.kubeClient.Pods(api.NamespaceAll).List(labels.Everything(), - fields.OneTermEqualSelector(client.PodHost, nodeID)) - if err != nil { - return err - } - nc.recordNodeEvent(nodeID, "DeletingAllPods", fmt.Sprintf("Deleting all Pods from Node %v.", nodeID)) - for _, pod := range pods.Items { - // Defensive check, also needed for tests. - if pod.Spec.NodeName != nodeID { - continue - } - glog.V(2).Infof("Delete pod %v", pod.Name) - nc.recorder.Eventf(&pod, "NodeControllerEviction", "Deleting Pod %s from Node %s", pod.Name, nodeID) - if err := nc.kubeClient.Pods(pod.Namespace).Delete(pod.Name, nil); err != nil { - glog.Errorf("Error deleting pod %v: %v", pod.Name, err) - } - } - - return nil + // TODO: replace with a controller that ensures pods that are terminating complete + // in a particular time period + go util.Forever(func() { + nc.terminationEvictor.Try(func(value TimedValue) (bool, time.Duration) { + remaining, err := nc.terminatePods(value.Value, value.Added) + if err != nil { + util.HandleError(fmt.Errorf("unable to terminate pods on node %q: %v", value.Value, err)) + return false, 0 + } + if remaining != 0 { + glog.V(2).Infof("Pods still terminating on %q, estimated completion %s", value.Value, remaining) + return false, remaining + } + return true, 0 + }) + }, nodeEvictionPeriod) } // Generates num pod CIDRs that could be assigned to nodes. @@ -271,18 +274,18 @@ func (nc *NodeController) monitorNodeStatus() error { // Check eviction timeout against decisionTimestamp if lastReadyCondition.Status == api.ConditionFalse && decisionTimestamp.After(nc.nodeStatusMap[node.Name].readyTransitionTimestamp.Add(nc.podEvictionTimeout)) { - if nc.podEvictor.AddNodeToEvict(node.Name) { + if nc.podEvictor.Add(node.Name) { glog.Infof("Adding pods to evict: %v is later than %v + %v", decisionTimestamp, nc.nodeStatusMap[node.Name].readyTransitionTimestamp, nc.podEvictionTimeout) } } if lastReadyCondition.Status == api.ConditionUnknown && decisionTimestamp.After(nc.nodeStatusMap[node.Name].probeTimestamp.Add(nc.podEvictionTimeout-gracePeriod)) { - if nc.podEvictor.AddNodeToEvict(node.Name) { + if nc.podEvictor.Add(node.Name) { glog.Infof("Adding pods to evict2: %v is later than %v + %v", decisionTimestamp, nc.nodeStatusMap[node.Name].readyTransitionTimestamp, nc.podEvictionTimeout-gracePeriod) } } if lastReadyCondition.Status == api.ConditionTrue { - if nc.podEvictor.RemoveNodeToEvict(node.Name) { + if nc.podEvictor.Remove(node.Name) { glog.Infof("Pods on %v won't be evicted", node.Name) } } @@ -302,8 +305,8 @@ func (nc *NodeController) monitorNodeStatus() error { } if _, err := instances.ExternalID(node.Name); err != nil && err == cloudprovider.InstanceNotFound { glog.Infof("Deleting node (no longer present in cloud provider): %s", node.Name) - nc.recordNodeEvent(node.Name, "DeleteingNode", fmt.Sprintf("Deleting Node %v because it's not present according to cloud provider", node.Name)) - if err := nc.deletePods(node.Name); err != nil { + nc.recordNodeEvent(node.Name, "DeletingNode", fmt.Sprintf("Deleting Node %v because it's not present according to cloud provider", node.Name)) + if _, err := nc.deletePods(node.Name); err != nil { glog.Errorf("Unable to delete pods from node %s: %v", node.Name, err) continue } @@ -311,6 +314,7 @@ func (nc *NodeController) monitorNodeStatus() error { glog.Errorf("Unable to delete node %s: %v", node.Name, err) continue } + nc.podEvictor.Add(node.Name) } } } @@ -502,3 +506,88 @@ func (nc *NodeController) tryUpdateNodeStatus(node *api.Node) (time.Duration, ap return gracePeriod, lastReadyCondition, readyCondition, err } + +// We observed a Node deletion in etcd. Currently we only need to remove Pods that +// were assigned to it. +func (nc *NodeController) deleteNode(nodeID string) error { + nc.podEvictor.Add(nodeID) + return nil +} + +// deletePods will delete all pods from master running on given node, and return true +// if any pods were deleted. +func (nc *NodeController) deletePods(nodeID string) (bool, error) { + remaining := false + glog.V(2).Infof("Delete all pods from %s", nodeID) + pods, err := nc.kubeClient.Pods(api.NamespaceAll).List(labels.Everything(), + fields.OneTermEqualSelector(client.PodHost, nodeID)) + if err != nil { + return remaining, err + } + + nc.recordNodeEvent(nodeID, "DeletingAllPods", fmt.Sprintf("Deleting all Pods from Node %v.", nodeID)) + + for _, pod := range pods.Items { + // Defensive check, also needed for tests. + if pod.Spec.NodeName != nodeID { + continue + } + // if the pod has already been deleted, ignore it + if pod.DeletionGracePeriodSeconds != nil { + continue + } + + glog.V(2).Infof("Delete pod %v", pod.Name) + nc.recorder.Eventf(&pod, "NodeControllerEviction", "Deleting Pod %s from Node %s", pod.Name, nodeID) + if err := nc.kubeClient.Pods(pod.Namespace).Delete(pod.Name, nil); err != nil { + return false, err + } + remaining = true + } + return remaining, nil +} + +// terminatePods will ensure all pods on the given node that are in terminating state are eventually +// cleaned up +func (nc *NodeController) terminatePods(nodeID string, since time.Time) (time.Duration, error) { + remaining := time.Duration(0) + glog.V(2).Infof("Terminating all pods on %s", nodeID) + pods, err := nc.kubeClient.Pods(api.NamespaceAll).List(labels.Everything(), + fields.OneTermEqualSelector(client.PodHost, nodeID)) + if err != nil { + return remaining, err + } + + nc.recordNodeEvent(nodeID, "TerminatingAllPods", fmt.Sprintf("Terminating all Pods on Node %s.", nodeID)) + + now := time.Now() + elapsed := now.Sub(since) + for _, pod := range pods.Items { + // Defensive check, also needed for tests. + if pod.Spec.NodeName != nodeID { + continue + } + // only clean terminated pods + if pod.DeletionGracePeriodSeconds == nil { + continue + } + + grace := time.Duration(*pod.DeletionGracePeriodSeconds) * time.Second + if grace > nc.maximumGracePeriod { + grace = nc.maximumGracePeriod + } + next := grace - elapsed + if next < 0 { + glog.V(2).Infof("Removing pod %v after %s grace period", pod.Name, grace) + nc.recordNodeEvent(nodeID, "TerminatingEvictedPod", fmt.Sprintf("Pod %s has exceeded the grace period for deletion after being evicted from Node %q and is being force killed", pod.Name, nodeID)) + if err := nc.kubeClient.Pods(pod.Namespace).Delete(pod.Name, api.NewDeleteOptions(0)); err != nil { + glog.Errorf("Error completing deletion of pod %s: %v", pod.Name, err) + next = 1 + } + } + if remaining < next { + remaining = next + } + } + return remaining, nil +} diff --git a/pkg/controller/node/nodecontroller_test.go b/pkg/controller/node/nodecontroller_test.go index af73e2fc03dd1..6239673967193 100644 --- a/pkg/controller/node/nodecontroller_test.go +++ b/pkg/controller/node/nodecontroller_test.go @@ -324,9 +324,8 @@ func TestMonitorNodeStatusEvictPods(t *testing.T) { } for _, item := range table { - podEvictor := NewPodEvictor(util.NewFakeRateLimiter()) nodeController := NewNodeController(nil, item.fakeNodeHandler, - evictionTimeout, podEvictor, testNodeMonitorGracePeriod, + evictionTimeout, util.NewFakeRateLimiter(), testNodeMonitorGracePeriod, testNodeStartupGracePeriod, testNodeMonitorPeriod, nil, false) nodeController.now = func() util.Time { return fakeNow } if err := nodeController.monitorNodeStatus(); err != nil { @@ -340,7 +339,17 @@ func TestMonitorNodeStatusEvictPods(t *testing.T) { t.Errorf("unexpected error: %v", err) } - podEvictor.TryEvict(func(nodeName string) { nodeController.deletePods(nodeName) }) + nodeController.podEvictor.Try(func(value TimedValue) (bool, time.Duration) { + remaining, _ := nodeController.deletePods(value.Value) + if remaining { + nodeController.terminationEvictor.Add(value.Value) + } + return true, 0 + }) + nodeController.podEvictor.Try(func(value TimedValue) (bool, time.Duration) { + nodeController.terminatePods(value.Value, value.Added) + return true, 0 + }) podEvicted := false for _, action := range item.fakeNodeHandler.Actions() { if action.GetVerb() == "delete" && action.GetResource() == "pods" { @@ -531,7 +540,7 @@ func TestMonitorNodeStatusUpdateStatus(t *testing.T) { } for _, item := range table { - nodeController := NewNodeController(nil, item.fakeNodeHandler, 5*time.Minute, NewPodEvictor(util.NewFakeRateLimiter()), + nodeController := NewNodeController(nil, item.fakeNodeHandler, 5*time.Minute, util.NewFakeRateLimiter(), testNodeMonitorGracePeriod, testNodeStartupGracePeriod, testNodeMonitorPeriod, nil, false) nodeController.now = func() util.Time { return fakeNow } if err := nodeController.monitorNodeStatus(); err != nil { @@ -610,7 +619,7 @@ func TestNodeDeletion(t *testing.T) { Fake: testclient.NewSimpleFake(&api.PodList{Items: []api.Pod{*newPod("pod0", "node0"), *newPod("pod1", "node1")}}), } - nodeController := NewNodeController(nil, fakeNodeHandler, 5*time.Minute, NewPodEvictor(util.NewFakeRateLimiter()), + nodeController := NewNodeController(nil, fakeNodeHandler, 5*time.Minute, util.NewFakeRateLimiter(), testNodeMonitorGracePeriod, testNodeStartupGracePeriod, testNodeMonitorPeriod, nil, false) nodeController.now = func() util.Time { return fakeNow } if err := nodeController.monitorNodeStatus(); err != nil { @@ -620,6 +629,10 @@ func TestNodeDeletion(t *testing.T) { if err := nodeController.monitorNodeStatus(); err != nil { t.Errorf("unexpected error: %v", err) } + nodeController.podEvictor.Try(func(value TimedValue) (bool, time.Duration) { + nodeController.deletePods(value.Value) + return true, 0 + }) podEvicted := false for _, action := range fakeNodeHandler.Actions() { if action.GetVerb() == "delete" && action.GetResource() == "pods" { diff --git a/pkg/controller/node/podevictor.go b/pkg/controller/node/podevictor.go deleted file mode 100644 index 1c66d0a211edf..0000000000000 --- a/pkg/controller/node/podevictor.go +++ /dev/null @@ -1,129 +0,0 @@ -/* -Copyright 2015 The Kubernetes Authors All rights reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. -*/ - -package nodecontroller - -import ( - "sync" - - "k8s.io/kubernetes/pkg/util" - - "github.com/golang/glog" -) - -// A FIFO queue which additionally guarantees that any element can be added only once until -// it is removed. -type UniqueQueue struct { - lock sync.Mutex - queue []string - set util.StringSet -} - -// Entity responsible for evicting Pods from inserted Nodes. It uses RateLimiter to avoid -// evicting everything at once. Note that we rate limit eviction of Nodes not individual Pods. -type PodEvictor struct { - queue UniqueQueue - deletingPodsRateLimiter util.RateLimiter -} - -// Adds a new value to the queue if it wasn't added before, or was explicitly removed by the -// Remove call. Returns true if new value was added. -func (q *UniqueQueue) Add(value string) bool { - q.lock.Lock() - defer q.lock.Unlock() - - if !q.set.Has(value) { - q.queue = append(q.queue, value) - q.set.Insert(value) - return true - } else { - return false - } -} - -// Removes the value from the queue, so Get() call won't return it, and allow subsequent addition -// of the given value. If the value is not present does nothing and returns false. -func (q *UniqueQueue) Remove(value string) bool { - q.lock.Lock() - defer q.lock.Unlock() - - q.set.Delete(value) - for i, val := range q.queue { - if val == value { - if i > 0 && i < len(q.queue)-1 { - q.queue = append(q.queue[0:i], q.queue[i+1:len(q.queue)]...) - } else if i > 0 { - q.queue = q.queue[0 : len(q.queue)-1] - } else { - q.queue = q.queue[1:len(q.queue)] - } - return true - } - } - return false -} - -// Returns the oldest added value that wasn't returned yet. -func (q *UniqueQueue) Get() (string, bool) { - q.lock.Lock() - defer q.lock.Unlock() - if len(q.queue) == 0 { - return "", false - } - - result := q.queue[0] - q.queue = q.queue[1:len(q.queue)] - return result, true -} - -// Creates new PodEvictor which will use given RateLimiter to oversee eviction. -func NewPodEvictor(deletingPodsRateLimiter util.RateLimiter) *PodEvictor { - return &PodEvictor{ - queue: UniqueQueue{ - queue: make([]string, 0), - set: util.NewStringSet(), - }, - deletingPodsRateLimiter: deletingPodsRateLimiter, - } -} - -// Tries to evict all Pods from previously inserted Nodes. Ends prematurely if RateLimiter forbids any eviction. -// Each Node is processed only once, as long as it's not Removed, i.e. calling multiple AddNodeToEvict does not result -// with multiple evictions as long as RemoveNodeToEvict is not called. -func (pe *PodEvictor) TryEvict(delFunc func(string)) { - val, ok := pe.queue.Get() - for ok { - if pe.deletingPodsRateLimiter.CanAccept() { - glog.Infof("PodEvictor is evicting Pods on Node: %v", val) - delFunc(val) - } else { - glog.V(1).Info("PodEvictor is rate limitted.") - break - } - val, ok = pe.queue.Get() - } -} - -// Adds Node to the Evictor to be processed later. Won't add the same Node second time if it was already -// added and not removed. -func (pe *PodEvictor) AddNodeToEvict(nodeName string) bool { - return pe.queue.Add(nodeName) -} - -// Removes Node from the Evictor. The Node won't be processed until added again. -func (pe *PodEvictor) RemoveNodeToEvict(nodeName string) bool { - return pe.queue.Remove(nodeName) -} diff --git a/pkg/controller/node/rate_limited_queue.go b/pkg/controller/node/rate_limited_queue.go new file mode 100644 index 0000000000000..a30d62a1a8aad --- /dev/null +++ b/pkg/controller/node/rate_limited_queue.go @@ -0,0 +1,183 @@ +/* +Copyright 2015 The Kubernetes Authors All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package nodecontroller + +import ( + "container/heap" + "sync" + "time" + + "k8s.io/kubernetes/pkg/util" +) + +// TimedValue is a value that should be processed at a designated time. +type TimedValue struct { + Value string + Added time.Time + Next time.Time +} + +// now is used to test time +var now func() time.Time = time.Now + +// TimedQueue is a priority heap where the lowest Next is at the front of the queue +type TimedQueue []*TimedValue + +func (h TimedQueue) Len() int { return len(h) } +func (h TimedQueue) Less(i, j int) bool { return h[i].Next.Before(h[j].Next) } +func (h TimedQueue) Swap(i, j int) { h[i], h[j] = h[j], h[i] } + +func (h *TimedQueue) Push(x interface{}) { + *h = append(*h, x.(*TimedValue)) +} + +func (h *TimedQueue) Pop() interface{} { + old := *h + n := len(old) + x := old[n-1] + *h = old[0 : n-1] + return x +} + +// A FIFO queue which additionally guarantees that any element can be added only once until +// it is removed. +type UniqueQueue struct { + lock sync.Mutex + queue TimedQueue + set util.StringSet +} + +// Adds a new value to the queue if it wasn't added before, or was explicitly removed by the +// Remove call. Returns true if new value was added. +func (q *UniqueQueue) Add(value TimedValue) bool { + q.lock.Lock() + defer q.lock.Unlock() + + if q.set.Has(value.Value) { + return false + } + heap.Push(&q.queue, &value) + q.set.Insert(value.Value) + return true +} + +// Removes the value from the queue, so Get() call won't return it, and allow subsequent addition +// of the given value. If the value is not present does nothing and returns false. +func (q *UniqueQueue) Remove(value string) bool { + q.lock.Lock() + defer q.lock.Unlock() + + q.set.Delete(value) + for i, val := range q.queue { + if val.Value == value { + if i > 0 && i < len(q.queue)-1 { + q.queue = append(q.queue[0:i], q.queue[i+1:len(q.queue)]...) + } else if i > 0 { + q.queue = q.queue[0 : len(q.queue)-1] + } else { + q.queue = q.queue[1:len(q.queue)] + } + return true + } + } + return false +} + +// Returns the oldest added value that wasn't returned yet. +func (q *UniqueQueue) Get() (TimedValue, bool) { + q.lock.Lock() + defer q.lock.Unlock() + if len(q.queue) == 0 { + return TimedValue{}, false + } + result := q.queue.Pop().(*TimedValue) + q.set.Delete(result.Value) + return *result, true +} + +// RateLimitedTimedQueue is a unique item priority queue ordered by the expected next time +// of execution. It is also rate limited. +type RateLimitedTimedQueue struct { + queue UniqueQueue + limiter util.RateLimiter + leak bool +} + +// Creates new queue which will use given RateLimiter to oversee execution. If leak is true, +// items which are rate limited will be leakped. Otherwise, rate limited items will be requeued. +func NewRateLimitedTimedQueue(limiter util.RateLimiter, leak bool) *RateLimitedTimedQueue { + return &RateLimitedTimedQueue{ + queue: UniqueQueue{ + queue: TimedQueue{}, + set: util.NewStringSet(), + }, + limiter: limiter, + leak: leak, + } +} + +// ActionFunc takes a timed value and returns false if the item must be retried, with an optional +// time.Duration if some minimum wait interval should be used. +type ActionFunc func(TimedValue) (bool, time.Duration) + +// Try processes the queue. Ends prematurely if RateLimiter forbids an action and leak is true. +// Otherwise, requeues the item to be processed. Each value is processed once if fn returns true, +// otherwise it is added back to the queue. The returned remaining is used to identify the minimum +// time to execute the next item in the queue. +func (q *RateLimitedTimedQueue) Try(fn ActionFunc) { + val, ok := q.queue.Get() + for ok { + // rate limit the queue checking + if q.leak { + if !q.limiter.CanAccept() { + break + } + } else { + q.limiter.Accept() + } + + now := now() + if now.Before(val.Next) { + q.queue.Add(val) + val, ok = q.queue.Get() + // we do not sleep here because other values may be added at the front of the queue + continue + } + + if ok, wait := fn(val); !ok { + val.Next = now.Add(wait + 1) + q.queue.Add(val) + } + val, ok = q.queue.Get() + } +} + +// Adds value to the queue to be processed. Won't add the same value a second time if it was already +// added and not removed. +func (q *RateLimitedTimedQueue) Add(value string) bool { + now := now() + return q.queue.Add(TimedValue{ + Value: value, + Added: now, + Next: now, + }) +} + +// Removes Node from the Evictor. The Node won't be processed until added again. +func (q *RateLimitedTimedQueue) Remove(value string) bool { + return q.queue.Remove(value) +} diff --git a/pkg/controller/node/podevictor_test.go b/pkg/controller/node/rate_limited_queue_test.go similarity index 67% rename from pkg/controller/node/podevictor_test.go rename to pkg/controller/node/rate_limited_queue_test.go index dd9532efff7df..d57baf9d24df2 100644 --- a/pkg/controller/node/podevictor_test.go +++ b/pkg/controller/node/rate_limited_queue_test.go @@ -17,14 +17,16 @@ limitations under the License. package nodecontroller import ( + "reflect" "testing" + "time" "k8s.io/kubernetes/pkg/util" ) -func CheckQueueEq(lhs, rhs []string) bool { +func CheckQueueEq(lhs []string, rhs TimedQueue) bool { for i := 0; i < len(lhs); i++ { - if rhs[i] != lhs[i] { + if rhs[i].Value != lhs[i] { return false } } @@ -36,10 +38,10 @@ func CheckSetEq(lhs, rhs util.StringSet) bool { } func TestAddNode(t *testing.T) { - evictor := NewPodEvictor(util.NewFakeRateLimiter()) - evictor.AddNodeToEvict("first") - evictor.AddNodeToEvict("second") - evictor.AddNodeToEvict("third") + evictor := NewRateLimitedTimedQueue(util.NewFakeRateLimiter(), true) + evictor.Add("first") + evictor.Add("second") + evictor.Add("third") queuePattern := []string{"first", "second", "third"} if len(evictor.queue.queue) != len(queuePattern) { @@ -59,11 +61,11 @@ func TestAddNode(t *testing.T) { } func TestDelNode(t *testing.T) { - evictor := NewPodEvictor(util.NewFakeRateLimiter()) - evictor.AddNodeToEvict("first") - evictor.AddNodeToEvict("second") - evictor.AddNodeToEvict("third") - evictor.RemoveNodeToEvict("first") + evictor := NewRateLimitedTimedQueue(util.NewFakeRateLimiter(), true) + evictor.Add("first") + evictor.Add("second") + evictor.Add("third") + evictor.Remove("first") queuePattern := []string{"second", "third"} if len(evictor.queue.queue) != len(queuePattern) { @@ -81,11 +83,11 @@ func TestDelNode(t *testing.T) { t.Errorf("Invalid map. Got %v, expected %v", evictor.queue.set, setPattern) } - evictor = NewPodEvictor(util.NewFakeRateLimiter()) - evictor.AddNodeToEvict("first") - evictor.AddNodeToEvict("second") - evictor.AddNodeToEvict("third") - evictor.RemoveNodeToEvict("second") + evictor = NewRateLimitedTimedQueue(util.NewFakeRateLimiter(), true) + evictor.Add("first") + evictor.Add("second") + evictor.Add("third") + evictor.Remove("second") queuePattern = []string{"first", "third"} if len(evictor.queue.queue) != len(queuePattern) { @@ -103,11 +105,11 @@ func TestDelNode(t *testing.T) { t.Errorf("Invalid map. Got %v, expected %v", evictor.queue.set, setPattern) } - evictor = NewPodEvictor(util.NewFakeRateLimiter()) - evictor.AddNodeToEvict("first") - evictor.AddNodeToEvict("second") - evictor.AddNodeToEvict("third") - evictor.RemoveNodeToEvict("third") + evictor = NewRateLimitedTimedQueue(util.NewFakeRateLimiter(), true) + evictor.Add("first") + evictor.Add("second") + evictor.Add("third") + evictor.Remove("third") queuePattern = []string{"first", "second"} if len(evictor.queue.queue) != len(queuePattern) { @@ -126,15 +128,18 @@ func TestDelNode(t *testing.T) { } } -func TestEvictNode(t *testing.T) { - evictor := NewPodEvictor(util.NewFakeRateLimiter()) - evictor.AddNodeToEvict("first") - evictor.AddNodeToEvict("second") - evictor.AddNodeToEvict("third") - evictor.RemoveNodeToEvict("second") +func TestTry(t *testing.T) { + evictor := NewRateLimitedTimedQueue(util.NewFakeRateLimiter(), true) + evictor.Add("first") + evictor.Add("second") + evictor.Add("third") + evictor.Remove("second") deletedMap := util.NewStringSet() - evictor.TryEvict(func(nodeName string) { deletedMap.Insert(nodeName) }) + evictor.Try(func(value TimedValue) (bool, time.Duration) { + deletedMap.Insert(value.Value) + return true, 0 + }) setPattern := util.NewStringSet("first", "third") if len(deletedMap) != len(setPattern) { @@ -144,3 +149,35 @@ func TestEvictNode(t *testing.T) { t.Errorf("Invalid map. Got %v, expected %v", deletedMap, setPattern) } } + +func TestTryOrdering(t *testing.T) { + evictor := NewRateLimitedTimedQueue(util.NewFakeRateLimiter(), false) + evictor.Add("first") + evictor.Add("second") + evictor.Add("third") + + order := []string{} + count := 0 + queued := false + evictor.Try(func(value TimedValue) (bool, time.Duration) { + count++ + if value.Added.IsZero() { + t.Fatalf("added should not be zero") + } + if value.Next.IsZero() { + t.Fatalf("next should not be zero") + } + if !queued && value.Value == "second" { + queued = true + return false, time.Millisecond + } + order = append(order, value.Value) + return true, 0 + }) + if reflect.DeepEqual(order, []string{"first", "third", "second"}) { + t.Fatalf("order was wrong: %v", order) + } + if count != 4 { + t.Fatalf("unexpected iterations: %d", count) + } +} From adc97bf93617ddf2b20016c02f0e6332d8503a4f Mon Sep 17 00:00:00 2001 From: Clayton Coleman Date: Thu, 13 Aug 2015 17:48:56 -0400 Subject: [PATCH 6/8] Conversions --- hack/verify-flags/known-flags.txt | 1 + pkg/expapi/deep_copy_generated.go | 6 ++++++ pkg/expapi/v1/conversion_generated.go | 12 ++++++++++++ pkg/expapi/v1/deep_copy_generated.go | 6 ++++++ 4 files changed, 25 insertions(+) diff --git a/hack/verify-flags/known-flags.txt b/hack/verify-flags/known-flags.txt index e843d901ec209..c6e4467647476 100644 --- a/hack/verify-flags/known-flags.txt +++ b/hack/verify-flags/known-flags.txt @@ -51,6 +51,7 @@ current-release-pr current-replicas default-container-cpu-limit default-container-mem-limit +delay-shutdown deleting-pods-burst deleting-pods-qps deployment-label-key diff --git a/pkg/expapi/deep_copy_generated.go b/pkg/expapi/deep_copy_generated.go index 0ebee13fc50ae..53276c2fcdd32 100644 --- a/pkg/expapi/deep_copy_generated.go +++ b/pkg/expapi/deep_copy_generated.go @@ -53,6 +53,12 @@ func deepCopy_api_ObjectMeta(in api.ObjectMeta, out *api.ObjectMeta, c *conversi } else { out.DeletionTimestamp = nil } + if in.DeletionGracePeriodSeconds != nil { + out.DeletionGracePeriodSeconds = new(int64) + *out.DeletionGracePeriodSeconds = *in.DeletionGracePeriodSeconds + } else { + out.DeletionGracePeriodSeconds = nil + } if in.Labels != nil { out.Labels = make(map[string]string) for key, val := range in.Labels { diff --git a/pkg/expapi/v1/conversion_generated.go b/pkg/expapi/v1/conversion_generated.go index 875868e4947ad..dd50f74c1510f 100644 --- a/pkg/expapi/v1/conversion_generated.go +++ b/pkg/expapi/v1/conversion_generated.go @@ -57,6 +57,12 @@ func convert_api_ObjectMeta_To_v1_ObjectMeta(in *api.ObjectMeta, out *v1.ObjectM } else { out.DeletionTimestamp = nil } + if in.DeletionGracePeriodSeconds != nil { + out.DeletionGracePeriodSeconds = new(int64) + *out.DeletionGracePeriodSeconds = *in.DeletionGracePeriodSeconds + } else { + out.DeletionGracePeriodSeconds = nil + } if in.Labels != nil { out.Labels = make(map[string]string) for key, val := range in.Labels { @@ -115,6 +121,12 @@ func convert_v1_ObjectMeta_To_api_ObjectMeta(in *v1.ObjectMeta, out *api.ObjectM } else { out.DeletionTimestamp = nil } + if in.DeletionGracePeriodSeconds != nil { + out.DeletionGracePeriodSeconds = new(int64) + *out.DeletionGracePeriodSeconds = *in.DeletionGracePeriodSeconds + } else { + out.DeletionGracePeriodSeconds = nil + } if in.Labels != nil { out.Labels = make(map[string]string) for key, val := range in.Labels { diff --git a/pkg/expapi/v1/deep_copy_generated.go b/pkg/expapi/v1/deep_copy_generated.go index 2543a796d95cf..1dc045e9445a7 100644 --- a/pkg/expapi/v1/deep_copy_generated.go +++ b/pkg/expapi/v1/deep_copy_generated.go @@ -70,6 +70,12 @@ func deepCopy_v1_ObjectMeta(in v1.ObjectMeta, out *v1.ObjectMeta, c *conversion. } else { out.DeletionTimestamp = nil } + if in.DeletionGracePeriodSeconds != nil { + out.DeletionGracePeriodSeconds = new(int64) + *out.DeletionGracePeriodSeconds = *in.DeletionGracePeriodSeconds + } else { + out.DeletionGracePeriodSeconds = nil + } if in.Labels != nil { out.Labels = make(map[string]string) for key, val := range in.Labels { From 5ddcef24a62e81085250d80b2aa61f5d19750546 Mon Sep 17 00:00:00 2001 From: Clayton Coleman Date: Fri, 14 Aug 2015 16:00:21 -0400 Subject: [PATCH 7/8] Fix failing test --- pkg/storage/cacher_test.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pkg/storage/cacher_test.go b/pkg/storage/cacher_test.go index 0603ec3465d9d..edbcaf32caf43 100644 --- a/pkg/storage/cacher_test.go +++ b/pkg/storage/cacher_test.go @@ -54,11 +54,13 @@ func newTestCacher(client tools.EtcdClient) *storage.Cacher { } func makeTestPod(name string) *api.Pod { + gracePeriod := int64(30) return &api.Pod{ ObjectMeta: api.ObjectMeta{Namespace: "ns", Name: name}, Spec: api.PodSpec{ - DNSPolicy: api.DNSClusterFirst, - RestartPolicy: api.RestartPolicyAlways, + TerminationGracePeriodSeconds: &gracePeriod, + DNSPolicy: api.DNSClusterFirst, + RestartPolicy: api.RestartPolicyAlways, }, } } From 266e6226e52b55f8a6d5278c1264299d7c713820 Mon Sep 17 00:00:00 2001 From: Clayton Coleman Date: Mon, 17 Aug 2015 16:00:06 -0400 Subject: [PATCH 8/8] Move slow-* items to test/images/network-tester --- {contrib/for-tests => test/images}/network-tester/slow-pod.json | 0 {contrib/for-tests => test/images}/network-tester/slow-rc.json | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename {contrib/for-tests => test/images}/network-tester/slow-pod.json (100%) rename {contrib/for-tests => test/images}/network-tester/slow-rc.json (100%) diff --git a/contrib/for-tests/network-tester/slow-pod.json b/test/images/network-tester/slow-pod.json similarity index 100% rename from contrib/for-tests/network-tester/slow-pod.json rename to test/images/network-tester/slow-pod.json diff --git a/contrib/for-tests/network-tester/slow-rc.json b/test/images/network-tester/slow-rc.json similarity index 100% rename from contrib/for-tests/network-tester/slow-rc.json rename to test/images/network-tester/slow-rc.json