Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Do not throw creation errors for containers that fail immediately after being started #23894

Merged
merged 2 commits into from
Apr 15, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 18 additions & 16 deletions pkg/util/oom/oom_linux.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@ package oom

import (
"fmt"
"io/ioutil"
"os"
"path"
"strconv"
Expand Down Expand Up @@ -49,16 +48,6 @@ func getPids(cgroupName string) ([]int, error) {
return fsManager.GetPids()
}

func syscallNotExists(err error) bool {
if err == nil {
return false
}
if e, ok := err.(*os.SyscallError); ok && os.IsNotExist(e) {
return true
}
return false
}

// Writes 'value' to /proc/<pid>/oom_score_adj. PID = 0 means self
// Returns os.ErrNotExist if the `pid` does not exist.
func applyOOMScoreAdj(pid int, oomScoreAdj int) error {
Expand All @@ -78,12 +67,19 @@ func applyOOMScoreAdj(pid int, oomScoreAdj int) error {
value := strconv.Itoa(oomScoreAdj)
var err error
for i := 0; i < maxTries; i++ {
if err = ioutil.WriteFile(oomScoreAdjPath, []byte(value), 0700); err != nil {
if syscallNotExists(err) {
f, err := os.Open(oomScoreAdjPath)
if err != nil {
if os.IsNotExist(err) {
return os.ErrNotExist
}
err = fmt.Errorf("failed to apply oom-score-adj to pid %d (%v)", pid, err)
continue
}
if _, err := f.Write([]byte(value)); err != nil {
err = fmt.Errorf("failed to apply oom-score-adj to pid %d (%v)", pid, err)
continue
}
return nil
}
return err
}
Expand All @@ -96,20 +92,26 @@ func (oomAdjuster *OOMAdjuster) applyOOMScoreAdjContainer(cgroupName string, oom
continueAdjusting := false
pidList, err := oomAdjuster.pidLister(cgroupName)
if err != nil {
if syscallNotExists(err) {
if os.IsNotExist(err) {
// Nothing to do since the container doesn't exist anymore.
return os.ErrNotExist
}
continueAdjusting = true
glog.Errorf("Error getting process list for cgroup %s: %+v", cgroupName, err)
glog.V(10).Infof("Error getting process list for cgroup %s: %+v", cgroupName, err)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

10 seems quite high - is that intentional?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah. Its because of the inherent race between starting a container and applying oom score asynchronously.

} else if len(pidList) == 0 {
glog.V(10).Infof("Pid list is empty")
continueAdjusting = true
} else {
for _, pid := range pidList {
if !adjustedProcessSet[pid] {
continueAdjusting = true
glog.V(10).Infof("pid %d needs to be set", pid)
if err = oomAdjuster.ApplyOOMScoreAdj(pid, oomScoreAdj); err == nil {
adjustedProcessSet[pid] = true
} else if err == os.ErrNotExist {
continue
} else {
glog.V(10).Infof("cannot adjust oom score for pid %d - %v", pid, err)
continueAdjusting = true
}
// Processes can come and go while we try to apply oom score adjust value. So ignore errors here.
}
Expand Down
30 changes: 12 additions & 18 deletions pkg/util/oom/oom_linux_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,10 @@ limitations under the License.
package oom

import (
"os"
"testing"

"github.com/stretchr/testify/assert"
)

// Converts a sequence of PID lists into a PID lister.
Expand Down Expand Up @@ -62,10 +65,9 @@ func applyOOMScoreAdjContainerTester(pidListSequence [][]int, maxTries int, appl
} else if err != nil {
return
}

// Check that OOM scores were applied to the right processes.
if len(appliedPids) != len(pidOOMs) {
t.Errorf("Applied OOM scores to incorrect number of processes")
t.Errorf("Applied OOM scores to incorrect number of processes - %+v vs %v", appliedPids, pidOOMs)
return
}
for _, pid := range appliedPids {
Expand All @@ -82,29 +84,21 @@ func TestOOMScoreAdjContainer(t *testing.T) {
pidListSequence1 := [][]int{
{1, 2},
}
applyOOMScoreAdjContainerTester(pidListSequence1, 1, nil, true, t)
applyOOMScoreAdjContainerTester(pidListSequence1, 2, []int{1, 2}, false, t)
applyOOMScoreAdjContainerTester(pidListSequence1, 3, []int{1, 2}, false, t)

pidListSequence3 := [][]int{
{1, 2},
{1, 2, 4, 5},
{2, 1, 4, 5, 3},
}
applyOOMScoreAdjContainerTester(pidListSequence3, 1, nil, true, t)
applyOOMScoreAdjContainerTester(pidListSequence3, 2, nil, true, t)
applyOOMScoreAdjContainerTester(pidListSequence3, 3, nil, true, t)
applyOOMScoreAdjContainerTester(pidListSequence3, 4, []int{1, 2, 3, 4, 5}, false, t)
applyOOMScoreAdjContainerTester(pidListSequence1, 1, []int{1, 2}, false, t)

pidListSequenceLag := [][]int{
{},
{},
{},
{1, 2, 4},
{1, 2, 4, 5},
}
for i := 1; i < 5; i++ {
for i := 1; i < 4; i++ {
applyOOMScoreAdjContainerTester(pidListSequenceLag, i, nil, true, t)
}
applyOOMScoreAdjContainerTester(pidListSequenceLag, 6, []int{1, 2, 4, 5}, false, t)
applyOOMScoreAdjContainerTester(pidListSequenceLag, 4, []int{1, 2, 4}, false, t)
}

func TestPidListerFailure(t *testing.T) {
_, err := getPids("/does/not/exist")
assert.True(t, os.IsNotExist(err), "expected getPids to return not exists error. Got %v", err)
}
2 changes: 1 addition & 1 deletion pkg/util/procfs/procfs.go
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ func (pfs *ProcFS) GetFullContainerName(pid int) (string, error) {
filePath := path.Join("/proc", strconv.Itoa(pid), "cgroup")
content, err := ioutil.ReadFile(filePath)
if err != nil {
if e, ok := err.(*os.SyscallError); ok && os.IsNotExist(e) {
if os.IsNotExist(err) {
return "", os.ErrNotExist
}
return "", err
Expand Down
92 changes: 92 additions & 0 deletions test/e2e_node/container_manager_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
/*
Copyright 2016 The Kubernetes Authors All rights reserved.

Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

http://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package e2e_node

import (
"fmt"
"time"

"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/client/restclient"
client "k8s.io/kubernetes/pkg/client/unversioned"

. "github.com/onsi/ginkgo"
. "github.com/onsi/gomega"
)

var _ = Describe("Kubelet Container Manager", func() {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not necessary in this PR, but all the other non-node e2e tests use KubeDescribe

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Acknowledged. There is a PR in flight to refactor the existing e2e framework to be re-used here. We can switch to kubeDescribe once that PR is in.

var cl *client.Client
BeforeEach(func() {
// Setup the apiserver client
cl = client.NewOrDie(&restclient.Config{Host: *apiServerAddress})
})
Describe("oom score adjusting", func() {
namespace := "oom-adj"
Context("when scheduling a busybox command that always fails in a pod", func() {
podName := "bin-false"
It("it should return succes", func() {
pod := &api.Pod{
ObjectMeta: api.ObjectMeta{
Name: podName,
Namespace: namespace,
},
Spec: api.PodSpec{
// Force the Pod to schedule to the node without a scheduler running
NodeName: *nodeName,
// Don't restart the Pod since it is expected to exit
RestartPolicy: api.RestartPolicyNever,
Containers: []api.Container{
{
Image: "gcr.io/google_containers/busybox:1.24",
Name: podName,
Command: []string{"/bin/false"},
},
},
},
}
_, err := cl.Pods(namespace).Create(pod)
Expect(err).To(BeNil(), fmt.Sprintf("Error creating Pod %v", err))
})

It("it should have an error terminated reason", func() {
Eventually(func() error {
podData, err := cl.Pods(namespace).Get(podName)
if err != nil {
return err
}
if len(podData.Status.ContainerStatuses) != 1 {
return fmt.Errorf("expected only one container in the pod %q", podName)
}
contTerminatedState := podData.Status.ContainerStatuses[0].State.Terminated
if contTerminatedState == nil {
return fmt.Errorf("expected state to be terminated. Got pod status: %+v", podData.Status)
}
if contTerminatedState.Reason != "Error" {
return fmt.Errorf("expected terminated state reason to be error. Got %+v", contTerminatedState)
}
return nil
}, time.Minute, time.Second*4).Should(BeNil())
})

It("it should be possible to delete", func() {
err := cl.Pods(namespace).Delete(podName, &api.DeleteOptions{})
Expect(err).To(BeNil(), fmt.Sprintf("Error deleting Pod %v", err))
})
})
})

})