From b95b30bbd74feff122c206946d44a635b37965e0 Mon Sep 17 00:00:00 2001 From: Jan Chaloupka Date: Thu, 12 May 2016 14:01:33 +0200 Subject: [PATCH] Scheduler: introduce CheckNodeMemoryPressurePredicate, don't schedule pods for nodes that reports memory pressury. Introduce unit-test for CheckNodeMemoryPressurePredicate Following work done in #14943 --- docs/devel/scheduler_algorithm.md | 1 + .../scheduler/algorithm/predicates/error.go | 1 + .../algorithm/predicates/predicates.go | 29 +++++ .../algorithm/predicates/predicates_test.go | 108 ++++++++++++++++++ .../algorithmprovider/defaults/defaults.go | 3 + 5 files changed, 142 insertions(+) diff --git a/docs/devel/scheduler_algorithm.md b/docs/devel/scheduler_algorithm.md index 63206c8b05639..7e79e24b2d7ad 100755 --- a/docs/devel/scheduler_algorithm.md +++ b/docs/devel/scheduler_algorithm.md @@ -48,6 +48,7 @@ The purpose of filtering the nodes is to filter out the nodes that do not meet c - `MatchNodeSelector`: Check if the labels of the node match the labels specified in the Pod's `nodeSelector` field and, as of Kubernetes v1.2, also match the `scheduler.alpha.kubernetes.io/affinity` pod annotation if present. See [here](../user-guide/node-selection/) for more details on both. - `MaxEBSVolumeCount`: Ensure that the number of attached ElasticBlockStore volumes does not exceed a maximum value (by default, 39, since Amazon recommends a maximum of 40 with one of those 40 reserved for the root volume -- see [Amazon's documentation](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/volume_limits.html#linux-specific-volume-limits)). The maximum value can be controlled by setting the `KUBE_MAX_PD_VOLS` environment variable. - `MaxGCEPDVolumeCount`: Ensure that the number of attached GCE PersistentDisk volumes does not exceed a maximum value (by default, 16, which is the maximum GCE allows -- see [GCE's documentation](https://cloud.google.com/compute/docs/disks/persistent-disks#limits_for_predefined_machine_types)). The maximum value can be controlled by setting the `KUBE_MAX_PD_VOLS` environment variable. +- `CheckNodeMemoryPressure`: Check if a pod can be scheduled on a node reporting memory pressure condition. Currently, no ``BestEffort`` should be placed on a node under memory pressure as it gets automatically evicted by kubelet. The details of the above predicates can be found in [plugin/pkg/scheduler/algorithm/predicates/predicates.go](http://releases.k8s.io/HEAD/plugin/pkg/scheduler/algorithm/predicates/predicates.go). All predicates mentioned above can be used in combination to perform a sophisticated filtering policy. Kubernetes uses some, but not all, of these predicates by default. You can see which ones are used by default in [plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go](http://releases.k8s.io/HEAD/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go). diff --git a/plugin/pkg/scheduler/algorithm/predicates/error.go b/plugin/pkg/scheduler/algorithm/predicates/error.go index b32305abe6533..fd4777ccde867 100644 --- a/plugin/pkg/scheduler/algorithm/predicates/error.go +++ b/plugin/pkg/scheduler/algorithm/predicates/error.go @@ -38,6 +38,7 @@ var ( ErrNodeLabelPresenceViolated = newPredicateFailureError("CheckNodeLabelPresence") ErrServiceAffinityViolated = newPredicateFailureError("CheckServiceAffinity") ErrMaxVolumeCountExceeded = newPredicateFailureError("MaxVolumeCount") + ErrNodeUnderMemoryPressure = newPredicateFailureError("NodeUnderMemoryPressure") // ErrFakePredicate is used for test only. The fake predicates returning false also returns error // as ErrFakePredicate. ErrFakePredicate = newPredicateFailureError("FakePredicateError") diff --git a/plugin/pkg/scheduler/algorithm/predicates/predicates.go b/plugin/pkg/scheduler/algorithm/predicates/predicates.go index b880c25a68aed..a5c63f4a7c539 100644 --- a/plugin/pkg/scheduler/algorithm/predicates/predicates.go +++ b/plugin/pkg/scheduler/algorithm/predicates/predicates.go @@ -23,6 +23,7 @@ import ( "k8s.io/kubernetes/pkg/api" "k8s.io/kubernetes/pkg/api/unversioned" "k8s.io/kubernetes/pkg/client/cache" + qosutil "k8s.io/kubernetes/pkg/kubelet/qos/util" "k8s.io/kubernetes/pkg/labels" "k8s.io/kubernetes/plugin/pkg/scheduler/algorithm" priorityutil "k8s.io/kubernetes/plugin/pkg/scheduler/algorithm/priorities/util" @@ -999,3 +1000,31 @@ func tolerationsToleratesTaints(tolerations []api.Toleration, taints []api.Taint return true } + +// Determine if a pod is scheduled with best-effort QoS +func isPodBestEffort(pod *api.Pod) bool { + return qosutil.GetPodQos(pod) == qosutil.BestEffort +} + +// CheckNodeMemoryPressurePredicate checks if a pod can be scheduled on a node +// reporting memory pressure condition. +func CheckNodeMemoryPressurePredicate(pod *api.Pod, nodeInfo *schedulercache.NodeInfo) (bool, error) { + node := nodeInfo.Node() + if node == nil { + return false, fmt.Errorf("node not found") + } + + // pod is not BestEffort pod + if !isPodBestEffort(pod) { + return true, nil + } + + // is node under presure? + for _, cond := range node.Status.Conditions { + if cond.Type == api.NodeMemoryPressure && cond.Status == api.ConditionTrue { + return false, ErrNodeUnderMemoryPressure + } + } + + return true, nil +} diff --git a/plugin/pkg/scheduler/algorithm/predicates/predicates_test.go b/plugin/pkg/scheduler/algorithm/predicates/predicates_test.go index 8fe0cabc59312..1221b663a85a4 100644 --- a/plugin/pkg/scheduler/algorithm/predicates/predicates_test.go +++ b/plugin/pkg/scheduler/algorithm/predicates/predicates_test.go @@ -2641,3 +2641,111 @@ func TestPodToleratesTaints(t *testing.T) { } } } + +func makeEmptyNodeInfo(node *api.Node) *schedulercache.NodeInfo { + nodeInfo := schedulercache.NewNodeInfo() + nodeInfo.SetNode(node) + return nodeInfo +} + +func TestPodSchedulesOnNodeWithMemoryPressureCondition(t *testing.T) { + // specify best-effort pod + bestEffortPod := &api.Pod{ + Spec: api.PodSpec{ + Containers: []api.Container{ + { + Name: "container", + Image: "image", + ImagePullPolicy: "Always", + // no requirements -> best effort pod + Resources: api.ResourceRequirements{}, + }, + }, + }, + } + + // specify non-best-effort pod + nonBestEffortPod := &api.Pod{ + Spec: api.PodSpec{ + Containers: []api.Container{ + { + Name: "container", + Image: "image", + ImagePullPolicy: "Always", + // at least one requirement -> burstable pod + Resources: api.ResourceRequirements{ + Requests: makeAllocatableResources(100, 100, 100, 100), + }, + }, + }, + }, + } + + // specify a node with no memory pressure condition on + noMemoryPressureNode := &api.Node{ + Status: api.NodeStatus{ + Conditions: []api.NodeCondition{ + { + Type: "Ready", + Status: "True", + }, + }, + }, + } + + // specify a node with memory pressure condition on + memoryPressureNode := &api.Node{ + Status: api.NodeStatus{ + Conditions: []api.NodeCondition{ + { + Type: "MemoryPressure", + Status: "True", + }, + }, + }, + } + + tests := []struct { + pod *api.Pod + nodeInfo *schedulercache.NodeInfo + fits bool + name string + }{ + { + pod: bestEffortPod, + nodeInfo: makeEmptyNodeInfo(noMemoryPressureNode), + fits: true, + name: "best-effort pod schedulable on node without memory pressure condition on", + }, + { + pod: bestEffortPod, + nodeInfo: makeEmptyNodeInfo(memoryPressureNode), + fits: false, + name: "best-effort pod not schedulable on node with memory pressure condition on", + }, + { + pod: nonBestEffortPod, + nodeInfo: makeEmptyNodeInfo(memoryPressureNode), + fits: true, + name: "non best-effort pod schedulable on node with memory pressure condition on", + }, + { + pod: nonBestEffortPod, + nodeInfo: makeEmptyNodeInfo(noMemoryPressureNode), + fits: true, + name: "non best-effort pod schedulable on node without memory pressure condition on", + }, + } + + for _, test := range tests { + fits, err := CheckNodeMemoryPressurePredicate(test.pod, test.nodeInfo) + if fits != test.fits { + t.Errorf("%s: expected %v got %v", test.name, test.fits, fits) + } + + if err != nil && err != ErrNodeUnderMemoryPressure { + t.Errorf("%s: unexpected error: %v", test.name, err) + continue + } + } +} diff --git a/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go b/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go index 16c576609068a..21e8e22990d61 100644 --- a/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go +++ b/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go @@ -153,6 +153,9 @@ func defaultPredicates() sets.String { return predicates.NewTolerationMatchPredicate(args.NodeInfo) }, ), + + // Fit is determined by node memory pressure condition. + factory.RegisterFitPredicate("CheckNodeMemoryPressure", predicates.CheckNodeMemoryPressurePredicate), ) }