Skip to content

Commit

Permalink
Scheduler: introduce CheckNodeMemoryPressurePredicate, don't schedule…
Browse files Browse the repository at this point in the history
… pods for nodes that reports memory pressury.

Introduce unit-test for CheckNodeMemoryPressurePredicate

Following work done in #14943
  • Loading branch information
ingvagabund committed May 21, 2016
1 parent f538d60 commit b95b30b
Show file tree
Hide file tree
Showing 5 changed files with 142 additions and 0 deletions.
1 change: 1 addition & 0 deletions docs/devel/scheduler_algorithm.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ The purpose of filtering the nodes is to filter out the nodes that do not meet c
- `MatchNodeSelector`: Check if the labels of the node match the labels specified in the Pod's `nodeSelector` field and, as of Kubernetes v1.2, also match the `scheduler.alpha.kubernetes.io/affinity` pod annotation if present. See [here](../user-guide/node-selection/) for more details on both.
- `MaxEBSVolumeCount`: Ensure that the number of attached ElasticBlockStore volumes does not exceed a maximum value (by default, 39, since Amazon recommends a maximum of 40 with one of those 40 reserved for the root volume -- see [Amazon's documentation](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/volume_limits.html#linux-specific-volume-limits)). The maximum value can be controlled by setting the `KUBE_MAX_PD_VOLS` environment variable.
- `MaxGCEPDVolumeCount`: Ensure that the number of attached GCE PersistentDisk volumes does not exceed a maximum value (by default, 16, which is the maximum GCE allows -- see [GCE's documentation](https://cloud.google.com/compute/docs/disks/persistent-disks#limits_for_predefined_machine_types)). The maximum value can be controlled by setting the `KUBE_MAX_PD_VOLS` environment variable.
- `CheckNodeMemoryPressure`: Check if a pod can be scheduled on a node reporting memory pressure condition. Currently, no ``BestEffort`` should be placed on a node under memory pressure as it gets automatically evicted by kubelet.

The details of the above predicates can be found in [plugin/pkg/scheduler/algorithm/predicates/predicates.go](http://releases.k8s.io/HEAD/plugin/pkg/scheduler/algorithm/predicates/predicates.go). All predicates mentioned above can be used in combination to perform a sophisticated filtering policy. Kubernetes uses some, but not all, of these predicates by default. You can see which ones are used by default in [plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go](http://releases.k8s.io/HEAD/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go).

Expand Down
1 change: 1 addition & 0 deletions plugin/pkg/scheduler/algorithm/predicates/error.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ var (
ErrNodeLabelPresenceViolated = newPredicateFailureError("CheckNodeLabelPresence")
ErrServiceAffinityViolated = newPredicateFailureError("CheckServiceAffinity")
ErrMaxVolumeCountExceeded = newPredicateFailureError("MaxVolumeCount")
ErrNodeUnderMemoryPressure = newPredicateFailureError("NodeUnderMemoryPressure")
// ErrFakePredicate is used for test only. The fake predicates returning false also returns error
// as ErrFakePredicate.
ErrFakePredicate = newPredicateFailureError("FakePredicateError")
Expand Down
29 changes: 29 additions & 0 deletions plugin/pkg/scheduler/algorithm/predicates/predicates.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/api/unversioned"
"k8s.io/kubernetes/pkg/client/cache"
qosutil "k8s.io/kubernetes/pkg/kubelet/qos/util"
"k8s.io/kubernetes/pkg/labels"
"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm"
priorityutil "k8s.io/kubernetes/plugin/pkg/scheduler/algorithm/priorities/util"
Expand Down Expand Up @@ -999,3 +1000,31 @@ func tolerationsToleratesTaints(tolerations []api.Toleration, taints []api.Taint

return true
}

// Determine if a pod is scheduled with best-effort QoS
func isPodBestEffort(pod *api.Pod) bool {
return qosutil.GetPodQos(pod) == qosutil.BestEffort
}

// CheckNodeMemoryPressurePredicate checks if a pod can be scheduled on a node
// reporting memory pressure condition.
func CheckNodeMemoryPressurePredicate(pod *api.Pod, nodeInfo *schedulercache.NodeInfo) (bool, error) {
node := nodeInfo.Node()
if node == nil {
return false, fmt.Errorf("node not found")
}

// pod is not BestEffort pod
if !isPodBestEffort(pod) {
return true, nil
}

// is node under presure?
for _, cond := range node.Status.Conditions {
if cond.Type == api.NodeMemoryPressure && cond.Status == api.ConditionTrue {
return false, ErrNodeUnderMemoryPressure
}
}

return true, nil
}
108 changes: 108 additions & 0 deletions plugin/pkg/scheduler/algorithm/predicates/predicates_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2641,3 +2641,111 @@ func TestPodToleratesTaints(t *testing.T) {
}
}
}

func makeEmptyNodeInfo(node *api.Node) *schedulercache.NodeInfo {
nodeInfo := schedulercache.NewNodeInfo()
nodeInfo.SetNode(node)
return nodeInfo
}

func TestPodSchedulesOnNodeWithMemoryPressureCondition(t *testing.T) {
// specify best-effort pod
bestEffortPod := &api.Pod{
Spec: api.PodSpec{
Containers: []api.Container{
{
Name: "container",
Image: "image",
ImagePullPolicy: "Always",
// no requirements -> best effort pod
Resources: api.ResourceRequirements{},
},
},
},
}

// specify non-best-effort pod
nonBestEffortPod := &api.Pod{
Spec: api.PodSpec{
Containers: []api.Container{
{
Name: "container",
Image: "image",
ImagePullPolicy: "Always",
// at least one requirement -> burstable pod
Resources: api.ResourceRequirements{
Requests: makeAllocatableResources(100, 100, 100, 100),
},
},
},
},
}

// specify a node with no memory pressure condition on
noMemoryPressureNode := &api.Node{
Status: api.NodeStatus{
Conditions: []api.NodeCondition{
{
Type: "Ready",
Status: "True",
},
},
},
}

// specify a node with memory pressure condition on
memoryPressureNode := &api.Node{
Status: api.NodeStatus{
Conditions: []api.NodeCondition{
{
Type: "MemoryPressure",
Status: "True",
},
},
},
}

tests := []struct {
pod *api.Pod
nodeInfo *schedulercache.NodeInfo
fits bool
name string
}{
{
pod: bestEffortPod,
nodeInfo: makeEmptyNodeInfo(noMemoryPressureNode),
fits: true,
name: "best-effort pod schedulable on node without memory pressure condition on",
},
{
pod: bestEffortPod,
nodeInfo: makeEmptyNodeInfo(memoryPressureNode),
fits: false,
name: "best-effort pod not schedulable on node with memory pressure condition on",
},
{
pod: nonBestEffortPod,
nodeInfo: makeEmptyNodeInfo(memoryPressureNode),
fits: true,
name: "non best-effort pod schedulable on node with memory pressure condition on",
},
{
pod: nonBestEffortPod,
nodeInfo: makeEmptyNodeInfo(noMemoryPressureNode),
fits: true,
name: "non best-effort pod schedulable on node without memory pressure condition on",
},
}

for _, test := range tests {
fits, err := CheckNodeMemoryPressurePredicate(test.pod, test.nodeInfo)
if fits != test.fits {
t.Errorf("%s: expected %v got %v", test.name, test.fits, fits)
}

if err != nil && err != ErrNodeUnderMemoryPressure {
t.Errorf("%s: unexpected error: %v", test.name, err)
continue
}
}
}
3 changes: 3 additions & 0 deletions plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,9 @@ func defaultPredicates() sets.String {
return predicates.NewTolerationMatchPredicate(args.NodeInfo)
},
),

// Fit is determined by node memory pressure condition.
factory.RegisterFitPredicate("CheckNodeMemoryPressure", predicates.CheckNodeMemoryPressurePredicate),
)
}

Expand Down

0 comments on commit b95b30b

Please sign in to comment.