Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Introduce node memory pressure condition to scheduler #25531

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/devel/scheduler_algorithm.md
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ The purpose of filtering the nodes is to filter out the nodes that do not meet c
- `MatchNodeSelector`: Check if the labels of the node match the labels specified in the Pod's `nodeSelector` field and, as of Kubernetes v1.2, also match the `scheduler.alpha.kubernetes.io/affinity` pod annotation if present. See [here](../user-guide/node-selection/) for more details on both.
- `MaxEBSVolumeCount`: Ensure that the number of attached ElasticBlockStore volumes does not exceed a maximum value (by default, 39, since Amazon recommends a maximum of 40 with one of those 40 reserved for the root volume -- see [Amazon's documentation](http://docs.aws.amazon.com/AWSEC2/latest/UserGuide/volume_limits.html#linux-specific-volume-limits)). The maximum value can be controlled by setting the `KUBE_MAX_PD_VOLS` environment variable.
- `MaxGCEPDVolumeCount`: Ensure that the number of attached GCE PersistentDisk volumes does not exceed a maximum value (by default, 16, which is the maximum GCE allows -- see [GCE's documentation](https://cloud.google.com/compute/docs/disks/persistent-disks#limits_for_predefined_machine_types)). The maximum value can be controlled by setting the `KUBE_MAX_PD_VOLS` environment variable.
- `CheckNodeMemoryPressure`: Check if a pod can be scheduled on a node reporting memory pressure condition. Currently, no ``BestEffort`` should be placed on a node under memory pressure as it gets automatically evicted by kubelet.

The details of the above predicates can be found in [plugin/pkg/scheduler/algorithm/predicates/predicates.go](http://releases.k8s.io/HEAD/plugin/pkg/scheduler/algorithm/predicates/predicates.go). All predicates mentioned above can be used in combination to perform a sophisticated filtering policy. Kubernetes uses some, but not all, of these predicates by default. You can see which ones are used by default in [plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go](http://releases.k8s.io/HEAD/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go).

Expand Down
1 change: 1 addition & 0 deletions plugin/pkg/scheduler/algorithm/predicates/error.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ var (
ErrNodeLabelPresenceViolated = newPredicateFailureError("CheckNodeLabelPresence")
ErrServiceAffinityViolated = newPredicateFailureError("CheckServiceAffinity")
ErrMaxVolumeCountExceeded = newPredicateFailureError("MaxVolumeCount")
ErrNodeUnderMemoryPressure = newPredicateFailureError("NodeUnderMemoryPressure")
// ErrFakePredicate is used for test only. The fake predicates returning false also returns error
// as ErrFakePredicate.
ErrFakePredicate = newPredicateFailureError("FakePredicateError")
Expand Down
29 changes: 29 additions & 0 deletions plugin/pkg/scheduler/algorithm/predicates/predicates.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"k8s.io/kubernetes/pkg/api"
"k8s.io/kubernetes/pkg/api/unversioned"
"k8s.io/kubernetes/pkg/client/cache"
qosutil "k8s.io/kubernetes/pkg/kubelet/qos/util"
"k8s.io/kubernetes/pkg/labels"
"k8s.io/kubernetes/plugin/pkg/scheduler/algorithm"
priorityutil "k8s.io/kubernetes/plugin/pkg/scheduler/algorithm/priorities/util"
Expand Down Expand Up @@ -999,3 +1000,31 @@ func tolerationsToleratesTaints(tolerations []api.Toleration, taints []api.Taint

return true
}

// Determine if a pod is scheduled with best-effort QoS
func isPodBestEffort(pod *api.Pod) bool {
return qosutil.GetPodQos(pod) == qosutil.BestEffort
}

// CheckNodeMemoryPressurePredicate checks if a pod can be scheduled on a node
// reporting memory pressure condition.
func CheckNodeMemoryPressurePredicate(pod *api.Pod, nodeInfo *schedulercache.NodeInfo) (bool, error) {
node := nodeInfo.Node()
if node == nil {
return false, fmt.Errorf("node not found")
}

// pod is not BestEffort pod
if !isPodBestEffort(pod) {
return true, nil
}

// is node under presure?
for _, cond := range node.Status.Conditions {
if cond.Type == api.NodeMemoryPressure && cond.Status == api.ConditionTrue {
return false, ErrNodeUnderMemoryPressure
}
}

return true, nil
}
108 changes: 108 additions & 0 deletions plugin/pkg/scheduler/algorithm/predicates/predicates_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2641,3 +2641,111 @@ func TestPodToleratesTaints(t *testing.T) {
}
}
}

func makeEmptyNodeInfo(node *api.Node) *schedulercache.NodeInfo {
nodeInfo := schedulercache.NewNodeInfo()
nodeInfo.SetNode(node)
return nodeInfo
}

func TestPodSchedulesOnNodeWithMemoryPressureCondition(t *testing.T) {
// specify best-effort pod
bestEffortPod := &api.Pod{
Spec: api.PodSpec{
Containers: []api.Container{
{
Name: "container",
Image: "image",
ImagePullPolicy: "Always",
// no requirements -> best effort pod
Resources: api.ResourceRequirements{},
},
},
},
}

// specify non-best-effort pod
nonBestEffortPod := &api.Pod{
Spec: api.PodSpec{
Containers: []api.Container{
{
Name: "container",
Image: "image",
ImagePullPolicy: "Always",
// at least one requirement -> burstable pod
Resources: api.ResourceRequirements{
Requests: makeAllocatableResources(100, 100, 100, 100),
},
},
},
},
}

// specify a node with no memory pressure condition on
noMemoryPressureNode := &api.Node{
Status: api.NodeStatus{
Conditions: []api.NodeCondition{
{
Type: "Ready",
Status: "True",
},
},
},
}

// specify a node with memory pressure condition on
memoryPressureNode := &api.Node{
Status: api.NodeStatus{
Conditions: []api.NodeCondition{
{
Type: "MemoryPressure",
Status: "True",
},
},
},
}

tests := []struct {
pod *api.Pod
nodeInfo *schedulercache.NodeInfo
fits bool
name string
}{
{
pod: bestEffortPod,
nodeInfo: makeEmptyNodeInfo(noMemoryPressureNode),
fits: true,
name: "best-effort pod schedulable on node without memory pressure condition on",
},
{
pod: bestEffortPod,
nodeInfo: makeEmptyNodeInfo(memoryPressureNode),
fits: false,
name: "best-effort pod not schedulable on node with memory pressure condition on",
},
{
pod: nonBestEffortPod,
nodeInfo: makeEmptyNodeInfo(memoryPressureNode),
fits: true,
name: "non best-effort pod schedulable on node with memory pressure condition on",
},
{
pod: nonBestEffortPod,
nodeInfo: makeEmptyNodeInfo(noMemoryPressureNode),
fits: true,
name: "non best-effort pod schedulable on node without memory pressure condition on",
},
}

for _, test := range tests {
fits, err := CheckNodeMemoryPressurePredicate(test.pod, test.nodeInfo)
if fits != test.fits {
t.Errorf("%s: expected %v got %v", test.name, test.fits, fits)
}

if err != nil && err != ErrNodeUnderMemoryPressure {
t.Errorf("%s: unexpected error: %v", test.name, err)
continue
}
}
}
3 changes: 3 additions & 0 deletions plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,9 @@ func defaultPredicates() sets.String {
return predicates.NewTolerationMatchPredicate(args.NodeInfo)
},
),

// Fit is determined by node memory pressure condition.
factory.RegisterFitPredicate("CheckNodeMemoryPressure", predicates.CheckNodeMemoryPressurePredicate),
)
}

Expand Down