Merge pull request kubernetes#10661 from davidopp/somebranch

When scheduling, treat zero limit-pods as having nonzero limit when calculating priorities
nikhiljindal · Jul 6, 2015 · 8278928 · 8278928
2 parents a802ac2 + 2e3f2ea
commit 8278928
Show file tree

Hide file tree

Showing 5 changed files with 164 additions and 22 deletions.
diff --git a/plugin/pkg/scheduler/algorithm/priorities/priorities.go b/plugin/pkg/scheduler/algorithm/priorities/priorities.go
@@ -20,6 +20,7 @@ import (
 	"math"
 
 	"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
+	"github.com/GoogleCloudPlatform/kubernetes/pkg/api/resource"
 	"github.com/GoogleCloudPlatform/kubernetes/pkg/labels"
 	"github.com/GoogleCloudPlatform/kubernetes/plugin/pkg/scheduler/algorithm"
 	"github.com/GoogleCloudPlatform/kubernetes/plugin/pkg/scheduler/algorithm/predicates"
@@ -28,38 +29,69 @@ import (
 
 // the unused capacity is calculated on a scale of 0-10
 // 0 being the lowest priority and 10 being the highest
-func calculateScore(requested, capacity int64, node string) int {
+func calculateScore(requested int64, capacity int64, node string) int {
 	if capacity == 0 {
 		return 0
 	}
 	if requested > capacity {
-		glog.Infof("Combined requested resources from existing pods exceeds capacity on minion: %s", node)
+		glog.Infof("Combined requested resources %d from existing pods exceeds capacity %d on node %s",
+			requested, capacity, node)
 		return 0
 	}
 	return int(((capacity - requested) * 10) / capacity)
 }
 
-// Calculate the occupancy on a node.  'node' has information about the resources on the node.
+// For each of these resources, a pod that doesn't request the resource explicitly
+// will be treated as having requested the amount indicated below, for the purpose
+// of computing priority only. This ensures that when scheduling zero-limit pods, such
+// pods will not all be scheduled to the machine with the smallest in-use limit,
+// and that when scheduling regular pods, such pods will not see zero-limit pods as
+// consuming no resources whatsoever.
+const defaultMilliCpuLimit int64 = 100            // 0.1 core
+const defaultMemoryLimit int64 = 60 * 1024 * 1024 // 60 MB
+
+// TODO: Consider setting default as a fixed fraction of machine capacity (take "capacity api.ResourceList"
+// as an additional argument here) rather than using constants
+func getNonzeroLimits(limits *api.ResourceList) (int64, int64) {
+	var out_millicpu, out_memory int64
+	// Override if un-set, but not if explicitly set to zero
+	if (*limits.Cpu() == resource.Quantity{}) {
+		out_millicpu = defaultMilliCpuLimit
+	} else {
+		out_millicpu = limits.Cpu().MilliValue()
+	}
+	// Override if un-set, but not if explicitly set to zero
+	if (*limits.Memory() == resource.Quantity{}) {
+		out_memory = defaultMemoryLimit
+	} else {
+		out_memory = limits.Memory().Value()
+	}
+	return out_millicpu, out_memory
+}
+
+// Calculate the resource occupancy on a node.  'node' has information about the resources on the node.
 // 'pods' is a list of pods currently scheduled on the node.
-func calculateOccupancy(pod *api.Pod, node api.Node, pods []*api.Pod) algorithm.HostPriority {
+func calculateResourceOccupancy(pod *api.Pod, node api.Node, pods []*api.Pod) algorithm.HostPriority {
 	totalMilliCPU := int64(0)
 	totalMemory := int64(0)
+	capacityMilliCPU := node.Status.Capacity.Cpu().MilliValue()
+	capacityMemory := node.Status.Capacity.Memory().Value()
+
 	for _, existingPod := range pods {
 		for _, container := range existingPod.Spec.Containers {
-			totalMilliCPU += container.Resources.Limits.Cpu().MilliValue()
-			totalMemory += container.Resources.Limits.Memory().Value()
+			cpu, memory := getNonzeroLimits(&container.Resources.Limits)
+			totalMilliCPU += cpu
+			totalMemory += memory
 		}
 	}
 	// Add the resources requested by the current pod being scheduled.
 	// This also helps differentiate between differently sized, but empty, minions.
 	for _, container := range pod.Spec.Containers {
-		totalMilliCPU += container.Resources.Limits.Cpu().MilliValue()
-		totalMemory += container.Resources.Limits.Memory().Value()
+		cpu, memory := getNonzeroLimits(&container.Resources.Limits)
+		totalMilliCPU += cpu
+		totalMemory += memory
 	}
 
-	capacityMilliCPU := node.Status.Capacity.Cpu().MilliValue()
-	capacityMemory := node.Status.Capacity.Memory().Value()
-
 	cpuScore := calculateScore(totalMilliCPU, capacityMilliCPU, node.Name)
 	memoryScore := calculateScore(totalMemory, capacityMemory, node.Name)
 	glog.V(10).Infof(
@@ -89,7 +121,7 @@ func LeastRequestedPriority(pod *api.Pod, podLister algorithm.PodLister, minionL
 
 	list := algorithm.HostPriorityList{}
 	for _, node := range nodes.Items {
-		list = append(list, calculateOccupancy(pod, node, podsToMachines[node.Name]))
+		list = append(list, calculateResourceOccupancy(pod, node, podsToMachines[node.Name]))
 	}
 	return list, nil
 }
@@ -163,15 +195,17 @@ func calculateBalancedResourceAllocation(pod *api.Pod, node api.Node, pods []*ap
 	score := int(0)
 	for _, existingPod := range pods {
 		for _, container := range existingPod.Spec.Containers {
-			totalMilliCPU += container.Resources.Limits.Cpu().MilliValue()
-			totalMemory += container.Resources.Limits.Memory().Value()
+			cpu, memory := getNonzeroLimits(&container.Resources.Limits)
+			totalMilliCPU += cpu
+			totalMemory += memory
 		}
 	}
 	// Add the resources requested by the current pod being scheduled.
 	// This also helps differentiate between differently sized, but empty, minions.
 	for _, container := range pod.Spec.Containers {
-		totalMilliCPU += container.Resources.Limits.Cpu().MilliValue()
-		totalMemory += container.Resources.Limits.Memory().Value()
+		cpu, memory := getNonzeroLimits(&container.Resources.Limits)
+		totalMilliCPU += cpu
+		totalMemory += memory
 	}
 
 	capacityMilliCPU := node.Status.Capacity.Cpu().MilliValue()

diff --git a/plugin/pkg/scheduler/algorithm/priorities/priorities_test.go b/plugin/pkg/scheduler/algorithm/priorities/priorities_test.go
@@ -19,10 +19,12 @@ package priorities
 import (
 	"reflect"
 	"sort"
+	"strconv"
 	"testing"
 
 	"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
 	"github.com/GoogleCloudPlatform/kubernetes/pkg/api/resource"
+	"github.com/GoogleCloudPlatform/kubernetes/plugin/pkg/scheduler"
 	"github.com/GoogleCloudPlatform/kubernetes/plugin/pkg/scheduler/algorithm"
 )
 
@@ -38,6 +40,103 @@ func makeMinion(node string, milliCPU, memory int64) api.Node {
 	}
 }
 
+func TestZeroLimit(t *testing.T) {
+	// A pod with no resources. We expect spreading to count it as having the default resources.
+	noResources := api.PodSpec{
+		Containers: []api.Container{
+			{},
+		},
+	}
+	noResources1 := noResources
+	noResources1.NodeName = "machine1"
+	// A pod with the same resources as a 0-limit pod gets by default as its resources (for spreading).
+	small := api.PodSpec{
+		Containers: []api.Container{
+			{
+				Resources: api.ResourceRequirements{
+					Limits: api.ResourceList{
+						"cpu": resource.MustParse(
+							strconv.FormatInt(defaultMilliCpuLimit, 10) + "m"),
+						"memory": resource.MustParse(
+							strconv.FormatInt(defaultMemoryLimit, 10)),
+					},
+				},
+			},
+		},
+	}
+	small2 := small
+	small2.NodeName = "machine2"
+	// A larger pod.
+	large := api.PodSpec{
+		Containers: []api.Container{
+			{
+				Resources: api.ResourceRequirements{
+					Limits: api.ResourceList{
+						"cpu": resource.MustParse(
+							strconv.FormatInt(defaultMilliCpuLimit*3, 10) + "m"),
+						"memory": resource.MustParse(
+							strconv.FormatInt(defaultMemoryLimit*3, 10)),
+					},
+				},
+			},
+		},
+	}
+	large1 := large
+	large1.NodeName = "machine1"
+	large2 := large
+	large2.NodeName = "machine2"
+	tests := []struct {
+		pod   *api.Pod
+		pods  []*api.Pod
+		nodes []api.Node
+		test  string
+	}{
+		// The point of these tests is to show you get the same priority for a zero-limit pod
+		// as for a pod with the defaults limits, both when the zero-limit pod is already on the machine
+		// and when the zero-limit pod is the one being scheduled.
+		{
+			pod: &api.Pod{Spec: noResources},
+			// match current f1-micro on GCE
+			nodes: []api.Node{makeMinion("machine1", 1000, defaultMemoryLimit*10), makeMinion("machine2", 1000, defaultMemoryLimit*10)},
+			test:  "test priority of zero-limit pod with machine with zero-limit pod",
+			pods: []*api.Pod{
+				{Spec: large1}, {Spec: noResources1},
+				{Spec: large2}, {Spec: small2},
+			},
+		},
+		{
+			pod: &api.Pod{Spec: small},
+			// match current f1-micro on GCE
+			nodes: []api.Node{makeMinion("machine1", 1000, defaultMemoryLimit*10), makeMinion("machine2", 1000, defaultMemoryLimit*10)},
+			test:  "test priority of nonzero-limit pod with machine with zero-limit pod",
+			pods: []*api.Pod{
+				{Spec: large1}, {Spec: noResources1},
+				{Spec: large2}, {Spec: small2},
+			},
+		},
+	}
+
+	const expectedPriority int = 25
+	for _, test := range tests {
+		list, err := scheduler.PrioritizeNodes(
+			test.pod,
+			algorithm.FakePodLister(test.pods),
+			// This should match the configuration in defaultPriorities() in
+			// plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go if you want
+			// to test what's actually in production.
+			[]algorithm.PriorityConfig{{Function: LeastRequestedPriority, Weight: 1}, {Function: BalancedResourceAllocation, Weight: 1}, {Function: NewServiceSpreadPriority(algorithm.FakeServiceLister([]api.Service{})), Weight: 1}},
+			algorithm.FakeMinionLister(api.NodeList{Items: test.nodes}))
+		if err != nil {
+			t.Errorf("unexpected error: %v", err)
+		}
+		for _, hp := range list {
+			if hp.Score != expectedPriority {
+				t.Errorf("%s: expected 25 for all priorities, got list %#v", list)
+			}
+		}
+	}
+}
+
 func TestLeastRequested(t *testing.T) {
 	labels1 := map[string]string{
 		"foo": "bar",
@@ -62,14 +161,16 @@ func TestLeastRequested(t *testing.T) {
 			{
 				Resources: api.ResourceRequirements{
 					Limits: api.ResourceList{
-						"cpu": resource.MustParse("1000m"),
+						"cpu":    resource.MustParse("1000m"),
+						"memory": resource.MustParse("0"),
 					},
 				},
 			},
 			{
 				Resources: api.ResourceRequirements{
 					Limits: api.ResourceList{
-						"cpu": resource.MustParse("2000m"),
+						"cpu":    resource.MustParse("2000m"),
+						"memory": resource.MustParse("0"),
 					},
 				},
 			},
@@ -392,14 +493,16 @@ func TestBalancedResourceAllocation(t *testing.T) {
 			{
 				Resources: api.ResourceRequirements{
 					Limits: api.ResourceList{
-						"cpu": resource.MustParse("1000m"),
+						"cpu":    resource.MustParse("1000m"),
+						"memory": resource.MustParse("0"),
 					},
 				},
 			},
 			{
 				Resources: api.ResourceRequirements{
 					Limits: api.ResourceList{
-						"cpu": resource.MustParse("2000m"),
+						"cpu":    resource.MustParse("2000m"),
+						"memory": resource.MustParse("0"),
 					},
 				},
 			},

diff --git a/...heduler/algorithm/priorities/spreading.go → ...algorithm/priorities/service_spreading.go b/...heduler/algorithm/priorities/spreading.go → ...algorithm/priorities/service_spreading.go
@@ -20,6 +20,7 @@ import (
 	"github.com/GoogleCloudPlatform/kubernetes/pkg/api"
 	"github.com/GoogleCloudPlatform/kubernetes/pkg/labels"
 	"github.com/GoogleCloudPlatform/kubernetes/plugin/pkg/scheduler/algorithm"
+	"github.com/golang/glog"
 )
 
 type ServiceSpread struct {
@@ -82,6 +83,9 @@ func (s *ServiceSpread) CalculateSpreadPriority(pod *api.Pod, podLister algorith
 			fScore = 10 * (float32(maxCount-counts[minion.Name]) / float32(maxCount))
 		}
 		result = append(result, algorithm.HostPriority{Host: minion.Name, Score: int(fScore)})
+		glog.V(10).Infof(
+			"%v -> %v: ServiceSpreadPriority, Score: (%d)", pod.Name, minion.Name, int(fScore),
+		)
 	}
 	return result, nil
 }

diff --git a/...er/algorithm/priorities/spreading_test.go → ...ithm/priorities/service_spreading_test.go b/...er/algorithm/priorities/spreading_test.go → ...ithm/priorities/service_spreading_test.go
diff --git a/plugin/pkg/scheduler/generic_scheduler.go b/plugin/pkg/scheduler/generic_scheduler.go
@@ -74,7 +74,7 @@ func (g *genericScheduler) Schedule(pod *api.Pod, minionLister algorithm.MinionL
 		return "", err
 	}
 
-	priorityList, err := prioritizeNodes(pod, g.pods, g.prioritizers, algorithm.FakeMinionLister(filteredNodes))
+	priorityList, err := PrioritizeNodes(pod, g.pods, g.prioritizers, algorithm.FakeMinionLister(filteredNodes))
 	if err != nil {
 		return "", err
 	}
@@ -142,7 +142,7 @@ func findNodesThatFit(pod *api.Pod, podLister algorithm.PodLister, predicateFunc
 // Each priority function can also have its own weight
 // The minion scores returned by the priority function are multiplied by the weights to get weighted scores
 // All scores are finally combined (added) to get the total weighted scores of all minions
-func prioritizeNodes(pod *api.Pod, podLister algorithm.PodLister, priorityConfigs []algorithm.PriorityConfig, minionLister algorithm.MinionLister) (algorithm.HostPriorityList, error) {
+func PrioritizeNodes(pod *api.Pod, podLister algorithm.PodLister, priorityConfigs []algorithm.PriorityConfig, minionLister algorithm.MinionLister) (algorithm.HostPriorityList, error) {
 	result := algorithm.HostPriorityList{}
 
 	// If no priority configs are provided, then the EqualPriority function is applied
@@ -168,6 +168,7 @@ func prioritizeNodes(pod *api.Pod, podLister algorithm.PodLister, priorityConfig
 		}
 	}
 	for host, score := range combinedScores {
+		glog.V(10).Infof("Host %s Score %d", host, score)
 		result = append(result, algorithm.HostPriority{Host: host, Score: score})
 	}
 	return result, nil