Merge pull request kubernetes#20140 from resouer/scheduler

Auto commit by PR queue bot
satnam6502 · Feb 6, 2016 · 8e56494 · 8e56494
2 parents 1eff1d5 + 233a601
commit 8e56494
Show file tree

Hide file tree

Showing 3 changed files with 231 additions and 0 deletions.
diff --git a/docs/devel/scheduler_algorithm.md b/docs/devel/scheduler_algorithm.md
@@ -67,6 +67,7 @@ Currently, Kubernetes scheduler provides some practical priority functions, incl
 - `BalancedResourceAllocation`: This priority function tries to put the Pod on a node such that the CPU and Memory utilization rate is balanced after the Pod is deployed.
 - `CalculateSpreadPriority`: Spread Pods by minimizing the number of Pods belonging to the same service on the same node.  If zone information is present on the nodes, the priority will be adjusted so that pods are spread across zones and nodes.
 - `CalculateAntiAffinityPriority`: Spread Pods by minimizing the number of Pods belonging to the same service on nodes with the same value for a particular label.
+- `ImageLocalityPriority`: Nodes are prioritized based on locality of images requested by a pod. Nodes with larger size of already-installed packages required by the pod will be preferred over nodes with no already-installed packages required by the pod or a small total size of already-installed packages required by the pod.
 
 The details of the above priority functions can be found in [plugin/pkg/scheduler/algorithm/priorities](http://releases.k8s.io/HEAD/plugin/pkg/scheduler/algorithm/priorities/). Kubernetes uses some, but not all, of these priority functions by default. You can see which ones are used by default in [plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go](http://releases.k8s.io/HEAD/plugin/pkg/scheduler/algorithmprovider/defaults/defaults.go). Similar as predicates, you can combine the above priority functions and assign weight factors (positive number) to them as you want (check [scheduler.md](scheduler.md) for how to customize).
 

diff --git a/plugin/pkg/scheduler/algorithm/priorities/priorities.go b/plugin/pkg/scheduler/algorithm/priorities/priorities.go
@@ -170,6 +170,78 @@ func (n *NodeLabelPrioritizer) CalculateNodeLabelPriority(pod *api.Pod, machines
 	return result, nil
 }
 
+// This is a reasonable size range of all container images. 90%ile of images on dockerhub drops into this range.
+const (
+	mb         int64 = 1024 * 1024
+	minImgSize int64 = 23 * mb
+	maxImgSize int64 = 1000 * mb
+)
+
+// ImageLocalityPriority is a priority function that favors nodes that already have requested pod container's images.
+// It will detect whether the requested images are present on a node, and then calculate a score ranging from 0 to 10
+// based on the total size of those images.
+// - If none of the images are present, this node will be given the lowest priority.
+// - If some of the images are present on a node, the larger their sizes' sum, the higher the node's priority.
+func ImageLocalityPriority(pod *api.Pod, machinesToPods map[string][]*api.Pod, podLister algorithm.PodLister, nodeLister algorithm.NodeLister) (schedulerapi.HostPriorityList, error) {
+	sumSizeMap := make(map[string]int64)
+
+	nodes, err := nodeLister.List()
+	if err != nil {
+		return nil, err
+	}
+
+	for _, container := range pod.Spec.Containers {
+		for _, node := range nodes.Items {
+			// Check if this container's image is present and get its size.
+			imageSize := checkContainerImageOnNode(node, container)
+			// Add this size to the total result of this node.
+			sumSizeMap[node.Name] += imageSize
+		}
+	}
+
+	result := []schedulerapi.HostPriority{}
+	// score int - scale of 0-10
+	// 0 being the lowest priority and 10 being the highest.
+	for nodeName, sumSize := range sumSizeMap {
+		result = append(result, schedulerapi.HostPriority{Host: nodeName,
+			Score: calculateScoreFromSize(sumSize)})
+	}
+	return result, nil
+}
+
+// checkContainerImageOnNode checks if a container image is present on a node and returns its size.
+func checkContainerImageOnNode(node api.Node, container api.Container) int64 {
+	for _, image := range node.Status.Images {
+		for _, repoTag := range image.RepoTags {
+			if container.Image == repoTag {
+				// Should return immediately.
+				return image.Size
+			}
+		}
+	}
+	return 0
+}
+
+// calculateScoreFromSize calculates the priority of a node. sumSize is sum size of requested images on this node.
+// 1. Split image size range into 10 buckets.
+// 2. Decide the priority of a given sumSize based on which bucket it belongs to.
+func calculateScoreFromSize(sumSize int64) int {
+	var score int
+	switch {
+	case sumSize == 0 || sumSize < minImgSize:
+		// score == 0 means none of the images required by this pod are present on this
+		// node or the total size of the images present is too small to be taken into further consideration.
+		score = 0
+	// If existing images' total size is larger than max, just make it highest priority.
+	case sumSize >= maxImgSize:
+		score = 10
+	default:
+		score = int((10 * (sumSize - minImgSize) / (maxImgSize - minImgSize)) + 1)
+	}
+	// Return which bucket the given size belongs to
+	return score
+}
+
 // BalancedResourceAllocation favors nodes with balanced resource usage rate.
 // BalancedResourceAllocation should **NOT** be used alone, and **MUST** be used together with LeastRequestedPriority.
 // It calculates the difference between the cpu and memory fracion of capacity, and prioritizes the host based on how

diff --git a/plugin/pkg/scheduler/algorithm/priorities/priorities_test.go b/plugin/pkg/scheduler/algorithm/priorities/priorities_test.go
@@ -736,3 +736,161 @@ func TestBalancedResourceAllocation(t *testing.T) {
 		}
 	}
 }
+
+func TestImageLocalityPriority(t *testing.T) {
+	test_40_250 := api.PodSpec{
+		Containers: []api.Container{
+			{
+				Image: "gcr.io/40",
+			},
+			{
+				Image: "gcr.io/250",
+			},
+		},
+	}
+
+	test_40_140 := api.PodSpec{
+		Containers: []api.Container{
+			{
+				Image: "gcr.io/40",
+			},
+			{
+				Image: "gcr.io/140",
+			},
+		},
+	}
+
+	test_min_max := api.PodSpec{
+		Containers: []api.Container{
+			{
+				Image: "gcr.io/10",
+			},
+			{
+				Image: "gcr.io/2000",
+			},
+		},
+	}
+
+	node_40_140_2000 := api.NodeStatus{
+		Images: []api.ContainerImage{
+			{
+				RepoTags: []string{
+					"gcr.io/40",
+					"gcr.io/40:v1",
+					"gcr.io/40:v1",
+				},
+				Size: int64(40 * mb),
+			},
+			{
+				RepoTags: []string{
+					"gcr.io/140",
+					"gcr.io/140:v1",
+				},
+				Size: int64(140 * mb),
+			},
+			{
+				RepoTags: []string{
+					"gcr.io/2000",
+				},
+				Size: int64(2000 * mb),
+			},
+		},
+	}
+
+	node_250_10 := api.NodeStatus{
+		Images: []api.ContainerImage{
+			{
+				RepoTags: []string{
+					"gcr.io/250",
+				},
+				Size: int64(250 * mb),
+			},
+			{
+				RepoTags: []string{
+					"gcr.io/10",
+					"gcr.io/10:v1",
+				},
+				Size: int64(10 * mb),
+			},
+		},
+	}
+
+	tests := []struct {
+		pod          *api.Pod
+		pods         []*api.Pod
+		nodes        []api.Node
+		expectedList schedulerapi.HostPriorityList
+		test         string
+	}{
+		{
+			// Pod: gcr.io/40 gcr.io/250
+
+			// Node1
+			// Image: gcr.io/40 40MB
+			// Score: (40M-23M)/97.7M + 1 = 1
+
+			// Node2
+			// Image: gcr.io/250 250MB
+			// Score: (250M-23M)/97.7M + 1 = 3
+			pod:          &api.Pod{Spec: test_40_250},
+			nodes:        []api.Node{makeImageNode("machine1", node_40_140_2000), makeImageNode("machine2", node_250_10)},
+			expectedList: []schedulerapi.HostPriority{{"machine1", 1}, {"machine2", 3}},
+			test:         "two images spread on two nodes, prefer the larger image one",
+		},
+		{
+			// Pod: gcr.io/40 gcr.io/140
+
+			// Node1
+			// Image: gcr.io/40 40MB, gcr.io/140 140MB
+			// Score: (40M+140M-23M)/97.7M + 1 = 2
+
+			// Node2
+			// Image: not present
+			// Score: 0
+			pod:          &api.Pod{Spec: test_40_140},
+			nodes:        []api.Node{makeImageNode("machine1", node_40_140_2000), makeImageNode("machine2", node_250_10)},
+			expectedList: []schedulerapi.HostPriority{{"machine1", 2}, {"machine2", 0}},
+			test:         "two images on one node, prefer this node",
+		},
+		{
+			// Pod: gcr.io/2000 gcr.io/10
+
+			// Node1
+			// Image: gcr.io/2000 2000MB
+			// Score: 2000 > max score = 10
+
+			// Node2
+			// Image: gcr.io/10 10MB
+			// Score: 10 < min score = 0
+			pod:          &api.Pod{Spec: test_min_max},
+			nodes:        []api.Node{makeImageNode("machine1", node_40_140_2000), makeImageNode("machine2", node_250_10)},
+			expectedList: []schedulerapi.HostPriority{{"machine1", 10}, {"machine2", 0}},
+			test:         "if exceed limit, use limit",
+		},
+	}
+
+	for _, test := range tests {
+		m2p, err := predicates.MapPodsToMachines(algorithm.FakePodLister(test.pods))
+		if err != nil {
+			t.Errorf("unexpected error: %v", err)
+		}
+		list, err := ImageLocalityPriority(test.pod, m2p, algorithm.FakePodLister(test.pods), algorithm.FakeNodeLister(api.NodeList{Items: test.nodes}))
+		if err != nil {
+			t.Errorf("unexpected error: %v", err)
+		}
+
+		sort.Sort(test.expectedList)
+		sort.Sort(list)
+
+		if !reflect.DeepEqual(test.expectedList, list) {
+			t.Errorf("%s: expected %#v, got %#v", test.test, test.expectedList, list)
+		}
+	}
+}
+
+func makeImageNode(node string, status api.NodeStatus) api.Node {
+	return api.Node{
+		ObjectMeta: api.ObjectMeta{Name: node},
+		Status:     status,
+	}
+}