From 960856f4e8b510a24ceec51b165ded2ae2deda52 Mon Sep 17 00:00:00 2001 From: David Ashpole Date: Fri, 16 Feb 2018 14:53:44 -0800 Subject: [PATCH] collect metrics on the /kubepods cgroup on-demand --- pkg/kubelet/apis/stats/v1alpha1/types.go | 2 + pkg/kubelet/cm/container_manager.go | 3 + pkg/kubelet/cm/container_manager_linux.go | 5 + pkg/kubelet/cm/container_manager_stub.go | 4 + pkg/kubelet/eviction/eviction_manager.go | 18 +-- pkg/kubelet/eviction/eviction_manager_test.go | 129 ++++++++---------- pkg/kubelet/eviction/helpers.go | 85 ++++++------ pkg/kubelet/eviction/helpers_test.go | 20 ++- pkg/kubelet/eviction/types.go | 10 +- pkg/kubelet/kubelet.go | 2 +- pkg/kubelet/kubelet_getters.go | 5 + pkg/kubelet/server/server_test.go | 1 + pkg/kubelet/server/stats/handler.go | 3 + pkg/kubelet/server/stats/summary.go | 20 +-- pkg/kubelet/server/stats/summary_test.go | 16 ++- .../stats/testing/mock_stats_provider.go | 14 ++ test/e2e_node/summary_test.go | 12 ++ 17 files changed, 196 insertions(+), 153 deletions(-) diff --git a/pkg/kubelet/apis/stats/v1alpha1/types.go b/pkg/kubelet/apis/stats/v1alpha1/types.go index da0e2b6e63399..2f7f56e4bba57 100644 --- a/pkg/kubelet/apis/stats/v1alpha1/types.go +++ b/pkg/kubelet/apis/stats/v1alpha1/types.go @@ -87,6 +87,8 @@ const ( SystemContainerRuntime = "runtime" // SystemContainerMisc is the container name for the system container tracking non-kubernetes processes. SystemContainerMisc = "misc" + // SystemContainerPods is the container name for the system container tracking user pods. + SystemContainerPods = "pods" ) // PodStats holds pod-level unprocessed sample stats. diff --git a/pkg/kubelet/cm/container_manager.go b/pkg/kubelet/cm/container_manager.go index 3754f8028e9fb..7e2ae8e4b5582 100644 --- a/pkg/kubelet/cm/container_manager.go +++ b/pkg/kubelet/cm/container_manager.go @@ -91,6 +91,9 @@ type ContainerManager interface { UpdatePluginResources(*schedulercache.NodeInfo, *lifecycle.PodAdmitAttributes) error InternalContainerLifecycle() InternalContainerLifecycle + + // GetPodCgroupRoot returns the cgroup which contains all pods. + GetPodCgroupRoot() string } type NodeConfig struct { diff --git a/pkg/kubelet/cm/container_manager_linux.go b/pkg/kubelet/cm/container_manager_linux.go index f083425b6eb8a..340d58c65e976 100644 --- a/pkg/kubelet/cm/container_manager_linux.go +++ b/pkg/kubelet/cm/container_manager_linux.go @@ -500,6 +500,11 @@ func (cm *containerManagerImpl) GetNodeConfig() NodeConfig { return cm.NodeConfig } +// GetPodCgroupRoot returns the literal cgroupfs value for the cgroup containing all pods. +func (cm *containerManagerImpl) GetPodCgroupRoot() string { + return cm.cgroupManager.Name(CgroupName(cm.cgroupRoot)) +} + func (cm *containerManagerImpl) GetMountedSubsystems() *CgroupSubsystems { return cm.subsystems } diff --git a/pkg/kubelet/cm/container_manager_stub.go b/pkg/kubelet/cm/container_manager_stub.go index a00dc4e9fbf19..b03ca869fc2c2 100644 --- a/pkg/kubelet/cm/container_manager_stub.go +++ b/pkg/kubelet/cm/container_manager_stub.go @@ -90,6 +90,10 @@ func (cm *containerManagerStub) InternalContainerLifecycle() InternalContainerLi return &internalContainerLifecycleImpl{cpumanager.NewFakeManager()} } +func (cm *containerManagerStub) GetPodCgroupRoot() string { + return "" +} + func NewStubContainerManager() ContainerManager { return &containerManagerStub{} } diff --git a/pkg/kubelet/eviction/eviction_manager.go b/pkg/kubelet/eviction/eviction_manager.go index 61fd9e0d03edd..45160d67bfb5f 100644 --- a/pkg/kubelet/eviction/eviction_manager.go +++ b/pkg/kubelet/eviction/eviction_manager.go @@ -149,11 +149,11 @@ func (m *managerImpl) Admit(attrs *lifecycle.PodAdmitAttributes) lifecycle.PodAd } // Start starts the control loop to observe and response to low compute resources. -func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, podCleanedUpFunc PodCleanedUpFunc, capacityProvider CapacityProvider, monitoringInterval time.Duration) { +func (m *managerImpl) Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, podCleanedUpFunc PodCleanedUpFunc, monitoringInterval time.Duration) { // start the eviction manager monitoring go func() { for { - if evictedPods := m.synchronize(diskInfoProvider, podFunc, capacityProvider); evictedPods != nil { + if evictedPods := m.synchronize(diskInfoProvider, podFunc); evictedPods != nil { glog.Infof("eviction manager: pods %s evicted, waiting for pod to be cleaned up", format.Pods(evictedPods)) m.waitForPodsCleanup(podCleanedUpFunc, evictedPods) } else { @@ -219,7 +219,7 @@ func startMemoryThresholdNotifier(thresholds []evictionapi.Threshold, observatio // synchronize is the main control loop that enforces eviction thresholds. // Returns the pod that was killed, or nil if no pod was killed. -func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, capacityProvider CapacityProvider) []*v1.Pod { +func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc) []*v1.Pod { // if we have nothing to do, just return thresholds := m.config.Thresholds if len(thresholds) == 0 { @@ -248,7 +248,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act } // make observations and get a function to derive pod usage stats relative to those observations. - observations, statsFunc := makeSignalObservations(summary, capacityProvider, activePods) + observations, statsFunc := makeSignalObservations(summary) debugLogObservations("observations", observations) // attempt to create a threshold notifier to improve eviction response time @@ -259,7 +259,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act err = startMemoryThresholdNotifier(m.config.Thresholds, observations, false, func(desc string) { glog.Infof("soft memory eviction threshold crossed at %s", desc) // TODO wait grace period for soft memory limit - m.synchronize(diskInfoProvider, podFunc, capacityProvider) + m.synchronize(diskInfoProvider, podFunc) }) if err != nil { glog.Warningf("eviction manager: failed to create soft memory threshold notifier: %v", err) @@ -267,7 +267,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act // start hard memory notification err = startMemoryThresholdNotifier(m.config.Thresholds, observations, true, func(desc string) { glog.Infof("hard memory eviction threshold crossed at %s", desc) - m.synchronize(diskInfoProvider, podFunc, capacityProvider) + m.synchronize(diskInfoProvider, podFunc) }) if err != nil { glog.Warningf("eviction manager: failed to create hard memory threshold notifier: %v", err) @@ -349,7 +349,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act m.recorder.Eventf(m.nodeRef, v1.EventTypeWarning, "EvictionThresholdMet", "Attempting to reclaim %s", resourceToReclaim) // check if there are node-level resources we can reclaim to reduce pressure before evicting end-user pods. - if m.reclaimNodeLevelResources(resourceToReclaim, capacityProvider, activePods) { + if m.reclaimNodeLevelResources(resourceToReclaim) { glog.Infof("eviction manager: able to reduce %v pressure without evicting pods.", resourceToReclaim) return nil } @@ -437,7 +437,7 @@ func (m *managerImpl) waitForPodsCleanup(podCleanedUpFunc PodCleanedUpFunc, pods } // reclaimNodeLevelResources attempts to reclaim node level resources. returns true if thresholds were satisfied and no pod eviction is required. -func (m *managerImpl) reclaimNodeLevelResources(resourceToReclaim v1.ResourceName, capacityProvider CapacityProvider, pods []*v1.Pod) bool { +func (m *managerImpl) reclaimNodeLevelResources(resourceToReclaim v1.ResourceName) bool { nodeReclaimFuncs := m.resourceToNodeReclaimFuncs[resourceToReclaim] for _, nodeReclaimFunc := range nodeReclaimFuncs { // attempt to reclaim the pressured resource. @@ -454,7 +454,7 @@ func (m *managerImpl) reclaimNodeLevelResources(resourceToReclaim v1.ResourceNam } // make observations and get a function to derive pod usage stats relative to those observations. - observations, _ := makeSignalObservations(summary, capacityProvider, pods) + observations, _ := makeSignalObservations(summary) debugLogObservations("observations after resource reclaim", observations) // determine the set of thresholds met independent of grace period diff --git a/pkg/kubelet/eviction/eviction_manager_test.go b/pkg/kubelet/eviction/eviction_manager_test.go index 85bd0b6c0aa57..9ec64c4089aa4 100644 --- a/pkg/kubelet/eviction/eviction_manager_test.go +++ b/pkg/kubelet/eviction/eviction_manager_test.go @@ -65,27 +65,6 @@ func (m *mockDiskInfoProvider) HasDedicatedImageFs() (bool, error) { return m.dedicatedImageFs, nil } -func newMockCapacityProvider(capacity, reservation v1.ResourceList) *mockCapacityProvider { - return &mockCapacityProvider{ - capacity: capacity, - reservation: reservation, - } -} - -type mockCapacityProvider struct { - capacity v1.ResourceList - reservation v1.ResourceList -} - -func (m *mockCapacityProvider) GetCapacity() v1.ResourceList { - return m.capacity - -} - -func (m *mockCapacityProvider) GetNodeAllocatableReservation() v1.ResourceList { - return m.reservation -} - // mockDiskGC is used to simulate invoking image and container garbage collection. type mockDiskGC struct { err error @@ -139,6 +118,15 @@ func makeMemoryStats(nodeAvailableBytes string, podStats map[*v1.Pod]statsapi.Po AvailableBytes: &availableBytes, WorkingSetBytes: &WorkingSetBytes, }, + SystemContainers: []statsapi.ContainerStats{ + { + Name: statsapi.SystemContainerPods, + Memory: &statsapi.MemoryStats{ + AvailableBytes: &availableBytes, + WorkingSetBytes: &WorkingSetBytes, + }, + }, + }, }, Pods: []statsapi.PodStats{}, } @@ -217,7 +205,6 @@ func TestMemoryPressure(t *testing.T) { fakeClock := clock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false} - capacityProvider := newMockCapacityProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("3Gi")}, v1.ResourceList{v1.ResourceMemory: *quantityMustParse("1Gi")}) diskGC := &mockDiskGC{err: nil} nodeRef := &v1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""} @@ -261,7 +248,7 @@ func TestMemoryPressure(t *testing.T) { burstablePodToAdmit, _ := podMaker("burst-admit", defaultPriority, newResourceList("100m", "100Mi", ""), newResourceList("200m", "200Mi", ""), "0Gi") // synchronize - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + manager.synchronize(diskInfoProvider, activePodsFunc) // we should not have memory pressure if manager.IsUnderMemoryPressure() { @@ -279,7 +266,7 @@ func TestMemoryPressure(t *testing.T) { // induce soft threshold fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("1500Mi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + manager.synchronize(diskInfoProvider, activePodsFunc) // we should have memory pressure if !manager.IsUnderMemoryPressure() { @@ -294,7 +281,7 @@ func TestMemoryPressure(t *testing.T) { // step forward in time pass the grace period fakeClock.Step(3 * time.Minute) summaryProvider.result = summaryStatsMaker("1500Mi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + manager.synchronize(diskInfoProvider, activePodsFunc) // we should have memory pressure if !manager.IsUnderMemoryPressure() { @@ -319,7 +306,7 @@ func TestMemoryPressure(t *testing.T) { // remove memory pressure fakeClock.Step(20 * time.Minute) summaryProvider.result = summaryStatsMaker("3Gi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + manager.synchronize(diskInfoProvider, activePodsFunc) // we should not have memory pressure if manager.IsUnderMemoryPressure() { @@ -329,7 +316,7 @@ func TestMemoryPressure(t *testing.T) { // induce memory pressure! fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("500Mi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + manager.synchronize(diskInfoProvider, activePodsFunc) // we should have memory pressure if !manager.IsUnderMemoryPressure() { @@ -357,7 +344,7 @@ func TestMemoryPressure(t *testing.T) { fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("2Gi", podStats) podKiller.pod = nil // reset state - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + manager.synchronize(diskInfoProvider, activePodsFunc) // we should have memory pressure (because transition period not yet met) if !manager.IsUnderMemoryPressure() { @@ -381,7 +368,7 @@ func TestMemoryPressure(t *testing.T) { fakeClock.Step(5 * time.Minute) summaryProvider.result = summaryStatsMaker("2Gi", podStats) podKiller.pod = nil // reset state - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + manager.synchronize(diskInfoProvider, activePodsFunc) // we should not have memory pressure (because transition period met) if manager.IsUnderMemoryPressure() { @@ -439,7 +426,6 @@ func TestDiskPressureNodeFs(t *testing.T) { fakeClock := clock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false} - capacityProvider := newMockCapacityProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("3Gi")}, v1.ResourceList{v1.ResourceMemory: *quantityMustParse("1Gi")}) diskGC := &mockDiskGC{err: nil} nodeRef := &v1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""} @@ -482,7 +468,7 @@ func TestDiskPressureNodeFs(t *testing.T) { podToAdmit, _ := podMaker("pod-to-admit", defaultPriority, newResourceList("", "", ""), newResourceList("", "", ""), "0Gi", "0Gi", "0Gi") // synchronize - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + manager.synchronize(diskInfoProvider, activePodsFunc) // we should not have disk pressure if manager.IsUnderDiskPressure() { @@ -497,7 +483,7 @@ func TestDiskPressureNodeFs(t *testing.T) { // induce soft threshold fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("1.5Gi", "200Gi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + manager.synchronize(diskInfoProvider, activePodsFunc) // we should have disk pressure if !manager.IsUnderDiskPressure() { @@ -512,7 +498,7 @@ func TestDiskPressureNodeFs(t *testing.T) { // step forward in time pass the grace period fakeClock.Step(3 * time.Minute) summaryProvider.result = summaryStatsMaker("1.5Gi", "200Gi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + manager.synchronize(diskInfoProvider, activePodsFunc) // we should have disk pressure if !manager.IsUnderDiskPressure() { @@ -537,7 +523,7 @@ func TestDiskPressureNodeFs(t *testing.T) { // remove disk pressure fakeClock.Step(20 * time.Minute) summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + manager.synchronize(diskInfoProvider, activePodsFunc) // we should not have disk pressure if manager.IsUnderDiskPressure() { @@ -547,7 +533,7 @@ func TestDiskPressureNodeFs(t *testing.T) { // induce disk pressure! fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("500Mi", "200Gi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + manager.synchronize(diskInfoProvider, activePodsFunc) // we should have disk pressure if !manager.IsUnderDiskPressure() { @@ -572,7 +558,7 @@ func TestDiskPressureNodeFs(t *testing.T) { fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats) podKiller.pod = nil // reset state - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + manager.synchronize(diskInfoProvider, activePodsFunc) // we should have disk pressure (because transition period not yet met) if !manager.IsUnderDiskPressure() { @@ -593,7 +579,7 @@ func TestDiskPressureNodeFs(t *testing.T) { fakeClock.Step(5 * time.Minute) summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats) podKiller.pod = nil // reset state - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + manager.synchronize(diskInfoProvider, activePodsFunc) // we should not have disk pressure (because transition period met) if manager.IsUnderDiskPressure() { @@ -638,7 +624,6 @@ func TestMinReclaim(t *testing.T) { fakeClock := clock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false} - capacityProvider := newMockCapacityProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("3Gi")}, v1.ResourceList{v1.ResourceMemory: *quantityMustParse("1Gi")}) diskGC := &mockDiskGC{err: nil} nodeRef := &v1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""} @@ -673,7 +658,7 @@ func TestMinReclaim(t *testing.T) { } // synchronize - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + manager.synchronize(diskInfoProvider, activePodsFunc) // we should not have memory pressure if manager.IsUnderMemoryPressure() { @@ -683,7 +668,7 @@ func TestMinReclaim(t *testing.T) { // induce memory pressure! fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("500Mi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + manager.synchronize(diskInfoProvider, activePodsFunc) // we should have memory pressure if !manager.IsUnderMemoryPressure() { @@ -703,7 +688,7 @@ func TestMinReclaim(t *testing.T) { fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("1.2Gi", podStats) podKiller.pod = nil // reset state - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + manager.synchronize(diskInfoProvider, activePodsFunc) // we should have memory pressure (because transition period not yet met) if !manager.IsUnderMemoryPressure() { @@ -723,7 +708,7 @@ func TestMinReclaim(t *testing.T) { fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("2Gi", podStats) podKiller.pod = nil // reset state - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + manager.synchronize(diskInfoProvider, activePodsFunc) // we should have memory pressure (because transition period not yet met) if !manager.IsUnderMemoryPressure() { @@ -739,7 +724,7 @@ func TestMinReclaim(t *testing.T) { fakeClock.Step(5 * time.Minute) summaryProvider.result = summaryStatsMaker("2Gi", podStats) podKiller.pod = nil // reset state - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + manager.synchronize(diskInfoProvider, activePodsFunc) // we should not have memory pressure (because transition period met) if manager.IsUnderMemoryPressure() { @@ -781,7 +766,6 @@ func TestNodeReclaimFuncs(t *testing.T) { fakeClock := clock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false} - capacityProvider := newMockCapacityProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("3Gi")}, v1.ResourceList{v1.ResourceMemory: *quantityMustParse("1Gi")}) nodeRef := &v1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""} config := Config{ @@ -816,7 +800,7 @@ func TestNodeReclaimFuncs(t *testing.T) { } // synchronize - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + manager.synchronize(diskInfoProvider, activePodsFunc) // we should not have disk pressure if manager.IsUnderDiskPressure() { @@ -828,7 +812,7 @@ func TestNodeReclaimFuncs(t *testing.T) { summaryProvider.result = summaryStatsMaker(".9Gi", "200Gi", podStats) // make GC successfully return disk usage to previous levels diskGC.summaryAfterGC = summaryStatsMaker("16Gi", "200Gi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + manager.synchronize(diskInfoProvider, activePodsFunc) // we should have disk pressure if !manager.IsUnderDiskPressure() { @@ -851,7 +835,8 @@ func TestNodeReclaimFuncs(t *testing.T) { // remove disk pressure fakeClock.Step(20 * time.Minute) - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + summaryProvider.result = summaryStatsMaker("16Gi", "200Gi", podStats) + manager.synchronize(diskInfoProvider, activePodsFunc) // we should not have disk pressure if manager.IsUnderDiskPressure() { @@ -863,7 +848,7 @@ func TestNodeReclaimFuncs(t *testing.T) { summaryProvider.result = summaryStatsMaker("400Mi", "200Gi", podStats) // Dont reclaim any disk diskGC.summaryAfterGC = summaryStatsMaker("400Mi", "200Gi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + manager.synchronize(diskInfoProvider, activePodsFunc) // we should have disk pressure if !manager.IsUnderDiskPressure() { @@ -890,7 +875,7 @@ func TestNodeReclaimFuncs(t *testing.T) { diskGC.imageGCInvoked = false // reset state diskGC.containerGCInvoked = false // reset state podKiller.pod = nil // reset state - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + manager.synchronize(diskInfoProvider, activePodsFunc) // we should have disk pressure (because transition period not yet met) if !manager.IsUnderDiskPressure() { @@ -913,7 +898,7 @@ func TestNodeReclaimFuncs(t *testing.T) { diskGC.imageGCInvoked = false // reset state diskGC.containerGCInvoked = false // reset state podKiller.pod = nil // reset state - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + manager.synchronize(diskInfoProvider, activePodsFunc) // we should not have disk pressure (because transition period met) if manager.IsUnderDiskPressure() { @@ -981,7 +966,6 @@ func TestInodePressureNodeFsInodes(t *testing.T) { fakeClock := clock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false} - capacityProvider := newMockCapacityProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("3Gi")}, v1.ResourceList{v1.ResourceMemory: *quantityMustParse("1Gi")}) diskGC := &mockDiskGC{err: nil} nodeRef := &v1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""} @@ -1024,7 +1008,7 @@ func TestInodePressureNodeFsInodes(t *testing.T) { podToAdmit, _ := podMaker("pod-to-admit", defaultPriority, newResourceList("", "", ""), newResourceList("", "", ""), "0", "0", "0") // synchronize - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + manager.synchronize(diskInfoProvider, activePodsFunc) // we should not have disk pressure if manager.IsUnderDiskPressure() { @@ -1039,7 +1023,7 @@ func TestInodePressureNodeFsInodes(t *testing.T) { // induce soft threshold fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("1.5Mi", "4Mi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + manager.synchronize(diskInfoProvider, activePodsFunc) // we should have disk pressure if !manager.IsUnderDiskPressure() { @@ -1054,7 +1038,7 @@ func TestInodePressureNodeFsInodes(t *testing.T) { // step forward in time pass the grace period fakeClock.Step(3 * time.Minute) summaryProvider.result = summaryStatsMaker("1.5Mi", "4Mi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + manager.synchronize(diskInfoProvider, activePodsFunc) // we should have disk pressure if !manager.IsUnderDiskPressure() { @@ -1079,7 +1063,7 @@ func TestInodePressureNodeFsInodes(t *testing.T) { // remove inode pressure fakeClock.Step(20 * time.Minute) summaryProvider.result = summaryStatsMaker("3Mi", "4Mi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + manager.synchronize(diskInfoProvider, activePodsFunc) // we should not have disk pressure if manager.IsUnderDiskPressure() { @@ -1089,7 +1073,7 @@ func TestInodePressureNodeFsInodes(t *testing.T) { // induce inode pressure! fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("0.5Mi", "4Mi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + manager.synchronize(diskInfoProvider, activePodsFunc) // we should have disk pressure if !manager.IsUnderDiskPressure() { @@ -1114,7 +1098,7 @@ func TestInodePressureNodeFsInodes(t *testing.T) { fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("3Mi", "4Mi", podStats) podKiller.pod = nil // reset state - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + manager.synchronize(diskInfoProvider, activePodsFunc) // we should have disk pressure (because transition period not yet met) if !manager.IsUnderDiskPressure() { @@ -1135,7 +1119,7 @@ func TestInodePressureNodeFsInodes(t *testing.T) { fakeClock.Step(5 * time.Minute) summaryProvider.result = summaryStatsMaker("3Mi", "4Mi", podStats) podKiller.pod = nil // reset state - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + manager.synchronize(diskInfoProvider, activePodsFunc) // we should not have disk pressure (because transition period met) if manager.IsUnderDiskPressure() { @@ -1184,7 +1168,6 @@ func TestCriticalPodsAreNotEvicted(t *testing.T) { fakeClock := clock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false} - capacityProvider := newMockCapacityProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("3Gi")}, v1.ResourceList{v1.ResourceMemory: *quantityMustParse("1Gi")}) diskGC := &mockDiskGC{err: nil} nodeRef := &v1.ObjectReference{ Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: "", @@ -1230,7 +1213,7 @@ func TestCriticalPodsAreNotEvicted(t *testing.T) { // induce soft threshold fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("1500Mi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + manager.synchronize(diskInfoProvider, activePodsFunc) // we should have memory pressure if !manager.IsUnderMemoryPressure() { @@ -1245,7 +1228,7 @@ func TestCriticalPodsAreNotEvicted(t *testing.T) { // step forward in time pass the grace period fakeClock.Step(3 * time.Minute) summaryProvider.result = summaryStatsMaker("1500Mi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + manager.synchronize(diskInfoProvider, activePodsFunc) // we should have memory pressure if !manager.IsUnderMemoryPressure() { @@ -1263,7 +1246,7 @@ func TestCriticalPodsAreNotEvicted(t *testing.T) { // remove memory pressure fakeClock.Step(20 * time.Minute) summaryProvider.result = summaryStatsMaker("3Gi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + manager.synchronize(diskInfoProvider, activePodsFunc) // we should not have memory pressure if manager.IsUnderMemoryPressure() { @@ -1276,7 +1259,7 @@ func TestCriticalPodsAreNotEvicted(t *testing.T) { // induce memory pressure! fakeClock.Step(1 * time.Minute) summaryProvider.result = summaryStatsMaker("500Mi", podStats) - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + manager.synchronize(diskInfoProvider, activePodsFunc) // we should have memory pressure if !manager.IsUnderMemoryPressure() { @@ -1294,7 +1277,6 @@ func TestAllocatableMemoryPressure(t *testing.T) { utilfeature.DefaultFeatureGate.SetFromMap(map[string]bool{string(features.PodPriority): true}) podMaker := makePodWithMemoryStats summaryStatsMaker := makeMemoryStats - constantCapacity := "4Gi" podsToMake := []podToMake{ {name: "guaranteed-low-priority-high-usage", priority: lowPriority, requests: newResourceList("100m", "1Gi", ""), limits: newResourceList("100m", "1Gi", ""), memoryWorkingSet: "900Mi"}, {name: "burstable-below-requests", priority: defaultPriority, requests: newResourceList("100m", "100Mi", ""), limits: newResourceList("200m", "1Gi", ""), memoryWorkingSet: "50Mi"}, @@ -1317,7 +1299,6 @@ func TestAllocatableMemoryPressure(t *testing.T) { fakeClock := clock.NewFakeClock(time.Now()) podKiller := &mockPodKiller{} diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false} - capacityProvider := newMockCapacityProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("3Gi")}, v1.ResourceList{v1.ResourceMemory: *quantityMustParse("1Gi")}) diskGC := &mockDiskGC{err: nil} nodeRef := &v1.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""} @@ -1329,12 +1310,12 @@ func TestAllocatableMemoryPressure(t *testing.T) { Signal: evictionapi.SignalAllocatableMemoryAvailable, Operator: evictionapi.OpLessThan, Value: evictionapi.ThresholdValue{ - Quantity: quantityMustParse("1Ki"), + Quantity: quantityMustParse("1Gi"), }, }, }, } - summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker(constantCapacity, podStats)} + summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker("4Gi", podStats)} manager := &managerImpl{ clock: fakeClock, killPodFunc: podKiller.killPodNow, @@ -1353,7 +1334,7 @@ func TestAllocatableMemoryPressure(t *testing.T) { burstablePodToAdmit, _ := podMaker("burst-admit", defaultPriority, newResourceList("100m", "100Mi", ""), newResourceList("200m", "200Mi", ""), "0Gi") // synchronize - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + manager.synchronize(diskInfoProvider, activePodsFunc) // we should not have memory pressure if manager.IsUnderMemoryPressure() { @@ -1372,8 +1353,8 @@ func TestAllocatableMemoryPressure(t *testing.T) { fakeClock.Step(1 * time.Minute) pod, podStat := podMaker("guaranteed-high-2", defaultPriority, newResourceList("100m", "1Gi", ""), newResourceList("100m", "1Gi", ""), "1Gi") podStats[pod] = podStat - summaryProvider.result = summaryStatsMaker(constantCapacity, podStats) - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + summaryProvider.result = summaryStatsMaker("500Mi", podStats) + manager.synchronize(diskInfoProvider, activePodsFunc) // we should have memory pressure if !manager.IsUnderMemoryPressure() { @@ -1407,9 +1388,9 @@ func TestAllocatableMemoryPressure(t *testing.T) { delete(podStats, pod) } } - summaryProvider.result = summaryStatsMaker(constantCapacity, podStats) + summaryProvider.result = summaryStatsMaker("2Gi", podStats) podKiller.pod = nil // reset state - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + manager.synchronize(diskInfoProvider, activePodsFunc) // we should have memory pressure (because transition period not yet met) if !manager.IsUnderMemoryPressure() { @@ -1431,9 +1412,9 @@ func TestAllocatableMemoryPressure(t *testing.T) { // move the clock past transition period to ensure that we stop reporting pressure fakeClock.Step(5 * time.Minute) - summaryProvider.result = summaryStatsMaker(constantCapacity, podStats) + summaryProvider.result = summaryStatsMaker("2Gi", podStats) podKiller.pod = nil // reset state - manager.synchronize(diskInfoProvider, activePodsFunc, capacityProvider) + manager.synchronize(diskInfoProvider, activePodsFunc) // we should not have memory pressure (because transition period met) if manager.IsUnderMemoryPressure() { diff --git a/pkg/kubelet/eviction/helpers.go b/pkg/kubelet/eviction/helpers.go index 99c1a3ecdfedc..92c10ad6f557e 100644 --- a/pkg/kubelet/eviction/helpers.go +++ b/pkg/kubelet/eviction/helpers.go @@ -100,9 +100,6 @@ func validSignal(signal evictionapi.Signal) bool { // ParseThresholdConfig parses the flags for thresholds. func ParseThresholdConfig(allocatableConfig []string, evictionHard, evictionSoft, evictionSoftGracePeriod, evictionMinimumReclaim map[string]string) ([]evictionapi.Threshold, error) { results := []evictionapi.Threshold{} - allocatableThresholds := getAllocatableThreshold(allocatableConfig) - results = append(results, allocatableThresholds...) - hardThresholds, err := parseThresholdStatements(evictionHard) if err != nil { return nil, err @@ -137,9 +134,31 @@ func ParseThresholdConfig(allocatableConfig []string, evictionHard, evictionSoft } } } + for _, key := range allocatableConfig { + if key == kubetypes.NodeAllocatableEnforcementKey { + results = addAllocatableThresholds(results) + break + } + } return results, nil } +func addAllocatableThresholds(thresholds []evictionapi.Threshold) []evictionapi.Threshold { + additionalThresholds := []evictionapi.Threshold{} + for _, threshold := range thresholds { + if threshold.Signal == evictionapi.SignalMemoryAvailable && isHardEvictionThreshold(threshold) { + // Copy the SignalMemoryAvailable to SignalAllocatableMemoryAvailable + additionalThresholds = append(additionalThresholds, evictionapi.Threshold{ + Signal: evictionapi.SignalAllocatableMemoryAvailable, + Operator: threshold.Operator, + Value: threshold.Value, + MinReclaim: threshold.MinReclaim, + }) + } + } + return append(thresholds, additionalThresholds...) +} + // parseThresholdStatements parses the input statements into a list of Threshold objects. func parseThresholdStatements(statements map[string]string) ([]evictionapi.Threshold, error) { if len(statements) == 0 { @@ -204,27 +223,6 @@ func parseThresholdStatement(signal evictionapi.Signal, val string) (*evictionap }, nil } -// getAllocatableThreshold returns the thresholds applicable for the allocatable configuration -func getAllocatableThreshold(allocatableConfig []string) []evictionapi.Threshold { - for _, key := range allocatableConfig { - if key == kubetypes.NodeAllocatableEnforcementKey { - return []evictionapi.Threshold{ - { - Signal: evictionapi.SignalAllocatableMemoryAvailable, - Operator: evictionapi.OpLessThan, - Value: evictionapi.ThresholdValue{ - Quantity: resource.NewQuantity(int64(0), resource.BinarySI), - }, - MinReclaim: &evictionapi.ThresholdValue{ - Quantity: resource.NewQuantity(int64(0), resource.BinarySI), - }, - }, - } - } - } - return []evictionapi.Threshold{} -} - // parsePercentage parses a string representing a percentage value func parsePercentage(input string) (float32, error) { value, err := strconv.ParseFloat(strings.TrimRight(input, "%"), 32) @@ -724,7 +722,7 @@ func (a byEvictionPriority) Less(i, j int) bool { } // makeSignalObservations derives observations using the specified summary provider. -func makeSignalObservations(summary *statsapi.Summary, capacityProvider CapacityProvider, pods []*v1.Pod) (signalObservations, statsFunc) { +func makeSignalObservations(summary *statsapi.Summary) (signalObservations, statsFunc) { // build the function to work against for pod stats statsFunc := cachedStatsFunc(summary.Pods) // build an evaluation context for current eviction signals @@ -737,6 +735,17 @@ func makeSignalObservations(summary *statsapi.Summary, capacityProvider Capacity time: memory.Time, } } + if allocatableContainer, err := getSysContainer(summary.Node.SystemContainers, statsapi.SystemContainerPods); err != nil { + glog.Errorf("eviction manager: failed to construct signal: %q error: %v", evictionapi.SignalAllocatableMemoryAvailable, err) + } else { + if memory := allocatableContainer.Memory; memory != nil && memory.AvailableBytes != nil && memory.WorkingSetBytes != nil { + result[evictionapi.SignalAllocatableMemoryAvailable] = signalObservation{ + available: resource.NewQuantity(int64(*memory.AvailableBytes), resource.BinarySI), + capacity: resource.NewQuantity(int64(*memory.AvailableBytes+*memory.WorkingSetBytes), resource.BinarySI), + time: memory.Time, + } + } + } if nodeFs := summary.Node.Fs; nodeFs != nil { if nodeFs.AvailableBytes != nil && nodeFs.CapacityBytes != nil { result[evictionapi.SignalNodeFsAvailable] = signalObservation{ @@ -771,7 +780,6 @@ func makeSignalObservations(summary *statsapi.Summary, capacityProvider Capacity } } } - if rlimit := summary.Node.Rlimit; rlimit != nil { if rlimit.NumOfRunningProcesses != nil && rlimit.MaxPID != nil { available := int64(*rlimit.MaxPID) - int64(*rlimit.NumOfRunningProcesses) @@ -782,27 +790,16 @@ func makeSignalObservations(summary *statsapi.Summary, capacityProvider Capacity } } } + return result, statsFunc +} - if memoryAllocatableCapacity, ok := capacityProvider.GetCapacity()[v1.ResourceMemory]; ok { - memoryAllocatableAvailable := memoryAllocatableCapacity.Copy() - if reserved, exists := capacityProvider.GetNodeAllocatableReservation()[v1.ResourceMemory]; exists { - memoryAllocatableAvailable.Sub(reserved) - } - for _, pod := range summary.Pods { - mu, err := podMemoryUsage(pod) - if err == nil { - memoryAllocatableAvailable.Sub(mu[v1.ResourceMemory]) - } - } - result[evictionapi.SignalAllocatableMemoryAvailable] = signalObservation{ - available: memoryAllocatableAvailable, - capacity: &memoryAllocatableCapacity, +func getSysContainer(sysContainers []statsapi.ContainerStats, name string) (*statsapi.ContainerStats, error) { + for _, cont := range sysContainers { + if cont.Name == name { + return &cont, nil } - } else { - glog.Errorf("Could not find capacity information for resource %v", v1.ResourceMemory) } - - return result, statsFunc + return nil, fmt.Errorf("system container %q not found in metrics", name) } // thresholdsMet returns the set of thresholds that were met independent of grace period diff --git a/pkg/kubelet/eviction/helpers_test.go b/pkg/kubelet/eviction/helpers_test.go index 168352ff2553c..a61d19fee787f 100644 --- a/pkg/kubelet/eviction/helpers_test.go +++ b/pkg/kubelet/eviction/helpers_test.go @@ -72,7 +72,7 @@ func TestParseThresholdConfig(t *testing.T) { Signal: evictionapi.SignalAllocatableMemoryAvailable, Operator: evictionapi.OpLessThan, Value: evictionapi.ThresholdValue{ - Quantity: quantityMustParse("0"), + Quantity: quantityMustParse("150Mi"), }, MinReclaim: &evictionapi.ThresholdValue{ Quantity: quantityMustParse("0"), @@ -1004,6 +1004,15 @@ func TestMakeSignalObservations(t *testing.T) { InodesFree: &nodeFsInodesFree, Inodes: &nodeFsInodes, }, + SystemContainers: []statsapi.ContainerStats{ + { + Name: statsapi.SystemContainerPods, + Memory: &statsapi.MemoryStats{ + AvailableBytes: &nodeAvailableBytes, + WorkingSetBytes: &nodeWorkingSetBytes, + }, + }, + }, }, Pods: []statsapi.PodStats{}, } @@ -1017,20 +1026,19 @@ func TestMakeSignalObservations(t *testing.T) { fakeStats.Pods = append(fakeStats.Pods, newPodStats(pod, containerWorkingSetBytes)) } res := quantityMustParse("5Gi") - capacityProvider := newMockCapacityProvider(v1.ResourceList{v1.ResourceMemory: *quantityMustParse("5Gi")}, v1.ResourceList{v1.ResourceMemory: *quantityMustParse("0Gi")}) // Allocatable thresholds are always 100%. Verify that Threshold == Capacity. if res.CmpInt64(int64(allocatableMemoryCapacity)) != 0 { t.Errorf("Expected Threshold %v to be equal to value %v", res.Value(), allocatableMemoryCapacity) } - actualObservations, statsFunc := makeSignalObservations(fakeStats, capacityProvider, pods) + actualObservations, statsFunc := makeSignalObservations(fakeStats) allocatableMemQuantity, found := actualObservations[evictionapi.SignalAllocatableMemoryAvailable] if !found { t.Errorf("Expected allocatable memory observation, but didnt find one") } - if allocatableMemQuantity.available.Value() != 2*containerWorkingSetBytes { - t.Errorf("Expected %v, actual: %v", containerWorkingSetBytes, allocatableMemQuantity.available.Value()) + if expectedBytes := int64(nodeAvailableBytes); allocatableMemQuantity.available.Value() != expectedBytes { + t.Errorf("Expected %v, actual: %v", expectedBytes, allocatableMemQuantity.available.Value()) } - if expectedBytes := int64(allocatableMemoryCapacity); allocatableMemQuantity.capacity.Value() != expectedBytes { + if expectedBytes := int64(nodeWorkingSetBytes + nodeAvailableBytes); allocatableMemQuantity.capacity.Value() != expectedBytes { t.Errorf("Expected %v, actual: %v", expectedBytes, allocatableMemQuantity.capacity.Value()) } memQuantity, found := actualObservations[evictionapi.SignalMemoryAvailable] diff --git a/pkg/kubelet/eviction/types.go b/pkg/kubelet/eviction/types.go index a3e2827bbf2f3..e4fca84d7a04d 100644 --- a/pkg/kubelet/eviction/types.go +++ b/pkg/kubelet/eviction/types.go @@ -53,7 +53,7 @@ type Config struct { // Manager evaluates when an eviction threshold for node stability has been met on the node. type Manager interface { // Start starts the control loop to monitor eviction thresholds at specified interval. - Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, podCleanedUpFunc PodCleanedUpFunc, capacityProvider CapacityProvider, monitoringInterval time.Duration) + Start(diskInfoProvider DiskInfoProvider, podFunc ActivePodsFunc, podCleanedUpFunc PodCleanedUpFunc, monitoringInterval time.Duration) // IsUnderMemoryPressure returns true if the node is under memory pressure. IsUnderMemoryPressure() bool @@ -71,14 +71,6 @@ type DiskInfoProvider interface { HasDedicatedImageFs() (bool, error) } -// CapacityProvider is responsible for providing the resource capacity and reservation information -type CapacityProvider interface { - // GetCapacity returns the amount of compute resources tracked by container manager available on the node. - GetCapacity() v1.ResourceList - // GetNodeAllocatableReservation returns the amount of compute resources that have to be reserved from scheduling. - GetNodeAllocatableReservation() v1.ResourceList -} - // ImageGC is responsible for performing garbage collection of unused images. type ImageGC interface { // DeleteUnusedImages deletes unused images. diff --git a/pkg/kubelet/kubelet.go b/pkg/kubelet/kubelet.go index 3fa2ec9d4c893..51a6964b75dbf 100644 --- a/pkg/kubelet/kubelet.go +++ b/pkg/kubelet/kubelet.go @@ -1318,7 +1318,7 @@ func (kl *Kubelet) initializeRuntimeDependentModules() { glog.Fatalf("Failed to start cAdvisor %v", err) } // eviction manager must start after cadvisor because it needs to know if the container runtime has a dedicated imagefs - kl.evictionManager.Start(kl.StatsProvider, kl.GetActivePods, kl.podResourcesAreReclaimed, kl.containerManager, evictionMonitoringPeriod) + kl.evictionManager.Start(kl.StatsProvider, kl.GetActivePods, kl.podResourcesAreReclaimed, evictionMonitoringPeriod) // trigger on-demand stats collection once so that we have capacity information for ephemeral storage. // ignore any errors, since if stats collection is not successful, the container manager will fail to start below. diff --git a/pkg/kubelet/kubelet_getters.go b/pkg/kubelet/kubelet_getters.go index 1b98f9189fb6b..5647dadb61151 100644 --- a/pkg/kubelet/kubelet_getters.go +++ b/pkg/kubelet/kubelet_getters.go @@ -211,6 +211,11 @@ func (kl *Kubelet) GetNodeConfig() cm.NodeConfig { return kl.containerManager.GetNodeConfig() } +// GetPodCgroupRoot returns the listeral cgroupfs value for the cgroup containing all pods +func (kl *Kubelet) GetPodCgroupRoot() string { + return kl.containerManager.GetPodCgroupRoot() +} + // GetHostIP returns host IP or nil in case of error. func (kl *Kubelet) GetHostIP() (net.IP, error) { node, err := kl.GetNode() diff --git a/pkg/kubelet/server/server_test.go b/pkg/kubelet/server/server_test.go index 58358ceb223dd..a42994018149e 100644 --- a/pkg/kubelet/server/server_test.go +++ b/pkg/kubelet/server/server_test.go @@ -168,6 +168,7 @@ func (fk *fakeKubelet) StreamingConnectionIdleTimeout() time.Duration { // Unused functions func (_ *fakeKubelet) GetNode() (*v1.Node, error) { return nil, nil } func (_ *fakeKubelet) GetNodeConfig() cm.NodeConfig { return cm.NodeConfig{} } +func (_ *fakeKubelet) GetPodCgroupRoot() string { return "" } func (fk *fakeKubelet) ListVolumesForPod(podUID types.UID) (map[string]volume.Volume, bool) { return map[string]volume.Volume{}, true diff --git a/pkg/kubelet/server/stats/handler.go b/pkg/kubelet/server/stats/handler.go index 13b371669cb24..ba2033c953dbd 100644 --- a/pkg/kubelet/server/stats/handler.go +++ b/pkg/kubelet/server/stats/handler.go @@ -81,6 +81,9 @@ type StatsProvider interface { // RlimitStats returns the rlimit stats of system. RlimitStats() (*statsapi.RlimitStats, error) + + // GetPodCgroupRoot returns the literal cgroupfs value for the cgroup containing all pods + GetPodCgroupRoot() string } type handler struct { diff --git a/pkg/kubelet/server/stats/summary.go b/pkg/kubelet/server/stats/summary.go index 4a155040f802a..0fff691f1222e 100644 --- a/pkg/kubelet/server/stats/summary.go +++ b/pkg/kubelet/server/stats/summary.go @@ -83,19 +83,23 @@ func (sp *summaryProviderImpl) Get(updateStats bool) (*statsapi.Summary, error) Rlimit: rlimit, } - systemContainers := map[string]string{ - statsapi.SystemContainerKubelet: nodeConfig.KubeletCgroupsName, - statsapi.SystemContainerRuntime: nodeConfig.RuntimeCgroupsName, - statsapi.SystemContainerMisc: nodeConfig.SystemCgroupsName, + systemContainers := map[string]struct { + name string + forceStatsUpdate bool + }{ + statsapi.SystemContainerKubelet: {nodeConfig.KubeletCgroupsName, false}, + statsapi.SystemContainerRuntime: {nodeConfig.RuntimeCgroupsName, false}, + statsapi.SystemContainerMisc: {nodeConfig.SystemCgroupsName, false}, + statsapi.SystemContainerPods: {sp.provider.GetPodCgroupRoot(), updateStats}, } - for sys, name := range systemContainers { + for sys, cont := range systemContainers { // skip if cgroup name is undefined (not all system containers are required) - if name == "" { + if cont.name == "" { continue } - s, _, err := sp.provider.GetCgroupStats(name, false) + s, _, err := sp.provider.GetCgroupStats(cont.name, cont.forceStatsUpdate) if err != nil { - glog.Errorf("Failed to get system container stats for %q: %v", name, err) + glog.Errorf("Failed to get system container stats for %q: %v", cont.name, err) continue } // System containers don't have a filesystem associated with them. diff --git a/pkg/kubelet/server/stats/summary_test.go b/pkg/kubelet/server/stats/summary_test.go index c6cbe4f42a666..3b260cf799bfd 100644 --- a/pkg/kubelet/server/stats/summary_test.go +++ b/pkg/kubelet/server/stats/summary_test.go @@ -49,6 +49,7 @@ func TestSummaryProvider(t *testing.T) { SystemCgroupsName: "/misc", KubeletCgroupsName: "/kubelet", } + cgroupRoot = "/kubepods" cgroupStatsMap = map[string]struct { cs *statsapi.ContainerStats ns *statsapi.NetworkStats @@ -57,6 +58,7 @@ func TestSummaryProvider(t *testing.T) { "/runtime": {cs: getContainerStats(), ns: getNetworkStats()}, "/misc": {cs: getContainerStats(), ns: getNetworkStats()}, "/kubelet": {cs: getContainerStats(), ns: getNetworkStats()}, + "/pods": {cs: getContainerStats(), ns: getNetworkStats()}, } rlimitStats = getRlimitStats() ) @@ -67,6 +69,7 @@ func TestSummaryProvider(t *testing.T) { mockStatsProvider. On("GetNode").Return(node, nil). On("GetNodeConfig").Return(nodeConfig). + On("GetPodCgroupRoot").Return(cgroupRoot). On("ListPodStats").Return(podStats, nil). On("ImageFsStats").Return(imageFsStats, nil). On("RootFsStats").Return(rootFsStats, nil). @@ -74,7 +77,8 @@ func TestSummaryProvider(t *testing.T) { On("GetCgroupStats", "/", true).Return(cgroupStatsMap["/"].cs, cgroupStatsMap["/"].ns, nil). On("GetCgroupStats", "/runtime", false).Return(cgroupStatsMap["/runtime"].cs, cgroupStatsMap["/runtime"].ns, nil). On("GetCgroupStats", "/misc", false).Return(cgroupStatsMap["/misc"].cs, cgroupStatsMap["/misc"].ns, nil). - On("GetCgroupStats", "/kubelet", false).Return(cgroupStatsMap["/kubelet"].cs, cgroupStatsMap["/kubelet"].ns, nil) + On("GetCgroupStats", "/kubelet", false).Return(cgroupStatsMap["/kubelet"].cs, cgroupStatsMap["/kubelet"].ns, nil). + On("GetCgroupStats", "/kubepods", true).Return(cgroupStatsMap["/pods"].cs, cgroupStatsMap["/pods"].ns, nil) provider := NewSummaryProvider(mockStatsProvider) summary, err := provider.Get(true) @@ -88,7 +92,7 @@ func TestSummaryProvider(t *testing.T) { assert.Equal(summary.Node.Fs, rootFsStats) assert.Equal(summary.Node.Runtime, &statsapi.RuntimeStats{ImageFs: imageFsStats}) - assert.Equal(len(summary.Node.SystemContainers), 3) + assert.Equal(len(summary.Node.SystemContainers), 4) assert.Contains(summary.Node.SystemContainers, statsapi.ContainerStats{ Name: "kubelet", StartTime: cgroupStatsMap["/kubelet"].cs.StartTime, @@ -113,6 +117,14 @@ func TestSummaryProvider(t *testing.T) { Accelerators: cgroupStatsMap["/runtime"].cs.Accelerators, UserDefinedMetrics: cgroupStatsMap["/runtime"].cs.UserDefinedMetrics, }) + assert.Contains(summary.Node.SystemContainers, statsapi.ContainerStats{ + Name: "pods", + StartTime: cgroupStatsMap["/pods"].cs.StartTime, + CPU: cgroupStatsMap["/pods"].cs.CPU, + Memory: cgroupStatsMap["/pods"].cs.Memory, + Accelerators: cgroupStatsMap["/pods"].cs.Accelerators, + UserDefinedMetrics: cgroupStatsMap["/pods"].cs.UserDefinedMetrics, + }) assert.Equal(summary.Pods, podStats) } diff --git a/pkg/kubelet/server/stats/testing/mock_stats_provider.go b/pkg/kubelet/server/stats/testing/mock_stats_provider.go index 8695e3f16dcd8..2923df9633991 100644 --- a/pkg/kubelet/server/stats/testing/mock_stats_provider.go +++ b/pkg/kubelet/server/stats/testing/mock_stats_provider.go @@ -125,6 +125,20 @@ func (_m *StatsProvider) GetNodeConfig() cm.NodeConfig { return r0 } +// GetPodCgroupRoot provides a mock function with given fields: +func (_m *StatsProvider) GetPodCgroupRoot() string { + ret := _m.Called() + + var r0 string + if rf, ok := ret.Get(0).(func() string); ok { + r0 = rf() + } else { + r0 = ret.Get(0).(string) + } + + return r0 +} + // GetPodByName provides a mock function with given fields: namespace, name func (_m *StatsProvider) GetPodByName(namespace string, name string) (*corev1.Pod, bool) { ret := _m.Called(namespace, name) diff --git a/test/e2e_node/summary_test.go b/test/e2e_node/summary_test.go index ff5d1d7f93404..edfb0d3d357b2 100644 --- a/test/e2e_node/summary_test.go +++ b/test/e2e_node/summary_test.go @@ -107,9 +107,21 @@ var _ = framework.KubeDescribe("Summary API", func() { "UserDefinedMetrics": BeEmpty(), }) } + podsContExpectations := sysContExpectations().(*gstruct.FieldsMatcher) + podsContExpectations.Fields["Memory"] = ptrMatchAllFields(gstruct.Fields{ + "Time": recent(maxStatsAge), + // Pods are limited by Node Allocatable + "AvailableBytes": bounded(1*framework.Kb, memoryLimit), + "UsageBytes": bounded(10*framework.Kb, 20*framework.Mb), + "WorkingSetBytes": bounded(10*framework.Kb, 20*framework.Mb), + "RSSBytes": bounded(1*framework.Kb, 20*framework.Mb), + "PageFaults": bounded(0, 1000000), + "MajorPageFaults": bounded(0, 10), + }) systemContainers := gstruct.Elements{ "kubelet": sysContExpectations(), "runtime": sysContExpectations(), + "pods": podsContExpectations, } // The Kubelet only manages the 'misc' system container if the host is not running systemd. if !systemdutil.IsRunningSystemd() {