diff --git a/docs/proposals/kubelet-eviction.md b/docs/proposals/kubelet-eviction.md index 4ac8576577817..02c921ad05004 100644 --- a/docs/proposals/kubelet-eviction.md +++ b/docs/proposals/kubelet-eviction.md @@ -478,9 +478,19 @@ for eviction. Instead `DaemonSet` should ideally include Guaranteed pods only. ## Known issues +### kubelet may evict more pods than needed + The pod eviction may evict more pods than needed due to stats collection timing gap. This can be mitigated by adding the ability to get root container stats on an on-demand basis (https://github.com/google/cadvisor/issues/1247) in the future. +### How kubelet ranks pods for eviction in response to inode exhaustion + +At this time, it is not possible to know how many inodes were consumed by a particular container. If the `kubelet` observes +inode exhaustion, it will evict pods by ranking them by quality of service. The following issue has been opened in cadvisor +to track per container inode consumption (https://github.com/google/cadvisor/issues/1422) which would allow us to rank pods +by inode consumption. For example, this would let us identify a container that created large numbers of 0 byte files, and evict +that pod over others. + [![Analytics](https://kubernetes-site.appspot.com/UA-36037335-10/GitHub/docs/proposals/kubelet-eviction.md?pixel)]() diff --git a/hack/.linted_packages b/hack/.linted_packages index 108f1dc1e0cc7..5cbd622bc89a7 100644 --- a/hack/.linted_packages +++ b/hack/.linted_packages @@ -96,6 +96,7 @@ pkg/credentialprovider/aws pkg/hyperkube pkg/kubelet/api pkg/kubelet/container +pkg/kubelet/eviction pkg/kubelet/envvars pkg/kubelet/util/format pkg/kubelet/util/ioutils diff --git a/pkg/kubelet/eviction/eviction_manager_test.go b/pkg/kubelet/eviction/eviction_manager_test.go index 1eb3716c63d30..0deebaf3595cb 100644 --- a/pkg/kubelet/eviction/eviction_manager_test.go +++ b/pkg/kubelet/eviction/eviction_manager_test.go @@ -914,3 +914,228 @@ func TestNodeReclaimFuncs(t *testing.T) { t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod) } } + +func TestDiskPressureNodeFsInodes(t *testing.T) { + // TODO: we need to know inodes used when cadvisor supports per container stats + podMaker := func(name string, requests api.ResourceList, limits api.ResourceList) (*api.Pod, statsapi.PodStats) { + pod := newPod(name, []api.Container{ + newContainer(name, requests, limits), + }, nil) + podStats := newPodInodeStats(pod) + return pod, podStats + } + summaryStatsMaker := func(rootFsInodesFree, rootFsInodes string, podStats map[*api.Pod]statsapi.PodStats) *statsapi.Summary { + rootFsInodesFreeVal := resource.MustParse(rootFsInodesFree) + internalRootFsInodesFree := uint64(rootFsInodesFreeVal.Value()) + rootFsInodesVal := resource.MustParse(rootFsInodes) + internalRootFsInodes := uint64(rootFsInodesVal.Value()) + result := &statsapi.Summary{ + Node: statsapi.NodeStats{ + Fs: &statsapi.FsStats{ + InodesFree: &internalRootFsInodesFree, + Inodes: &internalRootFsInodes, + }, + }, + Pods: []statsapi.PodStats{}, + } + for _, podStat := range podStats { + result.Pods = append(result.Pods, podStat) + } + return result + } + // TODO: pass inodes used in future when supported by cadvisor. + podsToMake := []struct { + name string + requests api.ResourceList + limits api.ResourceList + }{ + {name: "best-effort-high", requests: newResourceList("", ""), limits: newResourceList("", "")}, + {name: "best-effort-low", requests: newResourceList("", ""), limits: newResourceList("", "")}, + {name: "burstable-high", requests: newResourceList("100m", "100Mi"), limits: newResourceList("200m", "1Gi")}, + {name: "burstable-low", requests: newResourceList("100m", "100Mi"), limits: newResourceList("200m", "1Gi")}, + {name: "guaranteed-high", requests: newResourceList("100m", "1Gi"), limits: newResourceList("100m", "1Gi")}, + {name: "guaranteed-low", requests: newResourceList("100m", "1Gi"), limits: newResourceList("100m", "1Gi")}, + } + pods := []*api.Pod{} + podStats := map[*api.Pod]statsapi.PodStats{} + for _, podToMake := range podsToMake { + pod, podStat := podMaker(podToMake.name, podToMake.requests, podToMake.limits) + pods = append(pods, pod) + podStats[pod] = podStat + } + activePodsFunc := func() []*api.Pod { + return pods + } + + fakeClock := clock.NewFakeClock(time.Now()) + podKiller := &mockPodKiller{} + diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: false} + imageGC := &mockImageGC{freed: int64(0), err: nil} + nodeRef := &api.ObjectReference{Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: ""} + + config := Config{ + MaxPodGracePeriodSeconds: 5, + PressureTransitionPeriod: time.Minute * 5, + Thresholds: []Threshold{ + { + Signal: SignalNodeFsInodesFree, + Operator: OpLessThan, + Value: ThresholdValue{ + Quantity: quantityMustParse("1Mi"), + }, + }, + { + Signal: SignalNodeFsInodesFree, + Operator: OpLessThan, + Value: ThresholdValue{ + Quantity: quantityMustParse("2Mi"), + }, + GracePeriod: time.Minute * 2, + }, + }, + } + summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker("3Mi", "4Mi", podStats)} + manager := &managerImpl{ + clock: fakeClock, + killPodFunc: podKiller.killPodNow, + imageGC: imageGC, + config: config, + recorder: &record.FakeRecorder{}, + summaryProvider: summaryProvider, + nodeRef: nodeRef, + nodeConditionsLastObservedAt: nodeConditionsObservedAt{}, + thresholdsFirstObservedAt: thresholdsObservedAt{}, + } + + // create a best effort pod to test admission + podToAdmit, _ := podMaker("pod-to-admit", newResourceList("", ""), newResourceList("", "")) + + // synchronize + manager.synchronize(diskInfoProvider, activePodsFunc) + + // we should not have disk pressure + if manager.IsUnderDiskPressure() { + t.Errorf("Manager should not report disk pressure") + } + + // try to admit our pod (should succeed) + if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: podToAdmit}); !result.Admit { + t.Errorf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, true, result.Admit) + } + + // induce soft threshold + fakeClock.Step(1 * time.Minute) + summaryProvider.result = summaryStatsMaker("1.5Mi", "4Mi", podStats) + manager.synchronize(diskInfoProvider, activePodsFunc) + + // we should have disk pressure + if !manager.IsUnderDiskPressure() { + t.Errorf("Manager should report disk pressure since soft threshold was met") + } + + // verify no pod was yet killed because there has not yet been enough time passed. + if podKiller.pod != nil { + t.Errorf("Manager should not have killed a pod yet, but killed: %v", podKiller.pod) + } + + // step forward in time pass the grace period + fakeClock.Step(3 * time.Minute) + summaryProvider.result = summaryStatsMaker("1.5Mi", "4Mi", podStats) + manager.synchronize(diskInfoProvider, activePodsFunc) + + // we should have disk pressure + if !manager.IsUnderDiskPressure() { + t.Errorf("Manager should report disk pressure since soft threshold was met") + } + + // verify the right pod was killed with the right grace period. + if podKiller.pod != pods[0] { + t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod, pods[0]) + } + if podKiller.gracePeriodOverride == nil { + t.Errorf("Manager chose to kill pod but should have had a grace period override.") + } + observedGracePeriod := *podKiller.gracePeriodOverride + if observedGracePeriod != manager.config.MaxPodGracePeriodSeconds { + t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", manager.config.MaxPodGracePeriodSeconds, observedGracePeriod) + } + // reset state + podKiller.pod = nil + podKiller.gracePeriodOverride = nil + + // remove disk pressure + fakeClock.Step(20 * time.Minute) + summaryProvider.result = summaryStatsMaker("3Mi", "4Mi", podStats) + manager.synchronize(diskInfoProvider, activePodsFunc) + + // we should not have disk pressure + if manager.IsUnderDiskPressure() { + t.Errorf("Manager should not report disk pressure") + } + + // induce disk pressure! + fakeClock.Step(1 * time.Minute) + summaryProvider.result = summaryStatsMaker("0.5Mi", "4Mi", podStats) + manager.synchronize(diskInfoProvider, activePodsFunc) + + // we should have disk pressure + if !manager.IsUnderDiskPressure() { + t.Errorf("Manager should report disk pressure") + } + + // check the right pod was killed + if podKiller.pod != pods[0] { + t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod, pods[0]) + } + observedGracePeriod = *podKiller.gracePeriodOverride + if observedGracePeriod != int64(0) { + t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod) + } + + // try to admit our pod (should fail) + if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: podToAdmit}); result.Admit { + t.Errorf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, false, result.Admit) + } + + // reduce disk pressure + fakeClock.Step(1 * time.Minute) + summaryProvider.result = summaryStatsMaker("3Mi", "4Mi", podStats) + podKiller.pod = nil // reset state + manager.synchronize(diskInfoProvider, activePodsFunc) + + // we should have disk pressure (because transition period not yet met) + if !manager.IsUnderDiskPressure() { + t.Errorf("Manager should report disk pressure") + } + + // no pod should have been killed + if podKiller.pod != nil { + t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod) + } + + // try to admit our pod (should fail) + if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: podToAdmit}); result.Admit { + t.Errorf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, false, result.Admit) + } + + // move the clock past transition period to ensure that we stop reporting pressure + fakeClock.Step(5 * time.Minute) + summaryProvider.result = summaryStatsMaker("3Mi", "4Mi", podStats) + podKiller.pod = nil // reset state + manager.synchronize(diskInfoProvider, activePodsFunc) + + // we should not have disk pressure (because transition period met) + if manager.IsUnderDiskPressure() { + t.Errorf("Manager should not report disk pressure") + } + + // no pod should have been killed + if podKiller.pod != nil { + t.Errorf("Manager chose to kill pod: %v when no pod should have been killed", podKiller.pod) + } + + // try to admit our pod (should succeed) + if result := manager.Admit(&lifecycle.PodAdmitAttributes{Pod: podToAdmit}); !result.Admit { + t.Errorf("Admit pod: %v, expected: %v, actual: %v", podToAdmit, true, result.Admit) + } +} diff --git a/pkg/kubelet/eviction/helpers.go b/pkg/kubelet/eviction/helpers.go index c2025f44ae7c9..8221c4897286d 100644 --- a/pkg/kubelet/eviction/helpers.go +++ b/pkg/kubelet/eviction/helpers.go @@ -41,10 +41,16 @@ const ( message = "The node was low on compute resources." // disk, in bytes. internal to this module, used to account for local disk usage. resourceDisk api.ResourceName = "disk" + // inodes, number. internal to this module, used to account for local disk inode consumption. + resourceInodes api.ResourceName = "inodes" // imagefs, in bytes. internal to this module, used to account for local image filesystem usage. resourceImageFs api.ResourceName = "imagefs" + // imagefs inodes, number. internal to this module, used to account for local image filesystem inodes. + resourceImageFsInodes api.ResourceName = "imagefsInodes" // nodefs, in bytes. internal to this module, used to account for local node root filesystem usage. resourceNodeFs api.ResourceName = "nodefs" + // nodefs inodes, number. internal to this module, used to account for local node root filesystem inodes. + resourceNodeFsInodes api.ResourceName = "nodefsInodes" ) var ( @@ -62,12 +68,16 @@ func init() { signalToNodeCondition[SignalMemoryAvailable] = api.NodeMemoryPressure signalToNodeCondition[SignalImageFsAvailable] = api.NodeDiskPressure signalToNodeCondition[SignalNodeFsAvailable] = api.NodeDiskPressure + signalToNodeCondition[SignalImageFsInodesFree] = api.NodeDiskPressure + signalToNodeCondition[SignalNodeFsInodesFree] = api.NodeDiskPressure // map signals to resources (and vice-versa) signalToResource = map[Signal]api.ResourceName{} signalToResource[SignalMemoryAvailable] = api.ResourceMemory signalToResource[SignalImageFsAvailable] = resourceImageFs + signalToResource[SignalImageFsInodesFree] = resourceImageFsInodes signalToResource[SignalNodeFsAvailable] = resourceNodeFs + signalToResource[SignalNodeFsInodesFree] = resourceNodeFsInodes resourceToSignal = map[api.ResourceName]Signal{} for key, value := range signalToResource { resourceToSignal[value] = key @@ -185,22 +195,21 @@ func parseThresholdStatement(statement string) (Threshold, error) { Percentage: percentage, }, }, nil - } else { - quantity, err := resource.ParseQuantity(quantityValue) - if err != nil { - return Threshold{}, err - } - if quantity.Sign() < 0 || quantity.IsZero() { - return Threshold{}, fmt.Errorf("eviction threshold %v must be positive: %s", signal, &quantity) - } - return Threshold{ - Signal: signal, - Operator: operator, - Value: ThresholdValue{ - Quantity: &quantity, - }, - }, nil } + quantity, err := resource.ParseQuantity(quantityValue) + if err != nil { + return Threshold{}, err + } + if quantity.Sign() < 0 || quantity.IsZero() { + return Threshold{}, fmt.Errorf("eviction threshold %v must be positive: %s", signal, &quantity) + } + return Threshold{ + Signal: signal, + Operator: operator, + Value: ThresholdValue{ + Quantity: &quantity, + }, + }, nil } // parsePercentage parses a string representing a percentage value @@ -287,6 +296,18 @@ func diskUsage(fsStats *statsapi.FsStats) *resource.Quantity { return resource.NewQuantity(usage, resource.BinarySI) } +// inodeUsage converts inodes consumed into a resource quantity. +func inodeUsage(fsStats *statsapi.FsStats) *resource.Quantity { + // TODO: cadvisor needs to support inodes used per container + // right now, cadvisor reports total inodes and inodes free per filesystem. + // this is insufficient to know how many inodes are consumed by the container. + // for example, with the overlay driver, the rootfs and each container filesystem + // will report the same total inode and inode free values but no way of knowing + // how many inodes consumed in that filesystem are charged to this container. + // for now, we report 0 as inode usage pending support in cadvisor. + return resource.NewQuantity(int64(0), resource.BinarySI) +} + // memoryUsage converts working set into a resource quantity. func memoryUsage(memStats *statsapi.MemoryStats) *resource.Quantity { if memStats == nil || memStats.WorkingSetBytes == nil { @@ -311,15 +332,18 @@ func localVolumeNames(pod *api.Pod) []string { return result } -// podDiskUsage aggregates pod disk usage for the specified stats to measure. +// podDiskUsage aggregates pod disk usage and inode consumption for the specified stats to measure. func podDiskUsage(podStats statsapi.PodStats, pod *api.Pod, statsToMeasure []fsStatsType) (api.ResourceList, error) { disk := resource.Quantity{Format: resource.BinarySI} + inodes := resource.Quantity{Format: resource.BinarySI} for _, container := range podStats.Containers { if hasFsStatsType(statsToMeasure, fsStatsRoot) { disk.Add(*diskUsage(container.Rootfs)) + inodes.Add(*inodeUsage(container.Rootfs)) } if hasFsStatsType(statsToMeasure, fsStatsLogs) { disk.Add(*diskUsage(container.Logs)) + inodes.Add(*inodeUsage(container.Logs)) } } if hasFsStatsType(statsToMeasure, fsStatsLocalVolumeSource) { @@ -328,13 +352,15 @@ func podDiskUsage(podStats statsapi.PodStats, pod *api.Pod, statsToMeasure []fsS for _, volumeStats := range podStats.VolumeStats { if volumeStats.Name == volumeName { disk.Add(*diskUsage(&volumeStats.FsStats)) + inodes.Add(*inodeUsage(&volumeStats.FsStats)) break } } } } return api.ResourceList{ - resourceDisk: disk, + resourceDisk: disk, + resourceInodes: inodes, }, nil } @@ -502,8 +528,8 @@ func memory(stats statsFunc) cmpFunc { } } -// disk compares pods by largest consumer of disk relative to request. -func disk(stats statsFunc, fsStatsToMeasure []fsStatsType) cmpFunc { +// disk compares pods by largest consumer of disk relative to request for the specified disk resource. +func disk(stats statsFunc, fsStatsToMeasure []fsStatsType, diskResource api.ResourceName) cmpFunc { return func(p1, p2 *api.Pod) int { p1Stats, found := stats(p1) // if we have no usage stats for p1, we want p2 first @@ -528,8 +554,8 @@ func disk(stats statsFunc, fsStatsToMeasure []fsStatsType) cmpFunc { // disk is best effort, so we don't measure relative to a request. // TODO: add disk as a guaranteed resource - p1Disk := p1Usage[resourceDisk] - p2Disk := p2Usage[resourceDisk] + p1Disk := p1Usage[diskResource] + p2Disk := p2Usage[diskResource] // if p2 is using more than p1, we want p2 first return p2Disk.Cmp(p1Disk) } @@ -541,9 +567,9 @@ func rankMemoryPressure(pods []*api.Pod, stats statsFunc) { } // rankDiskPressureFunc returns a rankFunc that measures the specified fs stats. -func rankDiskPressureFunc(fsStatsToMeasure []fsStatsType) rankFunc { +func rankDiskPressureFunc(fsStatsToMeasure []fsStatsType, diskResource api.ResourceName) rankFunc { return func(pods []*api.Pod, stats statsFunc) { - orderedBy(qosComparator, disk(stats, fsStatsToMeasure)).Sort(pods) + orderedBy(qosComparator, disk(stats, fsStatsToMeasure, diskResource)).Sort(pods) } } @@ -564,6 +590,7 @@ func makeSignalObservations(summaryProvider stats.SummaryProvider) (signalObserv if err != nil { return nil, nil, err } + // build the function to work against for pod stats statsFunc := cachedStatsFunc(summary.Pods) // build an evaluation context for current eviction signals @@ -575,17 +602,33 @@ func makeSignalObservations(summaryProvider stats.SummaryProvider) (signalObserv capacity: resource.NewQuantity(int64(*memory.AvailableBytes+*memory.WorkingSetBytes), resource.BinarySI), } } - if nodeFs := summary.Node.Fs; nodeFs != nil && nodeFs.AvailableBytes != nil && nodeFs.CapacityBytes != nil { - result[SignalNodeFsAvailable] = signalObservation{ - available: resource.NewQuantity(int64(*nodeFs.AvailableBytes), resource.BinarySI), - capacity: resource.NewQuantity(int64(*nodeFs.CapacityBytes), resource.BinarySI), + if nodeFs := summary.Node.Fs; nodeFs != nil { + if nodeFs.AvailableBytes != nil && nodeFs.CapacityBytes != nil { + result[SignalNodeFsAvailable] = signalObservation{ + available: resource.NewQuantity(int64(*nodeFs.AvailableBytes), resource.BinarySI), + capacity: resource.NewQuantity(int64(*nodeFs.CapacityBytes), resource.BinarySI), + } + } + if nodeFs.InodesFree != nil && nodeFs.Inodes != nil { + result[SignalNodeFsInodesFree] = signalObservation{ + available: resource.NewQuantity(int64(*nodeFs.InodesFree), resource.BinarySI), + capacity: resource.NewQuantity(int64(*nodeFs.Inodes), resource.BinarySI), + } } } if summary.Node.Runtime != nil { - if imageFs := summary.Node.Runtime.ImageFs; imageFs != nil && imageFs.AvailableBytes != nil && imageFs.CapacityBytes != nil { - result[SignalImageFsAvailable] = signalObservation{ - available: resource.NewQuantity(int64(*imageFs.AvailableBytes), resource.BinarySI), - capacity: resource.NewQuantity(int64(*imageFs.CapacityBytes), resource.BinarySI), + if imageFs := summary.Node.Runtime.ImageFs; imageFs != nil { + if imageFs.AvailableBytes != nil && imageFs.CapacityBytes != nil { + result[SignalImageFsAvailable] = signalObservation{ + available: resource.NewQuantity(int64(*imageFs.AvailableBytes), resource.BinarySI), + capacity: resource.NewQuantity(int64(*imageFs.CapacityBytes), resource.BinarySI), + } + if imageFs.InodesFree != nil && imageFs.Inodes != nil { + result[SignalImageFsInodesFree] = signalObservation{ + available: resource.NewQuantity(int64(*imageFs.InodesFree), resource.BinarySI), + capacity: resource.NewQuantity(int64(*imageFs.Inodes), resource.BinarySI), + } + } } } } @@ -785,16 +828,20 @@ func buildResourceToRankFunc(withImageFs bool) map[api.ResourceName]rankFunc { // usage of an imagefs is optional if withImageFs { // with an imagefs, nodefs pod rank func for eviction only includes logs and local volumes - resourceToRankFunc[resourceNodeFs] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource}) + resourceToRankFunc[resourceNodeFs] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource}, resourceDisk) + resourceToRankFunc[resourceNodeFsInodes] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes) // with an imagefs, imagefs pod rank func for eviction only includes rootfs - resourceToRankFunc[resourceImageFs] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot}) + resourceToRankFunc[resourceImageFs] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot}, resourceDisk) + resourceToRankFunc[resourceImageFsInodes] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot}, resourceInodes) } else { // without an imagefs, nodefs pod rank func for eviction looks at all fs stats - resourceToRankFunc[resourceNodeFs] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}) + resourceToRankFunc[resourceNodeFs] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceDisk) + resourceToRankFunc[resourceNodeFsInodes] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes) } return resourceToRankFunc } +// PodIsEvicted returns true if the reported pod status is due to an eviction. func PodIsEvicted(podStatus api.PodStatus) bool { return podStatus.Phase == api.PodFailed && podStatus.Reason == reason } @@ -806,11 +853,14 @@ func buildResourceToNodeReclaimFuncs(imageGC ImageGC, withImageFs bool) map[api. if withImageFs { // with an imagefs, nodefs pressure should just delete logs resourceToReclaimFunc[resourceNodeFs] = nodeReclaimFuncs{deleteLogs()} + resourceToReclaimFunc[resourceNodeFsInodes] = nodeReclaimFuncs{deleteLogs()} // with an imagefs, imagefs pressure should delete unused images - resourceToReclaimFunc[resourceImageFs] = nodeReclaimFuncs{deleteImages(imageGC)} + resourceToReclaimFunc[resourceImageFs] = nodeReclaimFuncs{deleteImages(imageGC, true)} + resourceToReclaimFunc[resourceImageFsInodes] = nodeReclaimFuncs{deleteImages(imageGC, false)} } else { // without an imagefs, nodefs pressure should delete logs, and unused images - resourceToReclaimFunc[resourceNodeFs] = nodeReclaimFuncs{deleteLogs(), deleteImages(imageGC)} + resourceToReclaimFunc[resourceNodeFs] = nodeReclaimFuncs{deleteLogs(), deleteImages(imageGC, true)} + resourceToReclaimFunc[resourceNodeFsInodes] = nodeReclaimFuncs{deleteLogs(), deleteImages(imageGC, false)} } return resourceToReclaimFunc } @@ -824,13 +874,17 @@ func deleteLogs() nodeReclaimFunc { } // deleteImages will delete unused images to free up disk pressure. -func deleteImages(imageGC ImageGC) nodeReclaimFunc { +func deleteImages(imageGC ImageGC, reportBytesFreed bool) nodeReclaimFunc { return func() (*resource.Quantity, error) { glog.Infof("eviction manager: attempting to delete unused images") - reclaimed, err := imageGC.DeleteUnusedImages() + bytesFreed, err := imageGC.DeleteUnusedImages() if err != nil { return nil, err } + reclaimed := int64(0) + if reportBytesFreed { + reclaimed = bytesFreed + } return resource.NewQuantity(reclaimed, resource.BinarySI), nil } } diff --git a/pkg/kubelet/eviction/helpers_test.go b/pkg/kubelet/eviction/helpers_test.go index fa2fbd418ea2b..df0aa0eb4bb20 100644 --- a/pkg/kubelet/eviction/helpers_test.go +++ b/pkg/kubelet/eviction/helpers_test.go @@ -191,6 +191,49 @@ func TestParseThresholdConfig(t *testing.T) { }, }, }, + "inode flag values": { + evictionHard: "imagefs.inodesFree<150Mi,nodefs.inodesFree<100Mi", + evictionSoft: "imagefs.inodesFree<300Mi,nodefs.inodesFree<200Mi", + evictionSoftGracePeriod: "imagefs.inodesFree=30s,nodefs.inodesFree=30s", + evictionMinReclaim: "imagefs.inodesFree=2Gi,nodefs.inodesFree=1Gi", + expectErr: false, + expectThresholds: []Threshold{ + { + Signal: SignalImageFsInodesFree, + Operator: OpLessThan, + Value: ThresholdValue{ + Quantity: quantityMustParse("150Mi"), + }, + MinReclaim: quantityMustParse("2Gi"), + }, + { + Signal: SignalNodeFsInodesFree, + Operator: OpLessThan, + Value: ThresholdValue{ + Quantity: quantityMustParse("100Mi"), + }, + MinReclaim: quantityMustParse("1Gi"), + }, + { + Signal: SignalImageFsInodesFree, + Operator: OpLessThan, + Value: ThresholdValue{ + Quantity: quantityMustParse("300Mi"), + }, + GracePeriod: gracePeriod, + MinReclaim: quantityMustParse("2Gi"), + }, + { + Signal: SignalNodeFsInodesFree, + Operator: OpLessThan, + Value: ThresholdValue{ + Quantity: quantityMustParse("200Mi"), + }, + GracePeriod: gracePeriod, + MinReclaim: quantityMustParse("1Gi"), + }, + }, + }, "invalid-signal": { evictionHard: "mem.available<150Mi", evictionSoft: "", @@ -400,7 +443,7 @@ func TestOrderedByDisk(t *testing.T) { return result, found } pods := []*api.Pod{pod1, pod2, pod3, pod4, pod5, pod6} - orderedBy(disk(statsFn, []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource})).Sort(pods) + orderedBy(disk(statsFn, []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceDisk)).Sort(pods) expected := []*api.Pod{pod6, pod5, pod4, pod3, pod2, pod1} for i := range expected { if pods[i] != expected[i] { @@ -466,7 +509,7 @@ func TestOrderedByQoSDisk(t *testing.T) { return result, found } pods := []*api.Pod{pod1, pod2, pod3, pod4, pod5, pod6} - orderedBy(qosComparator, disk(statsFn, []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource})).Sort(pods) + orderedBy(qosComparator, disk(statsFn, []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceDisk)).Sort(pods) expected := []*api.Pod{pod2, pod1, pod4, pod3, pod6, pod5} for i := range expected { if pods[i] != expected[i] { @@ -608,6 +651,10 @@ func TestMakeSignalObservations(t *testing.T) { imageFsCapacityBytes := uint64(1024 * 1024 * 2) nodeFsAvailableBytes := uint64(1024) nodeFsCapacityBytes := uint64(1024 * 2) + imageFsInodesFree := uint64(1024) + imageFsInodes := uint64(1024 * 1024) + nodeFsInodesFree := uint64(1024) + nodeFsInodes := uint64(1024 * 1024) fakeStats := &statsapi.Summary{ Node: statsapi.NodeStats{ Memory: &statsapi.MemoryStats{ @@ -618,11 +665,15 @@ func TestMakeSignalObservations(t *testing.T) { ImageFs: &statsapi.FsStats{ AvailableBytes: &imageFsAvailableBytes, CapacityBytes: &imageFsCapacityBytes, + InodesFree: &imageFsInodesFree, + Inodes: &imageFsInodes, }, }, Fs: &statsapi.FsStats{ AvailableBytes: &nodeFsAvailableBytes, CapacityBytes: &nodeFsCapacityBytes, + InodesFree: &nodeFsInodesFree, + Inodes: &nodeFsInodes, }, }, Pods: []statsapi.PodStats{}, @@ -664,6 +715,16 @@ func TestMakeSignalObservations(t *testing.T) { if expectedBytes := int64(nodeFsCapacityBytes); nodeFsQuantity.capacity.Value() != expectedBytes { t.Errorf("Expected %v, actual: %v", expectedBytes, nodeFsQuantity.capacity.Value()) } + nodeFsInodesQuantity, found := actualObservations[SignalNodeFsInodesFree] + if !found { + t.Errorf("Expected inodes free nodefs observation: %v", err) + } + if expected := int64(nodeFsInodesFree); nodeFsInodesQuantity.available.Value() != expected { + t.Errorf("Expected %v, actual: %v", expected, nodeFsInodesQuantity.available.Value()) + } + if expected := int64(nodeFsInodes); nodeFsInodesQuantity.capacity.Value() != expected { + t.Errorf("Expected %v, actual: %v", expected, nodeFsInodesQuantity.capacity.Value()) + } imageFsQuantity, found := actualObservations[SignalImageFsAvailable] if !found { t.Errorf("Expected available imagefs observation: %v", err) @@ -674,6 +735,16 @@ func TestMakeSignalObservations(t *testing.T) { if expectedBytes := int64(imageFsCapacityBytes); imageFsQuantity.capacity.Value() != expectedBytes { t.Errorf("Expected %v, actual: %v", expectedBytes, imageFsQuantity.capacity.Value()) } + imageFsInodesQuantity, found := actualObservations[SignalImageFsInodesFree] + if !found { + t.Errorf("Expected inodes free imagefs observation: %v", err) + } + if expected := int64(imageFsInodesFree); imageFsInodesQuantity.available.Value() != expected { + t.Errorf("Expected %v, actual: %v", expected, imageFsInodesQuantity.available.Value()) + } + if expected := int64(imageFsInodes); imageFsInodesQuantity.capacity.Value() != expected { + t.Errorf("Expected %v, actual: %v", expected, imageFsInodesQuantity.capacity.Value()) + } for _, pod := range pods { podStats, found := statsFunc(pod) if !found { @@ -1204,6 +1275,22 @@ func testCompareThresholdValue(t *testing.T) { } } +// newPodInodeStats returns stats with specified usage amounts. +// TODO: in future, this should take a value for inodesUsed per container. +func newPodInodeStats(pod *api.Pod) statsapi.PodStats { + result := statsapi.PodStats{ + PodRef: statsapi.PodReference{ + Name: pod.Name, Namespace: pod.Namespace, UID: string(pod.UID), + }, + } + for range pod.Spec.Containers { + result.Containers = append(result.Containers, statsapi.ContainerStats{ + Rootfs: &statsapi.FsStats{}, + }) + } + return result +} + // newPodDiskStats returns stats with specified usage amounts. func newPodDiskStats(pod *api.Pod, rootFsUsed, logsUsed, perLocalVolumeUsed resource.Quantity) statsapi.PodStats { result := statsapi.PodStats{ diff --git a/pkg/kubelet/eviction/types.go b/pkg/kubelet/eviction/types.go index 6984148e19472..3d85f44d2a1f2 100644 --- a/pkg/kubelet/eviction/types.go +++ b/pkg/kubelet/eviction/types.go @@ -32,8 +32,12 @@ const ( SignalMemoryAvailable Signal = "memory.available" // SignalNodeFsAvailable is amount of storage available on filesystem that kubelet uses for volumes, daemon logs, etc. SignalNodeFsAvailable Signal = "nodefs.available" + // SignalNodeFsInodesFree is amount of inodes available on filesystem that kubelet uses for volumes, daemon logs, etc. + SignalNodeFsInodesFree Signal = "nodefs.inodesFree" // SignalImageFsAvailable is amount of storage available on filesystem that container runtime uses for storing images and container writable layers. SignalImageFsAvailable Signal = "imagefs.available" + // SignalImageFsInodesFree is amount of inodes available on filesystem that container runtime uses for storing images and container writeable layers. + SignalImageFsInodesFree Signal = "imagefs.inodesFree" ) // fsStatsType defines the types of filesystem stats to collect.