Skip to content

Commit

Permalink
WIP: kubelet eviction on inode exhaustion
Browse files Browse the repository at this point in the history
  • Loading branch information
derekwaynecarr committed Aug 9, 2016
1 parent b39cde3 commit 0a7f8ad
Show file tree
Hide file tree
Showing 3 changed files with 106 additions and 21 deletions.
84 changes: 65 additions & 19 deletions pkg/kubelet/eviction/helpers.go
Original file line number Diff line number Diff line change
Expand Up @@ -41,10 +41,16 @@ const (
message = "The node was low on compute resources."
// disk, in bytes. internal to this module, used to account for local disk usage.
resourceDisk api.ResourceName = "disk"
// inodes, in bytes. internal to this module, used to account for local disk inode consumption.
resourceInodes api.ResourceName = "inodes"
// imagefs, in bytes. internal to this module, used to account for local image filesystem usage.
resourceImageFs api.ResourceName = "imagefs"
// imagefs inodes, in bytes. internal to this module, used to account for local image filesystem inodes.
resourceImageFsInodes api.ResourceName = "imagefsInodes"
// nodefs, in bytes. internal to this module, used to account for local node root filesystem usage.
resourceNodeFs api.ResourceName = "nodefs"
// nodefs inodes, in bytes. internal to this module, used to account for local node root filesystem inodes.
resourceNodeFsInodes api.ResourceName = "nodefsInodes"
)

var (
Expand All @@ -62,12 +68,16 @@ func init() {
signalToNodeCondition[SignalMemoryAvailable] = api.NodeMemoryPressure
signalToNodeCondition[SignalImageFsAvailable] = api.NodeDiskPressure
signalToNodeCondition[SignalNodeFsAvailable] = api.NodeDiskPressure
signalToNodeCondition[SignalImageFsInodesFree] = api.NodeDiskPressure
signalToNodeCondition[SignalNodeFsInodesFree] = api.NodeDiskPressure

// map signals to resources (and vice-versa)
signalToResource = map[Signal]api.ResourceName{}
signalToResource[SignalMemoryAvailable] = api.ResourceMemory
signalToResource[SignalImageFsAvailable] = resourceImageFs
signalToResource[SignalImageFsInodesFree] = resourceImageFsInodes
signalToResource[SignalNodeFsAvailable] = resourceNodeFs
signalToResource[SignalNodeFsInodesFree] = resourceNodeFsInodes
resourceToSignal = map[api.ResourceName]Signal{}
for key, value := range signalToResource {
resourceToSignal[value] = key
Expand Down Expand Up @@ -258,6 +268,15 @@ func diskUsage(fsStats *statsapi.FsStats) *resource.Quantity {
return resource.NewQuantity(usage, resource.BinarySI)
}

// inodeUsage converts inodes consumed into a resource quantity.
func inodeUsage(fsStats *statsapi.FsStats) *resource.Quantity {
if fsStats == nil || fsStats.InodesFree == nil || fsStats.Inodes == nil {
return &resource.Quantity{Format: resource.BinarySI}
}
usage := int64(*fsStats.Inodes) - int64(*fsStats.InodesFree)
return resource.NewQuantity(usage, resource.BinarySI)
}

// memoryUsage converts working set into a resource quantity.
func memoryUsage(memStats *statsapi.MemoryStats) *resource.Quantity {
if memStats == nil || memStats.WorkingSetBytes == nil {
Expand All @@ -282,15 +301,18 @@ func localVolumeNames(pod *api.Pod) []string {
return result
}

// podDiskUsage aggregates pod disk usage for the specified stats to measure.
// podDiskUsage aggregates pod disk usage and inode consumption for the specified stats to measure.
func podDiskUsage(podStats statsapi.PodStats, pod *api.Pod, statsToMeasure []fsStatsType) (api.ResourceList, error) {
disk := resource.Quantity{Format: resource.BinarySI}
inodes := resource.Quantity{Format: resource.BinarySI}
for _, container := range podStats.Containers {
if hasFsStatsType(statsToMeasure, fsStatsRoot) {
disk.Add(*diskUsage(container.Rootfs))
inodes.Add(*inodeUsage(container.Rootfs))
}
if hasFsStatsType(statsToMeasure, fsStatsLogs) {
disk.Add(*diskUsage(container.Logs))
inodes.Add(*inodeUsage(container.Logs))
}
}
if hasFsStatsType(statsToMeasure, fsStatsLocalVolumeSource) {
Expand All @@ -299,13 +321,15 @@ func podDiskUsage(podStats statsapi.PodStats, pod *api.Pod, statsToMeasure []fsS
for _, volumeStats := range podStats.VolumeStats {
if volumeStats.Name == volumeName {
disk.Add(*diskUsage(&volumeStats.FsStats))
inodes.Add(*inodeUsage(&volumeStats.FsStats))
break
}
}
}
}
return api.ResourceList{
resourceDisk: disk,
resourceDisk: disk,
resourceInodes: inodes,
}, nil
}

Expand Down Expand Up @@ -465,8 +489,8 @@ func memory(stats statsFunc) cmpFunc {
}
}

// disk compares pods by largest consumer of disk relative to request.
func disk(stats statsFunc, fsStatsToMeasure []fsStatsType) cmpFunc {
// disk compares pods by largest consumer of disk relative to request for the specified disk resource.
func disk(stats statsFunc, fsStatsToMeasure []fsStatsType, diskResource api.ResourceName) cmpFunc {
return func(p1, p2 *api.Pod) int {
p1Stats, found := stats(p1)
// if we have no usage stats for p1, we want p2 first
Expand All @@ -491,8 +515,8 @@ func disk(stats statsFunc, fsStatsToMeasure []fsStatsType) cmpFunc {

// disk is best effort, so we don't measure relative to a request.
// TODO: add disk as a guaranteed resource
p1Disk := p1Usage[resourceDisk]
p2Disk := p2Usage[resourceDisk]
p1Disk := p1Usage[diskResource]
p2Disk := p2Usage[diskResource]
// if p2 is using more than p1, we want p2 first
return p2Disk.Cmp(p1Disk)
}
Expand All @@ -504,9 +528,9 @@ func rankMemoryPressure(pods []*api.Pod, stats statsFunc) {
}

// rankDiskPressureFunc returns a rankFunc that measures the specified fs stats.
func rankDiskPressureFunc(fsStatsToMeasure []fsStatsType) rankFunc {
func rankDiskPressureFunc(fsStatsToMeasure []fsStatsType, diskResource api.ResourceName) rankFunc {
return func(pods []*api.Pod, stats statsFunc) {
orderedBy(qosComparator, disk(stats, fsStatsToMeasure)).Sort(pods)
orderedBy(qosComparator, disk(stats, fsStatsToMeasure, diskResource)).Sort(pods)
}
}

Expand Down Expand Up @@ -535,12 +559,23 @@ func makeSignalObservations(summaryProvider stats.SummaryProvider) (signalObserv
if memory := summary.Node.Memory; memory != nil && memory.AvailableBytes != nil {
result[SignalMemoryAvailable] = resource.NewQuantity(int64(*memory.AvailableBytes), resource.BinarySI)
}
if nodeFs := summary.Node.Fs; nodeFs != nil && nodeFs.AvailableBytes != nil {
result[SignalNodeFsAvailable] = resource.NewQuantity(int64(*nodeFs.AvailableBytes), resource.BinarySI)
if nodeFs := summary.Node.Fs; nodeFs != nil {
if nodeFs.AvailableBytes != nil {
result[SignalNodeFsAvailable] = resource.NewQuantity(int64(*nodeFs.AvailableBytes), resource.BinarySI)
}
if nodeFs.InodesFree != nil {
result[SignalNodeFsInodesFree] = resource.NewQuantity(int64(*nodeFs.InodesFree), resource.BinarySI)
}
}

if summary.Node.Runtime != nil {
if imageFs := summary.Node.Runtime.ImageFs; imageFs != nil && imageFs.AvailableBytes != nil {
result[SignalImageFsAvailable] = resource.NewQuantity(int64(*imageFs.AvailableBytes), resource.BinarySI)
if imageFs := summary.Node.Runtime.ImageFs; imageFs != nil {
if imageFs.AvailableBytes != nil {
result[SignalImageFsAvailable] = resource.NewQuantity(int64(*imageFs.AvailableBytes), resource.BinarySI)
}
if imageFs.InodesFree != nil {
result[SignalImageFsInodesFree] = resource.NewQuantity(int64(*imageFs.InodesFree), resource.BinarySI)
}
}
}
return result, statsFunc, nil
Expand Down Expand Up @@ -717,16 +752,20 @@ func buildResourceToRankFunc(withImageFs bool) map[api.ResourceName]rankFunc {
// usage of an imagefs is optional
if withImageFs {
// with an imagefs, nodefs pod rank func for eviction only includes logs and local volumes
resourceToRankFunc[resourceNodeFs] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource})
resourceToRankFunc[resourceNodeFs] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource}, resourceDisk)
resourceToRankFunc[resourceNodeFsInodes] = rankDiskPressureFunc([]fsStatsType{fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes)
// with an imagefs, imagefs pod rank func for eviction only includes rootfs
resourceToRankFunc[resourceImageFs] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot})
resourceToRankFunc[resourceImageFs] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot}, resourceDisk)
resourceToRankFunc[resourceImageFsInodes] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot}, resourceInodes)
} else {
// without an imagefs, nodefs pod rank func for eviction looks at all fs stats
resourceToRankFunc[resourceNodeFs] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource})
resourceToRankFunc[resourceNodeFs] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceDisk)
resourceToRankFunc[resourceNodeFsInodes] = rankDiskPressureFunc([]fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceInodes)
}
return resourceToRankFunc
}

// PodIsEvicted returns true if the reported pod status is due to an eviction.
func PodIsEvicted(podStatus api.PodStatus) bool {
return podStatus.Phase == api.PodFailed && podStatus.Reason == reason
}
Expand All @@ -738,11 +777,14 @@ func buildResourceToNodeReclaimFuncs(imageGC ImageGC, withImageFs bool) map[api.
if withImageFs {
// with an imagefs, nodefs pressure should just delete logs
resourceToReclaimFunc[resourceNodeFs] = nodeReclaimFuncs{deleteLogs()}
resourceToReclaimFunc[resourceNodeFsInodes] = nodeReclaimFuncs{deleteLogs()}
// with an imagefs, imagefs pressure should delete unused images
resourceToReclaimFunc[resourceImageFs] = nodeReclaimFuncs{deleteImages(imageGC)}
resourceToReclaimFunc[resourceImageFs] = nodeReclaimFuncs{deleteImages(imageGC, true)}
resourceToReclaimFunc[resourceImageFsInodes] = nodeReclaimFuncs{deleteImages(imageGC, false)}
} else {
// without an imagefs, nodefs pressure should delete logs, and unused images
resourceToReclaimFunc[resourceNodeFs] = nodeReclaimFuncs{deleteLogs(), deleteImages(imageGC)}
resourceToReclaimFunc[resourceNodeFs] = nodeReclaimFuncs{deleteLogs(), deleteImages(imageGC, true)}
resourceToReclaimFunc[resourceNodeFsInodes] = nodeReclaimFuncs{deleteLogs(), deleteImages(imageGC, false)}
}
return resourceToReclaimFunc
}
Expand All @@ -756,13 +798,17 @@ func deleteLogs() nodeReclaimFunc {
}

// deleteImages will delete unused images to free up disk pressure.
func deleteImages(imageGC ImageGC) nodeReclaimFunc {
func deleteImages(imageGC ImageGC, reportBytesFreed bool) nodeReclaimFunc {
return func() (*resource.Quantity, error) {
glog.Infof("eviction manager: attempting to delete unused images")
reclaimed, err := imageGC.DeleteUnusedImages()
bytesFreed, err := imageGC.DeleteUnusedImages()
if err != nil {
return nil, err
}
reclaimed := int64(0)
if reportBytesFreed {
reclaimed = bytesFreed
}
return resource.NewQuantity(reclaimed, resource.BinarySI), nil
}
}
39 changes: 37 additions & 2 deletions pkg/kubelet/eviction/helpers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,41 @@ func TestParseThresholdConfig(t *testing.T) {
},
},
},
"inode flag values": {
evictionHard: "imagefs.inodesFree<150Mi,nodefs.inodesFree<100Mi",
evictionSoft: "imagefs.inodesFree<300Mi,nodefs.inodesFree<200Mi",
evictionSoftGracePeriod: "imagefs.inodesFree=30s,nodefs.inodesFree=30s",
evictionMinReclaim: "imagefs.inodesFree=2Gi,nodefs.inodesFree=1Gi",
expectErr: false,
expectThresholds: []Threshold{
{
Signal: SignalImageFsInodesFree,
Operator: OpLessThan,
Value: quantityMustParse("150Mi"),
MinReclaim: quantityMustParse("2Gi"),
},
{
Signal: SignalNodeFsInodesFree,
Operator: OpLessThan,
Value: quantityMustParse("100Mi"),
MinReclaim: quantityMustParse("1Gi"),
},
{
Signal: SignalImageFsInodesFree,
Operator: OpLessThan,
Value: quantityMustParse("300Mi"),
GracePeriod: gracePeriod,
MinReclaim: quantityMustParse("2Gi"),
},
{
Signal: SignalNodeFsInodesFree,
Operator: OpLessThan,
Value: quantityMustParse("200Mi"),
GracePeriod: gracePeriod,
MinReclaim: quantityMustParse("1Gi"),
},
},
},
"invalid-signal": {
evictionHard: "mem.available<150Mi",
evictionSoft: "",
Expand Down Expand Up @@ -311,7 +346,7 @@ func TestOrderedByDisk(t *testing.T) {
return result, found
}
pods := []*api.Pod{pod1, pod2, pod3, pod4, pod5, pod6}
orderedBy(disk(statsFn, []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource})).Sort(pods)
orderedBy(disk(statsFn, []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceDisk)).Sort(pods)
expected := []*api.Pod{pod6, pod5, pod4, pod3, pod2, pod1}
for i := range expected {
if pods[i] != expected[i] {
Expand Down Expand Up @@ -377,7 +412,7 @@ func TestOrderedByQoSDisk(t *testing.T) {
return result, found
}
pods := []*api.Pod{pod1, pod2, pod3, pod4, pod5, pod6}
orderedBy(qosComparator, disk(statsFn, []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource})).Sort(pods)
orderedBy(qosComparator, disk(statsFn, []fsStatsType{fsStatsRoot, fsStatsLogs, fsStatsLocalVolumeSource}, resourceDisk)).Sort(pods)
expected := []*api.Pod{pod2, pod1, pod4, pod3, pod6, pod5}
for i := range expected {
if pods[i] != expected[i] {
Expand Down
4 changes: 4 additions & 0 deletions pkg/kubelet/eviction/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,12 @@ const (
SignalMemoryAvailable Signal = "memory.available"
// SignalNodeFsAvailable is amount of storage available on filesystem that kubelet uses for volumes, daemon logs, etc.
SignalNodeFsAvailable Signal = "nodefs.available"
// SignalNodeFsInodesFree is amount of inodes available on filesystem that kubelet uses for volumes, daemon logs, etc.
SignalNodeFsInodesFree Signal = "nodefs.inodesFree"
// SignalImageFsAvailable is amount of storage available on filesystem that container runtime uses for storing images and container writable layers.
SignalImageFsAvailable Signal = "imagefs.available"
// SignalImageFsInodesFree is amount of inodes available on filesystem that container runtime uses for storing images and container writeable layers.
SignalImageFsInodesFree Signal = "imagefs.inodesFree"
)

// fsStatsType defines the types of filesystem stats to collect.
Expand Down

0 comments on commit 0a7f8ad

Please sign in to comment.