Skip to content

Commit

Permalink
node: device-mgr: explicitly check if pre-allocated devices are healthy
Browse files Browse the repository at this point in the history
Signed-off-by: Swati Sehgal <swsehgal@redhat.com>
  • Loading branch information
swatisehgal committed Mar 6, 2023
1 parent a799ffb commit 5b2a3db
Show file tree
Hide file tree
Showing 2 changed files with 24 additions and 0 deletions.
5 changes: 5 additions & 0 deletions pkg/kubelet/cm/devicemanager/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -558,6 +558,11 @@ func (m *ManagerImpl) devicesToAllocate(podUID, contName, resource string, requi
return nil, fmt.Errorf("can't allocate unhealthy devices %s", resource)
}

// Check if all the previously allocated devices are healthy
if !healthyDevices.IsSuperset(devices) {
return nil, fmt.Errorf("previously allocated devices are no longer healthy; can't allocate unhealthy devices %s", resource)
}

if needed == 0 {
// No change, no work.
return nil, nil
Expand Down
19 changes: 19 additions & 0 deletions pkg/kubelet/cm/devicemanager/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -988,6 +988,7 @@ func TestPodContainerDeviceAllocation(t *testing.T) {
func TestPodContainerDeviceToAllocate(t *testing.T) {
resourceName1 := "domain1.com/resource1"
resourceName2 := "domain2.com/resource2"
resourceName3 := "domain2.com/resource3"
as := require.New(t)
tmpDir, err := os.MkdirTemp("", "checkpoint")
as.Nil(err)
Expand All @@ -1010,11 +1011,19 @@ func TestPodContainerDeviceToAllocate(t *testing.T) {
checkpoint.DevicesPerNUMA{nodeWithoutTopology: []string{"dev5"}},
constructAllocResp(map[string]string{"/dev/r1dev5": "/dev/r1dev5"},
map[string]string{"/home/r1lib1": "/usr/r1lib1"}, map[string]string{}))
testManager.podDevices.insert("pod3", "con3", resourceName3,
checkpoint.DevicesPerNUMA{nodeWithoutTopology: []string{"dev5"}},
constructAllocResp(map[string]string{"/dev/r1dev5": "/dev/r1dev5"},
map[string]string{"/home/r1lib1": "/usr/r1lib1"}, map[string]string{}))

// no healthy devices for resourceName1 and devices corresponding to
// resource2 are intentionally omitted to simulate that the resource
// hasn't been registered.
testManager.healthyDevices[resourceName1] = sets.NewString()
testManager.healthyDevices[resourceName3] = sets.NewString()
// dev5 is no longer in the list of healthy devices
testManager.healthyDevices[resourceName3].Insert("dev7")
testManager.healthyDevices[resourceName3].Insert("dev8")

testCases := []struct {
description string
Expand Down Expand Up @@ -1046,6 +1055,16 @@ func TestPodContainerDeviceToAllocate(t *testing.T) {
expectedAllocatedDevices: nil,
expErr: fmt.Errorf("can't allocate unregistered device %s", resourceName2),
},
{
description: "Admission error in case resource not devices previously allocated no longer healthy",
podUID: "pod3",
contName: "con3",
resource: resourceName3,
required: 1,
reusableDevices: sets.NewString(),
expectedAllocatedDevices: nil,
expErr: fmt.Errorf("previously allocated devices are no longer healthy; can't allocate unhealthy devices %s", resourceName3),
},
}

for _, testCase := range testCases {
Expand Down

0 comments on commit 5b2a3db

Please sign in to comment.