Skip to content

Commit

Permalink
feat(setting): lazy update for volume-related settings
Browse files Browse the repository at this point in the history
Delete longhorn components with best efforts when danger zone
settings updated.

For settings `taint-toleration`, `guaranteed-instance-manager-cpu`,
`system-managed-components-node-selector` and `priority-class`,
users can customize the settings but the settings will only take
effect when all volumes are detached.

Ref: 7173

Signed-off-by: James Lu <james.lu@suse.com>
Signed-off-by: davidko <dko@suse.com>
  • Loading branch information
mantissahz authored and innobead committed Jan 11, 2024
1 parent 01ee838 commit 273b6c9
Show file tree
Hide file tree
Showing 10 changed files with 337 additions and 129 deletions.
2 changes: 2 additions & 0 deletions controller/controller_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,7 @@ func ParseResourceRequirement(val string) (*corev1.ResourceRequirements, error)
}, nil
}

// GetInstanceManagerCPURequirement returns the instance manager CPU requirement
func GetInstanceManagerCPURequirement(ds *datastore.DataStore, imName string) (*corev1.ResourceRequirements, error) {
im, err := ds.GetInstanceManager(imName)
if err != nil {
Expand Down Expand Up @@ -210,6 +211,7 @@ func EnhancedDefaultControllerRateLimiter() workqueue.RateLimiter {
)
}

// IsSameGuaranteedCPURequirement returns true if the resource requirement a is equal to the resource requirement b
func IsSameGuaranteedCPURequirement(a, b *corev1.ResourceRequirements) bool {
var aQ, bQ resource.Quantity
if a != nil && a.Requests != nil {
Expand Down
137 changes: 133 additions & 4 deletions controller/instance_manager_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,15 @@ func (imc *InstanceManagerController) handlePod(im *longhorn.InstanceManager) er
return err
}

if im.Status.CurrentState != longhorn.InstanceManagerStateError && im.Status.CurrentState != longhorn.InstanceManagerStateStopped {
isSettingSynced, isPodDeletedOrNotRunning, areInstancesRunningInPod, err := imc.areDangerZoneSettingsSyncedToIMPod(im)
if err != nil {
return err
}

isPodDeletionNotRequired := isSettingSynced || areInstancesRunningInPod || isPodDeletedOrNotRunning
if im.Status.CurrentState != longhorn.InstanceManagerStateError &&
im.Status.CurrentState != longhorn.InstanceManagerStateStopped &&
isPodDeletionNotRequired {
return nil
}

Expand All @@ -466,15 +474,14 @@ func (imc *InstanceManagerController) handlePod(im *longhorn.InstanceManager) er
}

// Since `spec.nodeName` is specified during the pod creation,
// the node cordon can not prevent the pod being launched.
if unschedulable, err := imc.ds.IsKubeNodeUnschedulable(im.Spec.NodeID); unschedulable || err != nil {
// the node cordon cannot prevent the pod being launched.
if unscheduled, err := imc.ds.IsKubeNodeUnschedulable(im.Spec.NodeID); unscheduled || err != nil {
return err
}

if err := imc.createInstanceManagerPod(im); err != nil {
return err
}
// The instance manager state will be updated in the next reconcile loop.

return nil
}
Expand Down Expand Up @@ -515,6 +522,128 @@ func (imc *InstanceManagerController) annotateCASafeToEvict(im *longhorn.Instanc
return nil
}

func (imc *InstanceManagerController) areDangerZoneSettingsSyncedToIMPod(im *longhorn.InstanceManager) (isSynced, isPodDeletedOrNotRunning, areInstancesRunningInPod bool, err error) {
if im.Status.CurrentState != longhorn.InstanceManagerStateRunning {
return false, true, false, nil
}

// nolint:all
for _, instance := range types.ConsolidateInstances(im.Status.InstanceEngines, im.Status.InstanceReplicas, im.Status.Instances) {
if instance.Status.State == longhorn.InstanceStateRunning || instance.Status.State == longhorn.InstanceStateStarting {
return false, false, true, nil
}
}

pod, err := imc.ds.GetPod(im.Name)
if err != nil {
return false, false, false, errors.Wrapf(err, "cannot get pod for instance manager %v", im.Name)
}
if pod == nil {
return false, true, false, nil
}

for settingName := range types.GetDangerZoneSettings() {
isSettingSynced := true
setting, err := imc.ds.GetSettingWithAutoFillingRO(settingName)
if err != nil {
return false, false, false, err
}
switch settingName {
case types.SettingNameTaintToleration:
isSettingSynced, err = imc.isSettingTaintTolerationSynced(setting, pod)
case types.SettingNameSystemManagedComponentsNodeSelector:
isSettingSynced, err = imc.isSettingNodeSelectorSynced(setting, pod)
case types.SettingNameGuaranteedInstanceManagerCPU, types.SettingNameV2DataEngineGuaranteedInstanceManagerCPU:
isSettingSynced, err = imc.isSettingGuaranteedInstanceManagerCPUSynced(setting, pod)
case types.SettingNamePriorityClass:
isSettingSynced, err = imc.isSettingPriorityClassSynced(setting, pod)
case types.SettingNameStorageNetwork:
isSettingSynced, err = imc.isSettingStorageNetworkSynced(setting, pod)
case types.SettingNameV1DataEngine, types.SettingNameV2DataEngine:
isSettingSynced, err = imc.isSettingDataEngineSynced(settingName, im)
}
if err != nil {
return false, false, false, err
}
if !isSettingSynced {
return false, false, false, nil
}
}

return true, false, false, nil
}

func (imc *InstanceManagerController) isSettingTaintTolerationSynced(setting *longhorn.Setting, pod *corev1.Pod) (bool, error) {
newTolerationsList, err := types.UnmarshalTolerations(setting.Value)
if err != nil {
return false, err
}
newTolerationsMap := util.TolerationListToMap(newTolerationsList)
lastAppliedTolerations, err := getLastAppliedTolerationsList(pod)
if err != nil {
return false, err
}

return reflect.DeepEqual(util.TolerationListToMap(lastAppliedTolerations), newTolerationsMap), nil
}

func (imc *InstanceManagerController) isSettingNodeSelectorSynced(setting *longhorn.Setting, pod *corev1.Pod) (bool, error) {
newNodeSelector, err := types.UnmarshalNodeSelector(setting.Value)
if err != nil {
return false, err
}
if pod.Spec.NodeSelector == nil && len(newNodeSelector) == 0 {
return true, nil
}

return reflect.DeepEqual(pod.Spec.NodeSelector, newNodeSelector), nil
}

func (imc *InstanceManagerController) isSettingGuaranteedInstanceManagerCPUSynced(setting *longhorn.Setting, pod *corev1.Pod) (bool, error) {
lhNode, err := imc.ds.GetNode(pod.Spec.NodeName)
if err != nil {
return false, err
}
if types.GetCondition(lhNode.Status.Conditions, longhorn.NodeConditionTypeReady).Status != longhorn.ConditionStatusTrue {
return true, nil
}

resourceReq, err := GetInstanceManagerCPURequirement(imc.ds, pod.Name)
if err != nil {
return false, err
}
podResourceReq := pod.Spec.Containers[0].Resources
return IsSameGuaranteedCPURequirement(resourceReq, &podResourceReq), nil
}

func (imc *InstanceManagerController) isSettingPriorityClassSynced(setting *longhorn.Setting, pod *corev1.Pod) (bool, error) {
return pod.Spec.PriorityClassName == setting.Value, nil
}

func (imc *InstanceManagerController) isSettingStorageNetworkSynced(setting *longhorn.Setting, pod *corev1.Pod) (bool, error) {
nadAnnot := string(types.CNIAnnotationNetworks)

return pod.Annotations[nadAnnot] == setting.Value, nil
}

func (imc *InstanceManagerController) isSettingDataEngineSynced(settingName types.SettingName, im *longhorn.InstanceManager) (bool, error) {
enabled, err := imc.ds.GetSettingAsBool(settingName)
if err != nil {
return false, errors.Wrapf(err, "failed to get %v setting for updating data engine", settingName)
}
var dataEngine longhorn.DataEngineType
switch settingName {
case types.SettingNameV1DataEngine:
dataEngine = longhorn.DataEngineTypeV1
case types.SettingNameV2DataEngine:
dataEngine = longhorn.DataEngineTypeV2
}
if !enabled && im.Spec.DataEngine == dataEngine {
return false, nil
}
return true, nil
}

func (imc *InstanceManagerController) syncInstanceManagerAPIVersion(im *longhorn.InstanceManager) error {
// Avoid changing API versions when InstanceManagers are state Unknown.
// Then once required (in the future), the monitor could still talk with the pod and update processes in some corner cases. e.g., kubelet restart.
Expand Down
7 changes: 7 additions & 0 deletions controller/instance_manager_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ import (

corev1 "k8s.io/api/core/v1"
apiextensionsfake "k8s.io/apiextensions-apiserver/pkg/client/clientset/clientset/fake"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

"github.com/longhorn/longhorn-manager/datastore"
Expand Down Expand Up @@ -272,6 +273,12 @@ func (s *TestSuite) TestSyncInstanceManager(c *C) {

if tc.currentPodStatus != nil {
pod := newPod(tc.currentPodStatus, im.Name, im.Namespace, im.Spec.NodeID)
var containers []corev1.Container
containers = append(containers, corev1.Container{
Name: "instance-manager",
Resources: corev1.ResourceRequirements{Requests: corev1.ResourceList{"cpu": resource.MustParse("480m")}}},
)
pod.Spec.Containers = containers
err = pIndexer.Add(pod)
c.Assert(err, IsNil)
_, err = kubeClient.CoreV1().Pods(im.Namespace).Create(context.TODO(), pod, metav1.CreateOptions{})
Expand Down
4 changes: 2 additions & 2 deletions controller/node_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ import (
"github.com/longhorn/longhorn-manager/types"
"github.com/longhorn/longhorn-manager/util"

monitor "github.com/longhorn/longhorn-manager/controller/monitor"
"github.com/longhorn/longhorn-manager/controller/monitor"
longhorn "github.com/longhorn/longhorn-manager/k8s/pkg/apis/longhorn/v1beta2"
)

Expand Down Expand Up @@ -1003,7 +1003,7 @@ func (nc *NodeController) getImTypeDataEngines(node *longhorn.Node) map[longhorn
dataEngines[longhorn.InstanceManagerTypeReplica] = append(dataEngines[longhorn.InstanceManagerTypeReplica], longhorn.DataEngineTypeV1)
}
case types.SettingNameV2DataEngine:
if err := nc.ds.ValidateV2DataEngineEnabled(enabled); err == nil {
if _, err := nc.ds.ValidateV2DataEngineEnabled(enabled); err == nil {
dataEngines[longhorn.InstanceManagerTypeAllInOne] = append(dataEngines[longhorn.InstanceManagerTypeAllInOne], longhorn.DataEngineTypeV2)
} else {
log.WithError(err).Warnf("Failed to validate %v setting", types.SettingNameV2DataEngine)
Expand Down
Loading

0 comments on commit 273b6c9

Please sign in to comment.