Skip to content

Commit

Permalink
Implement Cluster TopologyReconciled v1beta2 condition
Browse files Browse the repository at this point in the history
Signed-off-by: Stefan Büringer buringerst@vmware.com
  • Loading branch information
sbueringer committed Nov 8, 2024
1 parent 0f319c6 commit fee8b63
Show file tree
Hide file tree
Showing 4 changed files with 146 additions and 22 deletions.
53 changes: 53 additions & 0 deletions api/v1beta1/cluster_types.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,59 @@ const (
// ClusterTopologyReconciledV1Beta2Condition is true if the topology controller is working properly.
// Note: This condition is added only if the Cluster is referencing a ClusterClass / defining a managed Topology.
ClusterTopologyReconciledV1Beta2Condition = "TopologyReconciled"

// ClusterTopologyReconcileSucceededV1Beta2Reason documents the reconciliation of a Cluster topology succeeded.
ClusterTopologyReconcileSucceededV1Beta2Reason = "TopologyReconcileSucceeded"

// ClusterTopologyReconciledFailedV1Beta2Reason documents the reconciliation of a Cluster topology
// failing due to an error.
ClusterTopologyReconciledFailedV1Beta2Reason = "TopologyReconcileFailed"

// ClusterTopologyReconciledControlPlaneUpgradePendingV1Beta2Reason documents reconciliation of a Cluster topology
// not yet completed because Control Plane is not yet updated to match the desired topology spec.
ClusterTopologyReconciledControlPlaneUpgradePendingV1Beta2Reason = "ControlPlaneUpgradePending"

// ClusterTopologyReconciledMachineDeploymentsCreatePendingV1Beta2Reason documents reconciliation of a Cluster topology
// not yet completed because at least one of the MachineDeployments is yet to be created.
// This generally happens because new MachineDeployment creations are held off while the ControlPlane is not stable.
ClusterTopologyReconciledMachineDeploymentsCreatePendingV1Beta2Reason = "MachineDeploymentsCreatePending"

// ClusterTopologyReconciledMachineDeploymentsUpgradePendingV1Beta2Reason documents reconciliation of a Cluster topology
// not yet completed because at least one of the MachineDeployments is not yet updated to match the desired topology spec.
ClusterTopologyReconciledMachineDeploymentsUpgradePendingV1Beta2Reason = "MachineDeploymentsUpgradePending"

// ClusterTopologyReconciledMachineDeploymentsUpgradeDeferredV1Beta2Reason documents reconciliation of a Cluster topology
// not yet completed because the upgrade for at least one of the MachineDeployments has been deferred.
ClusterTopologyReconciledMachineDeploymentsUpgradeDeferredV1Beta2Reason = "MachineDeploymentsUpgradeDeferred"

// ClusterTopologyReconciledMachinePoolsUpgradePendingV1Beta2Reason documents reconciliation of a Cluster topology
// not yet completed because at least one of the MachinePools is not yet updated to match the desired topology spec.
ClusterTopologyReconciledMachinePoolsUpgradePendingV1Beta2Reason = "MachinePoolsUpgradePending"

// ClusterTopologyReconciledMachinePoolsCreatePendingV1Beta2Reason documents reconciliation of a Cluster topology
// not yet completed because at least one of the MachinePools is yet to be created.
// This generally happens because new MachinePool creations are held off while the ControlPlane is not stable.
ClusterTopologyReconciledMachinePoolsCreatePendingV1Beta2Reason = "MachinePoolsCreatePending"

// ClusterTopologyReconciledMachinePoolsUpgradeDeferredV1Beta2Reason documents reconciliation of a Cluster topology
// not yet completed because the upgrade for at least one of the MachinePools has been deferred.
ClusterTopologyReconciledMachinePoolsUpgradeDeferredV1Beta2Reason = "MachinePoolsUpgradeDeferred"

// ClusterTopologyReconciledHookBlockingV1Beta2Reason documents reconciliation of a Cluster topology
// not yet completed because at least one of the lifecycle hooks is blocking.
ClusterTopologyReconciledHookBlockingV1Beta2Reason = "LifecycleHookBlocking"

// ClusterTopologyReconciledClusterClassNotReconciledV1Beta2Reason documents reconciliation of a Cluster topology not
// yet completed because the ClusterClass has not reconciled yet. If this condition persists there may be an issue
// with the ClusterClass surfaced in the ClusterClass status or controller logs.
ClusterTopologyReconciledClusterClassNotReconciledV1Beta2Reason = "ClusterClassNotReconciled"

// ClusterTopologyReconciledDeletionTimestampSetV1Beta2Reason surfaces when the Cluster is deleting because the
// DeletionTimestamp is set.
ClusterTopologyReconciledDeletionTimestampSetV1Beta2Reason = DeletionTimestampSetV1Beta2Reason

// ClusterTopologyReconcilePausedV1Beta2Reason surfaces when the Cluster is paused.
ClusterTopologyReconcilePausedV1Beta2Reason = PausedV1Beta2Reason
)

// Cluster's InfrastructureReady condition and corresponding reasons that will be used in v1Beta2 API version.
Expand Down
3 changes: 3 additions & 0 deletions api/v1beta1/condition_consts.go
Original file line number Diff line number Diff line change
Expand Up @@ -337,6 +337,9 @@ const (
// yet completed because the ClusterClass has not reconciled yet. If this condition persists there may be an issue
// with the ClusterClass surfaced in the ClusterClass status or controller logs.
TopologyReconciledClusterClassNotReconciledReason = "ClusterClassNotReconciled"

// TopologyReconciledPausedReason (Severity=Info) surfaces when the Cluster is paused.
TopologyReconciledPausedReason = "Paused"
)

// Conditions and condition reasons for ClusterClass.
Expand Down
18 changes: 9 additions & 9 deletions internal/controllers/topology/cluster/cluster_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ func (r *Reconciler) SetupWithManager(ctx context.Context, mgr ctrl.Manager, opt
builder.WithPredicates(predicates.ResourceIsTopologyOwned(mgr.GetScheme(), predicateLog)),
).
WithOptions(options).
WithEventFilter(predicates.ResourceNotPausedAndHasFilterLabel(mgr.GetScheme(), predicateLog, r.WatchFilterValue)).
WithEventFilter(predicates.ResourceHasFilterLabel(mgr.GetScheme(), predicateLog, r.WatchFilterValue)).
Build(r)

if err != nil {
Expand Down Expand Up @@ -175,13 +175,6 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Re
return ctrl.Result{}, nil
}

// Return early if the Cluster is paused.
// TODO: What should we do if the cluster class is paused?
if annotations.IsPaused(cluster, cluster) {
log.Info("Reconciliation is paused for this object")
return ctrl.Result{}, nil
}

patchHelper, err := patch.NewHelper(cluster, r.Client)
if err != nil {
return ctrl.Result{}, err
Expand All @@ -200,14 +193,21 @@ func (r *Reconciler) Reconcile(ctx context.Context, req ctrl.Request) (_ ctrl.Re
patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{
clusterv1.TopologyReconciledCondition,
}},
patch.WithForceOverwriteConditions{},
patch.WithOwnedConditions{Conditions: []clusterv1.ConditionType{
clusterv1.ClusterTopologyReconciledV1Beta2Condition,
}},
}
if err := patchHelper.Patch(ctx, cluster, options...); err != nil {
reterr = kerrors.NewAggregate([]error{reterr, err})
return
}
}()

// Return early if the Cluster is paused.
if cluster.Spec.Paused || annotations.HasPaused(cluster) {
return ctrl.Result{}, nil
}

// In case the object is deleted, the managed topology stops to reconcile;
// (the other controllers will take care of deletion).
if !cluster.ObjectMeta.DeletionTimestamp.IsZero() {
Expand Down
94 changes: 81 additions & 13 deletions internal/controllers/topology/cluster/conditions.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,14 @@ import (
"strings"

"github.com/pkg/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
"sigs.k8s.io/cluster-api/exp/topology/scope"
"sigs.k8s.io/cluster-api/internal/contract"
"sigs.k8s.io/cluster-api/util/annotations"
"sigs.k8s.io/cluster-api/util/conditions"
v1beta2conditions "sigs.k8s.io/cluster-api/util/conditions/v1beta2"
)

func (r *Reconciler) reconcileConditions(s *scope.Scope, cluster *clusterv1.Cluster, reconcileErr error) error {
Expand All @@ -36,32 +39,62 @@ func (r *Reconciler) reconcileConditions(s *scope.Scope, cluster *clusterv1.Clus
// The TopologyReconciled condition is considered true if spec of all the objects associated with the
// cluster are in sync with the topology defined in the cluster.
// The condition is false under the following conditions:
// - The cluster is paused.
// - An error occurred during the reconcile process of the cluster topology.
// - The ClusterClass has not been successfully reconciled with its current spec.
// - The cluster upgrade has not yet propagated to all the components of the cluster.
// - For a managed topology cluster the version upgrade is propagated one component at a time.
// In such a case, since some of the component's spec would be adrift from the topology the
// topology cannot be considered fully reconciled.
func (r *Reconciler) reconcileTopologyReconciledCondition(s *scope.Scope, cluster *clusterv1.Cluster, reconcileErr error) error {
// Mark TopologyReconciled as false if the Cluster is paused.
if cluster.Spec.Paused || annotations.HasPaused(cluster) {
var messages []string
if cluster.Spec.Paused {
messages = append(messages, "Cluster spec.paused is set to true")
}
if annotations.HasPaused(cluster) {
messages = append(messages, "Cluster has the cluster.x-k8s.io/paused annotation")
}
conditions.Set(cluster,
conditions.FalseCondition(
clusterv1.TopologyReconciledCondition,
clusterv1.TopologyReconciledPausedReason,
clusterv1.ConditionSeverityInfo,
strings.Join(messages, ", "),
),
)
v1beta2conditions.Set(cluster, metav1.Condition{
Type: clusterv1.ClusterTopologyReconciledV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.ClusterTopologyReconcilePausedV1Beta2Reason,
Message: strings.Join(messages, ", "),
})
return nil
}

// Mark TopologyReconciled as false due to cluster deletion.
if !cluster.ObjectMeta.DeletionTimestamp.IsZero() {
conditions.Set(
cluster,
conditions.Set(cluster,
conditions.FalseCondition(
clusterv1.TopologyReconciledCondition,
clusterv1.DeletedReason,
clusterv1.ConditionSeverityInfo,
"",
),
)
v1beta2conditions.Set(cluster, metav1.Condition{
Type: clusterv1.ClusterTopologyReconciledV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.ClusterTopologyReconciledDeletionTimestampSetV1Beta2Reason,
})
return nil
}

// If an error occurred during reconciliation set the TopologyReconciled condition to false.
// Add the error message from the reconcile function to the message of the condition.
if reconcileErr != nil {
conditions.Set(
cluster,
conditions.Set(cluster,
conditions.FalseCondition(
clusterv1.TopologyReconciledCondition,
clusterv1.TopologyReconcileFailedReason,
Expand All @@ -70,15 +103,21 @@ func (r *Reconciler) reconcileTopologyReconciledCondition(s *scope.Scope, cluste
reconcileErr.Error(),
),
)
v1beta2conditions.Set(cluster, metav1.Condition{
Type: clusterv1.ClusterTopologyReconciledV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.ClusterTopologyReconciledFailedV1Beta2Reason,
// TODO: Add a protection for messages continuously changing leading to Cluster object changes/reconcile.
Message: reconcileErr.Error(),
})
return nil
}

// If the ClusterClass `metadata.Generation` doesn't match the `status.ObservedGeneration` requeue as the ClusterClass
// is not up to date.
if s.Blueprint != nil && s.Blueprint.ClusterClass != nil &&
s.Blueprint.ClusterClass.GetGeneration() != s.Blueprint.ClusterClass.Status.ObservedGeneration {
conditions.Set(
cluster,
conditions.Set(cluster,
conditions.FalseCondition(
clusterv1.TopologyReconciledCondition,
clusterv1.TopologyReconciledClusterClassNotReconciledReason,
Expand All @@ -87,14 +126,20 @@ func (r *Reconciler) reconcileTopologyReconciledCondition(s *scope.Scope, cluste
".status.observedGeneration == .metadata.generation is true. If this is not the case either ClusterClass reconciliation failed or the ClusterClass is paused",
),
)
v1beta2conditions.Set(cluster, metav1.Condition{
Type: clusterv1.ClusterTopologyReconciledV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.ClusterTopologyReconciledClusterClassNotReconciledV1Beta2Reason,
Message: "ClusterClass not reconciled. If this condition persists please check ClusterClass status. A ClusterClass is reconciled if" +
".status.observedGeneration == .metadata.generation is true. If this is not the case either ClusterClass reconciliation failed or the ClusterClass is paused",
})
return nil
}

// If any of the lifecycle hooks are blocking any part of the reconciliation then topology
// is not considered as fully reconciled.
if s.HookResponseTracker.AggregateRetryAfter() != 0 {
conditions.Set(
cluster,
conditions.Set(cluster,
conditions.FalseCondition(
clusterv1.TopologyReconciledCondition,
clusterv1.TopologyReconciledHookBlockingReason,
Expand All @@ -103,6 +148,13 @@ func (r *Reconciler) reconcileTopologyReconciledCondition(s *scope.Scope, cluste
s.HookResponseTracker.AggregateMessage(),
),
)
v1beta2conditions.Set(cluster, metav1.Condition{
Type: clusterv1.ClusterTopologyReconciledV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: clusterv1.ClusterTopologyReconciledHookBlockingV1Beta2Reason,
// TODO: Add a protection for messages continuously changing leading to Cluster object changes/reconcile.
Message: s.HookResponseTracker.AggregateMessage(),
})
return nil
}

Expand All @@ -121,6 +173,7 @@ func (r *Reconciler) reconcileTopologyReconciledCondition(s *scope.Scope, cluste
s.UpgradeTracker.MachinePools.DeferredUpgrade() {
msgBuilder := &strings.Builder{}
var reason string
var v1beta2Reason string

// TODO(ykakarap): Evaluate potential improvements to building the condition. Multiple causes can trigger the
// condition to be false at the same time (Example: ControlPlane.IsPendingUpgrade and MachineDeployments.IsAnyPendingCreate can
Expand All @@ -130,40 +183,47 @@ func (r *Reconciler) reconcileTopologyReconciledCondition(s *scope.Scope, cluste
case s.UpgradeTracker.ControlPlane.IsPendingUpgrade:
fmt.Fprintf(msgBuilder, "Control plane rollout and upgrade to version %s on hold.", s.Blueprint.Topology.Version)
reason = clusterv1.TopologyReconciledControlPlaneUpgradePendingReason
v1beta2Reason = clusterv1.ClusterTopologyReconciledControlPlaneUpgradePendingV1Beta2Reason
case s.UpgradeTracker.MachineDeployments.IsAnyPendingUpgrade():
fmt.Fprintf(msgBuilder, "MachineDeployment(s) %s rollout and upgrade to version %s on hold.",
computeNameList(s.UpgradeTracker.MachineDeployments.PendingUpgradeNames()),
s.Blueprint.Topology.Version,
)
reason = clusterv1.TopologyReconciledMachineDeploymentsUpgradePendingReason
v1beta2Reason = clusterv1.ClusterTopologyReconciledMachineDeploymentsUpgradePendingV1Beta2Reason
case s.UpgradeTracker.MachineDeployments.IsAnyPendingCreate():
fmt.Fprintf(msgBuilder, "MachineDeployment(s) for Topologies %s creation on hold.",
computeNameList(s.UpgradeTracker.MachineDeployments.PendingCreateTopologyNames()),
)
reason = clusterv1.TopologyReconciledMachineDeploymentsCreatePendingReason
v1beta2Reason = clusterv1.ClusterTopologyReconciledMachineDeploymentsCreatePendingV1Beta2Reason
case s.UpgradeTracker.MachineDeployments.DeferredUpgrade():
fmt.Fprintf(msgBuilder, "MachineDeployment(s) %s rollout and upgrade to version %s deferred.",
computeNameList(s.UpgradeTracker.MachineDeployments.DeferredUpgradeNames()),
s.Blueprint.Topology.Version,
)
reason = clusterv1.TopologyReconciledMachineDeploymentsUpgradeDeferredReason
v1beta2Reason = clusterv1.ClusterTopologyReconciledMachineDeploymentsUpgradeDeferredV1Beta2Reason
case s.UpgradeTracker.MachinePools.IsAnyPendingUpgrade():
fmt.Fprintf(msgBuilder, "MachinePool(s) %s rollout and upgrade to version %s on hold.",
computeNameList(s.UpgradeTracker.MachinePools.PendingUpgradeNames()),
s.Blueprint.Topology.Version,
)
reason = clusterv1.TopologyReconciledMachinePoolsUpgradePendingReason
v1beta2Reason = clusterv1.ClusterTopologyReconciledMachinePoolsUpgradePendingV1Beta2Reason
case s.UpgradeTracker.MachinePools.IsAnyPendingCreate():
fmt.Fprintf(msgBuilder, "MachinePool(s) for Topologies %s creation on hold.",
computeNameList(s.UpgradeTracker.MachinePools.PendingCreateTopologyNames()),
)
reason = clusterv1.TopologyReconciledMachinePoolsCreatePendingReason
v1beta2Reason = clusterv1.ClusterTopologyReconciledMachinePoolsCreatePendingV1Beta2Reason
case s.UpgradeTracker.MachinePools.DeferredUpgrade():
fmt.Fprintf(msgBuilder, "MachinePool(s) %s rollout and upgrade to version %s deferred.",
computeNameList(s.UpgradeTracker.MachinePools.DeferredUpgradeNames()),
s.Blueprint.Topology.Version,
)
reason = clusterv1.TopologyReconciledMachinePoolsUpgradeDeferredReason
v1beta2Reason = clusterv1.ClusterTopologyReconciledMachinePoolsUpgradeDeferredV1Beta2Reason
}

switch {
Expand Down Expand Up @@ -191,26 +251,34 @@ func (r *Reconciler) reconcileTopologyReconciledCondition(s *scope.Scope, cluste
)
}

conditions.Set(
cluster,
conditions.Set(cluster,
conditions.FalseCondition(
clusterv1.TopologyReconciledCondition,
reason,
clusterv1.ConditionSeverityInfo,
msgBuilder.String(),
),
)
v1beta2conditions.Set(cluster, metav1.Condition{
Type: clusterv1.ClusterTopologyReconciledV1Beta2Condition,
Status: metav1.ConditionFalse,
Reason: v1beta2Reason,
Message: msgBuilder.String(),
})
return nil
}

// If there are no errors while reconciling and if the topology is not holding out changes
// we can consider that spec of all the objects is reconciled to match the topology. Set the
// TopologyReconciled condition to true.
conditions.Set(
cluster,
conditions.Set(cluster,
conditions.TrueCondition(clusterv1.TopologyReconciledCondition),
)

v1beta2conditions.Set(cluster, metav1.Condition{
Type: clusterv1.ClusterTopologyReconciledV1Beta2Condition,
Status: metav1.ConditionTrue,
Reason: clusterv1.ClusterTopologyReconcileSucceededV1Beta2Reason,
})
return nil
}

Expand Down

0 comments on commit fee8b63

Please sign in to comment.