Skip to content

Commit

Permalink
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat: Sync timeouts for applications (#6055)
Browse files Browse the repository at this point in the history
Helps with #6055

Introduces a controller-level configuration for terminating sync after timeout.

Signed-off-by: Andrii Korotkov <andrii.korotkov@verkada.com>
andrii-korotkov-verkada committed Nov 17, 2024
1 parent 6d8d32f commit f9588ad
Showing 11 changed files with 111 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -61,6 +61,7 @@ func NewCommand() *cobra.Command {
selfHealBackoffTimeoutSeconds int
selfHealBackoffFactor int
selfHealBackoffCapSeconds int
syncTimeout int
statusProcessors int
operationProcessors int
glogLevel int
@@ -181,6 +182,7 @@ func NewCommand() *cobra.Command {
time.Duration(appResyncJitter)*time.Second,
time.Duration(selfHealTimeoutSeconds)*time.Second,
selfHealBackoff,
time.Duration(syncTimeout)*time.Second,
time.Duration(repoErrorGracePeriod)*time.Second,
metricsPort,
metricsCacheExpiration,
@@ -248,6 +250,7 @@ func NewCommand() *cobra.Command {
command.Flags().IntVar(&selfHealBackoffTimeoutSeconds, "self-heal-backoff-timeout-seconds", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_TIMEOUT_SECONDS", 2, 0, math.MaxInt32), "Specifies initial timeout of exponential backoff between self heal attempts")
command.Flags().IntVar(&selfHealBackoffFactor, "self-heal-backoff-factor", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_FACTOR", 3, 0, math.MaxInt32), "Specifies factor of exponential timeout between application self heal attempts")
command.Flags().IntVar(&selfHealBackoffCapSeconds, "self-heal-backoff-cap-seconds", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SELF_HEAL_BACKOFF_CAP_SECONDS", 300, 0, math.MaxInt32), "Specifies max timeout of exponential backoff between application self heal attempts")
command.Flags().IntVar(&syncTimeout, "sync-timeout", env.ParseNumFromEnv("ARGOCD_APPLICATION_CONTROLLER_SYNCTIMEOUT", 0, 0, math.MaxInt32), "Specifies the timeout after which a sync would be terminated. 0 means no timeout (default).")
command.Flags().Int64Var(&kubectlParallelismLimit, "kubectl-parallelism-limit", env.ParseInt64FromEnv("ARGOCD_APPLICATION_CONTROLLER_KUBECTL_PARALLELISM_LIMIT", 20, 0, math.MaxInt64), "Number of allowed concurrent kubectl fork/execs. Any value less than 1 means no limit.")
command.Flags().BoolVar(&repoServerPlaintext, "repo-server-plaintext", env.ParseBoolFromEnv("ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_PLAINTEXT", false), "Disable TLS on connections to repo server")
command.Flags().BoolVar(&repoServerStrictTLS, "repo-server-strict-tls", env.ParseBoolFromEnv("ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_STRICT_TLS", false), "Whether to use strict validation of the TLS cert presented by the repo server")
12 changes: 12 additions & 0 deletions controller/appcontroller.go
Original file line number Diff line number Diff line change
@@ -131,6 +131,7 @@ type ApplicationController struct {
statusRefreshJitter time.Duration
selfHealTimeout time.Duration
selfHealBackOff *wait.Backoff
syncTimeout time.Duration
db db.ArgoDB
settingsMgr *settings_util.SettingsManager
refreshRequestedApps map[string]CompareWith
@@ -161,6 +162,7 @@ func NewApplicationController(
appResyncJitter time.Duration,
selfHealTimeout time.Duration,
selfHealBackoff *wait.Backoff,
syncTimeout time.Duration,
repoErrorGracePeriod time.Duration,
metricsPort int,
metricsCacheExpiration time.Duration,
@@ -202,6 +204,7 @@ func NewApplicationController(
settingsMgr: settingsMgr,
selfHealTimeout: selfHealTimeout,
selfHealBackOff: selfHealBackoff,
syncTimeout: syncTimeout,
clusterSharding: clusterSharding,
projByNameCache: sync.Map{},
applicationNamespaces: applicationNamespaces,
@@ -1373,12 +1376,21 @@ func (ctrl *ApplicationController) processRequestedAppOperation(app *appv1.Appli
// Get rid of sync results and null out previous operation completion time
state.SyncResult = nil
}
} else if ctrl.syncTimeout != time.Duration(0) && time.Now().After(state.StartedAt.Add(ctrl.syncTimeout)) && !terminating {
state.Phase = synccommon.OperationTerminating
state.Message = "operation is terminating due to timeout"
ctrl.setOperationState(app, state)
logCtx.Infof("Terminating in-progress operation due to timeout. Started at: %v, timeout: %v", state.StartedAt, ctrl.syncTimeout)
} else {
logCtx.Infof("Resuming in-progress operation. phase: %s, message: %s", state.Phase, state.Message)
}
} else {
state = &appv1.OperationState{Phase: synccommon.OperationRunning, Operation: *app.Operation, StartedAt: metav1.Now()}
ctrl.setOperationState(app, state)
if ctrl.syncTimeout != time.Duration(0) {
// Schedule a check during which the timeout would be checked.
ctrl.appOperationQueue.AddAfter(ctrl.toAppKey(app.QualifiedName()), ctrl.syncTimeout)
}
logCtx.Infof("Initialized new operation: %v", *app.Operation)
}
ts.AddCheckpoint("initial_operation_stage_ms")
52 changes: 52 additions & 0 deletions controller/appcontroller_test.go
Original file line number Diff line number Diff line change
@@ -160,6 +160,7 @@ func newFakeController(data *fakeData, repoErr error) *ApplicationController {
time.Second,
time.Minute,
nil,
0,
time.Second*10,
common.DefaultPortArgoCDMetrics,
data.metricsCacheExpiration,
@@ -2256,3 +2257,54 @@ func TestSelfHealExponentialBackoff(t *testing.T) {
})
}
}

func TestSyncTimeout(t *testing.T) {
testCases := []struct {
delta time.Duration
expectedPhase synccommon.OperationPhase
expectedMessage string
}{{
delta: 2 * time.Minute,
expectedPhase: synccommon.OperationFailed,
expectedMessage: "Operation terminated",
}, {
delta: 30 * time.Second,
expectedPhase: synccommon.OperationSucceeded,
expectedMessage: "successfully synced (no more tasks)",
}}
for i := range testCases {
tc := testCases[i]
t.Run(fmt.Sprintf("test case %d", i), func(t *testing.T) {
app := newFakeApp()
app.Spec.Project = "default"
app.Operation = &v1alpha1.Operation{
Sync: &v1alpha1.SyncOperation{
Revision: "HEAD",
},
}
ctrl := newFakeController(&fakeData{
apps: []runtime.Object{app, &defaultProj},
manifestResponses: []*apiclient.ManifestResponse{{
Manifests: []string{},
}},
}, nil)

ctrl.syncTimeout = time.Minute
app.Status.OperationState = &v1alpha1.OperationState{
Operation: v1alpha1.Operation{
Sync: &v1alpha1.SyncOperation{
Revision: "HEAD",
},
},
Phase: synccommon.OperationRunning,
StartedAt: metav1.NewTime(time.Now().Add(-tc.delta)),
}
ctrl.processRequestedAppOperation(app)

app, err := ctrl.applicationClientset.ArgoprojV1alpha1().Applications(app.ObjectMeta.Namespace).Get(context.Background(), app.ObjectMeta.Name, metav1.GetOptions{})
require.NoError(t, err)
require.Equal(t, tc.expectedPhase, app.Status.OperationState.Phase)
require.Equal(t, tc.expectedMessage, app.Status.OperationState.Message)
})
}
}
2 changes: 2 additions & 0 deletions docs/operator-manual/argocd-cmd-params-cm.yaml
Original file line number Diff line number Diff line change
@@ -51,6 +51,8 @@ data:
controller.self.heal.timeout.seconds: "2"
controller.self.heal.backoff.factor: "3"
controller.self.heal.backoff.cap.seconds: "300"
# Specifies a sync timeout for applications. "0" means no timeout (default "0")
controller.sync.timeout.seconds: "0"

# Cache expiration for app state (default 1h0m0s)
controller.app.state.cache.expiration: "1h0m0s"
Original file line number Diff line number Diff line change
@@ -115,6 +115,12 @@ spec:
name: argocd-cmd-params-cm
key: controller.self.heal.backoff.cap.seconds
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_SYNC_TIMEOUT
valueFrom:
configMapKeyRef:
name: argocd-cmd-params-cm
key: controller.sync.timeout.seconds
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_PLAINTEXT
valueFrom:
configMapKeyRef:
Original file line number Diff line number Diff line change
@@ -118,6 +118,12 @@ spec:
name: argocd-cmd-params-cm
key: controller.self.heal.backoff.cap.seconds
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_SYNC_TIMEOUT
valueFrom:
configMapKeyRef:
name: argocd-cmd-params-cm
key: controller.sync.timeout.seconds
optional: true
- name: ARGOCD_APPLICATION_CONTROLLER_REPO_SERVER_PLAINTEXT
valueFrom:
configMapKeyRef:
6 changes: 6 additions & 0 deletions manifests/core-install.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions manifests/ha/install.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions manifests/ha/namespace-install.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions manifests/install.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions manifests/namespace-install.yaml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

0 comments on commit f9588ad

Please sign in to comment.