-
Notifications
You must be signed in to change notification settings - Fork 70
🌱 Set Progressing to Succeeded on ClusterExtensionRevision only after availability probes pass
#2524
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
🌱 Set Progressing to Succeeded on ClusterExtensionRevision only after availability probes pass
#2524
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,7 +8,6 @@ import ( | |
| "errors" | ||
| "fmt" | ||
| "strings" | ||
| "sync" | ||
| "time" | ||
|
|
||
| appsv1 "k8s.io/api/apps/v1" | ||
|
|
@@ -20,6 +19,7 @@ import ( | |
| "k8s.io/apimachinery/pkg/runtime/schema" | ||
| "k8s.io/apimachinery/pkg/types" | ||
| "k8s.io/apimachinery/pkg/util/sets" | ||
| "k8s.io/utils/clock" | ||
| "pkg.package-operator.run/boxcutter" | ||
| "pkg.package-operator.run/boxcutter/machinery" | ||
| machinerytypes "pkg.package-operator.run/boxcutter/machinery/types" | ||
|
|
@@ -48,9 +48,7 @@ type ClusterExtensionRevisionReconciler struct { | |
| Client client.Client | ||
| RevisionEngineFactory RevisionEngineFactory | ||
| TrackingCache trackingCache | ||
| // track if we have queued up the reconciliation that detects eventual progress deadline issues | ||
| // keys is revision UUID, value is boolean | ||
| progressDeadlineCheckInFlight sync.Map | ||
| Clock clock.Clock | ||
| } | ||
|
|
||
| type trackingCache interface { | ||
|
|
@@ -86,15 +84,15 @@ func (c *ClusterExtensionRevisionReconciler) Reconcile(ctx context.Context, req | |
| // check if we reached the progress deadline only if the revision is still progressing and has not succeeded yet | ||
| if isStillProgressing && !succeeded { | ||
| timeout := time.Duration(pd) * time.Minute | ||
| if time.Since(existingRev.CreationTimestamp.Time) > timeout { | ||
| if c.Clock.Since(existingRev.CreationTimestamp.Time) > timeout { | ||
| // progress deadline reached, reset any errors and stop reconciling this revision | ||
| markAsNotProgressing(reconciledRev, ocv1.ReasonProgressDeadlineExceeded, fmt.Sprintf("Revision has not rolled out for %d minutes.", pd)) | ||
| reconcileErr = nil | ||
| res = ctrl.Result{} | ||
| } else if _, found := c.progressDeadlineCheckInFlight.Load(existingRev.GetUID()); !found && reconcileErr == nil { | ||
| // if we haven't already queued up a reconcile to check for progress deadline, queue one up, but only once | ||
| c.progressDeadlineCheckInFlight.Store(existingRev.GetUID(), true) | ||
| res = ctrl.Result{RequeueAfter: timeout} | ||
| } else if reconcileErr == nil { | ||
| requeueAfter := existingRev.CreationTimestamp.Time.Add(timeout).Add(2 * time.Second).Sub(c.Clock.Now()).Round(time.Second) | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. what's the reason for the +2 seconds?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Requeing is not exact time-wise, so we can land here for the second time if we do not add a bit of drift. With drift we ensure that the next reconciliation is going to detect the deadline has exceeded. We could experiment with adding just 1 sec, if that would your preference, but I think it is safer to have larger drift. An alternative would be to drop if requeueAfter == 0 {
requeueafter = 1 * time.Second
}wdyt?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm good with keeping it as is - maybe we could just add a comment to call this out as it might trip up the next person? Maybe also move the drift to a const to add additional context with the constant's name? |
||
| l.Info(fmt.Sprintf("ProgressDeadline not exceeded, requeue after ~%v to check again.", requeueAfter)) | ||
| res = ctrl.Result{RequeueAfter: requeueAfter} | ||
| } | ||
| } | ||
| } | ||
|
|
@@ -208,9 +206,7 @@ func (c *ClusterExtensionRevisionReconciler) reconcile(ctx context.Context, cer | |
| } | ||
|
|
||
| revVersion := cer.GetAnnotations()[labels.BundleVersionKey] | ||
| if !rres.InTransition() { | ||
| markAsProgressing(cer, ocv1.ReasonSucceeded, fmt.Sprintf("Revision %s has rolled out.", revVersion)) | ||
| } else { | ||
| if rres.InTransition() { | ||
| markAsProgressing(cer, ocv1.ReasonRollingOut, fmt.Sprintf("Revision %s is rolling out.", revVersion)) | ||
| } | ||
|
|
||
|
|
@@ -231,6 +227,7 @@ func (c *ClusterExtensionRevisionReconciler) reconcile(ctx context.Context, cer | |
| } | ||
| } | ||
|
|
||
| markAsProgressing(cer, ocv1.ReasonSucceeded, fmt.Sprintf("Revision %s has rolled out.", revVersion)) | ||
| markAsAvailable(cer, ocv1.ClusterExtensionRevisionReasonProbesSucceeded, "Objects are available and pass all probes.") | ||
|
|
||
| // We'll probably only want to remove this once we are done updating the ClusterExtension conditions | ||
|
|
@@ -333,6 +330,7 @@ func (c *ClusterExtensionRevisionReconciler) SetupWithManager(mgr ctrl.Manager) | |
| return true | ||
| }, | ||
| } | ||
| c.Clock = clock.RealClock{} | ||
| return ctrl.NewControllerManagedBy(mgr). | ||
| For( | ||
| &ocv1.ClusterExtensionRevision{}, | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -383,6 +383,39 @@ Feature: Install ClusterExtension | |
| invalid ClusterExtension configuration: invalid configuration: unknown field "watchNamespace" | ||
| """ | ||
|
|
||
| @BoxcutterRuntime | ||
| @ProgressDeadline | ||
| Scenario: Report ClusterExtension as not progressing if the rollout does not become available within given timeout | ||
| Given min value for ClusterExtension .spec.progressDeadlineMinutes is set to 1 | ||
| And min value for ClusterExtensionRevision .spec.progressDeadlineMinutes is set to 1 | ||
| When ClusterExtension is applied | ||
| """ | ||
| apiVersion: olm.operatorframework.io/v1 | ||
| kind: ClusterExtension | ||
| metadata: | ||
| name: ${NAME} | ||
| spec: | ||
| namespace: ${TEST_NAMESPACE} | ||
| progressDeadlineMinutes: 1 | ||
| serviceAccount: | ||
| name: olm-sa | ||
| source: | ||
| sourceType: Catalog | ||
| catalog: | ||
| packageName: test | ||
| # bundle refers bad image references, so that the deployment never becomes available | ||
| version: 1.0.2 | ||
| selector: | ||
| matchLabels: | ||
| "olm.operatorframework.io/metadata.name": test-catalog | ||
| """ | ||
| Then ClusterExtensionRevision "${NAME}-1" reports Progressing as False with Reason ProgressDeadlineExceeded | ||
| And ClusterExtension reports Progressing as False with Reason ProgressDeadlineExceeded and Message: | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. not for this PR, but I wonder if we should call it |
||
| """ | ||
| Revision has not rolled out for 1 minutes. | ||
| """ | ||
| And ClusterExtension reports Progressing transition between 1 and 2 minutes since its creation | ||
|
|
||
| @BoxcutterRuntime | ||
| @ProgressDeadline | ||
| Scenario: Report ClusterExtension as not progressing if the rollout does not complete within given timeout | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The error message uses "minutes" which is always plural, but when the progress deadline is 1 minute, it should say "minute" (singular). Consider using a conditional or pluralization helper to make the message grammatically correct for all values. For example:
fmt.Sprintf("Revision has not rolled out for %d minute(s).", pd)or a more sophisticated approach that checks if pd == 1.Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
'minute(s)' might also be acceptable here