You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@solr.apache.org by ho...@apache.org on 2023/07/13 14:36:12 UTC
[solr-operator] branch main updated: Move rolling update logic under clusterOp (#586)

This is an automated email from the ASF dual-hosted git repository.

houston pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr-operator.git


The following commit(s) were added to refs/heads/main by this push:
     new cfaf46c  Move rolling update logic under clusterOp (#586)
cfaf46c is described below

commit cfaf46c975c64e1e5ea1d3d4ea1498e66b3d512c
Author: Houston Putman <ho...@apache.org>
AuthorDate: Thu Jul 13 10:36:05 2023 -0400

    Move rolling update logic under clusterOp (#586)
    
    * Fix integration test, and ensure locking during update test
---
 controllers/solr_cluster_ops_util.go        | 73 ++++++++++++++++++++++++++
 controllers/solrcloud_controller.go         | 81 ++++++++---------------------
 controllers/util/solr_update_util.go        | 13 +++--
 controllers/util/solr_update_util_test.go   | 14 ++---
 helm/solr-operator/Chart.yaml               |  7 +++
 tests/e2e/solrcloud_rolling_upgrade_test.go |  8 ++-
 tests/e2e/test_utils_test.go                | 28 +++++++---
 tests/scripts/manage_e2e_tests.sh           |  2 +-
 8 files changed, 147 insertions(+), 79 deletions(-)

diff --git a/controllers/solr_cluster_ops_util.go b/controllers/solr_cluster_ops_util.go
index 5d8ec3a..c642dc3 100644
--- a/controllers/solr_cluster_ops_util.go
+++ b/controllers/solr_cluster_ops_util.go
@@ -156,6 +156,79 @@ func handleManagedCloudScaleUp(ctx context.Context, r *SolrCloudReconciler, inst
 	return
 }
 
+func determineRollingUpdateClusterOpLockIfNecessary(ctx context.Context, r *SolrCloudReconciler, instance *solrv1beta1.SolrCloud, statefulSet *appsv1.StatefulSet, outOfDatePods util.OutOfDatePodSegmentation, logger logr.Logger) (clusterLockAcquired bool, retryLaterDuration time.Duration, err error) {
+	if instance.Spec.UpdateStrategy.Method == solrv1beta1.ManagedUpdate && !outOfDatePods.IsEmpty() {
+		// Managed Rolling Upgrade!
+		originalStatefulSet := statefulSet.DeepCopy()
+		statefulSet.Annotations[util.ClusterOpsLockAnnotation] = util.UpdateLock
+		// No rolling update metadata is currently required
+		statefulSet.Annotations[util.ClusterOpsMetadataAnnotation] = ""
+		if err = r.Patch(ctx, statefulSet, client.StrategicMergeFrom(originalStatefulSet)); err != nil {
+			logger.Error(err, "Error while patching StatefulSet to start clusterOp", "clusterOp", util.UpdateLock, "clusterOpMetadata", "")
+		} else {
+			clusterLockAcquired = true
+		}
+	}
+	return
+}
+
+// handleManagedCloudRollingUpdate does the logic of a managed and "locked" cloud rolling update operation.
+// This will take many reconcile loops to complete, as it is deleting pods/moving replicas.
+func handleManagedCloudRollingUpdate(ctx context.Context, r *SolrCloudReconciler, instance *solrv1beta1.SolrCloud, statefulSet *appsv1.StatefulSet, outOfDatePods util.OutOfDatePodSegmentation, hasReadyPod bool, availableUpdatedPodCount int, logger logr.Logger) (retryLaterDuration time.Duration, err error) {
+	// Manage the updating of out-of-spec pods, if the Managed UpdateStrategy has been specified.
+	updateLogger := logger.WithName("ManagedUpdateSelector")
+
+	// First check if all pods are up to date. If so the rolling update is complete
+	if outOfDatePods.IsEmpty() {
+		// Once the rolling update is complete, finish the cluster operation by deleting the statefulSet annotations
+		originalStatefulSet := statefulSet.DeepCopy()
+		delete(statefulSet.Annotations, util.ClusterOpsLockAnnotation)
+		delete(statefulSet.Annotations, util.ClusterOpsMetadataAnnotation)
+		if err = r.Patch(ctx, statefulSet, client.StrategicMergeFrom(originalStatefulSet)); err != nil {
+			logger.Error(err, "Error while patching StatefulSet to finish the managed SolrCloud rollingUpdate clusterOp")
+		}
+
+		// TODO: Create event for the CRD.
+	} else {
+		// The out of date pods that have not been started, should all be updated immediately.
+		// There is no use "safely" updating pods which have not been started yet.
+		podsToUpdate := append([]corev1.Pod{}, outOfDatePods.NotStarted...)
+		for _, pod := range outOfDatePods.NotStarted {
+			updateLogger.Info("Pod killed for update.", "pod", pod.Name, "reason", "The solr container in the pod has not yet started, thus it is safe to update.")
+		}
+
+		// Pick which pods should be deleted for an update.
+		// Don't exit on an error, which would only occur because of an HTTP Exception. Requeue later instead.
+		additionalPodsToUpdate, podsHaveReplicas, retryLater, clusterStateError :=
+			util.DeterminePodsSafeToUpdate(ctx, instance, int(*statefulSet.Spec.Replicas), outOfDatePods, hasReadyPod, availableUpdatedPodCount, updateLogger)
+		// If we do not have the clusterState, it's not safe to update pods that are running
+		if clusterStateError != nil {
+			retryLater = true
+		} else {
+			podsToUpdate = append(podsToUpdate, outOfDatePods.ScheduledForDeletion...)
+			podsToUpdate = append(podsToUpdate, additionalPodsToUpdate...)
+		}
+
+		// Only actually delete a running pod if it has been evicted, or doesn't need eviction (persistent storage)
+		for _, pod := range podsToUpdate {
+			retryLaterDurationTemp, errTemp := DeletePodForUpdate(ctx, r, instance, &pod, podsHaveReplicas[pod.Name], updateLogger)
+
+			// Use the retryLaterDuration of the pod that requires a retry the soonest (smallest duration > 0)
+			if retryLaterDurationTemp > 0 && (retryLaterDurationTemp < retryLaterDuration || retryLaterDuration == 0) {
+				retryLaterDuration = retryLaterDurationTemp
+			}
+			if errTemp != nil {
+				err = errTemp
+			}
+		}
+
+		if retryLater && retryLaterDuration == 0 {
+			retryLaterDuration = time.Second * 10
+		}
+	}
+	return
+}
+
 // clearClusterOp simply removes any clusterOp for the given statefulSet.
 // This should only be used as a "break-glass" scenario. Do not use this to finish off successful clusterOps.
 func clearClusterOp(ctx context.Context, r *SolrCloudReconciler, statefulSet *appsv1.StatefulSet, reason string, logger logr.Logger) (err error) {
diff --git a/controllers/solrcloud_controller.go b/controllers/solrcloud_controller.go
index a4f91ec..56fdbd7 100644
--- a/controllers/solrcloud_controller.go
+++ b/controllers/solrcloud_controller.go
@@ -285,6 +285,13 @@ func (r *SolrCloudReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
 	var security *util.SecurityConfig = nil
 	if instance.Spec.SolrSecurity != nil {
 		security, err = util.ReconcileSecurityConfig(ctx, &r.Client, instance)
+		if err == nil && security != nil {
+			// If authn enabled on Solr, we need to pass the auth header when making requests
+			ctx, err = security.AddAuthToContext(ctx)
+			if err != nil {
+				logger.Error(err, "failed to create Authorization header when reconciling")
+			}
+		}
 		if err != nil {
 			return requeueOrNot, err
 		}
@@ -443,12 +450,14 @@ func (r *SolrCloudReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
 	}
 
 	// We only want to do one cluster operation at a time, so we use a lock to ensure that.
-	// Update or a Scale at a time. We do not want to do both.
-
+	// Update or Scale, one-at-a-time. We do not want to do both.
+	hasReadyPod := newStatus.ReadyReplicas > 0
 	var retryLaterDuration time.Duration
 	if clusterOpLock, hasAnn := statefulSet.Annotations[util.ClusterOpsLockAnnotation]; hasAnn {
 		clusterOpMetadata := statefulSet.Annotations[util.ClusterOpsMetadataAnnotation]
 		switch clusterOpLock {
+		case util.UpdateLock:
+			retryLaterDuration, err = handleManagedCloudRollingUpdate(ctx, r, instance, statefulSet, outOfDatePods, hasReadyPod, availableUpdatedPodCount, logger)
 		case util.ScaleDownLock:
 			retryLaterDuration, err = handleManagedCloudScaleDown(ctx, r, instance, statefulSet, clusterOpMetadata, podList, logger)
 		case util.ScaleUpLock:
@@ -459,12 +468,22 @@ func (r *SolrCloudReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
 			err = clearClusterOp(ctx, r, statefulSet, "clusterOp not supported", logger)
 		}
 	} else {
+		lockAcquired := false
+		// Start cluster operations if needed.
+		// The operations will be actually run in future reconcile loops, but a clusterOpLock will be acquired here.
+		// And that lock will tell future reconcile loops that the operation needs to be done.
+		// If a non-managed scale needs to take place, this method will update the StatefulSet without starting
+		// a "locked" cluster operation
+		lockAcquired, retryLaterDuration, err = determineRollingUpdateClusterOpLockIfNecessary(ctx, r, instance, statefulSet, outOfDatePods, logger)
 		// Start cluster operations if needed.
 		// The operations will be actually run in future reconcile loops, but a clusterOpLock will be acquired here.
 		// And that lock will tell future reconcile loops that the operation needs to be done.
 		// If a non-managed scale needs to take place, this method will update the StatefulSet without starting
 		// a "locked" cluster operation
-		_, retryLaterDuration, err = determineScaleClusterOpLockIfNecessary(ctx, r, instance, statefulSet, podList, logger)
+		if !lockAcquired {
+			lockAcquired, retryLaterDuration, err = determineScaleClusterOpLockIfNecessary(ctx, r, instance, statefulSet, podList, logger)
+		}
+		// After a lock is acquired, the reconcile will be started again because the StatefulSet is being watched
 	}
 	if err != nil && retryLaterDuration == 0 {
 		retryLaterDuration = time.Second * 5
@@ -476,62 +495,6 @@ func (r *SolrCloudReconciler) Reconcile(ctx context.Context, req ctrl.Request) (
 		return requeueOrNot, err
 	}
 
-	// TODO: Move this logic into the ClusterOpLock, with the "rollingUpdate" lock
-	if instance.Spec.UpdateStrategy.Method == solrv1beta1.ManagedUpdate && len(outOfDatePods.NotStarted)+len(outOfDatePods.ScheduledForDeletion)+len(outOfDatePods.Running) > 0 {
-		// Manage the updating of out-of-spec pods, if the Managed UpdateStrategy has been specified.
-		updateLogger := logger.WithName("ManagedUpdateSelector")
-
-		// The out of date pods that have not been started, should all be updated immediately.
-		// There is no use "safely" updating pods which have not been started yet.
-		podsToUpdate := append([]corev1.Pod{}, outOfDatePods.NotStarted...)
-		for _, pod := range outOfDatePods.NotStarted {
-			updateLogger.Info("Pod killed for update.", "pod", pod.Name, "reason", "The solr container in the pod has not yet started, thus it is safe to update.")
-		}
-
-		// If authn enabled on Solr, we need to pass the auth header
-		if security != nil {
-			ctx, err = security.AddAuthToContext(ctx)
-			if err != nil {
-				updateLogger.Error(err, "failed to create Authorization header when reconciling", "SolrCloud", instance.Name)
-				return requeueOrNot, err
-			}
-		}
-
-		// Pick which pods should be deleted for an update.
-		// Don't exit on an error, which would only occur because of an HTTP Exception. Requeue later instead.
-		additionalPodsToUpdate, podsHaveReplicas, retryLater, clusterStateError := util.DeterminePodsSafeToUpdate(ctx, instance, outOfDatePods, int(newStatus.ReadyReplicas), availableUpdatedPodCount, updateLogger)
-		// If we do not have the clusterState, it's not safe to update pods that are running
-		if clusterStateError != nil {
-			retryLater = true
-		} else {
-			podsToUpdate = append(podsToUpdate, outOfDatePods.ScheduledForDeletion...)
-			podsToUpdate = append(podsToUpdate, additionalPodsToUpdate...)
-		}
-
-		// Only actually delete a running pod if it has been evicted, or doesn't need eviction (persistent storage)
-		for _, pod := range podsToUpdate {
-			retryLaterDurationTemp, errTemp := DeletePodForUpdate(ctx, r, instance, &pod, podsHaveReplicas[pod.Name], updateLogger)
-
-			// Use the retryLaterDuration of the pod that requires a retry the soonest (smallest duration > 0)
-			if retryLaterDurationTemp > 0 && (retryLaterDurationTemp < retryLaterDuration || retryLaterDuration == 0) {
-				retryLaterDuration = retryLaterDurationTemp
-			}
-			if errTemp != nil {
-				err = errTemp
-			}
-		}
-
-		if err != nil || retryLaterDuration > 0 || retryLater {
-			if retryLaterDuration == 0 {
-				retryLaterDuration = time.Second * 10
-			}
-			updateRequeueAfter(&requeueOrNot, retryLaterDuration)
-		}
-		if err != nil {
-			return requeueOrNot, err
-		}
-	}
-
 	// Upsert or delete solrcloud-wide PodDisruptionBudget(s) based on 'Enabled' flag.
 	pdb := util.GeneratePodDisruptionBudget(instance, statefulSet.Spec.Selector.MatchLabels)
 	if instance.Spec.Availability.PodDisruptionBudget.Enabled != nil && *instance.Spec.Availability.PodDisruptionBudget.Enabled {
diff --git a/controllers/util/solr_update_util.go b/controllers/util/solr_update_util.go
index a5235f5..e0ca2f0 100644
--- a/controllers/util/solr_update_util.go
+++ b/controllers/util/solr_update_util.go
@@ -92,6 +92,10 @@ type OutOfDatePodSegmentation struct {
 	Running              []corev1.Pod
 }
 
+func (seg OutOfDatePodSegmentation) IsEmpty() bool {
+	return len(seg.NotStarted)+len(seg.ScheduledForDeletion)+len(seg.Running) == 0
+}
+
 // DeterminePodsSafeToUpdate takes a list of solr Pods and returns a list of pods that are safe to upgrade now.
 // This function MUST be idempotent and return the same list of pods given the same kubernetes/solr state.
 //
@@ -101,9 +105,9 @@ type OutOfDatePodSegmentation struct {
 // TODO:
 //   - Think about caching this for ~250 ms? Not a huge need to send these requests milliseconds apart.
 //   - Might be too much complexity for very little gain.
-func DeterminePodsSafeToUpdate(ctx context.Context, cloud *solr.SolrCloud, outOfDatePods OutOfDatePodSegmentation, readyPods int, availableUpdatedPodCount int, logger logr.Logger) (podsToUpdate []corev1.Pod, podsHaveReplicas map[string]bool, retryLater bool, err error) {
+func DeterminePodsSafeToUpdate(ctx context.Context, cloud *solr.SolrCloud, totalPods int, outOfDatePods OutOfDatePodSegmentation, hasReadyPod bool, availableUpdatedPodCount int, logger logr.Logger) (podsToUpdate []corev1.Pod, podsHaveReplicas map[string]bool, retryLater bool, err error) {
 	// Before fetching the cluster state, be sure that there is room to update at least 1 pod
-	maxPodsUnavailable, unavailableUpdatedPodCount, maxPodsToUpdate := calculateMaxPodsToUpdate(cloud, len(outOfDatePods.Running), len(outOfDatePods.NotStarted)+len(outOfDatePods.ScheduledForDeletion), availableUpdatedPodCount)
+	maxPodsUnavailable, unavailableUpdatedPodCount, maxPodsToUpdate := calculateMaxPodsToUpdate(cloud, totalPods, len(outOfDatePods.Running), len(outOfDatePods.NotStarted)+len(outOfDatePods.ScheduledForDeletion), availableUpdatedPodCount)
 	if maxPodsToUpdate <= 0 {
 		logger.Info("Pod update selection canceled. The number of updated pods unavailable equals or exceeds the calculated maxPodsUnavailable.",
 			"unavailableUpdatedPods", unavailableUpdatedPodCount, "outOfDatePodsNotStarted", len(outOfDatePods.NotStarted), "alreadyScheduledForDeletion", len(outOfDatePods.ScheduledForDeletion), "maxPodsUnavailable", maxPodsUnavailable)
@@ -111,7 +115,7 @@ func DeterminePodsSafeToUpdate(ctx context.Context, cloud *solr.SolrCloud, outOf
 		clusterResp := &solr_api.SolrClusterStatusResponse{}
 		overseerResp := &solr_api.SolrOverseerStatusResponse{}
 
-		if readyPods > 0 {
+		if hasReadyPod {
 			queryParams := url.Values{}
 			queryParams.Add("action", "CLUSTERSTATUS")
 			err = solr_api.CallCollectionsApi(ctx, cloud, queryParams, clusterResp)
@@ -148,8 +152,7 @@ func DeterminePodsSafeToUpdate(ctx context.Context, cloud *solr.SolrCloud, outOf
 }
 
 // calculateMaxPodsToUpdate determines the maximum number of additional pods that can be updated.
-func calculateMaxPodsToUpdate(cloud *solr.SolrCloud, outOfDatePodCount int, outOfDatePodsNotStartedCount int, availableUpdatedPodCount int) (maxPodsUnavailable int, unavailableUpdatedPodCount int, maxPodsToUpdate int) {
-	totalPods := int(*cloud.Spec.Replicas)
+func calculateMaxPodsToUpdate(cloud *solr.SolrCloud, totalPods int, outOfDatePodCount int, outOfDatePodsNotStartedCount int, availableUpdatedPodCount int) (maxPodsUnavailable int, unavailableUpdatedPodCount int, maxPodsToUpdate int) {
 	// In order to calculate the number of updated pods that are unavailable take all pods, take the total pods and subtract those that are available and updated, and those that are not updated.
 	unavailableUpdatedPodCount = totalPods - availableUpdatedPodCount - outOfDatePodCount - outOfDatePodsNotStartedCount
 	// If the maxBatchNodeUpgradeSpec is passed as a decimal between 0 and 1, then calculate as a percentage of the number of nodes.
diff --git a/controllers/util/solr_update_util_test.go b/controllers/util/solr_update_util_test.go
index 1ff982d..3729dbf 100644
--- a/controllers/util/solr_update_util_test.go
+++ b/controllers/util/solr_update_util_test.go
@@ -524,38 +524,38 @@ func TestCalculateMaxPodsToUpgrade(t *testing.T) {
 		},
 	}
 
-	foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate := calculateMaxPodsToUpdate(solrCloud, 4, 0, 4)
+	foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate := calculateMaxPodsToUpdate(solrCloud, 10, 4, 0, 4)
 	assert.Equal(t, 2, foundMaxPodsUnavailable, "Incorrect value of maxPodsUnavailable given fromInt(2)")
 	assert.Equal(t, 2, foundUnavailableUpdatedPodCount, "Incorrect value of unavailableUpdatedPodCount")
 	assert.Equal(t, 0, foundMaxPodsToUpdate, "Incorrect value of maxPodsToUpdate")
 
-	foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate = calculateMaxPodsToUpdate(solrCloud, 4, 0, 3)
+	foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate = calculateMaxPodsToUpdate(solrCloud, 10, 4, 0, 3)
 	assert.Equal(t, 2, foundMaxPodsUnavailable, "Incorrect value of maxPodsUnavailable given fromInt(2)")
 	assert.Equal(t, 3, foundUnavailableUpdatedPodCount, "Incorrect value of unavailableUpdatedPodCount")
 	assert.Equal(t, -1, foundMaxPodsToUpdate, "Incorrect value of maxPodsToUpdate")
 
-	foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate = calculateMaxPodsToUpdate(solrCloud, 3, 1, 3)
+	foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate = calculateMaxPodsToUpdate(solrCloud, 10, 3, 1, 3)
 	assert.Equal(t, 2, foundMaxPodsUnavailable, "Incorrect value of maxPodsUnavailable given fromInt(2)")
 	assert.Equal(t, 3, foundUnavailableUpdatedPodCount, "Incorrect value of unavailableUpdatedPodCount")
 	assert.Equal(t, -2, foundMaxPodsToUpdate, "Incorrect value of maxPodsToUpdate")
 
 	maxPodsUnavailable = intstr.FromString("45%")
-	foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate = calculateMaxPodsToUpdate(solrCloud, 3, 0, 5)
+	foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate = calculateMaxPodsToUpdate(solrCloud, 10, 3, 0, 5)
 	assert.Equal(t, 4, foundMaxPodsUnavailable, "Incorrect value of maxPodsUnavailable given fromString(\"45%\")")
 	assert.Equal(t, 2, foundMaxPodsToUpdate, "Incorrect value of maxPodsToUpdate")
 
 	maxPodsUnavailable = intstr.FromString("45%")
-	foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate = calculateMaxPodsToUpdate(solrCloud, 1, 2, 5)
+	foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate = calculateMaxPodsToUpdate(solrCloud, 10, 1, 2, 5)
 	assert.Equal(t, 4, foundMaxPodsUnavailable, "Incorrect value of maxPodsUnavailable given fromString(\"45%\")")
 	assert.Equal(t, 0, foundMaxPodsToUpdate, "Incorrect value of maxPodsToUpdate")
 
 	maxPodsUnavailable = intstr.FromString("70%")
-	foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate = calculateMaxPodsToUpdate(solrCloud, 3, 0, 2)
+	foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate = calculateMaxPodsToUpdate(solrCloud, 10, 3, 0, 2)
 	assert.Equal(t, 7, foundMaxPodsUnavailable, "Incorrect value of maxPodsUnavailable given fromString(\"70%\")")
 	assert.Equal(t, 2, foundMaxPodsToUpdate, "Incorrect value of maxPodsToUpdate")
 
 	solrCloud.Spec.UpdateStrategy.ManagedUpdateOptions.MaxPodsUnavailable = nil
-	foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate = calculateMaxPodsToUpdate(solrCloud, 3, 0, 2)
+	foundMaxPodsUnavailable, foundUnavailableUpdatedPodCount, foundMaxPodsToUpdate = calculateMaxPodsToUpdate(solrCloud, 10, 3, 0, 2)
 	assert.Equal(t, 2, foundMaxPodsUnavailable, "Incorrect value of maxPodsUnavailable given fromString(\"25%\")")
 	assert.Equal(t, -3, foundMaxPodsToUpdate, "Incorrect value of maxPodsToUpdate")
 }
diff --git a/helm/solr-operator/Chart.yaml b/helm/solr-operator/Chart.yaml
index 3f94074..41c10ad 100644
--- a/helm/solr-operator/Chart.yaml
+++ b/helm/solr-operator/Chart.yaml
@@ -84,6 +84,13 @@ annotations:
           url: https://github.com/apache/solr-operator/issues/570
         - name: Github PR
           url: https://github.com/apache/solr-operator/pull/578
+    - kind: changed
+      description: Managed Rolling Updates are now computed via a Cluster Lock, like scaling operations.
+      links:
+        - name: Github Issue
+          url: https://github.com/apache/solr-operator/issues/560
+        - name: Github PR
+          url: https://github.com/apache/solr-operator/pull/586
   artifacthub.io/images: |
     - name: solr-operator
       image: apache/solr-operator:v0.8.0-prerelease
diff --git a/tests/e2e/solrcloud_rolling_upgrade_test.go b/tests/e2e/solrcloud_rolling_upgrade_test.go
index 567e359..c3fe789 100644
--- a/tests/e2e/solrcloud_rolling_upgrade_test.go
+++ b/tests/e2e/solrcloud_rolling_upgrade_test.go
@@ -20,6 +20,7 @@ package e2e
 import (
 	"context"
 	solrv1beta1 "github.com/apache/solr-operator/api/v1beta1"
+	"github.com/apache/solr-operator/controllers/util"
 	. "github.com/onsi/ginkgo/v2"
 	. "github.com/onsi/gomega"
 	"k8s.io/apimachinery/pkg/util/intstr"
@@ -81,12 +82,14 @@ var _ = FDescribe("E2E - SolrCloud - Rolling Upgrades", func() {
 			Expect(k8sClient.Patch(ctx, patchedSolrCloud, client.MergeFrom(solrCloud))).To(Succeed(), "Could not add annotation to SolrCloud pod to initiate rolling restart")
 
 			By("waiting for the rolling restart to begin")
-			expectSolrCloudWithChecks(ctx, solrCloud, func(g Gomega, cloud *solrv1beta1.SolrCloud) {
+			solrCloud = expectSolrCloudWithChecks(ctx, solrCloud, func(g Gomega, cloud *solrv1beta1.SolrCloud) {
 				g.Expect(cloud.Status.UpToDateNodes).To(BeZero(), "Cloud did not get to a state with zero up-to-date replicas when rolling restart began.")
 				for _, nodeStatus := range cloud.Status.SolrNodes {
 					g.Expect(nodeStatus.SpecUpToDate).To(BeFalse(), "Node not starting as out-of-date when rolling restart begins: %s", nodeStatus.Name)
 				}
 			})
+			statefulSet := expectStatefulSet(ctx, solrCloud, solrCloud.StatefulSetName())
+			Expect(statefulSet.Annotations).To(HaveKeyWithValue(util.ClusterOpsLockAnnotation, util.UpdateLock), "StatefulSet does not have a RollingUpdate lock after starting managed update.")
 
 			By("waiting for the rolling restart to complete")
 			// Expect the SolrCloud to be up-to-date, or in a valid restarting state
@@ -140,6 +143,9 @@ var _ = FDescribe("E2E - SolrCloud - Rolling Upgrades", func() {
 				Expect(nodeStatus.Ready).To(BeTrue(), "Node not finishing as ready when rolling restart ends: %s", nodeStatus.Name)
 			}
 
+			statefulSet = expectStatefulSet(ctx, solrCloud, solrCloud.StatefulSetName())
+			Expect(statefulSet.Annotations).To(Not(HaveKey(util.ClusterOpsLockAnnotation)), "StatefulSet should not have a RollingUpdate lock after finishing a managed update.")
+
 			By("checking that the collections can be queried after the restart")
 			queryCollection(ctx, solrCloud, solrCollection1, 0)
 			queryCollection(ctx, solrCloud, solrCollection2, 0)
diff --git a/tests/e2e/test_utils_test.go b/tests/e2e/test_utils_test.go
index 055254b..2568ed8 100644
--- a/tests/e2e/test_utils_test.go
+++ b/tests/e2e/test_utils_test.go
@@ -308,14 +308,30 @@ func queryCollectionWithGomega(ctx context.Context, solrCloud *solrv1beta1.SolrC
 				fmt.Sprintf("http://localhost:%d/solr/%s/select?rows=0", solrCloud.Spec.SolrAddressability.PodPort, collection),
 			},
 		)
-		g.ExpectWithOffset(resolveOffset(additionalOffset), err).ToNot(HaveOccurred(), "Error occurred while querying empty Solr Collection")
-		g.ExpectWithOffset(resolveOffset(additionalOffset), response).To(ContainSubstring("\"numFound\":%d", docCount), "Error occurred while querying Solr Collection '%s'", collection)
-	}, time.Second*5).WithContext(ctx).Should(Succeed(), "Could not successfully query collection")
+		innerG.Expect(err).ToNot(HaveOccurred(), "Error occurred while querying empty Solr Collection")
+		innerG.Expect(response).To(ContainSubstring("\"numFound\":%d", docCount), "Error occurred while querying Solr Collection '%s'", collection)
+	}, time.Second*5).WithContext(ctx).Should(Succeed(), "Could not successfully query collection: %v", fetchClusterStatus(ctx, solrCloud))
 	// Only wait 5 seconds for the collection to be query-able
 }
 
+func fetchClusterStatus(ctx context.Context, solrCloud *solrv1beta1.SolrCloud) string {
+	response, err := runExecForContainer(
+		ctx,
+		util.SolrNodeContainer,
+		solrCloud.GetRandomSolrPodName(),
+		solrCloud.Namespace,
+		[]string{
+			"curl",
+			fmt.Sprintf("http://localhost:%d/solr/admin/collections?action=CLUSTERSTATUS", solrCloud.Spec.SolrAddressability.PodPort),
+		},
+	)
+	Expect(err).ToNot(HaveOccurred(), "Could not fetch clusterStatus for cloud")
+
+	return response
+}
+
 func queryCollectionWithNoReplicaAvailable(ctx context.Context, solrCloud *solrv1beta1.SolrCloud, collection string, additionalOffset ...int) {
-	EventuallyWithOffset(resolveOffset(additionalOffset), func(g Gomega) {
+	EventuallyWithOffset(resolveOffset(additionalOffset), func(innerG Gomega) {
 		response, err := runExecForContainer(
 			ctx,
 			util.SolrNodeContainer,
@@ -326,8 +342,8 @@ func queryCollectionWithNoReplicaAvailable(ctx context.Context, solrCloud *solrv
 				fmt.Sprintf("http://localhost:%d/solr/%s/select", solrCloud.Spec.SolrAddressability.PodPort, collection),
 			},
 		)
-		g.Expect(err).ToNot(HaveOccurred(), "Error occurred while querying empty Solr Collection")
-		g.Expect(response).To(ContainSubstring("Error trying to proxy request for url"), "Wrong occurred while querying Solr Collection '%s', expected a proxy forwarding error", collection)
+		innerG.Expect(err).ToNot(HaveOccurred(), "Error occurred while querying empty Solr Collection")
+		innerG.Expect(response).To(ContainSubstring("Error trying to proxy request for url"), "Wrong occurred while querying Solr Collection '%s', expected a proxy forwarding error", collection)
 	}, time.Second*5).WithContext(ctx).Should(Succeed(), "Collection query did not fail in the correct way")
 }
 
diff --git a/tests/scripts/manage_e2e_tests.sh b/tests/scripts/manage_e2e_tests.sh
index f85a3e1..98f23c4 100755
--- a/tests/scripts/manage_e2e_tests.sh
+++ b/tests/scripts/manage_e2e_tests.sh
@@ -104,7 +104,7 @@ function add_image_to_kind_repo_if_local() {
     kind load docker-image --name "${CLUSTER_NAME}" "${IMAGE}"
   else
     if [ "${PULL_IF_NOT_LOCAL}" = true ]; then
-      printf "\nPulling image \"%s\" since it was not found locally.\n\n" "${IMAGE}" "${IMAGE}"
+      printf "\nPulling image \"%s\" since it was not found locally.\n\n" "${IMAGE}"
       docker pull "${IMAGE}"
       kind load docker-image --name "${CLUSTER_NAME}" "${IMAGE}"
     else