You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@solr.apache.org by ho...@apache.org on 2023/08/31 18:02:06 UTC
[solr-operator] branch main updated: Fix rolling restarts for ephemeral SolrClouds (#614)

This is an automated email from the ASF dual-hosted git repository.

houston pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/solr-operator.git


The following commit(s) were added to refs/heads/main by this push:
     new 45665b5  Fix rolling restarts for ephemeral SolrClouds (#614)
45665b5 is described below

commit 45665b5eb186f78c6e622b4d5e092d7575962406
Author: Houston Putman <ho...@apache.org>
AuthorDate: Thu Aug 31 14:01:58 2023 -0400

    Fix rolling restarts for ephemeral SolrClouds (#614)
    
    A bug appeared when replica migration failed and needed to retry.
---
 controllers/solr_cluster_ops_util.go      |  29 +++--
 controllers/solr_pod_lifecycle_util.go    |   1 +
 controllers/util/solr_update_util.go      | 176 ++++++++++++++++++------------
 controllers/util/solr_update_util_test.go |  82 ++++++++------
 helm/solr-operator/Chart.yaml             |   5 +
 5 files changed, 180 insertions(+), 113 deletions(-)

diff --git a/controllers/solr_cluster_ops_util.go b/controllers/solr_cluster_ops_util.go
index 96859a0..61d6aed 100644
--- a/controllers/solr_cluster_ops_util.go
+++ b/controllers/solr_cluster_ops_util.go
@@ -321,21 +321,32 @@ func handleManagedCloudRollingUpdate(ctx context.Context, r *SolrCloudReconciler
 			updateLogger.Info("Pod killed for update.", "pod", pod.Name, "reason", "The solr container in the pod has not yet started, thus it is safe to update.")
 		}
 
-		// Pick which pods should be deleted for an update.
 		// Don't exit on an error, which would only occur because of an HTTP Exception. Requeue later instead.
-		additionalPodsToUpdate, podsHaveReplicas, retryLater, clusterStateError :=
-			util.DeterminePodsSafeToUpdate(ctx, instance, int(*statefulSet.Spec.Replicas), outOfDatePods, hasReadyPod, availableUpdatedPodCount, updateLogger)
-		// If we do not have the clusterState, it's not safe to update pods that are running
-		if clusterStateError != nil {
-			retryLater = true
-		} else {
+		// We won't kill pods that we need the cluster state for, but we can kill the pods that are already not running.
+		// This is important for scenarios where there is a bad pod config and nothing is running, but we need to do
+		// a restart to get a working pod config.
+		state, retryLater, apiError := util.GetNodeReplicaState(ctx, instance, hasReadyPod, logger)
+		if apiError != nil {
+			return false, true, 0, apiError
+		} else if !retryLater {
+			// If the cluster status has been successfully fetched, then add the pods scheduled for deletion
+			// This requires the clusterState to be fetched successfully to ensure that we know if there
+			// are replicas living on the pod
 			podsToUpdate = append(podsToUpdate, outOfDatePods.ScheduledForDeletion...)
-			podsToUpdate = append(podsToUpdate, additionalPodsToUpdate...)
+
+			// Pick which pods should be deleted for an update.
+			var additionalPodsToUpdate []corev1.Pod
+			additionalPodsToUpdate, retryLater =
+				util.DeterminePodsSafeToUpdate(instance, int(*statefulSet.Spec.Replicas), outOfDatePods, state, availableUpdatedPodCount, updateLogger)
+			// If we do not have the clusterState, it's not safe to update pods that are running
+			if !retryLater {
+				podsToUpdate = append(podsToUpdate, additionalPodsToUpdate...)
+			}
 		}
 
 		// Only actually delete a running pod if it has been evicted, or doesn't need eviction (persistent storage)
 		for _, pod := range podsToUpdate {
-			retryLaterDurationTemp, inProgTmp, errTemp := DeletePodForUpdate(ctx, r, instance, &pod, podsHaveReplicas[pod.Name], updateLogger)
+			retryLaterDurationTemp, inProgTmp, errTemp := DeletePodForUpdate(ctx, r, instance, &pod, state.PodHasReplicas(instance, pod.Name), updateLogger)
 			requestInProgress = requestInProgress || inProgTmp
 
 			// Use the retryLaterDuration of the pod that requires a retry the soonest (smallest duration > 0)
diff --git a/controllers/solr_pod_lifecycle_util.go b/controllers/solr_pod_lifecycle_util.go
index 8af455b..adf289e 100644
--- a/controllers/solr_pod_lifecycle_util.go
+++ b/controllers/solr_pod_lifecycle_util.go
@@ -101,6 +101,7 @@ func DeletePodForUpdate(ctx context.Context, r *SolrCloudReconciler, instance *s
 
 	// Delete the pod
 	if deletePod {
+		logger.Error(err, "Deleting solr pod for update", "pod", pod.Name)
 		err = r.Delete(ctx, pod, client.Preconditions{
 			UID: &pod.UID,
 		})
diff --git a/controllers/util/solr_update_util.go b/controllers/util/solr_update_util.go
index 7d39cd9..4f9eebe 100644
--- a/controllers/util/solr_update_util.go
+++ b/controllers/util/solr_update_util.go
@@ -96,6 +96,59 @@ func (seg OutOfDatePodSegmentation) IsEmpty() bool {
 	return len(seg.NotStarted)+len(seg.ScheduledForDeletion)+len(seg.Running) == 0
 }
 
+type NodeReplicaState struct {
+	// A map of Solr node name (not pod name) to the contents of that Solr Node
+	NodeContents map[string]*SolrNodeContents
+	// A map of unique shard name (collection + shard) to the number of replicas for that shard
+	TotalShardReplicas map[string]int
+	// A map of unique shard name (collection + shard) to the number of non-active replicas for that shard
+	ShardReplicasNotActive map[string]int
+	// Whether all pods are live in the cluster state
+	AllManagedPodsLive bool
+}
+
+// PodContents is a helper method to get the node contents for a particular pod
+func (state NodeReplicaState) PodContents(cloud *solr.SolrCloud, podName string) (contents *SolrNodeContents, isInClusterState bool) {
+	contents, isInClusterState = state.NodeContents[SolrNodeName(cloud, podName)]
+	return
+}
+
+// PodHasReplicas is a helper method to retrieve whether a pod has replicas living on it
+func (state NodeReplicaState) PodHasReplicas(cloud *solr.SolrCloud, podName string) bool {
+	contents, isInClusterState := state.PodContents(cloud, podName)
+	return isInClusterState && contents.replicas > 0
+}
+
+func GetNodeReplicaState(ctx context.Context, cloud *solr.SolrCloud, hasReadyPod bool, logger logr.Logger) (state NodeReplicaState, retryLater bool, err error) {
+	clusterResp := &solr_api.SolrClusterStatusResponse{}
+	overseerResp := &solr_api.SolrOverseerStatusResponse{}
+
+	if hasReadyPod {
+		queryParams := url.Values{}
+		queryParams.Add("action", "CLUSTERSTATUS")
+		err = solr_api.CallCollectionsApi(ctx, cloud, queryParams, clusterResp)
+		if _, apiErr := solr_api.CheckForCollectionsApiError("CLUSTERSTATUS", clusterResp.ResponseHeader, clusterResp.Error); apiErr != nil {
+			err = apiErr
+		}
+		if err == nil {
+			queryParams.Set("action", "OVERSEERSTATUS")
+			err = solr_api.CallCollectionsApi(ctx, cloud, queryParams, overseerResp)
+			if _, apiErr := solr_api.CheckForCollectionsApiError("OVERSEERSTATUS", overseerResp.ResponseHeader, overseerResp.Error); apiErr != nil {
+				err = apiErr
+			}
+		}
+		if err == nil {
+			state = findSolrNodeContents(clusterResp.ClusterStatus, overseerResp.Leader, GetAllManagedSolrNodeNames(cloud))
+		} else {
+			logger.Error(err, "Could not fetch cluster state information for cloud")
+		}
+	} else {
+		retryLater = true
+	}
+
+	return
+}
+
 // DeterminePodsSafeToUpdate takes a list of solr Pods and returns a list of pods that are safe to upgrade now.
 // This function MUST be idempotent and return the same list of pods given the same kubernetes/solr state.
 //
@@ -105,50 +158,37 @@ func (seg OutOfDatePodSegmentation) IsEmpty() bool {
 // TODO:
 //   - Think about caching this for ~250 ms? Not a huge need to send these requests milliseconds apart.
 //   - Might be too much complexity for very little gain.
-func DeterminePodsSafeToUpdate(ctx context.Context, cloud *solr.SolrCloud, totalPods int, outOfDatePods OutOfDatePodSegmentation, hasReadyPod bool, availableUpdatedPodCount int, logger logr.Logger) (podsToUpdate []corev1.Pod, podsHaveReplicas map[string]bool, retryLater bool, err error) {
+func DeterminePodsSafeToUpdate(cloud *solr.SolrCloud, totalPods int, outOfDatePods OutOfDatePodSegmentation, state NodeReplicaState, availableUpdatedPodCount int, logger logr.Logger) (podsToUpdate []corev1.Pod, retryLater bool) {
 	// Before fetching the cluster state, be sure that there is room to update at least 1 pod
 	maxPodsUnavailable, unavailableUpdatedPodCount, maxPodsToUpdate := calculateMaxPodsToUpdate(cloud, totalPods, len(outOfDatePods.Running), len(outOfDatePods.NotStarted)+len(outOfDatePods.ScheduledForDeletion), availableUpdatedPodCount)
+
 	if maxPodsToUpdate <= 0 {
-		logger.Info("Pod update selection canceled. The number of updated pods unavailable equals or exceeds the calculated maxPodsUnavailable.",
-			"unavailableUpdatedPods", unavailableUpdatedPodCount, "outOfDatePodsNotStarted", len(outOfDatePods.NotStarted), "alreadyScheduledForDeletion", len(outOfDatePods.ScheduledForDeletion), "maxPodsUnavailable", maxPodsUnavailable)
-	} else {
-		clusterResp := &solr_api.SolrClusterStatusResponse{}
-		overseerResp := &solr_api.SolrOverseerStatusResponse{}
-
-		if hasReadyPod {
-			queryParams := url.Values{}
-			queryParams.Add("action", "CLUSTERSTATUS")
-			err = solr_api.CallCollectionsApi(ctx, cloud, queryParams, clusterResp)
-			if err == nil {
-				queryParams.Set("action", "OVERSEERSTATUS")
-				err = solr_api.CallCollectionsApi(ctx, cloud, queryParams, overseerResp)
-				if _, apiErr := solr_api.CheckForCollectionsApiError("OVERSEERSTATUS", overseerResp.ResponseHeader, overseerResp.Error); apiErr != nil {
-					err = apiErr
-				}
-			}
-			if err != nil {
-				logger.Error(err, "Error retrieving cluster status, delaying pod update selection")
-			}
-		}
-		// If the update logic already wants to retry later, then do not pick any pods
-		if !retryLater {
-			logger.Info("Pod update selection started.",
-				"outOfDatePods", len(outOfDatePods.Running),
-				"maxPodsUnavailable", maxPodsUnavailable,
-				"unavailableUpdatedPods", unavailableUpdatedPodCount,
-				"outOfDatePodsNotStarted", len(outOfDatePods.NotStarted),
-				"alreadyScheduledForDeletion", len(outOfDatePods.ScheduledForDeletion),
-				"maxPodsToUpdate", maxPodsToUpdate)
-			podsToUpdate, podsHaveReplicas = pickPodsToUpdate(cloud, outOfDatePods, clusterResp.ClusterStatus, overseerResp.Leader, maxPodsToUpdate, logger)
-
-			// If there are no pods to upgrade, even though the maxPodsToUpdate is >0, then retry later because the issue stems from cluster state
-			// and clusterState changes will not call the reconciler.
-			if len(podsToUpdate) == 0 && len(outOfDatePods.Running) > 0 {
-				retryLater = true
-			}
+		logger.Info("Pod update selection not started. The number of unavailable pods unavailable (or scheduled for deletion) equals or exceeds the calculated maxPodsUnavailable.",
+			"outOfDatePods", len(outOfDatePods.Running),
+			"maxPodsUnavailable", maxPodsUnavailable,
+			"unavailableUpdatedPods", unavailableUpdatedPodCount,
+			"outOfDatePodsNotStarted", len(outOfDatePods.NotStarted),
+			"alreadyScheduledForDeletion", len(outOfDatePods.ScheduledForDeletion))
+		retryLater = true
+	}
+	// If the update logic already wants to retry later, then do not pick any pods
+	if !retryLater {
+		logger.Info("Pod update selection started.",
+			"outOfDatePods", len(outOfDatePods.Running),
+			"maxPodsUnavailable", maxPodsUnavailable,
+			"unavailableUpdatedPods", unavailableUpdatedPodCount,
+			"outOfDatePodsNotStarted", len(outOfDatePods.NotStarted),
+			"alreadyScheduledForDeletion", len(outOfDatePods.ScheduledForDeletion),
+			"maxPodsToUpdate", maxPodsToUpdate)
+		podsToUpdate = pickPodsToUpdate(cloud, outOfDatePods, state, maxPodsToUpdate, logger)
+
+		// If there are no pods to upgrade, even though the maxPodsToUpdate is >0, then retry later because the issue stems from cluster state
+		// and clusterState changes will not call the reconciler.
+		if len(podsToUpdate) == 0 && len(outOfDatePods.Running) > 0 {
+			retryLater = true
 		}
 	}
-	return podsToUpdate, podsHaveReplicas, retryLater, err
+	return podsToUpdate, retryLater
 }
 
 // calculateMaxPodsToUpdate determines the maximum number of additional pods that can be updated.
@@ -164,38 +204,32 @@ func calculateMaxPodsToUpdate(cloud *solr.SolrCloud, totalPods int, outOfDatePod
 	return maxPodsUnavailable, unavailableUpdatedPodCount, maxPodsToUpdate
 }
 
-func pickPodsToUpdate(cloud *solr.SolrCloud, outOfDatePods OutOfDatePodSegmentation, clusterStatus solr_api.SolrClusterStatus,
-	overseer string, maxPodsToUpdate int, logger logr.Logger) (podsToUpdate []corev1.Pod, podsHaveReplicas map[string]bool) {
-	podsHaveReplicas = make(map[string]bool, maxPodsToUpdate)
-	nodeContents, totalShardReplicas, shardReplicasNotActive, allManagedPodsLive := findSolrNodeContents(clusterStatus, overseer, GetAllManagedSolrNodeNames(cloud))
-	sortNodePodsBySafety(outOfDatePods.Running, nodeContents, cloud)
+func pickPodsToUpdate(cloud *solr.SolrCloud, outOfDatePods OutOfDatePodSegmentation, state NodeReplicaState, maxPodsToUpdate int, logger logr.Logger) (podsToUpdate []corev1.Pod) {
+	sortNodePodsBySafety(outOfDatePods.Running, state.NodeContents, cloud)
 
 	updateOptions := cloud.Spec.UpdateStrategy.ManagedUpdateOptions
 	var maxShardReplicasUnavailableCache map[string]int
 	// In case the user wants all shardReplicas to be unavailable at the same time, populate the cache with the total number of replicas per shard.
 	if updateOptions.MaxShardReplicasUnavailable != nil && updateOptions.MaxShardReplicasUnavailable.Type == intstr.Int && updateOptions.MaxShardReplicasUnavailable.IntVal <= int32(0) {
-		maxShardReplicasUnavailableCache = totalShardReplicas
+		maxShardReplicasUnavailableCache = state.TotalShardReplicas
 	} else {
-		maxShardReplicasUnavailableCache = make(map[string]int, len(totalShardReplicas))
+		maxShardReplicasUnavailableCache = make(map[string]int, len(state.TotalShardReplicas))
 	}
 
 	for _, pod := range outOfDatePods.ScheduledForDeletion {
-		nodeName := SolrNodeName(cloud, pod.Name)
-		nodeContent, isInClusterState := nodeContents[nodeName]
+		nodeContent, isInClusterState := state.PodContents(cloud, pod.Name)
 
 		// This pod will be deleted, add its information to future down shards
-		podsHaveReplicas[pod.Name] = isInClusterState && nodeContent.replicas > 0
 		if isInClusterState && nodeContent.live {
 			for shard, additionalReplicaCount := range nodeContent.activeReplicasPerShard {
-				shardReplicasNotActive[shard] += additionalReplicaCount
+				state.ShardReplicasNotActive[shard] += additionalReplicaCount
 			}
 		}
 	}
 
 	for _, pod := range outOfDatePods.Running {
 		isSafeToUpdate := true
-		nodeName := SolrNodeName(cloud, pod.Name)
-		nodeContent, isInClusterState := nodeContents[nodeName]
+		nodeContent, isInClusterState := state.PodContents(cloud, pod.Name)
 		var reason string
 		// The overseerLeader can only be upgraded by itself
 		if !isInClusterState || !nodeContent.InClusterState() {
@@ -210,7 +244,7 @@ func pickPodsToUpdate(cloud *solr.SolrCloud, outOfDatePods OutOfDatePodSegmentat
 				// But we want to make sure it still follows the same replicasDown rules as the other nodes, so still use that logic
 				// This works if there are other solr nodes not managed by this SolrCloud resource, because we just check that this is the last
 				// pod managed for this SolrCloud that has not been updated.
-				if len(outOfDatePods.Running) == 1 && allManagedPodsLive {
+				if len(outOfDatePods.Running) == 1 && state.AllManagedPodsLive {
 					isSafeToUpdate = true
 					reason = "Pod is overseer and all other nodes have been updated."
 				} else {
@@ -231,10 +265,10 @@ func pickPodsToUpdate(cloud *solr.SolrCloud, outOfDatePods OutOfDatePodSegmentat
 							continue
 						}
 
-						notActiveReplicaCount, _ := shardReplicasNotActive[shard]
+						notActiveReplicaCount, _ := state.ShardReplicasNotActive[shard]
 
 						// If the maxBatchNodeUpgradeSpec is passed as a decimal between 0 and 1, then calculate as a percentage of the number of nodes
-						maxShardReplicasDown, _ := ResolveMaxShardReplicasUnavailable(updateOptions.MaxShardReplicasUnavailable, shard, totalShardReplicas, maxShardReplicasUnavailableCache)
+						maxShardReplicasDown, _ := ResolveMaxShardReplicasUnavailable(updateOptions.MaxShardReplicasUnavailable, shard, state.TotalShardReplicas, maxShardReplicasUnavailableCache)
 
 						// We have to allow killing of Pods that have multiple replicas of a shard
 						// Therefore only check the additional Replica count if some replicas of that shard are already being upgraded
@@ -257,12 +291,11 @@ func pickPodsToUpdate(cloud *solr.SolrCloud, outOfDatePods OutOfDatePodSegmentat
 			// If the node is not "live", then the replicas on that node will have already been counted as "not active".
 			if isInClusterState && nodeContent.live {
 				for shard, additionalReplicaCount := range nodeContent.activeReplicasPerShard {
-					shardReplicasNotActive[shard] += additionalReplicaCount
+					state.ShardReplicasNotActive[shard] += additionalReplicaCount
 				}
 			}
 			logger.Info("Pod selected to be deleted for update.", "pod", pod.Name, "reason", reason)
 			podsToUpdate = append(podsToUpdate, pod)
-			podsHaveReplicas[pod.Name] = isInClusterState && nodeContent.replicas > 0
 
 			// Stop after the maxBatchNodeUpdate count, if one is provided.
 			if maxPodsToUpdate >= 1 && len(podsToUpdate) >= maxPodsToUpdate {
@@ -273,7 +306,7 @@ func pickPodsToUpdate(cloud *solr.SolrCloud, outOfDatePods OutOfDatePodSegmentat
 			logger.Info("Pod not able to be killed for update.", "pod", pod.Name, "reason", reason)
 		}
 	}
-	return podsToUpdate, podsHaveReplicas
+	return podsToUpdate
 }
 
 func sortNodePodsBySafety(outOfDatePods []corev1.Pod, nodeMap map[string]*SolrNodeContents, solrCloud *solr.SolrCloud) {
@@ -364,13 +397,13 @@ This aggregated info is returned as:
   - A map from unique shard name (collection+shard) to the count of replicas that are not active for that shard.
   - If a node is not live, then all shards that live on that node will be considered "not active"
 */
-func findSolrNodeContents(cluster solr_api.SolrClusterStatus, overseerLeader string, managedSolrNodeNames map[string]bool) (nodeContents map[string]*SolrNodeContents, totalShardReplicas map[string]int, shardReplicasNotActive map[string]int, allManagedPodsLive bool) {
-	nodeContents = make(map[string]*SolrNodeContents, 0)
-	totalShardReplicas = make(map[string]int, 0)
-	shardReplicasNotActive = make(map[string]int, 0)
+func findSolrNodeContents(cluster solr_api.SolrClusterStatus, overseerLeader string, managedSolrNodeNames map[string]bool) (state NodeReplicaState) {
+	state.NodeContents = make(map[string]*SolrNodeContents, 0)
+	state.TotalShardReplicas = make(map[string]int, 0)
+	state.ShardReplicasNotActive = make(map[string]int, 0)
 	// Update the info for each "live" node.
 	for _, nodeName := range cluster.LiveNodes {
-		contents, hasValue := nodeContents[nodeName]
+		contents, hasValue := state.NodeContents[nodeName]
 		delete(managedSolrNodeNames, nodeName)
 		if !hasValue {
 			contents = &SolrNodeContents{
@@ -386,15 +419,15 @@ func findSolrNodeContents(cluster solr_api.SolrClusterStatus, overseerLeader str
 		} else {
 			contents.live = true
 		}
-		nodeContents[nodeName] = contents
+		state.NodeContents[nodeName] = contents
 	}
 	// Go through the state of each collection getting the count of replicas for each collection/shard living on each node
 	for collectionName, collection := range cluster.Collections {
 		for shardName, shard := range collection.Shards {
 			uniqueShard := collectionName + "|" + shardName
-			totalShardReplicas[uniqueShard] = len(shard.Replicas)
+			state.TotalShardReplicas[uniqueShard] = len(shard.Replicas)
 			for _, replica := range shard.Replicas {
-				contents, hasValue := nodeContents[replica.NodeName]
+				contents, hasValue := state.NodeContents[replica.NodeName]
 				if !hasValue {
 					contents = &SolrNodeContents{
 						nodeName:               replica.NodeName,
@@ -415,7 +448,7 @@ func findSolrNodeContents(cluster solr_api.SolrClusterStatus, overseerLeader str
 
 				// A replica can be considered "not active" if it's state is not "active" or the node it lives in is not "live".
 				if !(replica.State == solr_api.ReplicaActive && contents.live) {
-					shardReplicasNotActive[uniqueShard] += 1
+					state.ShardReplicasNotActive[uniqueShard] += 1
 				}
 				if replica.State == solr_api.ReplicaActive {
 					contents.activeReplicasPerShard[uniqueShard] += 1
@@ -427,13 +460,13 @@ func findSolrNodeContents(cluster solr_api.SolrClusterStatus, overseerLeader str
 					contents.notDownReplicas += 1
 				}
 
-				nodeContents[replica.NodeName] = contents
+				state.NodeContents[replica.NodeName] = contents
 			}
 		}
 	}
 	// Update the info for the overseerLeader leader.
 	if overseerLeader != "" {
-		contents, hasValue := nodeContents[overseerLeader]
+		contents, hasValue := state.NodeContents[overseerLeader]
 		if !hasValue {
 			contents = &SolrNodeContents{
 				nodeName:               overseerLeader,
@@ -447,9 +480,10 @@ func findSolrNodeContents(cluster solr_api.SolrClusterStatus, overseerLeader str
 		} else {
 			contents.overseerLeader = true
 		}
-		nodeContents[overseerLeader] = contents
+		state.NodeContents[overseerLeader] = contents
 	}
-	return nodeContents, totalShardReplicas, shardReplicasNotActive, len(managedSolrNodeNames) == 0
+	state.AllManagedPodsLive = len(managedSolrNodeNames) == 0
+	return state
 }
 
 type SolrNodeContents struct {
diff --git a/controllers/util/solr_update_util_test.go b/controllers/util/solr_update_util_test.go
index 3729dbf..0336aff 100644
--- a/controllers/util/solr_update_util_test.go
+++ b/controllers/util/solr_update_util_test.go
@@ -104,23 +104,23 @@ func TestPickPodsToUpgrade(t *testing.T) {
 	*/
 
 	// Normal inputs
+	testDownClusterState := findSolrNodeContents(testDownClusterStatus, overseerLeader, GetAllManagedSolrNodeNames(solrCloud))
 	maxshardReplicasUnavailable = intstr.FromInt(1)
-	podsToUpgradeDetailed, podsHaveReplicas := pickPodsToUpdate(solrCloud, allPods, testDownClusterStatus, overseerLeader, 6, log)
-	assert.Equal(t, podsHaveReplicas, map[string]bool{"foo-solrcloud-2": true, "foo-solrcloud-6": false})
+	podsToUpgradeDetailed := pickPodsToUpdate(solrCloud, allPods, testDownClusterState, 6, log)
 	podsToUpgrade := getPodNames(podsToUpgradeDetailed)
 	assert.ElementsMatch(t, []string{"foo-solrcloud-2", "foo-solrcloud-6"}, podsToUpgrade, "Incorrect set of next pods to upgrade. Do to the down/non-live replicas, only the node without replicas and one more can be upgraded.")
 
 	// Test the maxBatchNodeUpgradeSpec
+	testDownClusterState = findSolrNodeContents(testDownClusterStatus, overseerLeader, GetAllManagedSolrNodeNames(solrCloud))
 	maxshardReplicasUnavailable = intstr.FromInt(1)
-	podsToUpgradeDetailed, podsHaveReplicas = pickPodsToUpdate(solrCloud, allPods, testDownClusterStatus, overseerLeader, 1, log)
-	assert.Equal(t, podsHaveReplicas, map[string]bool{"foo-solrcloud-6": false})
+	podsToUpgradeDetailed = pickPodsToUpdate(solrCloud, allPods, testDownClusterState, 1, log)
 	podsToUpgrade = getPodNames(podsToUpgradeDetailed)
 	assert.ElementsMatch(t, []string{"foo-solrcloud-6"}, podsToUpgrade, "Incorrect set of next pods to upgrade. Only 1 node should be upgraded when maxBatchNodeUpgradeSpec=1")
 
 	// Test the maxShardReplicasDownSpec
+	testDownClusterState = findSolrNodeContents(testDownClusterStatus, overseerLeader, GetAllManagedSolrNodeNames(solrCloud))
 	maxshardReplicasUnavailable = intstr.FromInt(2)
-	podsToUpgradeDetailed, podsHaveReplicas = pickPodsToUpdate(solrCloud, allPods, testDownClusterStatus, overseerLeader, 6, log)
-	assert.Equal(t, podsHaveReplicas, map[string]bool{"foo-solrcloud-2": true, "foo-solrcloud-3": true, "foo-solrcloud-4": true, "foo-solrcloud-6": false})
+	podsToUpgradeDetailed = pickPodsToUpdate(solrCloud, allPods, testDownClusterState, 6, log)
 	podsToUpgrade = getPodNames(podsToUpgradeDetailed)
 	assert.ElementsMatch(t, []string{"foo-solrcloud-2", "foo-solrcloud-3", "foo-solrcloud-4", "foo-solrcloud-6"}, podsToUpgrade, "Incorrect set of next pods to upgrade.")
 
@@ -129,23 +129,27 @@ func TestPickPodsToUpgrade(t *testing.T) {
 	*/
 
 	// Normal inputs
+	testRecoveringClusterState := findSolrNodeContents(testRecoveringClusterStatus, overseerLeader, GetAllManagedSolrNodeNames(solrCloud))
 	maxshardReplicasUnavailable = intstr.FromInt(1)
-	podsToUpgrade = getPodNames(pickPodsToUpdate(solrCloud, allPods, testRecoveringClusterStatus, overseerLeader, 6, log))
+	podsToUpgrade = getPodNames(pickPodsToUpdate(solrCloud, allPods, testRecoveringClusterState, 6, log))
 	assert.ElementsMatch(t, []string{"foo-solrcloud-4", "foo-solrcloud-6"}, podsToUpgrade, "Incorrect set of next pods to upgrade. Do to the recovering/down/non-live replicas, only the non-live node and node without replicas can be upgraded.")
 
 	// Test the maxBatchNodeUpgradeSpec
+	testRecoveringClusterState = findSolrNodeContents(testRecoveringClusterStatus, overseerLeader, GetAllManagedSolrNodeNames(solrCloud))
 	maxshardReplicasUnavailable = intstr.FromInt(1)
-	podsToUpgrade = getPodNames(pickPodsToUpdate(solrCloud, allPods, testRecoveringClusterStatus, overseerLeader, 1, log))
+	podsToUpgrade = getPodNames(pickPodsToUpdate(solrCloud, allPods, testRecoveringClusterState, 1, log))
 	assert.ElementsMatch(t, []string{"foo-solrcloud-4"}, podsToUpgrade, "Incorrect set of next pods to upgrade. Only 1 node should be upgraded when maxBatchNodeUpgradeSpec=1, and it should be the non-live node.")
 
 	// Test the maxShardReplicasDownSpec
+	testRecoveringClusterState = findSolrNodeContents(testRecoveringClusterStatus, overseerLeader, GetAllManagedSolrNodeNames(solrCloud))
 	maxshardReplicasUnavailable = intstr.FromInt(2)
-	podsToUpgrade = getPodNames(pickPodsToUpdate(solrCloud, allPods, testRecoveringClusterStatus, overseerLeader, 6, log))
+	podsToUpgrade = getPodNames(pickPodsToUpdate(solrCloud, allPods, testRecoveringClusterState, 6, log))
 	assert.ElementsMatch(t, []string{"foo-solrcloud-2", "foo-solrcloud-3", "foo-solrcloud-4", "foo-solrcloud-6"}, podsToUpgrade, "Incorrect set of next pods to upgrade. More nodes should be upgraded when maxShardReplicasDown=2")
 
 	// The overseer should be upgraded when given enough leeway
+	testDownClusterState = findSolrNodeContents(testDownClusterStatus, overseerLeader, GetAllManagedSolrNodeNames(solrCloud))
 	maxshardReplicasUnavailable = intstr.FromString("50%")
-	podsToUpgrade = getPodNames(pickPodsToUpdate(solrCloud, lastPod, testDownClusterStatus, overseerLeader, 2, log))
+	podsToUpgrade = getPodNames(pickPodsToUpdate(solrCloud, lastPod, testDownClusterState, 2, log))
 	assert.ElementsMatch(t, []string{"foo-solrcloud-0"}, podsToUpgrade, "Incorrect set of next pods to upgrade. The last pod, the overseer, should be chosen because it has been given enough leeway.")
 
 	/*
@@ -153,18 +157,21 @@ func TestPickPodsToUpgrade(t *testing.T) {
 	*/
 
 	// Normal inputs
+	testHealthyClusterState := findSolrNodeContents(testHealthyClusterStatus, overseerLeader, GetAllManagedSolrNodeNames(solrCloud))
 	maxshardReplicasUnavailable = intstr.FromInt(1)
-	podsToUpgrade = getPodNames(pickPodsToUpdate(solrCloud, halfPods, testHealthyClusterStatus, overseerLeader, 6, log))
+	podsToUpgrade = getPodNames(pickPodsToUpdate(solrCloud, halfPods, testHealthyClusterState, 6, log))
 	assert.ElementsMatch(t, []string{"foo-solrcloud-1"}, podsToUpgrade, "Incorrect set of next pods to upgrade. Do to replica placement, only the node with the least leaders can be upgraded and replicas.")
 
 	// Test the maxShardReplicasDownSpec
+	testHealthyClusterState = findSolrNodeContents(testHealthyClusterStatus, overseerLeader, GetAllManagedSolrNodeNames(solrCloud))
 	maxshardReplicasUnavailable = intstr.FromInt(2)
-	podsToUpgrade = getPodNames(pickPodsToUpdate(solrCloud, halfPods, testHealthyClusterStatus, overseerLeader, 6, log))
+	podsToUpgrade = getPodNames(pickPodsToUpdate(solrCloud, halfPods, testHealthyClusterState, 6, log))
 	assert.ElementsMatch(t, []string{"foo-solrcloud-1", "foo-solrcloud-5"}, podsToUpgrade, "Incorrect set of next pods to upgrade. More nodes should be upgraded when maxShardReplicasDown=2")
 
 	// The overseer should be upgraded when given enough leeway
+	testHealthyClusterState = findSolrNodeContents(testHealthyClusterStatus, overseerLeader, GetAllManagedSolrNodeNames(solrCloud))
 	maxshardReplicasUnavailable = intstr.FromString("50%")
-	podsToUpgrade = getPodNames(pickPodsToUpdate(solrCloud, lastPod, testDownClusterStatus, overseerLeader, 2, log))
+	podsToUpgrade = getPodNames(pickPodsToUpdate(solrCloud, lastPod, testHealthyClusterState, 2, log))
 	assert.ElementsMatch(t, []string{"foo-solrcloud-0"}, podsToUpgrade, "Incorrect set of next pods to upgrade. The last pod, the overseer, should be chosen because it has been given enough leeway.")
 
 	/*
@@ -172,22 +179,26 @@ func TestPickPodsToUpgrade(t *testing.T) {
 	*/
 
 	// The overseer should be not be upgraded if the clusterstate is not healthy enough
+	testRecoveringClusterState = findSolrNodeContents(testRecoveringClusterStatus, overseerLeader, GetAllManagedSolrNodeNames(solrCloud))
 	maxshardReplicasUnavailable = intstr.FromInt(1)
-	podsToUpgrade = getPodNames(pickPodsToUpdate(solrCloud, lastPod, testRecoveringClusterStatus, overseerLeader, 3, log))
+	podsToUpgrade = getPodNames(pickPodsToUpdate(solrCloud, lastPod, testRecoveringClusterState, 3, log))
 	assert.ElementsMatch(t, []string{}, podsToUpgrade, "Incorrect set of next pods to upgrade. The overseer should be not be upgraded if the clusterstate is not healthy enough.")
 
 	// The overseer should be not be upgraded if the clusterstate is not healthy enough
-	podsToUpgrade = getPodNames(pickPodsToUpdate(solrCloud, lastPod, testRecoveringClusterStatus, overseerLeader, 6, log))
+	testRecoveringClusterState = findSolrNodeContents(testRecoveringClusterStatus, overseerLeader, GetAllManagedSolrNodeNames(solrCloud))
+	podsToUpgrade = getPodNames(pickPodsToUpdate(solrCloud, lastPod, testRecoveringClusterState, 6, log))
 	assert.ElementsMatch(t, []string{}, podsToUpgrade, "Incorrect set of next pods to upgrade. The overseer should be not be upgraded if there are other non-live nodes.")
 
 	// The overseer should be upgraded when given enough leeway
+	testDownClusterState = findSolrNodeContents(testDownClusterStatus, overseerLeader, GetAllManagedSolrNodeNames(solrCloud))
 	maxshardReplicasUnavailable = intstr.FromInt(2)
-	podsToUpgrade = getPodNames(pickPodsToUpdate(solrCloud, lastPod, testDownClusterStatus, overseerLeader, 6, log))
+	podsToUpgrade = getPodNames(pickPodsToUpdate(solrCloud, lastPod, testDownClusterState, 6, log))
 	assert.ElementsMatch(t, []string{"foo-solrcloud-0"}, podsToUpgrade, "Incorrect set of next pods to upgrade. The overseer should be upgraded when given enough leeway.")
 
 	// The overseer should be upgraded when everything is healthy and it is the last node
+	testHealthyClusterState = findSolrNodeContents(testHealthyClusterStatus, overseerLeader, GetAllManagedSolrNodeNames(solrCloud))
 	maxshardReplicasUnavailable = intstr.FromInt(1)
-	podsToUpgrade = getPodNames(pickPodsToUpdate(solrCloud, lastPod, testHealthyClusterStatus, overseerLeader, 6, log))
+	podsToUpgrade = getPodNames(pickPodsToUpdate(solrCloud, lastPod, testHealthyClusterState, 6, log))
 	assert.ElementsMatch(t, []string{"foo-solrcloud-0"}, podsToUpgrade, "Incorrect set of next pods to upgrade. The overseer should be upgraded when everything is healthy and it is the last node")
 
 	/*
@@ -195,15 +206,17 @@ func TestPickPodsToUpgrade(t *testing.T) {
 	*/
 
 	// The overseer should not be upgraded when everything is healthy and it is the last node but one pod is not in the live nodes
-	maxshardReplicasUnavailable = intstr.FromInt(1)
 	solrCloud.Spec.Replicas = Replicas(7)
-	podsToUpgrade = getPodNames(pickPodsToUpdate(solrCloud, lastPod, testHealthyClusterStatus, overseerLeader, 6, log))
+	testHealthyClusterState = findSolrNodeContents(testHealthyClusterStatus, overseerLeader, GetAllManagedSolrNodeNames(solrCloud))
+	maxshardReplicasUnavailable = intstr.FromInt(1)
+	podsToUpgrade = getPodNames(pickPodsToUpdate(solrCloud, lastPod, testHealthyClusterState, 6, log))
 	assert.Empty(t, podsToUpgrade, "Incorrect set of next pods to upgrade. The overseer should be not be upgraded when one of the managed pods is not live")
 
 	// The overseer should be upgraded when everything is healthy and it is the last node even though this SolrCloud resource doesn't manage all Nodes
-	maxshardReplicasUnavailable = intstr.FromInt(1)
 	solrCloud.Spec.Replicas = Replicas(4)
-	podsToUpgrade = getPodNames(pickPodsToUpdate(solrCloud, lastPod, testHealthyClusterStatus, overseerLeader, 6, log))
+	testHealthyClusterState = findSolrNodeContents(testHealthyClusterStatus, overseerLeader, GetAllManagedSolrNodeNames(solrCloud))
+	maxshardReplicasUnavailable = intstr.FromInt(1)
+	podsToUpgrade = getPodNames(pickPodsToUpdate(solrCloud, lastPod, testHealthyClusterState, 6, log))
 	assert.ElementsMatch(t, []string{"foo-solrcloud-0"}, podsToUpgrade, "Incorrect set of next pods to upgrade. The overseer should be upgraded when everything is healthy and it is the last node, even though this SolrCloud resource doesn't manage all Nodes")
 
 	/*
@@ -211,18 +224,21 @@ func TestPickPodsToUpgrade(t *testing.T) {
 	*/
 
 	// Normal inputs
+	testHealthyClusterState = findSolrNodeContents(testHealthyClusterStatus, overseerLeader, GetAllManagedSolrNodeNames(solrCloud))
 	maxshardReplicasUnavailable = intstr.FromInt(1)
-	podsToUpgrade = getPodNames(pickPodsToUpdate(solrCloud, someScheduledForDeletionPods, testHealthyClusterStatus, overseerLeader, 6, log))
+	podsToUpgrade = getPodNames(pickPodsToUpdate(solrCloud, someScheduledForDeletionPods, testHealthyClusterState, 6, log))
 	assert.ElementsMatch(t, []string{}, podsToUpgrade, "Incorrect set of next pods to upgrade. Due to replica placement, only the node with the least leaders can be upgraded and replicas.")
 
 	// Test the maxShardReplicasDownSpec
+	testHealthyClusterState = findSolrNodeContents(testHealthyClusterStatus, overseerLeader, GetAllManagedSolrNodeNames(solrCloud))
 	maxshardReplicasUnavailable = intstr.FromInt(2)
-	podsToUpgrade = getPodNames(pickPodsToUpdate(solrCloud, someScheduledForDeletionPods, testHealthyClusterStatus, overseerLeader, 6, log))
+	podsToUpgrade = getPodNames(pickPodsToUpdate(solrCloud, someScheduledForDeletionPods, testHealthyClusterState, 6, log))
 	assert.ElementsMatch(t, []string{"foo-solrcloud-1", "foo-solrcloud-3"}, podsToUpgrade, "Incorrect set of next pods to upgrade. More nodes should be upgraded when maxShardReplicasDown=2")
 
 	// Test the maxNodes
+	testHealthyClusterState = findSolrNodeContents(testHealthyClusterStatus, overseerLeader, GetAllManagedSolrNodeNames(solrCloud))
 	maxshardReplicasUnavailable = intstr.FromInt(2)
-	podsToUpgrade = getPodNames(pickPodsToUpdate(solrCloud, someScheduledForDeletionPods, testHealthyClusterStatus, overseerLeader, 2, log))
+	podsToUpgrade = getPodNames(pickPodsToUpdate(solrCloud, someScheduledForDeletionPods, testHealthyClusterState, 2, log))
 	assert.ElementsMatch(t, []string{"foo-solrcloud-1", "foo-solrcloud-3"}, podsToUpgrade, "Incorrect set of next pods to upgrade. More nodes should be upgraded when maxShardReplicasDown=2")
 
 }
@@ -350,21 +366,21 @@ func TestFindSolrNodeContents(t *testing.T) {
 	overseerLeader := "foo-solrcloud-0.foo-solrcloud-headless.default:2000_solr"
 
 	// Test allManagedPodsLive when true
-	_, _, _, allManagedPodsLive := findSolrNodeContents(testRecoveringClusterStatus, overseerLeader, map[string]bool{
+	state := findSolrNodeContents(testRecoveringClusterStatus, overseerLeader, map[string]bool{
 		"foo-solrcloud-2.foo-solrcloud-headless.default:2000_solr": true,
 		"foo-solrcloud-6.foo-solrcloud-headless.default:2000_solr": true,
 	})
-	assert.True(t, allManagedPodsLive, "allManagedPodsLive should be true, because both managed pods are live in cluster status")
+	assert.True(t, state.AllManagedPodsLive, "allManagedPodsLive should be true, because both managed pods are live in cluster status")
 
 	// Test allManagedPodsLive when false
-	_, _, _, allManagedPodsLive = findSolrNodeContents(testRecoveringClusterStatus, overseerLeader, map[string]bool{
+	state = findSolrNodeContents(testRecoveringClusterStatus, overseerLeader, map[string]bool{
 		"foo-solrcloud-2.foo-solrcloud-headless.default:2000_solr": true,
 		"foo-solrcloud-6.foo-solrcloud-headless.default:2000_solr": true,
 		"foo-solrcloud-4.foo-solrcloud-headless.default:2000_solr": true,
 	})
-	assert.False(t, allManagedPodsLive, "allManagedPodsLive should be false, because there is a managed pod that is not live")
+	assert.False(t, state.AllManagedPodsLive, "allManagedPodsLive should be false, because there is a managed pod that is not live")
 
-	nodeContents, totalShardReplicas, shardReplicasNotActive, _ := findSolrNodeContents(testRecoveringClusterStatus, overseerLeader, map[string]bool{})
+	state = findSolrNodeContents(testRecoveringClusterStatus, overseerLeader, map[string]bool{})
 
 	expectedNodeContents := map[string]*SolrNodeContents{
 		"foo-solrcloud-0.foo-solrcloud-headless.default:2000_solr": {
@@ -481,8 +497,8 @@ func TestFindSolrNodeContents(t *testing.T) {
 			live:                   true,
 		},
 	}
-	assert.Equal(t, len(nodeContents), len(nodeContents), "Number of Solr nodes with content information is incorrect.")
-	for node, foundNodeContents := range nodeContents {
+	assert.Equal(t, len(state.NodeContents), len(state.NodeContents), "Number of Solr nodes with content information is incorrect.")
+	for node, foundNodeContents := range state.NodeContents {
 		expectedContents, found := expectedNodeContents[node]
 		assert.Truef(t, found, "No nodeContents found for node %s", node)
 		assert.EqualValuesf(t, expectedContents, foundNodeContents, "NodeContents information from clusterstate is incorrect for node %s", node)
@@ -494,7 +510,7 @@ func TestFindSolrNodeContents(t *testing.T) {
 		"col2|shard1": 3,
 		"col2|shard2": 4,
 	}
-	assert.EqualValues(t, expectedTotalShardReplicas, totalShardReplicas, "Shards replica count is incorrect.")
+	assert.EqualValues(t, expectedTotalShardReplicas, state.TotalShardReplicas, "Shards replica count is incorrect.")
 
 	expectedShardReplicasNotActive := map[string]int{
 		"col1|shard1": 1,
@@ -502,7 +518,7 @@ func TestFindSolrNodeContents(t *testing.T) {
 		"col2|shard1": 2,
 		"col2|shard2": 2,
 	}
-	assert.EqualValues(t, expectedShardReplicasNotActive, shardReplicasNotActive, "Shards with replicas not active information is incorrect.")
+	assert.EqualValues(t, expectedShardReplicasNotActive, state.ShardReplicasNotActive, "Shards with replicas not active information is incorrect.")
 }
 
 func TestCalculateMaxPodsToUpgrade(t *testing.T) {
diff --git a/helm/solr-operator/Chart.yaml b/helm/solr-operator/Chart.yaml
index 2977fe9..c584361 100644
--- a/helm/solr-operator/Chart.yaml
+++ b/helm/solr-operator/Chart.yaml
@@ -102,6 +102,11 @@ annotations:
           url: https://github.com/apache/solr-operator/pull/596
         - name: Documentation
           url: https://apache.github.io/solr-operator/docs/solr-cloud/cluster-operations.html#avoiding-deadlocks
+    - kind: added
+      description: Fix a bug with Rolling Restarts with ephemeral data
+      links:
+        - name: Github PR
+          url: https://github.com/apache/solr-operator/pull/614
   artifacthub.io/images: |
     - name: solr-operator
       image: apache/solr-operator:v0.8.0-prerelease