You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@helix.apache.org by hz...@apache.org on 2021/09/01 17:01:08 UTC

[helix] branch master updated: Fix management mode history duplicate recording (#1846)

This is an automated email from the ASF dual-hosted git repository.

hzlu pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/helix.git


The following commit(s) were added to refs/heads/master by this push:
     new 0e20855  Fix management mode history duplicate recording (#1846)
0e20855 is described below

commit 0e20855148fe86aea3a4a3f7f764c18ee6a6e577
Author: Huizhi Lu <51...@users.noreply.github.com>
AuthorDate: Wed Sep 1 10:00:34 2021 -0700

    Fix management mode history duplicate recording (#1846)
    
    The management mode history has duplicate entries. It does not impact the normal function, but it's good to get it fixed to avoid confusion. This commit fixes the issue by adding a check for the status in metadata store and the calculated status.
---
 .../controller/stages/ManagementModeStage.java     | 39 +++++++---------
 .../controller/stages/TestManagementModeStage.java | 54 +++++++++++++++++-----
 .../controller/TestClusterFreezeMode.java          | 19 ++++++--
 3 files changed, 73 insertions(+), 39 deletions(-)

diff --git a/helix-core/src/main/java/org/apache/helix/controller/stages/ManagementModeStage.java b/helix-core/src/main/java/org/apache/helix/controller/stages/ManagementModeStage.java
index 3329d8c..4aa09d0 100644
--- a/helix-core/src/main/java/org/apache/helix/controller/stages/ManagementModeStage.java
+++ b/helix-core/src/main/java/org/apache/helix/controller/stages/ManagementModeStage.java
@@ -86,9 +86,7 @@ public class ManagementModeStage extends AbstractBaseStage {
         checkClusterFreezeStatus(cache.getEnabledLiveInstances(), cache.getLiveInstances(),
             cache.getAllInstancesMessages(), cache.getPauseSignal());
 
-    recordClusterStatus(managementMode, accessor);
-    recordManagementModeHistory(managementMode, cache.getPauseSignal(), manager.getInstanceName(),
-        accessor);
+    recordClusterStatus(managementMode, cache.getPauseSignal(), manager.getInstanceName(), accessor);
 
     event.addAttribute(AttributeName.CLUSTER_STATUS.name(), managementMode);
   }
@@ -135,36 +133,31 @@ public class ManagementModeStage extends AbstractBaseStage {
         .anyMatch(message -> PENDING_MESSAGE_TYPES.contains(message.getMsgType()));
   }
 
-  private void recordClusterStatus(ClusterManagementMode mode, HelixDataAccessor accessor) {
-    // update cluster status
+  private void recordClusterStatus(ClusterManagementMode mode, PauseSignal pauseSignal,
+      String controllerName, HelixDataAccessor accessor) {
+    // Read cluster status from metadata store
     PropertyKey statusPropertyKey = accessor.keyBuilder().clusterStatus();
     ClusterStatus clusterStatus = accessor.getProperty(statusPropertyKey);
     if (clusterStatus == null) {
       clusterStatus = new ClusterStatus();
     }
 
-    ClusterManagementMode.Type recordedType = clusterStatus.getManagementMode();
-    ClusterManagementMode.Status recordedStatus = clusterStatus.getManagementModeStatus();
-
-    // If there is any pending message sent by users, status could be computed as in progress.
-    // Skip recording status change to avoid confusion after cluster is already fully frozen.
-    if (ClusterManagementMode.Type.CLUSTER_FREEZE.equals(recordedType)
-        && ClusterManagementMode.Status.COMPLETED.equals(recordedStatus)
-        && ClusterManagementMode.Type.CLUSTER_FREEZE.equals(mode.getMode())
-        && ClusterManagementMode.Status.IN_PROGRESS.equals(mode.getStatus())) {
-      LOG.info("Skip recording status mode={}, status={}, because cluster is fully frozen",
-          mode.getMode(), mode.getStatus());
+    if (mode.getMode().equals(clusterStatus.getManagementMode())
+        && mode.getStatus().equals(clusterStatus.getManagementModeStatus())) {
+      // No need to update status and avoid duplicates when it's the same as metadata store
+      LOG.debug("Skip recording duplicate status mode={}, status={}", mode.getMode(),
+          mode.getStatus());
       return;
     }
 
-    if (!mode.getMode().equals(recordedType) || !mode.getStatus().equals(recordedStatus)) {
-      // Only update status when it's different with metadata store
-      clusterStatus.setManagementMode(mode.getMode());
-      clusterStatus.setManagementModeStatus(mode.getStatus());
-      if (!accessor.updateProperty(statusPropertyKey, clusterStatus)) {
-        LOG.error("Failed to update cluster status {}", clusterStatus);
-      }
+    // Update cluster status in metadata store
+    clusterStatus.setManagementMode(mode.getMode());
+    clusterStatus.setManagementModeStatus(mode.getStatus());
+    if (!accessor.updateProperty(statusPropertyKey, clusterStatus)) {
+      LOG.error("Failed to update cluster status {}", clusterStatus);
     }
+
+    recordManagementModeHistory(mode, pauseSignal, controllerName, accessor);
   }
 
   private void recordManagementModeHistory(ClusterManagementMode mode, PauseSignal pauseSignal,
diff --git a/helix-core/src/test/java/org/apache/helix/controller/stages/TestManagementModeStage.java b/helix-core/src/test/java/org/apache/helix/controller/stages/TestManagementModeStage.java
index 11a0c60..1c61624 100644
--- a/helix-core/src/test/java/org/apache/helix/controller/stages/TestManagementModeStage.java
+++ b/helix-core/src/test/java/org/apache/helix/controller/stages/TestManagementModeStage.java
@@ -35,6 +35,7 @@ import org.apache.helix.controller.pipeline.Pipeline;
 import org.apache.helix.manager.zk.ZKHelixDataAccessor;
 import org.apache.helix.manager.zk.ZkBaseDataAccessor;
 import org.apache.helix.model.ClusterStatus;
+import org.apache.helix.model.ControllerHistory;
 import org.apache.helix.model.LiveInstance;
 import org.testng.Assert;
 import org.testng.annotations.AfterClass;
@@ -93,15 +94,40 @@ public class TestManagementModeStage extends ZkTestBase {
     ClusterStatus clusterStatus = _accessor.getProperty(_accessor.keyBuilder().clusterStatus());
     Assert.assertEquals(clusterStatus.getManagementMode(), ClusterManagementMode.Type.CLUSTER_FREEZE);
 
-
-    // Mark a live instance to be pause state
-    LiveInstance liveInstance = liveInstances.get(0);
-    liveInstance.setStatus(LiveInstance.LiveInstanceStatus.FROZEN);
-    PropertyKey liveInstanceKey =
-        _accessor.keyBuilder().liveInstance(liveInstance.getInstanceName());
-    _accessor.updateProperty(liveInstanceKey, liveInstance);
+    ControllerHistory history =
+        _accessor.getProperty(_accessor.keyBuilder().controllerLeaderHistory());
+    Assert.assertNull(history);
+
+    // Mark both live instances to be frozen, then entering freeze mode is complete
+    for (int i = 0; i < 2; i++) {
+      LiveInstance liveInstance = liveInstances.get(i);
+      liveInstance.setStatus(LiveInstance.LiveInstanceStatus.FROZEN);
+      PropertyKey liveInstanceKey =
+          _accessor.keyBuilder().liveInstance(liveInstance.getInstanceName());
+      _accessor.updateProperty(liveInstanceKey, liveInstance);
+    }
     // Require cache refresh
     cache.notifyDataChange(HelixConstants.ChangeType.LIVE_INSTANCE);
+    runPipeline(event, dataRefresh, false);
+    managementModeStage.process(event);
+
+    // Freeze mode is complete
+    clusterStatus = _accessor.getProperty(_accessor.keyBuilder().clusterStatus());
+    Assert.assertEquals(clusterStatus.getManagementMode(), ClusterManagementMode.Type.CLUSTER_FREEZE);
+    Assert.assertEquals(clusterStatus.getManagementModeStatus(),
+        ClusterManagementMode.Status.COMPLETED);
+
+    // Management history is recorded
+    history = _accessor.getProperty(_accessor.keyBuilder().controllerLeaderHistory());
+    Assert.assertEquals(history.getManagementModeHistory().size(), 1);
+    String lastHistory = history.getManagementModeHistory().get(0);
+    Assert.assertTrue(lastHistory.contains("MODE=" + ClusterManagementMode.Type.CLUSTER_FREEZE));
+    Assert.assertTrue(lastHistory.contains("STATUS=" + ClusterManagementMode.Status.COMPLETED));
+
+    // No duplicate management mode history entries
+    managementModeStage.process(event);
+    history = _accessor.getProperty(_accessor.keyBuilder().controllerLeaderHistory());
+    Assert.assertEquals(history.getManagementModeHistory().size(), 1);
 
     // Unfreeze cluster
     request = ClusterManagementModeRequest.newBuilder()
@@ -119,11 +145,15 @@ public class TestManagementModeStage extends ZkTestBase {
     Assert.assertEquals(clusterStatus.getManagementModeStatus(),
         ClusterManagementMode.Status.IN_PROGRESS);
 
-    // remove froze status to mark the live instance to be normal status
-    liveInstance = _accessor.getProperty(liveInstanceKey);
-    liveInstance.getRecord().getSimpleFields()
-        .remove(LiveInstance.LiveInstanceProperty.STATUS.name());
-    _accessor.setProperty(liveInstanceKey, liveInstance);
+    // remove froze status to mark the live instances to be normal status
+    for (int i = 0; i < 2; i++) {
+      LiveInstance liveInstance = liveInstances.get(i);
+      PropertyKey liveInstanceKey =
+          _accessor.keyBuilder().liveInstance(liveInstance.getInstanceName());
+      liveInstance.getRecord().getSimpleFields()
+          .remove(LiveInstance.LiveInstanceProperty.STATUS.name());
+      _accessor.setProperty(liveInstanceKey, liveInstance);
+    }
     // Require cache refresh
     cache.notifyDataChange(HelixConstants.ChangeType.LIVE_INSTANCE);
     runPipeline(event, dataRefresh, false);
diff --git a/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterFreezeMode.java b/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterFreezeMode.java
index e068f38..6910263 100644
--- a/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterFreezeMode.java
+++ b/helix-core/src/test/java/org/apache/helix/integration/controller/TestClusterFreezeMode.java
@@ -172,6 +172,12 @@ public class TestClusterFreezeMode extends ZkTestBase {
     expectedClusterStatus.setManagementModeStatus(ClusterManagementMode.Status.IN_PROGRESS);
     verifyClusterStatus(expectedClusterStatus);
 
+    // Verify management mode history is empty
+    ControllerHistory controllerHistory =
+        _accessor.getProperty(_accessor.keyBuilder().controllerLeaderHistory());
+    List<String> managementHistory = controllerHistory.getManagementModeHistory();
+    Assert.assertTrue(managementHistory.isEmpty());
+
     // Unblock to finish state transition and delete the ST message
     latch.countDown();
 
@@ -185,12 +191,17 @@ public class TestClusterFreezeMode extends ZkTestBase {
 
     // Verify management mode history
     Assert.assertTrue(TestHelper.verify(() -> {
-      ControllerHistory history = _accessor.getProperty(keyBuilder.controllerLeaderHistory());
-      List<String> managementHistory = history.getManagementModeHistory();
-      if (managementHistory == null || managementHistory.isEmpty()) {
+      ControllerHistory tmpControllerHistory =
+          _accessor.getProperty(keyBuilder.controllerLeaderHistory());
+      List<String> tmpManagementHistory = tmpControllerHistory.getManagementModeHistory();
+      if (tmpManagementHistory == null || tmpManagementHistory.isEmpty()) {
         return false;
       }
-      String lastHistory = managementHistory.get(managementHistory.size() - 1);
+      // Should not have duplicate entries
+      if (tmpManagementHistory.size() > 1) {
+        return false;
+      }
+      String lastHistory = tmpManagementHistory.get(0);
       return lastHistory.contains("MODE=" + ClusterManagementMode.Type.CLUSTER_FREEZE)
           && lastHistory.contains("STATUS=" + ClusterManagementMode.Status.COMPLETED)
           && lastHistory.contains("REASON=" + methodName);