You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@helix.apache.org by jx...@apache.org on 2018/03/26 22:49:06 UTC

helix git commit: [HELIX-683] clean monitoring cache upon helix controller enable monitoring

Repository: helix
Updated Branches:
  refs/heads/master 149831c8f -> 24f2610f6


[HELIX-683] clean monitoring cache upon helix controller enable monitoring


Project: http://git-wip-us.apache.org/repos/asf/helix/repo
Commit: http://git-wip-us.apache.org/repos/asf/helix/commit/24f2610f
Tree: http://git-wip-us.apache.org/repos/asf/helix/tree/24f2610f
Diff: http://git-wip-us.apache.org/repos/asf/helix/diff/24f2610f

Branch: refs/heads/master
Commit: 24f2610f69714e68e31dcb2581d790c699ec05a1
Parents: 149831c
Author: Harry Zhang <zh...@usc.edu>
Authored: Mon Mar 26 12:14:07 2018 -0700
Committer: Harry Zhang <zh...@usc.edu>
Committed: Mon Mar 26 15:48:28 2018 -0700

----------------------------------------------------------------------
 .../controller/GenericHelixController.java      |   4 +
 .../controller/stages/ClusterDataCache.java     |   4 +
 .../TestControllerLeadershipChange.java         | 143 +++++++++++++++++++
 3 files changed, 151 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/helix/blob/24f2610f/helix-core/src/main/java/org/apache/helix/controller/GenericHelixController.java
----------------------------------------------------------------------
diff --git a/helix-core/src/main/java/org/apache/helix/controller/GenericHelixController.java b/helix-core/src/main/java/org/apache/helix/controller/GenericHelixController.java
index fd14e65..971f482 100644
--- a/helix-core/src/main/java/org/apache/helix/controller/GenericHelixController.java
+++ b/helix-core/src/main/java/org/apache/helix/controller/GenericHelixController.java
@@ -792,6 +792,10 @@ public class GenericHelixController implements IdealStateChangeListener,
         // monitoring state changed
         if (enable) {
           logger.info("Enable clusterStatusMonitor for cluster " + _clusterName);
+          // Clear old cached monitoring related data to avoid reporting stats cross different
+          // leadership periods
+          _cache.clearMonitoringRecords();
+          _taskCache.clearMonitoringRecords();
           _clusterStatusMonitor.active();
         } else {
           logger.info("Disable clusterStatusMonitor for cluster " + _clusterName);

http://git-wip-us.apache.org/repos/asf/helix/blob/24f2610f/helix-core/src/main/java/org/apache/helix/controller/stages/ClusterDataCache.java
----------------------------------------------------------------------
diff --git a/helix-core/src/main/java/org/apache/helix/controller/stages/ClusterDataCache.java b/helix-core/src/main/java/org/apache/helix/controller/stages/ClusterDataCache.java
index 3056531..3582480 100644
--- a/helix-core/src/main/java/org/apache/helix/controller/stages/ClusterDataCache.java
+++ b/helix-core/src/main/java/org/apache/helix/controller/stages/ClusterDataCache.java
@@ -826,6 +826,10 @@ public class ClusterDataCache {
     return _isMaintenanceModeEnabled;
   }
 
+  public void clearMonitoringRecords() {
+    _missingTopStateMap.clear();
+  }
+
   /**
    * toString method to print the entire cluster state
    */

http://git-wip-us.apache.org/repos/asf/helix/blob/24f2610f/helix-core/src/test/java/org/apache/helix/integration/controller/TestControllerLeadershipChange.java
----------------------------------------------------------------------
diff --git a/helix-core/src/test/java/org/apache/helix/integration/controller/TestControllerLeadershipChange.java b/helix-core/src/test/java/org/apache/helix/integration/controller/TestControllerLeadershipChange.java
new file mode 100644
index 0000000..c44c924
--- /dev/null
+++ b/helix-core/src/test/java/org/apache/helix/integration/controller/TestControllerLeadershipChange.java
@@ -0,0 +1,143 @@
+package org.apache.helix.integration.controller;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import java.lang.management.ManagementFactory;
+import javax.management.MBeanServer;
+import javax.management.ObjectName;
+import org.apache.helix.AccessOption;
+import org.apache.helix.HelixDataAccessor;
+import org.apache.helix.HelixManager;
+import org.apache.helix.HelixManagerFactory;
+import org.apache.helix.InstanceType;
+import org.apache.helix.PropertyPathBuilder;
+import org.apache.helix.integration.common.ZkIntegrationTestBase;
+import org.apache.helix.integration.manager.MockParticipantManager;
+import org.apache.helix.model.IdealState;
+import org.apache.helix.model.LiveInstance;
+import org.apache.helix.monitoring.mbeans.MonitorDomainNames;
+import org.testng.Assert;
+import org.testng.annotations.Test;
+
+public class TestControllerLeadershipChange extends ZkIntegrationTestBase{
+
+  @Test
+  public void testMissingTopStateDurationMonitoring() throws Exception {
+    String clusterName = "testCluster-TestControllerLeadershipChange";
+    String instanceName = clusterName + "-participant";
+    String resourceName = "testResource";
+    int numPartition = 1;
+    int numReplica = 1;
+    String stateModel = "LeaderStandby";
+    ObjectName resourceMBeanObjectName = getResourceMonitorObjectName(clusterName, resourceName);
+    MBeanServer beanServer = ManagementFactory.getPlatformMBeanServer();
+
+    // Create cluster
+    _gSetupTool.addCluster(clusterName, true);
+
+    // Create participant
+    _gSetupTool.addInstanceToCluster(clusterName, instanceName);
+    MockParticipantManager participant =
+        new MockParticipantManager(ZK_ADDR, clusterName, instanceName);
+    participant.syncStart();
+
+    // Create controller, since this is the only controller, it will be the leader
+    HelixManager manager1 = HelixManagerFactory
+        .getZKHelixManager(clusterName, clusterName + "-manager1", InstanceType.CONTROLLER,
+            ZK_ADDR);
+    manager1.connect();
+    Assert.assertTrue(manager1.isLeader());
+
+    // Create resource
+    _gSetupTool.addResourceToCluster(clusterName, resourceName, numPartition, stateModel,
+        IdealState.RebalanceMode.SEMI_AUTO.name());
+
+    // Rebalance Resource
+    _gSetupTool
+        .rebalanceResource(clusterName, resourceName, numReplica);
+    // Wait for rebalance
+    Thread.sleep(2000);
+
+    // Trigger missing top state in manager1
+    participant.syncStop();
+
+    Thread.sleep(2000);
+
+    // Starting manager2
+    HelixManager manager2 = HelixManagerFactory
+        .getZKHelixManager(clusterName, clusterName + "-manager2", InstanceType.CONTROLLER,
+            ZK_ADDR);
+    manager2.connect();
+    Assert.assertFalse(manager2.isLeader());
+
+    // Set leader to manager2
+    setLeader(manager2);
+
+    Assert.assertFalse(manager1.isLeader());
+    Assert.assertTrue(manager2.isLeader());
+
+    // Make resource top state to come back
+    participant = new MockParticipantManager(ZK_ADDR, clusterName, instanceName);
+    participant.syncStart();
+
+    // Wait for rebalance
+    Thread.sleep(2000);
+    setLeader(manager1);
+
+    Assert.assertTrue(manager1.isLeader());
+    Assert.assertFalse(manager2.isLeader());
+
+    _gSetupTool.rebalanceResource(clusterName, resourceName, numReplica);
+
+    // Wait for manager1 to update
+    Thread.sleep(2000);
+
+    // Resource lost top state, and manager1 lost leadership for 4000ms, because manager1 will
+    // clean monitoring cache after re-gaining leadership, so max value of hand off duration should
+    // not have such a large value
+    Assert.assertTrue((long) beanServer
+        .getAttribute(resourceMBeanObjectName, "PartitionTopStateHandoffDurationGauge.Max") < 500);
+  }
+
+  private void setLeader(HelixManager manager) {
+    System.out.println("Setting controller " + manager.getInstanceName() + " as leader");
+    HelixDataAccessor accessor = manager.getHelixDataAccessor();
+    final LiveInstance leader = new LiveInstance(manager.getInstanceName());
+    leader.setLiveInstance(ManagementFactory.getRuntimeMXBean().getName());
+    leader.setSessionId(manager.getSessionId());
+    leader.setHelixVersion(manager.getVersion());
+
+    // Delete the current controller leader node so it will trigger leader election
+    accessor.getBaseDataAccessor().remove(PropertyPathBuilder.controllerLeader(manager.getClusterName()), AccessOption.EPHEMERAL);
+
+    // No matter who gets leadership, force the given manager to become leader
+    // Note there is theoretically a racing condition that GenericHelixController.onControllerChange()
+    // will not catch this new value when it's double checking leadership, but it's stable enough
+    accessor.getBaseDataAccessor().set(PropertyPathBuilder.controllerLeader(manager.getClusterName()), leader.getRecord(), AccessOption.EPHEMERAL);
+  }
+
+  private ObjectName getResourceMonitorObjectName(String clusterName, String resourceName)
+      throws Exception {
+    return new ObjectName(String
+        .format("%s:cluster=%s,resourceName=%s", MonitorDomainNames.ClusterStatus.name(),
+            clusterName, resourceName));
+  }
+
+}