You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@helix.apache.org by jx...@apache.org on 2018/03/26 22:49:06 UTC
helix git commit: [HELIX-683] clean monitoring cache upon helix
controller enable monitoring
Repository: helix
Updated Branches:
refs/heads/master 149831c8f -> 24f2610f6
[HELIX-683] clean monitoring cache upon helix controller enable monitoring
Project: http://git-wip-us.apache.org/repos/asf/helix/repo
Commit: http://git-wip-us.apache.org/repos/asf/helix/commit/24f2610f
Tree: http://git-wip-us.apache.org/repos/asf/helix/tree/24f2610f
Diff: http://git-wip-us.apache.org/repos/asf/helix/diff/24f2610f
Branch: refs/heads/master
Commit: 24f2610f69714e68e31dcb2581d790c699ec05a1
Parents: 149831c
Author: Harry Zhang <zh...@usc.edu>
Authored: Mon Mar 26 12:14:07 2018 -0700
Committer: Harry Zhang <zh...@usc.edu>
Committed: Mon Mar 26 15:48:28 2018 -0700
----------------------------------------------------------------------
.../controller/GenericHelixController.java | 4 +
.../controller/stages/ClusterDataCache.java | 4 +
.../TestControllerLeadershipChange.java | 143 +++++++++++++++++++
3 files changed, 151 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/helix/blob/24f2610f/helix-core/src/main/java/org/apache/helix/controller/GenericHelixController.java
----------------------------------------------------------------------
diff --git a/helix-core/src/main/java/org/apache/helix/controller/GenericHelixController.java b/helix-core/src/main/java/org/apache/helix/controller/GenericHelixController.java
index fd14e65..971f482 100644
--- a/helix-core/src/main/java/org/apache/helix/controller/GenericHelixController.java
+++ b/helix-core/src/main/java/org/apache/helix/controller/GenericHelixController.java
@@ -792,6 +792,10 @@ public class GenericHelixController implements IdealStateChangeListener,
// monitoring state changed
if (enable) {
logger.info("Enable clusterStatusMonitor for cluster " + _clusterName);
+ // Clear old cached monitoring related data to avoid reporting stats cross different
+ // leadership periods
+ _cache.clearMonitoringRecords();
+ _taskCache.clearMonitoringRecords();
_clusterStatusMonitor.active();
} else {
logger.info("Disable clusterStatusMonitor for cluster " + _clusterName);
http://git-wip-us.apache.org/repos/asf/helix/blob/24f2610f/helix-core/src/main/java/org/apache/helix/controller/stages/ClusterDataCache.java
----------------------------------------------------------------------
diff --git a/helix-core/src/main/java/org/apache/helix/controller/stages/ClusterDataCache.java b/helix-core/src/main/java/org/apache/helix/controller/stages/ClusterDataCache.java
index 3056531..3582480 100644
--- a/helix-core/src/main/java/org/apache/helix/controller/stages/ClusterDataCache.java
+++ b/helix-core/src/main/java/org/apache/helix/controller/stages/ClusterDataCache.java
@@ -826,6 +826,10 @@ public class ClusterDataCache {
return _isMaintenanceModeEnabled;
}
+ public void clearMonitoringRecords() {
+ _missingTopStateMap.clear();
+ }
+
/**
* toString method to print the entire cluster state
*/
http://git-wip-us.apache.org/repos/asf/helix/blob/24f2610f/helix-core/src/test/java/org/apache/helix/integration/controller/TestControllerLeadershipChange.java
----------------------------------------------------------------------
diff --git a/helix-core/src/test/java/org/apache/helix/integration/controller/TestControllerLeadershipChange.java b/helix-core/src/test/java/org/apache/helix/integration/controller/TestControllerLeadershipChange.java
new file mode 100644
index 0000000..c44c924
--- /dev/null
+++ b/helix-core/src/test/java/org/apache/helix/integration/controller/TestControllerLeadershipChange.java
@@ -0,0 +1,143 @@
+package org.apache.helix.integration.controller;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+import java.lang.management.ManagementFactory;
+import javax.management.MBeanServer;
+import javax.management.ObjectName;
+import org.apache.helix.AccessOption;
+import org.apache.helix.HelixDataAccessor;
+import org.apache.helix.HelixManager;
+import org.apache.helix.HelixManagerFactory;
+import org.apache.helix.InstanceType;
+import org.apache.helix.PropertyPathBuilder;
+import org.apache.helix.integration.common.ZkIntegrationTestBase;
+import org.apache.helix.integration.manager.MockParticipantManager;
+import org.apache.helix.model.IdealState;
+import org.apache.helix.model.LiveInstance;
+import org.apache.helix.monitoring.mbeans.MonitorDomainNames;
+import org.testng.Assert;
+import org.testng.annotations.Test;
+
+public class TestControllerLeadershipChange extends ZkIntegrationTestBase{
+
+ @Test
+ public void testMissingTopStateDurationMonitoring() throws Exception {
+ String clusterName = "testCluster-TestControllerLeadershipChange";
+ String instanceName = clusterName + "-participant";
+ String resourceName = "testResource";
+ int numPartition = 1;
+ int numReplica = 1;
+ String stateModel = "LeaderStandby";
+ ObjectName resourceMBeanObjectName = getResourceMonitorObjectName(clusterName, resourceName);
+ MBeanServer beanServer = ManagementFactory.getPlatformMBeanServer();
+
+ // Create cluster
+ _gSetupTool.addCluster(clusterName, true);
+
+ // Create participant
+ _gSetupTool.addInstanceToCluster(clusterName, instanceName);
+ MockParticipantManager participant =
+ new MockParticipantManager(ZK_ADDR, clusterName, instanceName);
+ participant.syncStart();
+
+ // Create controller, since this is the only controller, it will be the leader
+ HelixManager manager1 = HelixManagerFactory
+ .getZKHelixManager(clusterName, clusterName + "-manager1", InstanceType.CONTROLLER,
+ ZK_ADDR);
+ manager1.connect();
+ Assert.assertTrue(manager1.isLeader());
+
+ // Create resource
+ _gSetupTool.addResourceToCluster(clusterName, resourceName, numPartition, stateModel,
+ IdealState.RebalanceMode.SEMI_AUTO.name());
+
+ // Rebalance Resource
+ _gSetupTool
+ .rebalanceResource(clusterName, resourceName, numReplica);
+ // Wait for rebalance
+ Thread.sleep(2000);
+
+ // Trigger missing top state in manager1
+ participant.syncStop();
+
+ Thread.sleep(2000);
+
+ // Starting manager2
+ HelixManager manager2 = HelixManagerFactory
+ .getZKHelixManager(clusterName, clusterName + "-manager2", InstanceType.CONTROLLER,
+ ZK_ADDR);
+ manager2.connect();
+ Assert.assertFalse(manager2.isLeader());
+
+ // Set leader to manager2
+ setLeader(manager2);
+
+ Assert.assertFalse(manager1.isLeader());
+ Assert.assertTrue(manager2.isLeader());
+
+ // Make resource top state to come back
+ participant = new MockParticipantManager(ZK_ADDR, clusterName, instanceName);
+ participant.syncStart();
+
+ // Wait for rebalance
+ Thread.sleep(2000);
+ setLeader(manager1);
+
+ Assert.assertTrue(manager1.isLeader());
+ Assert.assertFalse(manager2.isLeader());
+
+ _gSetupTool.rebalanceResource(clusterName, resourceName, numReplica);
+
+ // Wait for manager1 to update
+ Thread.sleep(2000);
+
+ // Resource lost top state, and manager1 lost leadership for 4000ms, because manager1 will
+ // clean monitoring cache after re-gaining leadership, so max value of hand off duration should
+ // not have such a large value
+ Assert.assertTrue((long) beanServer
+ .getAttribute(resourceMBeanObjectName, "PartitionTopStateHandoffDurationGauge.Max") < 500);
+ }
+
+ private void setLeader(HelixManager manager) {
+ System.out.println("Setting controller " + manager.getInstanceName() + " as leader");
+ HelixDataAccessor accessor = manager.getHelixDataAccessor();
+ final LiveInstance leader = new LiveInstance(manager.getInstanceName());
+ leader.setLiveInstance(ManagementFactory.getRuntimeMXBean().getName());
+ leader.setSessionId(manager.getSessionId());
+ leader.setHelixVersion(manager.getVersion());
+
+ // Delete the current controller leader node so it will trigger leader election
+ accessor.getBaseDataAccessor().remove(PropertyPathBuilder.controllerLeader(manager.getClusterName()), AccessOption.EPHEMERAL);
+
+ // No matter who gets leadership, force the given manager to become leader
+ // Note there is theoretically a racing condition that GenericHelixController.onControllerChange()
+ // will not catch this new value when it's double checking leadership, but it's stable enough
+ accessor.getBaseDataAccessor().set(PropertyPathBuilder.controllerLeader(manager.getClusterName()), leader.getRecord(), AccessOption.EPHEMERAL);
+ }
+
+ private ObjectName getResourceMonitorObjectName(String clusterName, String resourceName)
+ throws Exception {
+ return new ObjectName(String
+ .format("%s:cluster=%s,resourceName=%s", MonitorDomainNames.ClusterStatus.name(),
+ clusterName, resourceName));
+ }
+
+}