You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by te...@apache.org on 2018/08/02 11:59:57 UTC
hbase git commit: HBASE-19036 Add action in Chaos Monkey to restart
Active Namenode
Repository: hbase
Updated Branches:
refs/heads/master 78164efcf -> b3e41c952
HBASE-19036 Add action in Chaos Monkey to restart Active Namenode
Signed-off-by: tedyu <yu...@gmail.com>
Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/b3e41c95
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/b3e41c95
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/b3e41c95
Branch: refs/heads/master
Commit: b3e41c9525f0f8537b87bb7bf923cf74c31ee585
Parents: 78164ef
Author: Monani Mihir <mo...@gmail.com>
Authored: Tue Jul 31 18:44:45 2018 +0530
Committer: tedyu <yu...@gmail.com>
Committed: Thu Aug 2 04:59:51 2018 -0700
----------------------------------------------------------------------
.../hadoop/hbase/DistributedHBaseCluster.java | 33 ++++++-
.../hadoop/hbase/HBaseClusterManager.java | 2 +
.../hadoop/hbase/chaos/actions/Action.java | 28 ++++++
.../chaos/actions/RestartActionBaseAction.java | 12 +++
.../actions/RestartActiveNameNodeAction.java | 90 ++++++++++++++++++++
.../org/apache/hadoop/hbase/HBaseCluster.java | 37 ++++++++
.../apache/hadoop/hbase/MiniHBaseCluster.java | 26 ++++++
7 files changed, 227 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hbase/blob/b3e41c95/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java
----------------------------------------------------------------------
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java
index 943f2a6..5ec9e25 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java
@@ -25,6 +25,7 @@ import java.util.List;
import java.util.Objects;
import java.util.Set;
import java.util.TreeSet;
+
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.ClusterManager.ServiceType;
import org.apache.hadoop.hbase.client.Admin;
@@ -35,7 +36,6 @@ import org.apache.hadoop.hbase.client.RegionLocator;
import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.Threads;
import org.apache.yetus.audience.InterfaceAudience;
-
import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos;
import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.ServerInfo;
@@ -204,6 +204,37 @@ public class DistributedHBaseCluster extends HBaseCluster {
waitForServiceToStop(ServiceType.HADOOP_DATANODE, serverName, timeout);
}
+ @Override
+ public void startNameNode(ServerName serverName) throws IOException {
+ LOG.info("Starting name node on: " + serverName.getServerName());
+ clusterManager.start(ServiceType.HADOOP_NAMENODE, serverName.getHostname(),
+ serverName.getPort());
+ }
+
+ @Override
+ public void killNameNode(ServerName serverName) throws IOException {
+ LOG.info("Aborting name node on: " + serverName.getServerName());
+ clusterManager.kill(ServiceType.HADOOP_NAMENODE, serverName.getHostname(),
+ serverName.getPort());
+ }
+
+ @Override
+ public void stopNameNode(ServerName serverName) throws IOException {
+ LOG.info("Stopping name node on: " + serverName.getServerName());
+ clusterManager.stop(ServiceType.HADOOP_NAMENODE, serverName.getHostname(),
+ serverName.getPort());
+ }
+
+ @Override
+ public void waitForNameNodeToStart(ServerName serverName, long timeout) throws IOException {
+ waitForServiceToStart(ServiceType.HADOOP_NAMENODE, serverName, timeout);
+ }
+
+ @Override
+ public void waitForNameNodeToStop(ServerName serverName, long timeout) throws IOException {
+ waitForServiceToStop(ServiceType.HADOOP_NAMENODE, serverName, timeout);
+ }
+
private void waitForServiceToStop(ServiceType service, ServerName serverName, long timeout)
throws IOException {
LOG.info("Waiting for service: " + service + " to stop: " + serverName.getServerName());
http://git-wip-us.apache.org/repos/asf/hbase/blob/b3e41c95/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java
----------------------------------------------------------------------
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java
index 884ddad..f7c2fc6 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java
@@ -101,6 +101,7 @@ public class HBaseClusterManager extends Configured implements ClusterManager {
Configuration conf = getConf();
switch (service) {
case HADOOP_DATANODE:
+ case HADOOP_NAMENODE:
return conf.get("hbase.it.clustermanager.hadoop.hdfs.user", "hdfs");
case ZOOKEEPER_SERVER:
return conf.get("hbase.it.clustermanager.zookeeper.user", "zookeeper");
@@ -282,6 +283,7 @@ public class HBaseClusterManager extends Configured implements ClusterManager {
protected CommandProvider getCommandProvider(ServiceType service) throws IOException {
switch (service) {
case HADOOP_DATANODE:
+ case HADOOP_NAMENODE:
return new HadoopShellCommandProvider(getConf());
case ZOOKEEPER_SERVER:
return new ZookeeperShellCommandProvider(getConf());
http://git-wip-us.apache.org/repos/asf/hbase/blob/b3e41c95/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java
----------------------------------------------------------------------
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java
index 2b2c1b8..350e18a 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java
@@ -26,6 +26,7 @@ import java.util.List;
import java.util.Map;
import java.util.function.BiConsumer;
import java.util.function.Consumer;
+
import org.apache.commons.lang3.RandomUtils;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.ClusterMetrics;
@@ -65,6 +66,10 @@ public class Action {
"hbase.chaosmonkey.action.killdatanodetimeout";
public static final String START_DATANODE_TIMEOUT_KEY =
"hbase.chaosmonkey.action.startdatanodetimeout";
+ public static final String KILL_NAMENODE_TIMEOUT_KEY =
+ "hbase.chaosmonkey.action.killnamenodetimeout";
+ public static final String START_NAMENODE_TIMEOUT_KEY =
+ "hbase.chaosmonkey.action.startnamenodetimeout";
protected static final Logger LOG = LoggerFactory.getLogger(Action.class);
@@ -76,6 +81,8 @@ public class Action {
protected static final long START_ZK_NODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
protected static final long KILL_DATANODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
protected static final long START_DATANODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
+ protected static final long KILL_NAMENODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
+ protected static final long START_NAMENODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
protected ActionContext context;
protected HBaseCluster cluster;
@@ -90,6 +97,8 @@ public class Action {
protected long startZkNodeTimeout;
protected long killDataNodeTimeout;
protected long startDataNodeTimeout;
+ protected long killNameNodeTimeout;
+ protected long startNameNodeTimeout;
public void init(ActionContext context) throws IOException {
this.context = context;
@@ -112,6 +121,11 @@ public class Action {
KILL_DATANODE_TIMEOUT_DEFAULT);
startDataNodeTimeout = cluster.getConf().getLong(START_DATANODE_TIMEOUT_KEY,
START_DATANODE_TIMEOUT_DEFAULT);
+ killNameNodeTimeout =
+ cluster.getConf().getLong(KILL_NAMENODE_TIMEOUT_KEY, KILL_NAMENODE_TIMEOUT_DEFAULT);
+ startNameNodeTimeout =
+ cluster.getConf().getLong(START_NAMENODE_TIMEOUT_KEY, START_NAMENODE_TIMEOUT_DEFAULT);
+
}
public void perform() throws Exception { }
@@ -197,6 +211,20 @@ public class Action {
LOG.info("Started datanode " + server);
}
+ protected void killNameNode(ServerName server) throws IOException {
+ LOG.info("Killing namenode :-" + server.getHostname());
+ cluster.killNameNode(server);
+ cluster.waitForNameNodeToStop(server, killNameNodeTimeout);
+ LOG.info("Killed namenode:" + server + ". Reported num of rs:"
+ + cluster.getClusterMetrics().getLiveServerMetrics().size());
+ }
+
+ protected void startNameNode(ServerName server) throws IOException {
+ LOG.info("Starting Namenode :-" + server.getHostname());
+ cluster.startNameNode(server);
+ cluster.waitForNameNodeToStart(server, startNameNodeTimeout);
+ LOG.info("Started namenode:" + server);
+ }
protected void unbalanceRegions(ClusterMetrics clusterStatus,
List<ServerName> fromServers, List<ServerName> toServers,
double fractionOfRegions) throws Exception {
http://git-wip-us.apache.org/repos/asf/hbase/blob/b3e41c95/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActionBaseAction.java
----------------------------------------------------------------------
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActionBaseAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActionBaseAction.java
index 63286cb..6e589ae 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActionBaseAction.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActionBaseAction.java
@@ -82,4 +82,16 @@ public class RestartActionBaseAction extends Action {
sleep(sleepTime);
startDataNode(server);
}
+
+ void restartNameNode(ServerName server, long sleepTime) throws IOException {
+ sleepTime = Math.max(sleepTime, 1000);
+ // Don't try the kill if we're stopping
+ if (context.isStopping()) {
+ return;
+ }
+ killNameNode(server);
+ sleep(sleepTime);
+ startNameNode(server);
+ }
+
}
http://git-wip-us.apache.org/repos/asf/hbase/blob/b3e41c95/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActiveNameNodeAction.java
----------------------------------------------------------------------
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActiveNameNodeAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActiveNameNodeAction.java
new file mode 100644
index 0000000..645743a
--- /dev/null
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActiveNameNodeAction.java
@@ -0,0 +1,90 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hbase.chaos.actions;
+
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.ServerName;
+import org.apache.hadoop.hbase.util.FSUtils;
+import org.apache.hadoop.hbase.zookeeper.RecoverableZooKeeper;
+import org.apache.hadoop.hbase.zookeeper.ZKUtil;
+import org.apache.hadoop.hbase.zookeeper.ZKWatcher;
+import org.apache.hadoop.hbase.zookeeper.ZNodePaths;
+import org.apache.hadoop.hdfs.DFSUtil;
+import org.apache.hadoop.hdfs.HAUtil;
+import org.apache.hadoop.hdfs.server.namenode.ha.proto.HAZKInfoProtos.ActiveNodeInfo;
+
+/**
+ * Action that tries to restart the active namenode.
+ */
+public class RestartActiveNameNodeAction extends RestartActionBaseAction {
+
+ // Value taken from org.apache.hadoop.ha.ActiveStandbyElector.java, variable :- LOCK_FILENAME
+ private static final String ACTIVE_NN_LOCK_NAME = "ActiveStandbyElectorLock";
+
+ // Value taken from org.apache.hadoop.ha.ZKFailoverController.java
+ // variable :- ZK_PARENT_ZNODE_DEFAULT and ZK_PARENT_ZNODE_KEY
+ private static final String ZK_PARENT_ZNODE_DEFAULT = "/hadoop-ha";
+ private static final String ZK_PARENT_ZNODE_KEY = "ha.zookeeper.parent-znode";
+
+ public RestartActiveNameNodeAction(long sleepTime) {
+ super(sleepTime);
+ }
+
+ @Override
+ public void perform() throws Exception {
+ LOG.info("Performing action: Restart active namenode");
+ Configuration conf = FSUtils.getRootDir(getConf()).getFileSystem(getConf()).getConf();
+ String nameServiceID = DFSUtil.getNamenodeNameServiceId(conf);
+ if (!HAUtil.isHAEnabled(conf, nameServiceID)) {
+ throw new Exception("HA for namenode is not enabled");
+ }
+ ZKWatcher zkw = null;
+ RecoverableZooKeeper rzk = null;
+ String activeNamenode = null;
+ String hadoopHAZkNode = conf.get(ZK_PARENT_ZNODE_KEY, ZK_PARENT_ZNODE_DEFAULT);
+ try {
+ zkw = new ZKWatcher(conf, "get-active-namenode", null);
+ rzk = zkw.getRecoverableZooKeeper();
+ String hadoopHAZkNodePath = ZNodePaths.joinZNode(hadoopHAZkNode, nameServiceID);
+ List<String> subChildern = ZKUtil.listChildrenNoWatch(zkw, hadoopHAZkNodePath);
+ for (String eachEntry : subChildern) {
+ if (eachEntry.contains(ACTIVE_NN_LOCK_NAME)) {
+ byte[] data =
+ rzk.getData(ZNodePaths.joinZNode(hadoopHAZkNodePath, ACTIVE_NN_LOCK_NAME), false,
+ null);
+ ActiveNodeInfo proto = ActiveNodeInfo.parseFrom(data);
+ activeNamenode = proto.getHostname();
+ }
+ }
+ } finally {
+ if (zkw != null) {
+ zkw.close();
+ }
+ }
+ if (activeNamenode == null) {
+ throw new Exception("No active Name node found in zookeeper under " + hadoopHAZkNode);
+ }
+ LOG.info("Found active namenode host:" + activeNamenode);
+ ServerName activeNNHost = ServerName.valueOf(activeNamenode, -1, -1);
+ LOG.info("Restarting Active NameNode :" + activeNamenode);
+ restartNameNode(activeNNHost, sleepTime);
+ }
+}
http://git-wip-us.apache.org/repos/asf/hbase/blob/b3e41c95/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java
index 59a0059..a1b474d 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java
@@ -19,6 +19,7 @@ package org.apache.hadoop.hbase;
import java.io.Closeable;
import java.io.IOException;
+
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.hbase.util.Threads;
@@ -248,6 +249,42 @@ public abstract class HBaseCluster implements Closeable, Configurable {
throws IOException;
/**
+ * Starts a new namenode on the given hostname or if this is a mini/local cluster, silently logs
+ * warning message.
+ * @throws IOException if something goes wrong
+ */
+ public abstract void startNameNode(ServerName serverName) throws IOException;
+
+ /**
+ * Kills the namenode process if this is a distributed cluster, otherwise, this causes master to
+ * exit doing basic clean up only.
+ * @throws IOException if something goes wrong
+ */
+ public abstract void killNameNode(ServerName serverName) throws IOException;
+
+ /**
+ * Stops the namenode if this is a distributed cluster, otherwise silently logs warning message.
+ * @throws IOException if something goes wrong
+ */
+ public abstract void stopNameNode(ServerName serverName) throws IOException;
+
+ /**
+ * Wait for the specified namenode to join the cluster
+ * @return whether the operation finished with success
+ * @throws IOException if something goes wrong or timeout occurs
+ */
+ public abstract void waitForNameNodeToStart(ServerName serverName, long timeout)
+ throws IOException;
+
+ /**
+ * Wait for the specified namenode to stop
+ * @return whether the operation finished with success
+ * @throws IOException if something goes wrong or timeout occurs
+ */
+ public abstract void waitForNameNodeToStop(ServerName serverName, long timeout)
+ throws IOException;
+
+ /**
* Starts a new master on the given hostname or if this is a mini/local cluster,
* starts a master locally.
* @param hostname the hostname to start the master on
http://git-wip-us.apache.org/repos/asf/hbase/blob/b3e41c95/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java
index 5eb7218..473eb74 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java
@@ -24,6 +24,7 @@ import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
+
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.hbase.master.HMaster;
@@ -349,6 +350,31 @@ public class MiniHBaseCluster extends HBaseCluster {
}
@Override
+ public void startNameNode(ServerName serverName) throws IOException {
+ LOG.warn("Starting namenodes on mini cluster is not supported");
+ }
+
+ @Override
+ public void killNameNode(ServerName serverName) throws IOException {
+ LOG.warn("Aborting namenodes on mini cluster is not supported");
+ }
+
+ @Override
+ public void stopNameNode(ServerName serverName) throws IOException {
+ LOG.warn("Stopping namenodes on mini cluster is not supported");
+ }
+
+ @Override
+ public void waitForNameNodeToStart(ServerName serverName, long timeout) throws IOException {
+ LOG.warn("Waiting for namenodes to start on mini cluster is not supported");
+ }
+
+ @Override
+ public void waitForNameNodeToStop(ServerName serverName, long timeout) throws IOException {
+ LOG.warn("Waiting for namenodes to stop on mini cluster is not supported");
+ }
+
+ @Override
public void startMaster(String hostname, int port) throws IOException {
this.startMaster();
}