You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by te...@apache.org on 2018/08/02 11:59:57 UTC

hbase git commit: HBASE-19036 Add action in Chaos Monkey to restart Active Namenode

Repository: hbase
Updated Branches:
  refs/heads/master 78164efcf -> b3e41c952


HBASE-19036 Add action in Chaos Monkey to restart Active Namenode

Signed-off-by: tedyu <yu...@gmail.com>


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/b3e41c95
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/b3e41c95
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/b3e41c95

Branch: refs/heads/master
Commit: b3e41c9525f0f8537b87bb7bf923cf74c31ee585
Parents: 78164ef
Author: Monani Mihir <mo...@gmail.com>
Authored: Tue Jul 31 18:44:45 2018 +0530
Committer: tedyu <yu...@gmail.com>
Committed: Thu Aug 2 04:59:51 2018 -0700

----------------------------------------------------------------------
 .../hadoop/hbase/DistributedHBaseCluster.java   | 33 ++++++-
 .../hadoop/hbase/HBaseClusterManager.java       |  2 +
 .../hadoop/hbase/chaos/actions/Action.java      | 28 ++++++
 .../chaos/actions/RestartActionBaseAction.java  | 12 +++
 .../actions/RestartActiveNameNodeAction.java    | 90 ++++++++++++++++++++
 .../org/apache/hadoop/hbase/HBaseCluster.java   | 37 ++++++++
 .../apache/hadoop/hbase/MiniHBaseCluster.java   | 26 ++++++
 7 files changed, 227 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/b3e41c95/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java
----------------------------------------------------------------------
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java
index 943f2a6..5ec9e25 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/DistributedHBaseCluster.java
@@ -25,6 +25,7 @@ import java.util.List;
 import java.util.Objects;
 import java.util.Set;
 import java.util.TreeSet;
+
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hbase.ClusterManager.ServiceType;
 import org.apache.hadoop.hbase.client.Admin;
@@ -35,7 +36,6 @@ import org.apache.hadoop.hbase.client.RegionLocator;
 import org.apache.hadoop.hbase.util.Bytes;
 import org.apache.hadoop.hbase.util.Threads;
 import org.apache.yetus.audience.InterfaceAudience;
-
 import org.apache.hadoop.hbase.shaded.protobuf.ProtobufUtil;
 import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos;
 import org.apache.hadoop.hbase.shaded.protobuf.generated.AdminProtos.ServerInfo;
@@ -204,6 +204,37 @@ public class DistributedHBaseCluster extends HBaseCluster {
     waitForServiceToStop(ServiceType.HADOOP_DATANODE, serverName, timeout);
   }
 
+  @Override
+  public void startNameNode(ServerName serverName) throws IOException {
+    LOG.info("Starting name node on: " + serverName.getServerName());
+    clusterManager.start(ServiceType.HADOOP_NAMENODE, serverName.getHostname(),
+      serverName.getPort());
+  }
+
+  @Override
+  public void killNameNode(ServerName serverName) throws IOException {
+    LOG.info("Aborting name node on: " + serverName.getServerName());
+    clusterManager.kill(ServiceType.HADOOP_NAMENODE, serverName.getHostname(),
+      serverName.getPort());
+  }
+
+  @Override
+  public void stopNameNode(ServerName serverName) throws IOException {
+    LOG.info("Stopping name node on: " + serverName.getServerName());
+    clusterManager.stop(ServiceType.HADOOP_NAMENODE, serverName.getHostname(),
+      serverName.getPort());
+  }
+
+  @Override
+  public void waitForNameNodeToStart(ServerName serverName, long timeout) throws IOException {
+    waitForServiceToStart(ServiceType.HADOOP_NAMENODE, serverName, timeout);
+  }
+
+  @Override
+  public void waitForNameNodeToStop(ServerName serverName, long timeout) throws IOException {
+    waitForServiceToStop(ServiceType.HADOOP_NAMENODE, serverName, timeout);
+  }
+
   private void waitForServiceToStop(ServiceType service, ServerName serverName, long timeout)
     throws IOException {
     LOG.info("Waiting for service: " + service + " to stop: " + serverName.getServerName());

http://git-wip-us.apache.org/repos/asf/hbase/blob/b3e41c95/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java
----------------------------------------------------------------------
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java
index 884ddad..f7c2fc6 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/HBaseClusterManager.java
@@ -101,6 +101,7 @@ public class HBaseClusterManager extends Configured implements ClusterManager {
     Configuration conf = getConf();
     switch (service) {
       case HADOOP_DATANODE:
+      case HADOOP_NAMENODE:
         return conf.get("hbase.it.clustermanager.hadoop.hdfs.user", "hdfs");
       case ZOOKEEPER_SERVER:
         return conf.get("hbase.it.clustermanager.zookeeper.user", "zookeeper");
@@ -282,6 +283,7 @@ public class HBaseClusterManager extends Configured implements ClusterManager {
   protected CommandProvider getCommandProvider(ServiceType service) throws IOException {
     switch (service) {
       case HADOOP_DATANODE:
+      case HADOOP_NAMENODE:
         return new HadoopShellCommandProvider(getConf());
       case ZOOKEEPER_SERVER:
         return new ZookeeperShellCommandProvider(getConf());

http://git-wip-us.apache.org/repos/asf/hbase/blob/b3e41c95/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java
----------------------------------------------------------------------
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java
index 2b2c1b8..350e18a 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/Action.java
@@ -26,6 +26,7 @@ import java.util.List;
 import java.util.Map;
 import java.util.function.BiConsumer;
 import java.util.function.Consumer;
+
 import org.apache.commons.lang3.RandomUtils;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hbase.ClusterMetrics;
@@ -65,6 +66,10 @@ public class Action {
     "hbase.chaosmonkey.action.killdatanodetimeout";
   public static final String START_DATANODE_TIMEOUT_KEY =
     "hbase.chaosmonkey.action.startdatanodetimeout";
+  public static final String KILL_NAMENODE_TIMEOUT_KEY =
+      "hbase.chaosmonkey.action.killnamenodetimeout";
+  public static final String START_NAMENODE_TIMEOUT_KEY =
+      "hbase.chaosmonkey.action.startnamenodetimeout";
 
   protected static final Logger LOG = LoggerFactory.getLogger(Action.class);
 
@@ -76,6 +81,8 @@ public class Action {
   protected static final long START_ZK_NODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
   protected static final long KILL_DATANODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
   protected static final long START_DATANODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
+  protected static final long KILL_NAMENODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
+  protected static final long START_NAMENODE_TIMEOUT_DEFAULT = PolicyBasedChaosMonkey.TIMEOUT;
 
   protected ActionContext context;
   protected HBaseCluster cluster;
@@ -90,6 +97,8 @@ public class Action {
   protected long startZkNodeTimeout;
   protected long killDataNodeTimeout;
   protected long startDataNodeTimeout;
+  protected long killNameNodeTimeout;
+  protected long startNameNodeTimeout;
 
   public void init(ActionContext context) throws IOException {
     this.context = context;
@@ -112,6 +121,11 @@ public class Action {
       KILL_DATANODE_TIMEOUT_DEFAULT);
     startDataNodeTimeout = cluster.getConf().getLong(START_DATANODE_TIMEOUT_KEY,
       START_DATANODE_TIMEOUT_DEFAULT);
+    killNameNodeTimeout =
+        cluster.getConf().getLong(KILL_NAMENODE_TIMEOUT_KEY, KILL_NAMENODE_TIMEOUT_DEFAULT);
+    startNameNodeTimeout =
+        cluster.getConf().getLong(START_NAMENODE_TIMEOUT_KEY, START_NAMENODE_TIMEOUT_DEFAULT);
+
   }
 
   public void perform() throws Exception { }
@@ -197,6 +211,20 @@ public class Action {
     LOG.info("Started datanode " + server);
   }
 
+  protected void killNameNode(ServerName server) throws IOException {
+    LOG.info("Killing namenode :-" + server.getHostname());
+    cluster.killNameNode(server);
+    cluster.waitForNameNodeToStop(server, killNameNodeTimeout);
+    LOG.info("Killed namenode:" + server + ". Reported num of rs:"
+        + cluster.getClusterMetrics().getLiveServerMetrics().size());
+  }
+
+  protected void startNameNode(ServerName server) throws IOException {
+    LOG.info("Starting Namenode :-" + server.getHostname());
+    cluster.startNameNode(server);
+    cluster.waitForNameNodeToStart(server, startNameNodeTimeout);
+    LOG.info("Started namenode:" + server);
+  }
   protected void unbalanceRegions(ClusterMetrics clusterStatus,
       List<ServerName> fromServers, List<ServerName> toServers,
       double fractionOfRegions) throws Exception {

http://git-wip-us.apache.org/repos/asf/hbase/blob/b3e41c95/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActionBaseAction.java
----------------------------------------------------------------------
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActionBaseAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActionBaseAction.java
index 63286cb..6e589ae 100644
--- a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActionBaseAction.java
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActionBaseAction.java
@@ -82,4 +82,16 @@ public class RestartActionBaseAction extends Action {
     sleep(sleepTime);
     startDataNode(server);
   }
+
+  void restartNameNode(ServerName server, long sleepTime) throws IOException {
+    sleepTime = Math.max(sleepTime, 1000);
+    // Don't try the kill if we're stopping
+    if (context.isStopping()) {
+      return;
+    }
+    killNameNode(server);
+    sleep(sleepTime);
+    startNameNode(server);
+  }
+
 }

http://git-wip-us.apache.org/repos/asf/hbase/blob/b3e41c95/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActiveNameNodeAction.java
----------------------------------------------------------------------
diff --git a/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActiveNameNodeAction.java b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActiveNameNodeAction.java
new file mode 100644
index 0000000..645743a
--- /dev/null
+++ b/hbase-it/src/test/java/org/apache/hadoop/hbase/chaos/actions/RestartActiveNameNodeAction.java
@@ -0,0 +1,90 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.hadoop.hbase.chaos.actions;
+
+import java.util.List;
+
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.hbase.ServerName;
+import org.apache.hadoop.hbase.util.FSUtils;
+import org.apache.hadoop.hbase.zookeeper.RecoverableZooKeeper;
+import org.apache.hadoop.hbase.zookeeper.ZKUtil;
+import org.apache.hadoop.hbase.zookeeper.ZKWatcher;
+import org.apache.hadoop.hbase.zookeeper.ZNodePaths;
+import org.apache.hadoop.hdfs.DFSUtil;
+import org.apache.hadoop.hdfs.HAUtil;
+import org.apache.hadoop.hdfs.server.namenode.ha.proto.HAZKInfoProtos.ActiveNodeInfo;
+
+/**
+ * Action that tries to restart the active namenode.
+ */
+public class RestartActiveNameNodeAction extends RestartActionBaseAction {
+
+  // Value taken from org.apache.hadoop.ha.ActiveStandbyElector.java, variable :- LOCK_FILENAME
+  private static final String ACTIVE_NN_LOCK_NAME = "ActiveStandbyElectorLock";
+
+  // Value taken from org.apache.hadoop.ha.ZKFailoverController.java
+  // variable :- ZK_PARENT_ZNODE_DEFAULT and ZK_PARENT_ZNODE_KEY
+  private static final String ZK_PARENT_ZNODE_DEFAULT = "/hadoop-ha";
+  private static final String ZK_PARENT_ZNODE_KEY = "ha.zookeeper.parent-znode";
+
+  public RestartActiveNameNodeAction(long sleepTime) {
+    super(sleepTime);
+  }
+
+  @Override
+  public void perform() throws Exception {
+    LOG.info("Performing action: Restart active namenode");
+    Configuration conf = FSUtils.getRootDir(getConf()).getFileSystem(getConf()).getConf();
+    String nameServiceID = DFSUtil.getNamenodeNameServiceId(conf);
+    if (!HAUtil.isHAEnabled(conf, nameServiceID)) {
+      throw new Exception("HA for namenode is not enabled");
+    }
+    ZKWatcher zkw = null;
+    RecoverableZooKeeper rzk = null;
+    String activeNamenode = null;
+    String hadoopHAZkNode = conf.get(ZK_PARENT_ZNODE_KEY, ZK_PARENT_ZNODE_DEFAULT);
+    try {
+      zkw = new ZKWatcher(conf, "get-active-namenode", null);
+      rzk = zkw.getRecoverableZooKeeper();
+      String hadoopHAZkNodePath = ZNodePaths.joinZNode(hadoopHAZkNode, nameServiceID);
+      List<String> subChildern = ZKUtil.listChildrenNoWatch(zkw, hadoopHAZkNodePath);
+      for (String eachEntry : subChildern) {
+        if (eachEntry.contains(ACTIVE_NN_LOCK_NAME)) {
+          byte[] data =
+              rzk.getData(ZNodePaths.joinZNode(hadoopHAZkNodePath, ACTIVE_NN_LOCK_NAME), false,
+                null);
+          ActiveNodeInfo proto = ActiveNodeInfo.parseFrom(data);
+          activeNamenode = proto.getHostname();
+        }
+      }
+    } finally {
+      if (zkw != null) {
+        zkw.close();
+      }
+    }
+    if (activeNamenode == null) {
+      throw new Exception("No active Name node found in zookeeper under " + hadoopHAZkNode);
+    }
+    LOG.info("Found active namenode host:" + activeNamenode);
+    ServerName activeNNHost = ServerName.valueOf(activeNamenode, -1, -1);
+    LOG.info("Restarting Active NameNode :" + activeNamenode);
+    restartNameNode(activeNNHost, sleepTime);
+  }
+}

http://git-wip-us.apache.org/repos/asf/hbase/blob/b3e41c95/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java
index 59a0059..a1b474d 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/HBaseCluster.java
@@ -19,6 +19,7 @@ package org.apache.hadoop.hbase;
 
 import java.io.Closeable;
 import java.io.IOException;
+
 import org.apache.hadoop.conf.Configurable;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.hbase.util.Threads;
@@ -248,6 +249,42 @@ public abstract class HBaseCluster implements Closeable, Configurable {
     throws IOException;
 
   /**
+   * Starts a new namenode on the given hostname or if this is a mini/local cluster, silently logs
+   * warning message.
+   * @throws IOException if something goes wrong
+   */
+  public abstract void startNameNode(ServerName serverName) throws IOException;
+
+  /**
+   * Kills the namenode process if this is a distributed cluster, otherwise, this causes master to
+   * exit doing basic clean up only.
+   * @throws IOException if something goes wrong
+   */
+  public abstract void killNameNode(ServerName serverName) throws IOException;
+
+  /**
+   * Stops the namenode if this is a distributed cluster, otherwise silently logs warning message.
+   * @throws IOException if something goes wrong
+   */
+  public abstract void stopNameNode(ServerName serverName) throws IOException;
+
+  /**
+   * Wait for the specified namenode to join the cluster
+   * @return whether the operation finished with success
+   * @throws IOException if something goes wrong or timeout occurs
+   */
+  public abstract void waitForNameNodeToStart(ServerName serverName, long timeout)
+      throws IOException;
+
+  /**
+   * Wait for the specified namenode to stop
+   * @return whether the operation finished with success
+   * @throws IOException if something goes wrong or timeout occurs
+   */
+  public abstract void waitForNameNodeToStop(ServerName serverName, long timeout)
+      throws IOException;
+
+  /**
    * Starts a new master on the given hostname or if this is a mini/local cluster,
    * starts a master locally.
    * @param hostname the hostname to start the master on

http://git-wip-us.apache.org/repos/asf/hbase/blob/b3e41c95/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java
index 5eb7218..473eb74 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/MiniHBaseCluster.java
@@ -24,6 +24,7 @@ import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
+
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.hbase.master.HMaster;
@@ -349,6 +350,31 @@ public class MiniHBaseCluster extends HBaseCluster {
   }
 
   @Override
+  public void startNameNode(ServerName serverName) throws IOException {
+    LOG.warn("Starting namenodes on mini cluster is not supported");
+  }
+
+  @Override
+  public void killNameNode(ServerName serverName) throws IOException {
+    LOG.warn("Aborting namenodes on mini cluster is not supported");
+  }
+
+  @Override
+  public void stopNameNode(ServerName serverName) throws IOException {
+    LOG.warn("Stopping namenodes on mini cluster is not supported");
+  }
+
+  @Override
+  public void waitForNameNodeToStart(ServerName serverName, long timeout) throws IOException {
+    LOG.warn("Waiting for namenodes to start on mini cluster is not supported");
+  }
+
+  @Override
+  public void waitForNameNodeToStop(ServerName serverName, long timeout) throws IOException {
+    LOG.warn("Waiting for namenodes to stop on mini cluster is not supported");
+  }
+
+  @Override
   public void startMaster(String hostname, int port) throws IOException {
     this.startMaster();
   }