You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by li...@apache.org on 2013/05/16 21:20:15 UTC

svn commit: r1483521 - in /hbase/branches/0.89-fb: bin/rolling_restart_v2 src/main/java/org/apache/hadoop/hbase/util/RollingRestart.java

Author: liyin
Date: Thu May 16 19:20:14 2013
New Revision: 1483521

URL: http://svn.apache.org/r1483521
Log:
[HBASE-8500] Minor Changes in RollingRestart + a basic wrapper around the utility

Author: rshroff

Summary:
Made small changes in RollingRestart class to be able to remotely
restart a region server.

The change also adds a very basic wrapper(groovy script) around the RollingRestart utility
to perform the RR for all/selected region servers in the cluster.

Test Plan: tested it multiple times on TSH025

Reviewers: aaiyer, liyintang, paultuckfield

Reviewed By: aaiyer

CC: hbase-eng@

Differential Revision: https://phabricator.fb.com/D807373

Task ID: 2229110

Added:
    hbase/branches/0.89-fb/bin/rolling_restart_v2
Modified:
    hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/RollingRestart.java

Added: hbase/branches/0.89-fb/bin/rolling_restart_v2
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/bin/rolling_restart_v2?rev=1483521&view=auto
==============================================================================
--- hbase/branches/0.89-fb/bin/rolling_restart_v2 (added)
+++ hbase/branches/0.89-fb/bin/rolling_restart_v2 Thu May 16 19:20:14 2013
@@ -0,0 +1,155 @@
+#!/bin/env /opt/hbase/bin/hbase-groovy
+import org.apache.hadoop.hbase.HBaseConfiguration; 
+import org.apache.hadoop.conf.Configuration; 
+import org.apache.hadoop.hbase.client.MetaScanner;
+import org.apache.hadoop.hbase.client.HBaseAdmin;
+import org.apache.hadoop.hbase.client.*;
+import org.apache.hadoop.hbase.master.*;
+import org.apache.hadoop.hbase.*;
+import org.apache.hadoop.hbase.util.Bytes;
+import java.io.ByteArrayInputStream;
+import org.apache.log4j.*
+import org.apache.commons.cli.*
+
+Logger.getLogger("org.apache.zookeeper").setLevel(Level.ERROR);
+Logger.getLogger("org.apache.hadoop.hbase").setLevel(Level.INFO);
+
+def restartRegionServer(HBaseAdmin admin, HServerInfo regionserver,
+    Configuration conf, String rollingRestartArgs, String logsFolder) {
+  long startCode = regionserver.getStartCode()/1000;
+  String host = regionserver.getHostname();
+
+  // Start the rolling restart
+
+  String cmd ="/usr/local/hadoop/" + conf.get("titan.cell.name") +
+    "-HBASE/bin/hbase org.apache.hadoop.hbase.util.RollingRestart" +
+    rollingRestartArgs + " -s " + host;
+
+  println "Performing rolling restart for host " + host;
+  println cmd;
+
+  File file = new File(logsFolder + "/rolling_restart_" + host + ".txt");
+
+  file.createNewFile();
+  FileOutputStream ostream = new FileOutputStream(file); 
+  println "Check the output at " + logsFolder + "/rolling_restart_" + host + ".txt";
+
+  def proc = cmd.execute()
+    proc.consumeProcessOutput(ostream, ostream);
+  proc.waitForOrKill(30 * 60 * 1000)
+
+  ostream.close();
+  long newStartCode = admin.getConnection().
+    getHRegionConnection(regionserver.getServerAddress()).getHServerInfo().getStartCode()/1000;
+
+  if (newStartCode <= startCode) {
+    throw new Exception("Rolling Restart failed for Regionserver " + host);
+  }
+}
+
+// START
+
+Options options = new Options();
+
+options.addOption("n", "nodes", true,
+      "Name of the region servers to restart");
+options.addOption("a", "all", false,
+      "Name of the region servers to restart");
+options.addOption("r", "sleep_after_restart", true,
+      "time interval after which the region server should be started assigning regions. Default : 10000ms");
+options.addOption("b", "sleep_before_restart", true,
+ 	    "time interval after which the region server should be restarted after draining. Default : 10000ms");
+options.addOption("d", "region_drain_interval", true,
+		  "time interval between region movements while draining. Default : 1000ms");
+options.addOption("u", "region_undrain_interval", true,
+	    "time interval between region movements while undraining. Default : 10000ms");
+options.addOption("g", "get_request_frequency", true,
+	    "frequency at which region checker will check for region availability. Default : 1000ms");
+options.addOption("c", "clear", false,
+ 		  "Clear all the regionserver from blacklist. Default : false");
+options.addOption("h", "dont_use_hadoopctl", false,
+		  "Don't hadoopctl to restart the regionserver. Default : true");
+options.addOption("f", "log_file_folder", true,
+		  "Default location where logs for rolling restart should be stored. Default : /tmp");
+
+if (args.length == 0) {
+  HelpFormatter formatter = new HelpFormatter();
+  formatter.printHelp("rolling_restart", options, true);
+  return;
+}
+
+CommandLineParser parser = new PosixParser();
+CommandLine cmd = parser.parse(options, args);
+
+String rollingRestartArgs = "";
+String logsFolder = "/tmp";
+hosts = null;
+
+if (cmd.hasOption('r')) {
+  rollingRestartArgs += " -r " + cmd.getOptionValue('r');
+}
+if (cmd.hasOption('b')) {
+  rollingRestartArgs += " -b " + cmd.getOptionValue('b');
+}
+if (cmd.hasOption('d')) {
+  rollingRestartArgs += " -d " + cmd.getOptionValue('d');
+}
+if (cmd.hasOption('u')) {
+  rollingRestartArgs += " -u " + cmd.getOptionValue('u');
+}
+if (cmd.hasOption('g')) {
+  rollingRestartArgs += " -g " + cmd.getOptionValue('g');
+}
+if (cmd.hasOption('c')) {
+  rollingRestartArgs += " -c " + cmd.getOptionValue('c');
+}
+if (cmd.hasOption('h')) {
+  rollingRestartArgs += " -h " + cmd.getOptionValue('h');
+}
+if (cmd.hasOption('f')) {
+  logsFolder = cmd.getOptionValue('f');
+}
+
+if (cmd.hasOption("n")) {
+  hosts = cmd.getOptionValue("n").split(',');
+  println hosts;
+} else if (cmd.hasOption('a')) {
+  println "Performing restart on all regionservers";
+} else {
+  HelpFormatter formatter = new HelpFormatter();
+  formatter.printHelp("rolling_restart", options, true);
+  return;
+}
+
+conf = HBaseConfiguration.create();
+admin = new HBaseAdmin(conf);
+
+PrintWriter errFileWriter = new PrintWriter(logsFolder + "/rolling_restart_failed_nodes");
+
+if (hosts != null) {
+  for (String server in hosts) {
+    HServerAddress serverAddr = new HServerAddress(server, 60020);
+    try {
+      HServerInfo host = 
+        admin.getConnection().getHRegionConnection(serverAddr).getHServerInfo();
+      restartRegionServer(admin, host, conf, rollingRestartArgs, logsFolder);
+    } catch (Exception e) {
+      e.printStackTrace();
+      println "\nERROR: "+ server + " Failed.";
+      errFileWriter.println(host.getHostname());
+    }
+  }
+} else {
+
+  for (host in admin.getClusterStatus().getServerInfo()) { 
+    try {
+      restartRegionServer(admin, host, conf, rollingRestartArgs, logsFolder);
+    } catch (Exception e) {
+      e.printStackTrace();
+      println "\nERROR: "+ host + " Failed.";
+      errFileWriter.println(host.getHostname());
+    }
+  } 
+}
+
+errFileWriter.close();

Modified: hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/RollingRestart.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/RollingRestart.java?rev=1483521&r1=1483520&r2=1483521&view=diff
==============================================================================
--- hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/RollingRestart.java (original)
+++ hbase/branches/0.89-fb/src/main/java/org/apache/hadoop/hbase/util/RollingRestart.java Thu May 16 19:20:14 2013
@@ -85,7 +85,6 @@ public class RollingRestart {
       currentState = STAGE.FAIL;
       return;
     }
-
     this.serverAddr = new HServerAddress(serverName, 60020);
 
     currentState = STAGE.SETUP;
@@ -164,22 +163,50 @@ public class RollingRestart {
    * @throws InterruptedException
    */
   void restart() throws IOException, InterruptedException {
-    System.out.println("Shutting down the region server");
+    System.out.println("Shutting down the region server after sleep of " +
+        this.sleepIntervalBeforeRestart);
     Thread.sleep(this.sleepIntervalBeforeRestart);
     String cellName = conf.get("titan.cell.name");
-    try {
+    String sshCmd = "ssh hadoop@" + serverAddr.getHostname();
 
+    try {
       if (this.useHadoopCtl) {
-        Process p = Runtime.getRuntime().exec("/usr/local/bin/hadoopctl restart regionserver");
+        sshCmd += " hadoopctl restart regionserver";
+        LOG.info("Executing " + sshCmd);
+        Process p = Runtime.getRuntime().exec(sshCmd);
+
         p.waitFor();
+
+        LOG.info("Exit value for the region server restart " + p.exitValue());
+
+        if (p.exitValue() != 0) {
+          LOG.error("Failed to restart. regionserver. Aborting..");
+          throw new IOException("Failed to restart regionserver. Aborting..");
+        }
       } else {
-        Process p = Runtime.getRuntime().exec("/usr/local/hadoop/" +
-            cellName + "-HBASE/bin/hbase-daemon.sh stop regionserver");
+        String sshCmdToStopRS = sshCmd + " /usr/local/hadoop/" +
+            cellName + "-HBASE/bin/hbase-daemon.sh stop regionserver";
+        LOG.info("Executing " + sshCmd);
+        Process p = Runtime.getRuntime().exec(sshCmdToStopRS);
         p.waitFor();
-        p = Runtime.getRuntime().exec("/usr/local/hadoop/" +
-            cellName + "-HBASE/bin/hbase-daemon.sh start regionserver");
+
+        LOG.info("Exit value for the region server stop " + p.exitValue());
+
+        if (p.exitValue() != 0) {
+          LOG.error("Failed to stop regionserver. Aborting..");
+          throw new IOException("Failed to stop regionserver. Aborting..");
+        }
+        String sshCmdToStartRS = sshCmd + " /usr/local/hadoop/" +
+            cellName + "-HBASE/bin/hbase-daemon.sh start regionserver ";
+        p = Runtime.getRuntime().exec(sshCmdToStartRS);
         p.waitFor();
-        LOG.info("Exit value for the restarter " + p.exitValue());
+
+        LOG.info("Exit value for the region server start " + p.exitValue());
+
+        if (p.exitValue() != 0) {
+          LOG.error("Failed to start regionserver. Aborting..");
+          throw new IOException("Failed to start regionserver. Aborting..");
+        }
       }
 
     } catch (IOException e1) {
@@ -210,6 +237,9 @@ public class RollingRestart {
 
     List<HServerAddress> serversForRegion = plan.getAssignment(region);
 
+    if (serversForRegion == null) {
+      return null;
+    }
     // Get the preferred region server from the Assignment Plan
     for (HServerAddress server : serversForRegion) {
       if (!server.equals(serverAddr)) {
@@ -402,7 +432,7 @@ public class RollingRestart {
         "Name of the region server to restart");
     options.addOption("r", "sleep_after_restart", true,
         "time interval after which the region server should be started assigning regions. Default : 10000ms");
-    options.addOption("r", "sleep_before_restart", true,
+    options.addOption("b", "sleep_before_restart", true,
         "time interval after which the region server should be restarted after draining. Default : 10000ms");
     options.addOption("d", "region_drain_interval", true,
         "time interval between region movements while draining. Default : 1000ms");
@@ -434,6 +464,7 @@ public class RollingRestart {
 
     if (cmd.hasOption("c")) {
       RollingRestart.clearAll();
+      return;
     }
 
     if (!cmd.hasOption("s")) {