You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by st...@apache.org on 2011/04/05 06:09:15 UTC
svn commit: r1088881 - in /hbase/branches/0.90/bin: graceful_stop.sh hbase
hbase-daemons.sh region_mover.rb
Author: stack
Date: Tue Apr 5 04:09:14 2011
New Revision: 1088881
URL: http://svn.apache.org/viewvc?rev=1088881&view=rev
Log:
HBASE-3071 Graceful decommissioning of a regionserver
Added:
hbase/branches/0.90/bin/graceful_stop.sh
hbase/branches/0.90/bin/region_mover.rb
Modified:
hbase/branches/0.90/bin/hbase
hbase/branches/0.90/bin/hbase-daemons.sh
Added: hbase/branches/0.90/bin/graceful_stop.sh
URL: http://svn.apache.org/viewvc/hbase/branches/0.90/bin/graceful_stop.sh?rev=1088881&view=auto
==============================================================================
--- hbase/branches/0.90/bin/graceful_stop.sh (added)
+++ hbase/branches/0.90/bin/graceful_stop.sh Tue Apr 5 04:09:14 2011
@@ -0,0 +1,83 @@
+#!/usr/bin/env bash
+#
+#/**
+# * Copyright 2011 The Apache Software Foundation
+# *
+# * Licensed to the Apache Software Foundation (ASF) under one
+# * or more contributor license agreements. See the NOTICE file
+# * distributed with this work for additional information
+# * regarding copyright ownership. The ASF licenses this file
+# * to you under the Apache License, Version 2.0 (the
+# * "License"); you may not use this file except in compliance
+# * with the License. You may obtain a copy of the License at
+# *
+# * http://www.apache.org/licenses/LICENSE-2.0
+# *
+# * Unless required by applicable law or agreed to in writing, software
+# * distributed under the License is distributed on an "AS IS" BASIS,
+# * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# * See the License for the specific language governing permissions and
+# * limitations under the License.
+# */
+
+# Move regions off a server then stop it. Optionally restart and reload.
+# Turn off the balancer before running this script.
+function usage {
+ echo "Usage: graceful_stop.sh [--config <conf-dir>] [--restart] [--reload] <hostname>"
+ echo " restart If we should restart after graceful stop"
+ echo " reload Move offloaded regions back on to the stopped server"
+ echo " debug Move offloaded regions back on to the stopped server"
+ echo " hostname Hostname of server we are to stop"
+ exit 1
+}
+
+if [ $# -lt 1 ]; then
+ usage
+fi
+
+bin=`dirname "$0"`
+bin=`cd "$bin">/dev/null; pwd`
+# This will set HBASE_HOME, etc.
+. "$bin"/hbase-config.sh
+# Get arguments
+restart=
+reload=
+debug=
+while [ $# -gt 0 ]
+do
+ case "$1" in
+ --restart) restart=true; shift;;
+ --reload) reload=true; shift;;
+ --debug) debug="--debug"; shift;;
+ --) shift; break;;
+ -*) usage ;;
+ *) break;; # terminate while loop
+ esac
+done
+
+# "$@" contains the rest. Must be at least the hostname left.
+if [ $# -lt 1 ]; then
+ usage
+fi
+
+hostname=$1
+filename="/tmp/$hostname"
+# Run the region mover script.
+echo "Unloading $hostname region(s)"
+HBASE_NOEXEC=true "$bin"/hbase org.jruby.Main "$bin"/region_mover.rb --file=$filename $debug unload $hostname
+echo "Unloaded $hostname region(s)"
+# Stop the server. Have to put hostname into its own little file for hbase-daemons.sh
+hosts="/tmp/$(basename $0).$$.tmp"
+echo $hostname >> $hosts
+"$bin"/hbase-daemons.sh --hosts ${hosts} stop regionserver
+if [ "$restart" != "" ]; then
+ "$bin"/hbase-daemons.sh --hosts ${hosts} start regionserver
+ if [ "$reload" != "" ]; then
+ echo "Reloading $hostname region(s)"
+ HBASE_NOEXEC=true "$bin"/hbase org.jruby.Main "$bin"/region_mover.rb --file=$filename $debug load $hostname
+ echo "Reloaded $hostname region(s)"
+ fi
+fi
+
+# Cleanup tmp files.
+trap "rm -f "/tmp/$(basename $0).*.tmp" &> /dev/null" EXIT
Modified: hbase/branches/0.90/bin/hbase
URL: http://svn.apache.org/viewvc/hbase/branches/0.90/bin/hbase?rev=1088881&r1=1088880&r2=1088881&view=diff
==============================================================================
--- hbase/branches/0.90/bin/hbase (original)
+++ hbase/branches/0.90/bin/hbase Tue Apr 5 04:09:14 2011
@@ -268,5 +268,9 @@ if [ "x$JAVA_LIBRARY_PATH" != "x" ]; the
HBASE_OPTS="$HBASE_OPTS -Djava.library.path=$JAVA_LIBRARY_PATH"
fi
-# run it
-exec "$JAVA" $JAVA_HEAP_MAX $HBASE_OPTS -classpath "$CLASSPATH" $CLASS "$@"
+# Exec unless HBASE_NOEXEC is set.
+if [ "${HBASE_NOEXEC}" != "" ]; then
+ "$JAVA" $JAVA_HEAP_MAX $HBASE_OPTS -classpath "$CLASSPATH" $CLASS "$@"
+else
+ exec "$JAVA" $JAVA_HEAP_MAX $HBASE_OPTS -classpath "$CLASSPATH" $CLASS "$@"
+fi
Modified: hbase/branches/0.90/bin/hbase-daemons.sh
URL: http://svn.apache.org/viewvc/hbase/branches/0.90/bin/hbase-daemons.sh?rev=1088881&r1=1088880&r2=1088881&view=diff
==============================================================================
--- hbase/branches/0.90/bin/hbase-daemons.sh (original)
+++ hbase/branches/0.90/bin/hbase-daemons.sh Tue Apr 5 04:09:14 2011
@@ -38,7 +38,7 @@ bin=`cd "$bin">/dev/null; pwd`
. $bin/hbase-config.sh
remote_cmd="cd ${HBASE_HOME}; $bin/hbase-daemon.sh --config ${HBASE_CONF_DIR} $@"
-args="--config ${HBASE_CONF_DIR} $remote_cmd"
+args="--hosts ${HBASE_REGIONSERVERS} --config ${HBASE_CONF_DIR} $remote_cmd"
command=$2
case $command in
Added: hbase/branches/0.90/bin/region_mover.rb
URL: http://svn.apache.org/viewvc/hbase/branches/0.90/bin/region_mover.rb?rev=1088881&view=auto
==============================================================================
--- hbase/branches/0.90/bin/region_mover.rb (added)
+++ hbase/branches/0.90/bin/region_mover.rb Tue Apr 5 04:09:14 2011
@@ -0,0 +1,434 @@
+# Copyright 2011 The Apache Software Foundation
+#
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+
+# Moves regions. Will confirm region access in current location and will
+# not move a new region until successful confirm of region loading in new
+# location. Presumes balancer is disabled when we run (not harmful if its
+# on but this script and balancer will end up fighting each other).
+# Does not work for case of multiple regionservers all running on the
+# one node.
+require 'optparse'
+include Java
+import org.apache.hadoop.hbase.HConstants
+import org.apache.hadoop.hbase.HBaseConfiguration
+import org.apache.hadoop.hbase.client.HBaseAdmin
+import org.apache.hadoop.hbase.client.Get
+import org.apache.hadoop.hbase.client.Scan
+import org.apache.hadoop.hbase.client.HTable
+import org.apache.hadoop.hbase.client.HConnectionManager
+import org.apache.hadoop.hbase.filter.FirstKeyOnlyFilter;
+import org.apache.hadoop.hbase.HServerAddress
+import org.apache.hadoop.hbase.util.Bytes
+import org.apache.hadoop.hbase.util.Writables
+import org.apache.hadoop.conf.Configuration
+import org.apache.commons.logging.Log
+import org.apache.commons.logging.LogFactory
+
+# Name of this script
+NAME = "region_mover"
+
+# Get root table reference
+def getRootTable(config)
+ # Keep meta reference in ruby global
+ if not $ROOT
+ $ROOT = HTable.new(config, HConstants::ROOT_TABLE_NAME)
+ end
+ return $ROOT
+end
+
+# Get meta table reference
+def getMetaTable(config)
+ # Keep meta reference in ruby global
+ if not $META
+ $META = HTable.new(config, HConstants::META_TABLE_NAME)
+ end
+ return $META
+end
+
+# Get table instance.
+# Maintains cache of table instances.
+def getTable(config, name)
+ # Keep dictionary of tables in ruby global
+ if not $TABLES
+ $TABLES = {}
+ end
+ key = Bytes.toString(name)
+ if not $TABLES[key]
+ $TABLES[key] = HTable.new(config, name)
+ end
+ return $TABLES[key]
+end
+
+
+# Returns true if passed region is still on 'original' when we look at .META.
+def isSameServer(admin, r, original)
+ server = getServerNameForRegion(admin, r)
+ return false unless server
+ return true unless original
+ return server == original
+end
+
+class RubyAbortable
+ include org.apache.hadoop.hbase.Abortable
+ def abort(why, e)
+ puts "ABORTED! why=" + why + ", e=" + e.to_s
+ end
+end
+
+# Get servername that is up in .META.; this is hostname + port + startcode comma-delimited.
+# Can return nil
+def getServerNameForRegion(admin, r)
+ if r.isRootRegion()
+ # Hack
+ tracker = org.apache.hadoop.hbase.zookeeper.RootRegionTracker.new(admin.getConnection().getZooKeeperWatcher(), RubyAbortable.new())
+ tracker.start()
+ while not tracker.isLocationAvailable()
+ sleep 0.1
+ end
+ # Make a fake servername by appending ','
+ rootServer = tracker.getRootRegionLocation().toString() + ","
+ tracker.stop()
+ return rootServer
+ end
+ table = nil
+ if r.isMetaRegion()
+ table = getRootTable(admin.getConfiguration())
+ else
+ table = getMetaTable(admin.getConfiguration())
+ end
+ g = Get.new(r.getRegionName())
+ g.addColumn(HConstants::CATALOG_FAMILY, HConstants::SERVER_QUALIFIER)
+ g.addColumn(HConstants::CATALOG_FAMILY, HConstants::STARTCODE_QUALIFIER)
+ result = table.get(g)
+ server = result.getValue(HConstants::CATALOG_FAMILY, HConstants::SERVER_QUALIFIER)
+ startcode = result.getValue(HConstants::CATALOG_FAMILY, HConstants::STARTCODE_QUALIFIER)
+ return nil unless server
+ return java.lang.String.new(Bytes.toString(server)).replaceFirst(":", ",") + "," + Bytes.toLong(startcode).to_s
+end
+
+# Trys to scan a row from passed region
+# Throws exception if can't
+def isSuccessfulScan(admin, r)
+ scan = Scan.new(r.getStartKey())
+ scan.setBatch(1)
+ scan.setCaching(1)
+ scan.setFilter(FirstKeyOnlyFilter.new())
+ table = getTable(admin.getConfiguration(), r.getTableDesc().getName())
+ scanner = table.getScanner(scan)
+ begin
+ results = scanner.next()
+ # We might scan into next region, this might be an empty table.
+ # But if no exception, presume scanning is working.
+ ensure
+ scanner.close()
+ table.close()
+ end
+end
+
+# Check region has moved successful and is indeed hosted on another server
+# Wait until that is the case.
+def move(admin, r, newServer, original)
+ # Now move it. Do it in a loop so can retry if fail. Have seen issue where
+ # we tried move region but failed and retry put it back on old location;
+ # retry in this case.
+ retries = admin.getConfiguration.getInt("hbase.move.retries.max", 5)
+ count = 0
+ same = true
+ while count < retries and same
+ if count > 0
+ $LOG.info("Retry " + count.to_s + " of maximum " + retries.to_s)
+ end
+ count = count + 1
+ begin
+ admin.move(Bytes.toBytes(r.getEncodedName()), Bytes.toBytes(newServer))
+ rescue java.lang.reflect.UndeclaredThrowableException => e
+ $LOG.info("Exception moving " + r.getEncodedName() +
+ "; split/moved? Continuing: " + e)
+ return
+ end
+ # Wait till its up on new server before moving on
+ maxWaitInSeconds = admin.getConfiguration.getInt("hbase.move.wait.max", 60)
+ maxWait = Time.now + maxWaitInSeconds
+ while Time.now < maxWait
+ same = isSameServer(admin, r, original)
+ break unless same
+ sleep 0.1
+ end
+ end
+ raise RuntimeError, "Region stuck on #{original}, newserver=#{newServer}" if same
+ # Assert can Scan from new location.
+ isSuccessfulScan(admin, r)
+end
+
+# Return the hostname portion of a servername (all up to first ',')
+def getHostnamePortFromServerName(serverName)
+ parts = serverName.split(',')
+ return parts[0] + ":" + parts[1]
+end
+
+# Return the hostname:port out of a servername (all up to first ',')
+def getHostnameFromServerName(serverName)
+ return serverName.split(',')[0]
+end
+
+# Return array of servernames where servername is hostname+port+startcode
+# comma-delimited
+def getServers(admin)
+ serverInfos = admin.getClusterStatus().getServerInfo()
+ servers = []
+ for server in serverInfos
+ servers << server.getServerName()
+ end
+ return servers
+end
+
+# Remove the servername whose hostname portion matches from the passed
+# array of servers. Returns as side-effect the servername removed.
+def stripServer(servers, hostname)
+ count = servers.length
+ servername = nil
+ for server in servers
+ if getHostnameFromServerName(server) == hostname
+ servername = servers.delete(server)
+ end
+ end
+ # Check server to exclude is actually present
+ raise RuntimeError, "Server %s not online" % hostname unless servers.length < count
+ return servername
+end
+
+# Return servername that matches passed hostname
+def getServerName(servers, hostname)
+ servername = nil
+ for server in servers
+ if getHostnameFromServerName(server) == hostname
+ servername = server
+ break
+ end
+ end
+ raise ArgumentError, "Server %s not online" % hostname unless servername
+ return servername
+end
+
+# Create a logger and disable the DEBUG-level annoying client logging
+def configureLogging(options)
+ apacheLogger = LogFactory.getLog(NAME)
+ # Configure log4j to not spew so much
+ unless (options[:debug])
+ logger = org.apache.log4j.Logger.getLogger("org.apache.hadoop.hbase.client")
+ logger.setLevel(org.apache.log4j.Level::INFO)
+ end
+ return apacheLogger
+end
+
+# Get configuration instance
+def getConfiguration()
+ config = HBaseConfiguration.create()
+ # No prefetching on .META.
+ config.setInt("hbase.client.prefetch.limit", 1)
+ # Make a config that retries at short intervals many times
+ config.setInt("hbase.client.pause", 500)
+ config.setInt("hbase.client.retries.number", 100)
+ return config
+end
+
+# Now get list of regions on targetServer
+def getRegions(config, servername)
+ connection = HConnectionManager::getConnection(config)
+ hsa = HServerAddress.new(getHostnamePortFromServerName(servername))
+ rs = connection.getHRegionConnection(hsa)
+ return rs.getOnlineRegions()
+end
+
+def deleteFile(filename)
+ f = java.io.File.new(filename)
+ f.delete() if f.exists()
+end
+
+# Write HRegionInfo to file
+# Need to serialize in case non-printable characters.
+# Format is count of regionnames followed by serialized regionnames.
+def writeFile(filename, regions)
+ fos = java.io.FileOutputStream.new(filename)
+ dos = java.io.DataOutputStream.new(fos)
+ # Write out a count of region names
+ dos.writeInt(regions.size())
+ # Write actual region names.
+ for r in regions
+ bytes = Writables.getBytes(r)
+ Bytes.writeByteArray(dos, bytes)
+ end
+ dos.close()
+end
+
+# See writeFile above.
+# Returns array of HRegionInfos
+def readFile(filename)
+ f = java.io.File.new(filename)
+ return java.util.ArrayList.new() unless f.exists()
+ fis = java.io.FileInputStream.new(f)
+ dis = java.io.DataInputStream.new(fis)
+ # Read count of regions
+ count = dis.readInt()
+ regions = java.util.ArrayList.new(count)
+ index = 0
+ while index < count
+ regions.add(Writables.getHRegionInfo(Bytes.readByteArray(dis)))
+ index = index + 1
+ end
+ dis.close()
+ return regions
+end
+
+# Move regions off the passed hostname
+def unloadRegions(options, hostname)
+ # Get configuration
+ config = getConfiguration()
+ # Clean up any old files.
+ filename = getFilename(options, hostname)
+ deleteFile(filename)
+ # Get an admin instance
+ admin = HBaseAdmin.new(config)
+ servers = getServers(admin)
+ # Remove the server we are unloading from from list of servers.
+ # Side-effect is the servername that matches this hostname
+ servername = stripServer(servers, hostname)
+ movedRegions = java.util.ArrayList.new()
+ while true
+ rs = getRegions(config, servername)
+ break if rs.length == 0
+ count = 0
+ $LOG.info("Moving " + rs.length.to_s + " region(s) from " + servername +
+ " during this cycle");
+ for r in rs
+ # Get a random server to move the region to.
+ server = servers[rand(servers.length)]
+ $LOG.info("Moving region " + r.getEncodedName() + " (" + count.to_s +
+ " of " + rs.length.to_s + ") to server=" + server);
+ count = count + 1
+ # Assert we can scan region in its current location
+ isSuccessfulScan(admin, r)
+ # Now move it.
+ move(admin, r, server, servername)
+ movedRegions.add(r)
+ end
+ end
+ if movedRegions.size() > 0
+ # Write out file of regions moved
+ writeFile(filename, movedRegions)
+ $LOG.info("Wrote list of moved regions to " + filename)
+ end
+end
+
+# Move regions to the passed hostname
+def loadRegions(options, hostname)
+ # Get configuration
+ config = getConfiguration()
+ # Get an admin instance
+ admin = HBaseAdmin.new(config)
+ filename = getFilename(options, hostname)
+ regions = readFile(filename)
+ return if regions.isEmpty()
+ servername = nil
+ # Wait till server is up
+ maxWaitInSeconds = admin.getConfiguration.getInt("hbase.serverstart.wait.max", 180)
+ maxWait = Time.now + maxWaitInSeconds
+ while Time.now < maxWait
+ servers = getServers(admin)
+ begin
+ servername = getServerName(servers, hostname)
+ rescue ArgumentError => e
+ $LOG.info("hostname=" + hostname.to_s + " is not up yet, waiting");
+ end
+ break if servername
+ sleep 0.5
+ end
+ $LOG.info("Moving " + regions.size().to_s + " regions to " + servername)
+ count = 0
+ for r in regions
+ exists = false
+ begin
+ exists = isSuccessfulScan(admin, r)
+ rescue org.apache.hadoop.hbase.NotServingRegionException => e
+ $LOG.info("Failed scan of " + e.message)
+ end
+ count = count + 1
+ next unless exists
+ currentServer = getServerNameForRegion(admin, r)
+ if currentServer and currentServer == servername
+ $LOG.info("Region " + r.getRegionNameAsString() + " (" + count.to_s +
+ " of " + regions.length.to_s + ") already on target server=" + servername)
+ next
+ end
+ $LOG.info("Moving region " + r.getEncodedName() + " (" + count.to_s +
+ " of " + regions.length.to_s + ") to server=" + servername);
+ move(admin, r, servername, currentServer)
+ end
+end
+
+def getFilename(options, targetServer)
+ filename = options[:file]
+ if not filename
+ filename = "/tmp/" + targetServer
+ end
+ return filename
+end
+
+
+# Do command-line parsing
+options = {}
+optparse = OptionParser.new do |opts|
+ opts.banner = "Usage: #{NAME}.rb [options] load|unload <hostname>"
+ opts.separator 'Load or unload regions by moving one at a time'
+ options[:file] = nil
+ opts.on('-f', '--filename=FILE', 'File to save regions list into unloading, or read from loading; default /tmp/<hostname>') do |file|
+ options[:file] = file
+ end
+ opts.on('-h', '--help', 'Display usage information') do
+ puts opts
+ exit
+ end
+ options[:debug] = false
+ opts.on('-d', '--debug', 'Display extra debug logging') do
+ options[:debug] = true
+ end
+end
+optparse.parse!
+
+# Check ARGVs
+if ARGV.length < 2
+ puts optparse
+ exit 1
+end
+hostname = ARGV[1]
+if not hostname
+ opts optparse
+ exit 2
+end
+# Create a logger and save it to ruby global
+$LOG = configureLogging(options)
+case ARGV[0]
+ when 'load'
+ loadRegions(options, hostname)
+ when 'unload'
+ unloadRegions(options, hostname)
+ else
+ puts optparse
+ exit 3
+end