You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by ns...@apache.org on 2011/10/11 04:25:07 UTC

svn commit: r1181604 - in /hbase/branches/0.89/src: main/java/org/apache/hadoop/hbase/regionserver/ test/java/org/apache/hadoop/hbase/regionserver/

Author: nspiegelberg
Date: Tue Oct 11 02:25:07 2011
New Revision: 1181604

URL: http://svn.apache.org/viewvc?rev=1181604&view=rev
Log:
Fix RegionServer to correctly retry the failure of closing a region.

Summary:
The region server had a bug where when a region close failed, the
zookeeper was stuck in the state "REGION_CLOSING" for the specified region.
All subsequent retries of closing the region failed since the state in the
zookeeper suggested the region was already being closed by someone.

This needs to be fixed so that, if the re-try is by the same RS, the zookeeper
state can be ignored and a retry of the close region should be performed.

Test Plan:
1) Unit test.
2) Run all tests under hbase.regionserver.

Reviewed By: nspiegelberg
Reviewers: kranganathan, kannan, nspiegelberg
Commenters: liyintang
CC: , hbase@lists, liyintang, nspiegelberg
Revert Plan:
Tags:

- begin *PUBLIC* platform impact section -
Bugzilla: #
- end platform impact -

Differential Revision: 281691
Task ID: 619745

Added:
    hbase/branches/0.89/src/test/java/org/apache/hadoop/hbase/regionserver/TestHRegionClose.java
    hbase/branches/0.89/src/test/java/org/apache/hadoop/hbase/regionserver/TestHRegionCloseRetry.java
Modified:
    hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
    hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/RSZookeeperUpdater.java

Modified: hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java?rev=1181604&r1=1181603&r2=1181604&view=diff
==============================================================================
--- hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java (original)
+++ hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/HRegionServer.java Tue Oct 11 02:25:07 2011
@@ -168,6 +168,10 @@ public class HRegionServer implements HR
   protected final Map<Integer, HRegion> onlineRegions =
     new ConcurrentHashMap<Integer, HRegion>();
 
+  // This a list of regions that we need to re-try closing.
+  protected final Map<Integer, HRegion> retryCloseRegions = Collections
+      .synchronizedMap(new HashMap<Integer, HRegion>());
+
   protected final ReentrantReadWriteLock lock = new ReentrantReadWriteLock();
   private final LinkedBlockingQueue<HMsg> outboundMsgs =
     new LinkedBlockingQueue<HMsg>();
@@ -1658,8 +1662,17 @@ public class HRegionServer implements HR
       zkUpdater.startRegionCloseEvent(null, false);
     }
     HRegion region = this.removeFromOnlineRegions(hri);
+    if (region == null) {
+      region = this.removeFromRetryCloseRegions(hri);
+    }
     if (region != null) {
-      region.close();
+      try {
+        region.close();
+      } catch (IOException e) {
+        // If region closing fails, add it to retry map.
+        this.addToRetryCloseRegions(region);
+        throw e;
+      }
       if(reportWhenCompleted) {
         if(zkUpdater != null) {
           HMsg hmsg = new HMsg(HMsg.Type.MSG_REPORT_CLOSE, hri, null);
@@ -2332,6 +2345,31 @@ public class HRegionServer implements HR
   }
 
   /**
+   * This method removes HRegion corresponding to hri from the Map of regions to
+   * retry closing.
+   *
+   * @param hri
+   *          the HRegionInfo corresponding to the HRegion to-be-removed.
+   * @return the removed HRegion, or null if the HRegion was not in
+   *         onlineRegions.
+   */
+  HRegion removeFromRetryCloseRegions(HRegionInfo hri) {
+    return this.retryCloseRegions.remove(Bytes.mapKey(hri.getRegionName()));
+  }
+
+  /**
+   * This method adds HRegion corresponding to hri from the Map of regions to
+   * retry closing
+   *
+   * @param hri
+   *          the HRegionInfo corresponding to the HRegion to-be-added.
+   */
+  void addToRetryCloseRegions(HRegion hr) {
+    this.retryCloseRegions.put(
+        Bytes.mapKey(hr.getRegionInfo().getRegionName()), hr);
+  }
+
+  /**
    * @return A new Map of online regions sorted by region size with the first
    * entry being the biggest.
    */

Modified: hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/RSZookeeperUpdater.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/RSZookeeperUpdater.java?rev=1181604&r1=1181603&r2=1181604&view=diff
==============================================================================
--- hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/RSZookeeperUpdater.java (original)
+++ hbase/branches/0.89/src/main/java/org/apache/hadoop/hbase/regionserver/RSZookeeperUpdater.java Tue Oct 11 02:25:07 2011
@@ -12,6 +12,7 @@ import org.apache.hadoop.hbase.HMsg;
 import org.apache.hadoop.hbase.executor.RegionTransitionEventData;
 import org.apache.hadoop.hbase.executor.HBaseEventHandler;
 import org.apache.hadoop.hbase.executor.HBaseEventHandler.HBaseEventType;
+import org.apache.hadoop.hbase.util.Bytes;
 import org.apache.hadoop.hbase.util.Writables;
 import org.apache.hadoop.hbase.zookeeper.ZooKeeperWrapper;
 import org.apache.zookeeper.CreateMode;
@@ -60,13 +61,32 @@ public class RSZookeeperUpdater {
   public void startRegionCloseEvent(HMsg hmsg, boolean updatePeriodically) throws IOException {
     // if this ZNode already exists, something is wrong
     if(zkWrapper.exists(regionZNode, true)) {
-      String msg = "ZNode " + regionZNode + " already exists in ZooKeeper, will NOT close region.";
-      LOG.error(msg);
-      throw new IOException(msg);
-    }
+      zkWrapper.sync(regionZNode);
 
-    // create the region node in the unassigned directory first
-    zkWrapper.createZNodeIfNotExists(regionZNode, null, CreateMode.PERSISTENT, true);
+      // Get RS which created the node.
+      RegionTransitionEventData rsData = null;
+      Stat stat = new Stat();
+      byte[] data = zkWrapper.readZNode(regionZNode, stat);
+      rsData = new RegionTransitionEventData();
+      Writables.getWritable(data, rsData);
+
+      // If the RS that created this node is not the current RS, then throw an
+      // Exception since some other RS is trying to close the region, otherwise
+      // continue since the current RS is re-trying to close the region.
+      if (!rsData.getRsName().equals(regionServerName)) {
+        String msg = "ZNode " + regionZNode
+            + " already exists in ZooKeeper, was created by " +
+            rsData.getRsName() + " hence regionserver " +
+            regionServerName + " cannot close it";
+        LOG.error(msg);
+        throw new IOException(msg);
+      }
+      this.zkVersion = stat.getVersion();
+    } else {
+      // create the region node in the unassigned directory first
+      zkWrapper.createZNodeIfNotExists(regionZNode, null, CreateMode.PERSISTENT,
+          true);
+    }
 
     // update the data for "regionName" ZNode in unassigned to CLOSING
     updateZKWithEventData(HBaseEventType.RS2ZK_REGION_CLOSING, hmsg);

Added: hbase/branches/0.89/src/test/java/org/apache/hadoop/hbase/regionserver/TestHRegionClose.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89/src/test/java/org/apache/hadoop/hbase/regionserver/TestHRegionClose.java?rev=1181604&view=auto
==============================================================================
--- hbase/branches/0.89/src/test/java/org/apache/hadoop/hbase/regionserver/TestHRegionClose.java (added)
+++ hbase/branches/0.89/src/test/java/org/apache/hadoop/hbase/regionserver/TestHRegionClose.java Tue Oct 11 02:25:07 2011
@@ -0,0 +1,89 @@
+package org.apache.hadoop.hbase.regionserver;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.Map;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.HRegionInfo;
+import org.apache.hadoop.hbase.HServerAddress;
+import org.apache.hadoop.hbase.HServerInfo;
+import org.apache.hadoop.hbase.client.HConnection;
+import org.apache.hadoop.hbase.client.HTable;
+import org.apache.hadoop.hbase.client.ServerConnectionManager;
+import org.apache.hadoop.hbase.executor.HBaseEventHandler.HBaseEventType;
+import org.apache.hadoop.hbase.executor.RegionTransitionEventData;
+import org.apache.hadoop.hbase.ipc.HRegionInterface;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hbase.util.Writables;
+import org.apache.hadoop.hbase.zookeeper.ZooKeeperWrapper;
+import org.apache.zookeeper.data.Stat;
+
+import static org.junit.Assert.*;
+
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+
+public class TestHRegionClose {
+  final Log LOG = LogFactory.getLog(getClass());
+  private final static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
+  private static byte[][] FAMILIES = { Bytes.toBytes("f1"),
+      Bytes.toBytes("f2"), Bytes.toBytes("f3"), Bytes.toBytes("f4") };
+
+  @BeforeClass
+  public static void setUpBeforeClass() throws Exception {
+    TEST_UTIL.startMiniCluster(3);
+  }
+
+  @AfterClass
+  public static void tearDownAfterClass() throws Exception {
+    TEST_UTIL.shutdownMiniCluster();
+  }
+
+  @Test
+  public void testCloseHRegion() throws Exception {
+    // Build some data.
+    byte[] tableName = Bytes.toBytes("testCloseHRegion");
+    TEST_UTIL.createTable(tableName, FAMILIES);
+    HTable table = new HTable(tableName);
+    for (int i = 0; i < FAMILIES.length; i++) {
+      byte[] columnFamily = FAMILIES[i];
+      TEST_UTIL.createMultiRegions(table, columnFamily);
+      TEST_UTIL.loadTable(table, columnFamily);
+    }
+
+    // Pick a regionserver.
+    Configuration conf = TEST_UTIL.getConfiguration();
+    HRegionServer server = TEST_UTIL.getHBaseCluster().getRegionServer(0);
+
+    HRegion[] region = server.getOnlineRegionsAsArray();
+    HRegionInfo regionInfo = region[0].getRegionInfo();
+
+    // Some initializtion relevant to zk.
+    ZooKeeperWrapper zkWrapper = ZooKeeperWrapper.getInstance(conf, server
+        .getHServerInfo().getServerName());
+    String regionZNode = zkWrapper.getZNode(
+        zkWrapper.getRegionInTransitionZNode(), regionInfo.getEncodedName());
+
+    server.closeRegion(regionInfo, true);
+
+    byte[] data = zkWrapper.readZNode(regionZNode, new Stat());
+    RegionTransitionEventData rsData = new RegionTransitionEventData();
+    Writables.getWritable(data, rsData);
+
+    // Verify region is closed.
+    assertNull(server.getOnlineRegion(regionInfo.getRegionName()));
+    assertEquals(HBaseEventType.RS2ZK_REGION_CLOSED, rsData.getHbEvent());
+  }
+}

Added: hbase/branches/0.89/src/test/java/org/apache/hadoop/hbase/regionserver/TestHRegionCloseRetry.java
URL: http://svn.apache.org/viewvc/hbase/branches/0.89/src/test/java/org/apache/hadoop/hbase/regionserver/TestHRegionCloseRetry.java?rev=1181604&view=auto
==============================================================================
--- hbase/branches/0.89/src/test/java/org/apache/hadoop/hbase/regionserver/TestHRegionCloseRetry.java (added)
+++ hbase/branches/0.89/src/test/java/org/apache/hadoop/hbase/regionserver/TestHRegionCloseRetry.java Tue Oct 11 02:25:07 2011
@@ -0,0 +1,116 @@
+/**
+ *
+ */
+package org.apache.hadoop.hbase.regionserver;
+
+import java.io.File;
+import java.io.IOException;
+import java.util.Iterator;
+import java.util.Map;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.HRegionInfo;
+import org.apache.hadoop.hbase.HServerAddress;
+import org.apache.hadoop.hbase.HServerInfo;
+import org.apache.hadoop.hbase.client.HConnection;
+import org.apache.hadoop.hbase.client.HTable;
+import org.apache.hadoop.hbase.client.ServerConnectionManager;
+import org.apache.hadoop.hbase.executor.HBaseEventHandler.HBaseEventType;
+import org.apache.hadoop.hbase.executor.RegionTransitionEventData;
+import org.apache.hadoop.hbase.ipc.HRegionInterface;
+import org.apache.hadoop.hbase.util.Bytes;
+import org.apache.hadoop.hbase.util.Writables;
+import org.apache.hadoop.hbase.zookeeper.ZooKeeperWrapper;
+import org.apache.zookeeper.data.Stat;
+
+import static org.junit.Assert.*;
+
+import org.junit.AfterClass;
+import org.junit.BeforeClass;
+import org.junit.Test;
+
+
+/**
+ * @author pritam
+ */
+public class TestHRegionCloseRetry {
+  final Log LOG = LogFactory.getLog(getClass());
+  private final static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
+  private static byte[][] FAMILIES = { Bytes.toBytes("f1"),
+      Bytes.toBytes("f2"), Bytes.toBytes("f3"), Bytes.toBytes("f4") };
+
+  @BeforeClass
+  public static void setUpBeforeClass() throws Exception {
+    TEST_UTIL.startMiniCluster(3);
+  }
+
+  @AfterClass
+  public static void tearDownAfterClass() throws Exception {
+    TEST_UTIL.shutdownMiniCluster();
+  }
+
+  @Test
+  public void testCloseHRegionRetry() throws Exception {
+
+    // Build some data.
+    byte[] tableName = Bytes.toBytes("testCloseHRegionRetry");
+    TEST_UTIL.createTable(tableName, FAMILIES);
+    HTable table = new HTable(tableName);
+    for (int i = 0; i < FAMILIES.length; i++) {
+      byte[] columnFamily = FAMILIES[i];
+      TEST_UTIL.createMultiRegions(table, columnFamily);
+      TEST_UTIL.loadTable(table, columnFamily);
+    }
+
+    // Pick a regionserver.
+    Configuration conf = TEST_UTIL.getConfiguration();
+    HRegionServer server = TEST_UTIL.getHBaseCluster().getRegionServer(0);
+
+    // Some initializtion relevant to zk.
+    HRegion[] region = server.getOnlineRegionsAsArray();
+    assertTrue(region.length != 0);
+    HRegionInfo regionInfo = region[0].getRegionInfo();
+    ZooKeeperWrapper zkWrapper = ZooKeeperWrapper.getInstance(conf, server
+        .getHServerInfo().getServerName());
+    String regionZNode = zkWrapper.getZNode(
+        zkWrapper.getRegionInTransitionZNode(), regionInfo.getEncodedName());
+
+    // Ensure region is online before closing.
+    assertNotNull(server.getOnlineRegion(regionInfo.getRegionName()));
+
+    TEST_UTIL.getDFSCluster().shutdownDataNodes();
+    try {
+      server.closeRegion(regionInfo, true);
+    } catch (IOException e) {
+      LOG.warn(e);
+      TEST_UTIL.getDFSCluster().startDataNodes(conf, 3, true, null, null);
+      TEST_UTIL.getDFSCluster().waitClusterUp();
+      LOG.info("New DFS Cluster up");
+      // Close region should fail since filesystem wad down. The region should
+      // be in the state "CLOSING"
+      Stat stat = new Stat();
+      assertTrue(zkWrapper.exists(regionZNode, false));
+      byte[] data = zkWrapper.readZNode(regionZNode, stat);
+      RegionTransitionEventData rsData = new RegionTransitionEventData();
+      Writables.getWritable(data, rsData);
+      assertEquals(rsData.getHbEvent(), HBaseEventType.RS2ZK_REGION_CLOSING);
+
+      // Now try to close the region again.
+      LOG.info("Retrying close region");
+      server.closeRegion(regionInfo, true);
+      data = zkWrapper.readZNode(regionZNode, stat);
+      rsData = new RegionTransitionEventData();
+      Writables.getWritable(data, rsData);
+
+      // Verify region is closed.
+      assertNull(server.getOnlineRegion(regionInfo.getRegionName()));
+      assertEquals(HBaseEventType.RS2ZK_REGION_CLOSED, rsData.getHbEvent());
+      return;
+    }
+    fail("Close of region did not fail, even though filesystem was down");
+  }
+}