You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by te...@apache.org on 2017/06/21 15:22:38 UTC
hbase git commit: HBASE-18167 OfflineMetaRepair tool may cause
HMaster to abort always
Repository: hbase
Updated Branches:
refs/heads/branch-1 532e0dda1 -> 01027f805
HBASE-18167 OfflineMetaRepair tool may cause HMaster to abort always
Signed-off-by: tedyu <yu...@gmail.com>
Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/01027f80
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/01027f80
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/01027f80
Branch: refs/heads/branch-1
Commit: 01027f805bb2315aa4ba694b61df1ae4a7a82ea9
Parents: 532e0dd
Author: Pankaj Kumar <pa...@huawei.com>
Authored: Mon Jun 19 11:01:32 2017 +0800
Committer: tedyu <yu...@gmail.com>
Committed: Wed Jun 21 08:22:30 2017 -0700
----------------------------------------------------------------------
.../apache/hadoop/hbase/MetaTableAccessor.java | 29 ++++++++
.../hadoop/hbase/master/AssignmentManager.java | 73 +++++++++++++++++---
.../util/hbck/TestOfflineMetaRebuildBase.java | 66 +++++++++++++++++-
3 files changed, 157 insertions(+), 11 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hbase/blob/01027f80/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java
----------------------------------------------------------------------
diff --git a/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java b/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java
index 2bbae15..04ab430 100644
--- a/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java
+++ b/hbase-client/src/main/java/org/apache/hadoop/hbase/MetaTableAccessor.java
@@ -25,6 +25,7 @@ import java.io.InterruptedIOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
+import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.NavigableMap;
@@ -575,6 +576,34 @@ public class MetaTableAccessor {
return hris;
}
+ /**
+ * Retrieve server names from meta table.
+ * @param connection connection we're using
+ * @return List of region servers.
+ * @throws IOException
+ */
+ public static Set<ServerName> getServerNames(Connection connection) throws IOException {
+ final Set<ServerName> serverNames = new HashSet<ServerName>();
+ // Fill the above serverNames set with server entries from hbase:meta
+ CollectingVisitor<Result> v = new CollectingVisitor<Result>() {
+ @Override
+ void add(Result r) {
+ if (r == null || r.isEmpty()) return;
+ RegionLocations locations = getRegionLocations(r);
+ if (locations == null) return;
+ for (HRegionLocation loc : locations.getRegionLocations()) {
+ if (loc != null) {
+ if (loc.getServerName() != null) {
+ serverNames.add(loc.getServerName());
+ }
+ }
+ }
+ }
+ };
+ fullScan(connection, v);
+ return serverNames;
+ }
+
public static void fullScanMetaAndPrint(Connection connection)
throws IOException {
Visitor v = new Visitor() {
http://git-wip-us.apache.org/repos/asf/hbase/blob/01027f80/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
index 0a28967..7927745 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/master/AssignmentManager.java
@@ -634,17 +634,10 @@ public class AssignmentManager extends ZooKeeperListener {
}
}
- Set<TableName> disabledOrDisablingOrEnabling = null;
Map<HRegionInfo, ServerName> allRegions = null;
-
if (!failover) {
- disabledOrDisablingOrEnabling = tableStateManager.getTablesInStates(
- ZooKeeperProtos.Table.State.DISABLED, ZooKeeperProtos.Table.State.DISABLING,
- ZooKeeperProtos.Table.State.ENABLING);
-
- // Clean re/start, mark all user regions closed before reassignment
- allRegions = regionStates.closeAllUserRegions(
- disabledOrDisablingOrEnabling);
+ // Retrieve user regions except tables region that are in disabled/disabling/enabling states.
+ allRegions = getUserRegionsToAssign();
}
// Now region states are restored
@@ -656,6 +649,15 @@ public class AssignmentManager extends ZooKeeperListener {
// Process list of dead servers and regions in RIT.
// See HBASE-4580 for more information.
processDeadServersAndRecoverLostRegions(deadServers);
+
+ // Handle the scenario when meta is rebuild by OfflineMetaRepair tool.
+ // In this scenario, meta will have only info:regioninfo entries (won't contain info:server)
+ // which lead SSH to skip holding region assignment.
+ if (MetaTableAccessor.getServerNames(server.getConnection()).isEmpty()) {
+ // Need to assign the user region as a fresh startup, otherwise user region assignment will
+ // never happen
+ assignRegionsOnSSHCompletion();
+ }
}
if (!failover && useZKForAssignment) {
@@ -685,6 +687,59 @@ public class AssignmentManager extends ZooKeeperListener {
return failover;
}
+ /*
+ * At cluster clean re/start, mark all user regions closed except those of tables that are
+ * excluded, such as disabled/disabling/enabling tables. All user regions and their previous
+ * locations are returned.
+ */
+ private Map<HRegionInfo, ServerName> getUserRegionsToAssign()
+ throws InterruptedIOException, CoordinatedStateException {
+ Set<TableName> disabledOrDisablingOrEnabling =
+ tableStateManager.getTablesInStates(ZooKeeperProtos.Table.State.DISABLED,
+ ZooKeeperProtos.Table.State.DISABLING, ZooKeeperProtos.Table.State.ENABLING);
+
+ // Clean re/start, mark all user regions closed before reassignment
+ return regionStates.closeAllUserRegions(disabledOrDisablingOrEnabling);
+ }
+
+ /*
+ * Wait for SSH completion and assign user region which are not in disabled/disabling/enabling
+ * table states.
+ */
+ private void assignRegionsOnSSHCompletion() {
+ LOG.info("Meta is rebuild by OfflineMetaRepair tool, assigning all user regions.");
+ Thread regionAssignerThread = new Thread("RegionAssignerOnMetaRebuild") {
+ public void run() {
+ long sshTimeout =
+ server.getConfiguration().getLong("hbase.master.initializationmonitor.timeout", 900000);
+ long startTime = EnvironmentEdgeManager.currentTime();
+ // Wait until all dead sercessing is done.
+ while (serverManager.areDeadServersInProgress()) {
+ if (EnvironmentEdgeManager.currentTime() - startTime > sshTimeout) {
+ LOG.warn(
+ "Couldn't assign the regions as SSH was not finished within the specified time in hbase.master.initializationmonitor.timeout parameter.");
+ return;
+ }
+ try {
+ Thread.sleep(100);
+ } catch (InterruptedException e) {
+ LOG.warn("RegionAssignerOnMetaRebuild got interrupted.", e);
+ break;
+ }
+ }
+ LOG.info("SSH has been completed for all dead servers, assigning the user regions.");
+ try {
+ // Assign the regions
+ assignAllUserRegions(getUserRegionsToAssign());
+ } catch (CoordinatedStateException | IOException | InterruptedException e) {
+ LOG.error("Exception occured while assigning user regions.", e);
+ }
+ };
+ };
+ regionAssignerThread.setDaemon(true);
+ regionAssignerThread.start();
+ }
+
/**
* If region is up in zk in transition, then do fixup and block and wait until
* the region is assigned and out of transition. Used on startup for
http://git-wip-us.apache.org/repos/asf/hbase/blob/01027f80/hbase-server/src/test/java/org/apache/hadoop/hbase/util/hbck/TestOfflineMetaRebuildBase.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/hbck/TestOfflineMetaRebuildBase.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/hbck/TestOfflineMetaRebuildBase.java
index b31e20e..4c5d306 100644
--- a/hbase-server/src/test/java/org/apache/hadoop/hbase/util/hbck/TestOfflineMetaRebuildBase.java
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/util/hbck/TestOfflineMetaRebuildBase.java
@@ -25,6 +25,7 @@ import static org.junit.Assert.assertTrue;
import java.io.IOException;
import java.util.Arrays;
+import java.util.List;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -33,13 +34,17 @@ import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hbase.HConstants;
import org.apache.hadoop.hbase.HTableDescriptor;
+import org.apache.hadoop.hbase.NamespaceDescriptor;
+import org.apache.hadoop.hbase.TableName;
import org.apache.hadoop.hbase.testclassification.MediumTests;
import org.apache.hadoop.hbase.client.Admin;
import org.apache.hadoop.hbase.client.Connection;
import org.apache.hadoop.hbase.client.ConnectionFactory;
+import org.apache.hadoop.hbase.util.Bytes;
import org.apache.hadoop.hbase.util.FSUtils;
import org.apache.hadoop.hbase.util.HBaseFsck;
import org.apache.hadoop.hbase.util.HBaseFsck.ErrorReporter.ERROR_CODE;
+import org.apache.hadoop.hbase.util.JVMClusterUtil.RegionServerThread;
import org.junit.Test;
import org.junit.experimental.categories.Category;
@@ -77,6 +82,63 @@ public class TestOfflineMetaRebuildBase extends OfflineMetaRebuildTestCore {
// bring up the minicluster
TEST_UTIL.startMiniZKCluster();
TEST_UTIL.restartHBaseCluster(3);
+ validateMetaAndUserTableRows(1, 5);
+ }
+
+ @Test(timeout = 300000)
+ public void testHMasterStartupOnMetaRebuild() throws Exception {
+ // shutdown the minicluster
+ TEST_UTIL.shutdownMiniHBaseCluster();
+
+ // Assign meta in master and restart Hbase
+ TEST_UTIL.getConfiguration().set("hbase.balancer.tablesOnMaster", "hbase:meta");
+ // Set namespace initialization timeout
+ TEST_UTIL.getConfiguration().set("hbase.master.namespace.init.timeout", "150000");
+ TEST_UTIL.restartHBaseCluster(3);
+ TEST_UTIL.getMiniHBaseCluster().waitForActiveAndReadyMaster();
+
+ // Create namespace
+ TEST_UTIL.getHBaseAdmin().createNamespace(NamespaceDescriptor.create("ns1").build());
+ TEST_UTIL.getHBaseAdmin().createNamespace(NamespaceDescriptor.create("ns2").build());
+ // Create tables
+ TEST_UTIL.createTable(TableName.valueOf("ns1:testHMasterStartupOnMetaRebuild"),
+ Bytes.toBytes("cf1"));
+ TEST_UTIL.createTable(TableName.valueOf("ns2:testHMasterStartupOnMetaRebuild"),
+ Bytes.toBytes("cf1"));
+ // Flush meta
+ TEST_UTIL.flush(TableName.META_TABLE_NAME);
+
+ // HMaster graceful shutdown
+ TEST_UTIL.getHBaseCluster().getMaster().shutdown();
+
+ // Kill region servers
+ List<RegionServerThread> regionServerThreads =
+ TEST_UTIL.getHBaseCluster().getRegionServerThreads();
+ for (RegionServerThread regionServerThread : regionServerThreads) {
+ TEST_UTIL.getHBaseCluster()
+ .killRegionServer(regionServerThread.getRegionServer().getServerName());
+ }
+
+ // rebuild meta table from scratch
+ HBaseFsck fsck = new HBaseFsck(conf);
+ assertTrue(fsck.rebuildMeta(false));
+
+ // bring up the minicluster
+ TEST_UTIL.restartHBaseCluster(3);
+ validateMetaAndUserTableRows(3, 7);
+
+ // Remove table and namesapce
+ TEST_UTIL.deleteTable("ns1:testHMasterStartupOnMetaRebuild");
+ TEST_UTIL.deleteTable("ns2:testHMasterStartupOnMetaRebuild");
+ TEST_UTIL.getHBaseAdmin().deleteNamespace("ns1");
+ TEST_UTIL.getHBaseAdmin().deleteNamespace("ns2");
+ }
+
+ /*
+ * Validate meta table region count and user table rows.
+ */
+ private void validateMetaAndUserTableRows(int totalTableCount, int totalRegionCount)
+ throws Exception {
try (Connection connection = ConnectionFactory.createConnection(TEST_UTIL.getConfiguration())) {
Admin admin = connection.getAdmin();
admin.enableTable(table);
@@ -85,10 +147,10 @@ public class TestOfflineMetaRebuildBase extends OfflineMetaRebuildTestCore {
LOG.info("No more RIT in ZK, now doing final test verification");
// everything is good again.
- assertEquals(5, scanMeta());
+ assertEquals(totalRegionCount, scanMeta());
HTableDescriptor[] htbls = admin.listTables();
LOG.info("Tables present after restart: " + Arrays.toString(htbls));
- assertEquals(1, htbls.length);
+ assertEquals(totalTableCount, htbls.length);
}
assertErrors(doFsck(conf, false), new ERROR_CODE[] {});