You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hbase.apache.org by ap...@apache.org on 2017/06/07 00:59:36 UTC

[1/4] hbase git commit: HBASE-18132 Low replication should be checked in period in case of datanode rolling upgrade (Allan Yang)

Repository: hbase
Updated Branches:
  refs/heads/branch-1 9c1efc9f9 -> e0dbafd7c
  refs/heads/branch-1.2 04bbdc835 -> d46b7832d
  refs/heads/branch-1.3 5812e41bc -> 69deecb1e
  refs/heads/master 1950acc67 -> 858bccfcb


HBASE-18132 Low replication should be checked in period in case of datanode rolling upgrade (Allan Yang)


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/e0dbafd7
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/e0dbafd7
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/e0dbafd7

Branch: refs/heads/branch-1
Commit: e0dbafd7cc70efe3dad3ef69effc402d5fb16095
Parents: 9c1efc9
Author: Andrew Purtell <ap...@apache.org>
Authored: Tue Jun 6 17:15:33 2017 -0700
Committer: Andrew Purtell <ap...@apache.org>
Committed: Tue Jun 6 17:21:11 2017 -0700

----------------------------------------------------------------------
 .../hadoop/hbase/regionserver/LogRoller.java    | 27 ++++++
 .../hadoop/hbase/regionserver/wal/FSHLog.java   | 14 +++-
 .../wal/TestWALOpenAfterDNRollingStart.java     | 86 ++++++++++++++++++++
 3 files changed, 126 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/e0dbafd7/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/LogRoller.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/LogRoller.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/LogRoller.java
index ccc951a..0e5f284 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/LogRoller.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/LogRoller.java
@@ -30,6 +30,7 @@ import org.apache.hadoop.hbase.classification.InterfaceAudience;
 import org.apache.hadoop.hbase.HConstants;
 import org.apache.hadoop.hbase.RemoteExceptionHandler;
 import org.apache.hadoop.hbase.Server;
+import org.apache.hadoop.hbase.regionserver.wal.FSHLog;
 import org.apache.hadoop.hbase.regionserver.wal.FailedLogCloseException;
 import org.apache.hadoop.hbase.wal.WAL;
 import org.apache.hadoop.hbase.regionserver.wal.WALActionsListener;
@@ -65,6 +66,8 @@ public class LogRoller extends HasThread {
   // Period to roll log.
   private final long rollperiod;
   private final int threadWakeFrequency;
+  // The interval to check low replication on hlog's pipeline
+  private long checkLowReplicationInterval;
 
   public void addWAL(final WAL wal) {
     if (null == walNeedsRoll.putIfAbsent(wal, Boolean.FALSE)) {
@@ -101,6 +104,8 @@ public class LogRoller extends HasThread {
       getLong("hbase.regionserver.logroll.period", 3600000);
     this.threadWakeFrequency = this.server.getConfiguration().
       getInt(HConstants.THREAD_WAKE_FREQUENCY, 10 * 1000);
+    this.checkLowReplicationInterval = this.server.getConfiguration().getLong(
+        "hbase.regionserver.hlog.check.lowreplication.interval", 30 * 1000);
   }
 
   @Override
@@ -112,10 +117,32 @@ public class LogRoller extends HasThread {
     super.interrupt();
   }
 
+  /**
+   * we need to check low replication in period, see HBASE-18132
+   */
+  void checkLowReplication(long now) {
+    try {
+      for (Entry<WAL, Boolean> entry : walNeedsRoll.entrySet()) {
+        WAL wal = entry.getKey();
+        boolean neeRollAlready = entry.getValue();
+        if(wal instanceof FSHLog && !neeRollAlready) {
+          FSHLog hlog = (FSHLog)wal;
+          if ((now - hlog.getLastTimeCheckLowReplication())
+              > this.checkLowReplicationInterval) {
+            hlog.checkLogRoll();
+          }
+        }
+      }
+    } catch (Throwable e) {
+      LOG.warn("Failed checking low replication", e);
+    }
+  }
+
   @Override
   public void run() {
     while (!server.isStopped()) {
       long now = System.currentTimeMillis();
+      checkLowReplication(now);
       boolean periodic = false;
       if (!rollLog.get()) {
         periodic = (now - this.lastrolltime) > this.rollperiod;

http://git-wip-us.apache.org/repos/asf/hbase/blob/e0dbafd7/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java
index 8d97b64..d5cf6bb 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java
@@ -349,6 +349,9 @@ public class FSHLog implements WAL {
 
   private final AtomicInteger closeErrorCount = new AtomicInteger();
 
+  // Last time to check low replication on hlog's pipeline
+  private volatile long lastTimeCheckLowReplication = EnvironmentEdgeManager.currentTime();
+
 
   /**
    * WAL Comparator; it compares the timestamp (log filenum), present in the log file name.
@@ -1310,7 +1313,7 @@ public class FSHLog implements WAL {
   /**
    * Schedule a log roll if needed.
    */
-  void checkLogRoll() {
+  public void checkLogRoll() {
     // Will return immediately if we are in the middle of a WAL log roll currently.
     if (!rollWriterLock.tryLock()) return;
     boolean lowReplication;
@@ -1333,6 +1336,7 @@ public class FSHLog implements WAL {
    */
   private boolean checkLowReplication() {
     boolean logRollNeeded = false;
+    this.lastTimeCheckLowReplication = EnvironmentEdgeManager.currentTime();
     // if the number of replicas in HDFS has fallen below the configured
     // value, then roll logs.
     try {
@@ -2059,4 +2063,12 @@ public class FSHLog implements WAL {
     }
     return new DatanodeInfo[0];
   }
+
+  /**
+   *
+   * @return last time on checking low replication
+   */
+  public long getLastTimeCheckLowReplication() {
+    return this.lastTimeCheckLowReplication;
+  }
 }

http://git-wip-us.apache.org/repos/asf/hbase/blob/e0dbafd7/hbase-server/src/test/java/org/apache/hadoop/hbase/wal/TestWALOpenAfterDNRollingStart.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/wal/TestWALOpenAfterDNRollingStart.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/wal/TestWALOpenAfterDNRollingStart.java
new file mode 100644
index 0000000..ee1692e
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/wal/TestWALOpenAfterDNRollingStart.java
@@ -0,0 +1,86 @@
+/**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.wal;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.regionserver.HRegionServer;
+import org.apache.hadoop.hbase.regionserver.wal.FSHLog;
+import org.apache.hadoop.hbase.testclassification.MediumTests;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+
+@Category(MediumTests.class)
+public class TestWALOpenAfterDNRollingStart {
+  final Log LOG = LogFactory.getLog(getClass());
+  private final static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
+  private static long DataNodeRestartInterval;
+
+  @BeforeClass
+  public static void setUpBeforeClass() throws Exception {
+    // Sleep time before restart next dn, we need to wait the current dn to finish start up
+    DataNodeRestartInterval = 15000;
+    // interval of checking low replication. The sleep time must smaller than DataNodeRestartInterval
+    // so a low replication case will be detected and the wal will be rolled
+    long checkLowReplicationInterval = 10000;
+    //don't let hdfs client to choose a new replica when dn down
+    TEST_UTIL.getConfiguration().setBoolean("dfs.client.block.write.replace-datanode-on-failure.enable",
+        false);
+    TEST_UTIL.getConfiguration().setLong("hbase.regionserver.hlog.check.lowreplication.interval",
+        checkLowReplicationInterval);
+    TEST_UTIL.startMiniDFSCluster(3);
+    TEST_UTIL.startMiniCluster(1);
+
+  }
+
+  /**
+   * see HBASE-18132
+   * This is a test case of failing open a wal(for replication for example) after all datanode
+   * restarted (rolling upgrade, for example).
+   * Before this patch, low replication detection is only used when syncing wal.
+   * But if the wal haven't had any entry whiten, it will never know all the replica of the wal
+   * is broken(because of dn restarting). And this wal can never be open
+   * @throws Exception
+   */
+  @Test(timeout = 300000)
+  public void test() throws Exception {
+    HRegionServer server = TEST_UTIL.getHBaseCluster().getRegionServer(0);
+    FSHLog hlog = (FSHLog)server.getWAL(null);
+    Path currentFile = hlog.getCurrentFileName();
+    //restart every dn to simulate a dn rolling upgrade
+    for(int i = 0; i < TEST_UTIL.getDFSCluster().getDataNodes().size(); i++) {
+      //This is NOT a bug, when restart dn in miniDFSCluster, it will remove the stopped dn from
+      //the dn list and then add to the tail of this list, we need to always restart the first one
+      //to simulate rolling upgrade of every dn.
+      TEST_UTIL.getDFSCluster().restartDataNode(0);
+      //sleep enough time so log roller can detect the pipeline break and roll log
+      Thread.sleep(DataNodeRestartInterval);
+    }
+
+    //if the log is not rolled, then we can never open this wal forever.
+    WAL.Reader reader = WALFactory
+        .createReader(TEST_UTIL.getTestFileSystem(), currentFile, TEST_UTIL.getConfiguration());
+  }
+
+
+}


[2/4] hbase git commit: HBASE-18132 Low replication should be checked in period in case of datanode rolling upgrade (Allan Yang)

Posted by ap...@apache.org.
HBASE-18132 Low replication should be checked in period in case of datanode rolling upgrade (Allan Yang)


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/858bccfc
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/858bccfc
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/858bccfc

Branch: refs/heads/master
Commit: 858bccfcb8eb07cd6d6b65cb208726e2f24bd0b5
Parents: 1950acc
Author: Andrew Purtell <ap...@apache.org>
Authored: Tue Jun 6 17:15:33 2017 -0700
Committer: Andrew Purtell <ap...@apache.org>
Committed: Tue Jun 6 17:21:21 2017 -0700

----------------------------------------------------------------------
 .../hadoop/hbase/regionserver/LogRoller.java    | 27 ++++++
 .../hadoop/hbase/regionserver/wal/FSHLog.java   | 21 +++--
 .../wal/TestWALOpenAfterDNRollingStart.java     | 94 ++++++++++++++++++++
 3 files changed, 136 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/858bccfc/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/LogRoller.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/LogRoller.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/LogRoller.java
index 9d1bc4b..80b6825 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/LogRoller.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/LogRoller.java
@@ -30,6 +30,7 @@ import org.apache.commons.logging.LogFactory;
 import org.apache.hadoop.hbase.classification.InterfaceAudience;
 import org.apache.hadoop.hbase.HConstants;
 import org.apache.hadoop.hbase.Server;
+import org.apache.hadoop.hbase.regionserver.wal.FSHLog;
 import org.apache.hadoop.hbase.regionserver.wal.FailedLogCloseException;
 import org.apache.hadoop.hbase.wal.WAL;
 import org.apache.hadoop.hbase.regionserver.wal.WALActionsListener;
@@ -61,6 +62,8 @@ public class LogRoller extends HasThread implements Closeable {
   // Period to roll log.
   private final long rollperiod;
   private final int threadWakeFrequency;
+  // The interval to check low replication on hlog's pipeline
+  private long checkLowReplicationInterval;
 
   private volatile boolean running = true;
 
@@ -99,6 +102,8 @@ public class LogRoller extends HasThread implements Closeable {
       getLong("hbase.regionserver.logroll.period", 3600000);
     this.threadWakeFrequency = this.server.getConfiguration().
       getInt(HConstants.THREAD_WAKE_FREQUENCY, 10 * 1000);
+    this.checkLowReplicationInterval = this.server.getConfiguration().getLong(
+        "hbase.regionserver.hlog.check.lowreplication.interval", 30 * 1000);
   }
 
   @Override
@@ -110,10 +115,32 @@ public class LogRoller extends HasThread implements Closeable {
     super.interrupt();
   }
 
+  /**
+   * we need to check low replication in period, see HBASE-18132
+   */
+  void checkLowReplication(long now) {
+    try {
+      for (Entry<WAL, Boolean> entry : walNeedsRoll.entrySet()) {
+        WAL wal = entry.getKey();
+        boolean neeRollAlready = entry.getValue();
+        if(wal instanceof FSHLog && !neeRollAlready) {
+          FSHLog hlog = (FSHLog)wal;
+          if ((now - hlog.getLastTimeCheckLowReplication())
+              > this.checkLowReplicationInterval) {
+            hlog.checkLogRoll();
+          }
+        }
+      }
+    } catch (Throwable e) {
+      LOG.warn("Failed checking low replication", e);
+    }
+  }
+
   @Override
   public void run() {
     while (running) {
       long now = System.currentTimeMillis();
+      checkLowReplication(now);
       boolean periodic = false;
       if (!rollLog.get()) {
         periodic = (now - this.lastrolltime) > this.rollperiod;

http://git-wip-us.apache.org/repos/asf/hbase/blob/858bccfc/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java
index caf07a2..77ac1d1 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java
@@ -47,11 +47,7 @@ import org.apache.hadoop.hbase.HBaseConfiguration;
 import org.apache.hadoop.hbase.HConstants;
 import org.apache.hadoop.hbase.HRegionInfo;
 import org.apache.hadoop.hbase.classification.InterfaceAudience;
-import org.apache.hadoop.hbase.util.Bytes;
-import org.apache.hadoop.hbase.util.ClassSize;
-import org.apache.hadoop.hbase.util.FSUtils;
-import org.apache.hadoop.hbase.util.HasThread;
-import org.apache.hadoop.hbase.util.Threads;
+import org.apache.hadoop.hbase.util.*;
 import org.apache.hadoop.hbase.wal.FSHLogProvider;
 import org.apache.hadoop.hbase.wal.WALFactory;
 import org.apache.hadoop.hbase.wal.WALKey;
@@ -153,6 +149,10 @@ public class FSHLog extends AbstractFSWAL<Writer> {
 
   private final AtomicInteger closeErrorCount = new AtomicInteger();
 
+  // Last time to check low replication on hlog's pipeline
+  private volatile long lastTimeCheckLowReplication = EnvironmentEdgeManager.currentTime();
+
+
   /**
    * Exception handler to pass the disruptor ringbuffer. Same as native implementation only it logs
    * using our logger instead of java native logger.
@@ -629,7 +629,7 @@ public class FSHLog extends AbstractFSWAL<Writer> {
   /**
    * Schedule a log roll if needed.
    */
-  void checkLogRoll() {
+  public void checkLogRoll() {
     // Will return immediately if we are in the middle of a WAL log roll currently.
     if (!rollWriterLock.tryLock()) {
       return;
@@ -650,6 +650,7 @@ public class FSHLog extends AbstractFSWAL<Writer> {
    */
   private boolean checkLowReplication() {
     boolean logRollNeeded = false;
+    this.lastTimeCheckLowReplication = EnvironmentEdgeManager.currentTime();
     // if the number of replicas in HDFS has fallen below the configured
     // value, then roll logs.
     try {
@@ -1186,4 +1187,12 @@ public class FSHLog extends AbstractFSWAL<Writer> {
     }
     return new DatanodeInfo[0];
   }
+
+  /**
+   *
+   * @return last time on checking low replication
+   */
+  public long getLastTimeCheckLowReplication() {
+    return this.lastTimeCheckLowReplication;
+  }
 }

http://git-wip-us.apache.org/repos/asf/hbase/blob/858bccfc/hbase-server/src/test/java/org/apache/hadoop/hbase/wal/TestWALOpenAfterDNRollingStart.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/wal/TestWALOpenAfterDNRollingStart.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/wal/TestWALOpenAfterDNRollingStart.java
new file mode 100644
index 0000000..cf2c5d7
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/wal/TestWALOpenAfterDNRollingStart.java
@@ -0,0 +1,94 @@
+/**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.wal;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.HConstants;
+import org.apache.hadoop.hbase.regionserver.HRegionServer;
+import org.apache.hadoop.hbase.regionserver.wal.FSHLog;
+import org.apache.hadoop.hbase.testclassification.MediumTests;
+import org.apache.hadoop.hbase.util.FSUtils;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+
+@Category(MediumTests.class)
+public class TestWALOpenAfterDNRollingStart {
+  final Log LOG = LogFactory.getLog(getClass());
+  private final static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
+  private static long DataNodeRestartInterval;
+
+  @BeforeClass
+  public static void setUpBeforeClass() throws Exception {
+    // Sleep time before restart next dn, we need to wait the current dn to finish start up
+    DataNodeRestartInterval = 15000;
+    // interval of checking low replication. The sleep time must smaller than DataNodeRestartInterval
+    // so a low replication case will be detected and the wal will be rolled
+    long checkLowReplicationInterval = 10000;
+    //don't let hdfs client to choose a new replica when dn down
+    TEST_UTIL.getConfiguration().setBoolean("dfs.client.block.write.replace-datanode-on-failure.enable",
+        false);
+    TEST_UTIL.getConfiguration().setLong("hbase.regionserver.hlog.check.lowreplication.interval",
+        checkLowReplicationInterval);
+    TEST_UTIL.startMiniDFSCluster(3);
+    TEST_UTIL.startMiniCluster(1);
+
+  }
+
+  /**
+   * see HBASE-18132
+   * This is a test case of failing open a wal(for replication for example) after all datanode
+   * restarted (rolling upgrade, for example).
+   * Before this patch, low replication detection is only used when syncing wal.
+   * But if the wal haven't had any entry whiten, it will never know all the replica of the wal
+   * is broken(because of dn restarting). And this wal can never be open
+   * @throws Exception
+   */
+  @Test(timeout = 300000)
+  public void test() throws Exception {
+    HRegionServer server = TEST_UTIL.getHBaseCluster().getRegionServer(0);
+    FSHLog hlog = (FSHLog)server.getWAL(null);
+    Path currentFile = hlog.getCurrentFileName();
+    //restart every dn to simulate a dn rolling upgrade
+    for(int i = 0; i < TEST_UTIL.getDFSCluster().getDataNodes().size(); i++) {
+      //This is NOT a bug, when restart dn in miniDFSCluster, it will remove the stopped dn from
+      //the dn list and then add to the tail of this list, we need to always restart the first one
+      //to simulate rolling upgrade of every dn.
+      TEST_UTIL.getDFSCluster().restartDataNode(0);
+      //sleep enough time so log roller can detect the pipeline break and roll log
+      Thread.sleep(DataNodeRestartInterval);
+    }
+
+    if(!server.getFileSystem().exists(currentFile)) {
+      Path walRootDir = FSUtils.getWALRootDir(TEST_UTIL.getConfiguration());
+      final Path oldLogDir = new Path(walRootDir, HConstants.HREGION_OLDLOGDIR_NAME);
+      currentFile = new Path(oldLogDir, currentFile.getName());
+    }
+    //if the log is not rolled, then we can never open this wal forever.
+    WAL.Reader reader = WALFactory
+        .createReader(TEST_UTIL.getTestFileSystem(), currentFile, TEST_UTIL.getConfiguration());
+
+  }
+
+
+}


[4/4] hbase git commit: HBASE-18132 Low replication should be checked in period in case of datanode rolling upgrade (Allan Yang)

Posted by ap...@apache.org.
HBASE-18132 Low replication should be checked in period in case of datanode rolling upgrade (Allan Yang)


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/d46b7832
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/d46b7832
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/d46b7832

Branch: refs/heads/branch-1.2
Commit: d46b7832d8b44cc02f543867683dae29544944aa
Parents: 04bbdc8
Author: Andrew Purtell <ap...@apache.org>
Authored: Tue Jun 6 17:15:33 2017 -0700
Committer: Andrew Purtell <ap...@apache.org>
Committed: Tue Jun 6 17:24:31 2017 -0700

----------------------------------------------------------------------
 .../hadoop/hbase/regionserver/LogRoller.java    | 27 ++++++
 .../hadoop/hbase/regionserver/wal/FSHLog.java   | 14 +++-
 .../wal/TestWALOpenAfterDNRollingStart.java     | 86 ++++++++++++++++++++
 3 files changed, 126 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/d46b7832/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/LogRoller.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/LogRoller.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/LogRoller.java
index ccc951a..0e5f284 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/LogRoller.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/LogRoller.java
@@ -30,6 +30,7 @@ import org.apache.hadoop.hbase.classification.InterfaceAudience;
 import org.apache.hadoop.hbase.HConstants;
 import org.apache.hadoop.hbase.RemoteExceptionHandler;
 import org.apache.hadoop.hbase.Server;
+import org.apache.hadoop.hbase.regionserver.wal.FSHLog;
 import org.apache.hadoop.hbase.regionserver.wal.FailedLogCloseException;
 import org.apache.hadoop.hbase.wal.WAL;
 import org.apache.hadoop.hbase.regionserver.wal.WALActionsListener;
@@ -65,6 +66,8 @@ public class LogRoller extends HasThread {
   // Period to roll log.
   private final long rollperiod;
   private final int threadWakeFrequency;
+  // The interval to check low replication on hlog's pipeline
+  private long checkLowReplicationInterval;
 
   public void addWAL(final WAL wal) {
     if (null == walNeedsRoll.putIfAbsent(wal, Boolean.FALSE)) {
@@ -101,6 +104,8 @@ public class LogRoller extends HasThread {
       getLong("hbase.regionserver.logroll.period", 3600000);
     this.threadWakeFrequency = this.server.getConfiguration().
       getInt(HConstants.THREAD_WAKE_FREQUENCY, 10 * 1000);
+    this.checkLowReplicationInterval = this.server.getConfiguration().getLong(
+        "hbase.regionserver.hlog.check.lowreplication.interval", 30 * 1000);
   }
 
   @Override
@@ -112,10 +117,32 @@ public class LogRoller extends HasThread {
     super.interrupt();
   }
 
+  /**
+   * we need to check low replication in period, see HBASE-18132
+   */
+  void checkLowReplication(long now) {
+    try {
+      for (Entry<WAL, Boolean> entry : walNeedsRoll.entrySet()) {
+        WAL wal = entry.getKey();
+        boolean neeRollAlready = entry.getValue();
+        if(wal instanceof FSHLog && !neeRollAlready) {
+          FSHLog hlog = (FSHLog)wal;
+          if ((now - hlog.getLastTimeCheckLowReplication())
+              > this.checkLowReplicationInterval) {
+            hlog.checkLogRoll();
+          }
+        }
+      }
+    } catch (Throwable e) {
+      LOG.warn("Failed checking low replication", e);
+    }
+  }
+
   @Override
   public void run() {
     while (!server.isStopped()) {
       long now = System.currentTimeMillis();
+      checkLowReplication(now);
       boolean periodic = false;
       if (!rollLog.get()) {
         periodic = (now - this.lastrolltime) > this.rollperiod;

http://git-wip-us.apache.org/repos/asf/hbase/blob/d46b7832/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java
index 7710f2b..4b81dda 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java
@@ -351,6 +351,9 @@ public class FSHLog implements WAL {
 
   private final AtomicInteger closeErrorCount = new AtomicInteger();
 
+  // Last time to check low replication on hlog's pipeline
+  private volatile long lastTimeCheckLowReplication = EnvironmentEdgeManager.currentTime();
+
 
   /**
    * WAL Comparator; it compares the timestamp (log filenum), present in the log file name.
@@ -1299,7 +1302,7 @@ public class FSHLog implements WAL {
   /**
    * Schedule a log roll if needed.
    */
-  void checkLogRoll() {
+  public void checkLogRoll() {
     // Will return immediately if we are in the middle of a WAL log roll currently.
     if (!rollWriterLock.tryLock()) return;
     boolean lowReplication;
@@ -1322,6 +1325,7 @@ public class FSHLog implements WAL {
    */
   private boolean checkLowReplication() {
     boolean logRollNeeded = false;
+    this.lastTimeCheckLowReplication = EnvironmentEdgeManager.currentTime();
     // if the number of replicas in HDFS has fallen below the configured
     // value, then roll logs.
     try {
@@ -2056,4 +2060,12 @@ public class FSHLog implements WAL {
     }
     return new DatanodeInfo[0];
   }
+
+  /**
+   *
+   * @return last time on checking low replication
+   */
+  public long getLastTimeCheckLowReplication() {
+    return this.lastTimeCheckLowReplication;
+  }
 }

http://git-wip-us.apache.org/repos/asf/hbase/blob/d46b7832/hbase-server/src/test/java/org/apache/hadoop/hbase/wal/TestWALOpenAfterDNRollingStart.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/wal/TestWALOpenAfterDNRollingStart.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/wal/TestWALOpenAfterDNRollingStart.java
new file mode 100644
index 0000000..ee1692e
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/wal/TestWALOpenAfterDNRollingStart.java
@@ -0,0 +1,86 @@
+/**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.wal;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.regionserver.HRegionServer;
+import org.apache.hadoop.hbase.regionserver.wal.FSHLog;
+import org.apache.hadoop.hbase.testclassification.MediumTests;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+
+@Category(MediumTests.class)
+public class TestWALOpenAfterDNRollingStart {
+  final Log LOG = LogFactory.getLog(getClass());
+  private final static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
+  private static long DataNodeRestartInterval;
+
+  @BeforeClass
+  public static void setUpBeforeClass() throws Exception {
+    // Sleep time before restart next dn, we need to wait the current dn to finish start up
+    DataNodeRestartInterval = 15000;
+    // interval of checking low replication. The sleep time must smaller than DataNodeRestartInterval
+    // so a low replication case will be detected and the wal will be rolled
+    long checkLowReplicationInterval = 10000;
+    //don't let hdfs client to choose a new replica when dn down
+    TEST_UTIL.getConfiguration().setBoolean("dfs.client.block.write.replace-datanode-on-failure.enable",
+        false);
+    TEST_UTIL.getConfiguration().setLong("hbase.regionserver.hlog.check.lowreplication.interval",
+        checkLowReplicationInterval);
+    TEST_UTIL.startMiniDFSCluster(3);
+    TEST_UTIL.startMiniCluster(1);
+
+  }
+
+  /**
+   * see HBASE-18132
+   * This is a test case of failing open a wal(for replication for example) after all datanode
+   * restarted (rolling upgrade, for example).
+   * Before this patch, low replication detection is only used when syncing wal.
+   * But if the wal haven't had any entry whiten, it will never know all the replica of the wal
+   * is broken(because of dn restarting). And this wal can never be open
+   * @throws Exception
+   */
+  @Test(timeout = 300000)
+  public void test() throws Exception {
+    HRegionServer server = TEST_UTIL.getHBaseCluster().getRegionServer(0);
+    FSHLog hlog = (FSHLog)server.getWAL(null);
+    Path currentFile = hlog.getCurrentFileName();
+    //restart every dn to simulate a dn rolling upgrade
+    for(int i = 0; i < TEST_UTIL.getDFSCluster().getDataNodes().size(); i++) {
+      //This is NOT a bug, when restart dn in miniDFSCluster, it will remove the stopped dn from
+      //the dn list and then add to the tail of this list, we need to always restart the first one
+      //to simulate rolling upgrade of every dn.
+      TEST_UTIL.getDFSCluster().restartDataNode(0);
+      //sleep enough time so log roller can detect the pipeline break and roll log
+      Thread.sleep(DataNodeRestartInterval);
+    }
+
+    //if the log is not rolled, then we can never open this wal forever.
+    WAL.Reader reader = WALFactory
+        .createReader(TEST_UTIL.getTestFileSystem(), currentFile, TEST_UTIL.getConfiguration());
+  }
+
+
+}


[3/4] hbase git commit: HBASE-18132 Low replication should be checked in period in case of datanode rolling upgrade (Allan Yang)

Posted by ap...@apache.org.
HBASE-18132 Low replication should be checked in period in case of datanode rolling upgrade (Allan Yang)


Project: http://git-wip-us.apache.org/repos/asf/hbase/repo
Commit: http://git-wip-us.apache.org/repos/asf/hbase/commit/69deecb1
Tree: http://git-wip-us.apache.org/repos/asf/hbase/tree/69deecb1
Diff: http://git-wip-us.apache.org/repos/asf/hbase/diff/69deecb1

Branch: refs/heads/branch-1.3
Commit: 69deecb1e4cc81f32878c528e63af0b742bdb735
Parents: 5812e41
Author: Andrew Purtell <ap...@apache.org>
Authored: Tue Jun 6 17:15:33 2017 -0700
Committer: Andrew Purtell <ap...@apache.org>
Committed: Tue Jun 6 17:24:29 2017 -0700

----------------------------------------------------------------------
 .../hadoop/hbase/regionserver/LogRoller.java    | 27 ++++++
 .../hadoop/hbase/regionserver/wal/FSHLog.java   | 14 +++-
 .../wal/TestWALOpenAfterDNRollingStart.java     | 86 ++++++++++++++++++++
 3 files changed, 126 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hbase/blob/69deecb1/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/LogRoller.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/LogRoller.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/LogRoller.java
index ccc951a..0e5f284 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/LogRoller.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/LogRoller.java
@@ -30,6 +30,7 @@ import org.apache.hadoop.hbase.classification.InterfaceAudience;
 import org.apache.hadoop.hbase.HConstants;
 import org.apache.hadoop.hbase.RemoteExceptionHandler;
 import org.apache.hadoop.hbase.Server;
+import org.apache.hadoop.hbase.regionserver.wal.FSHLog;
 import org.apache.hadoop.hbase.regionserver.wal.FailedLogCloseException;
 import org.apache.hadoop.hbase.wal.WAL;
 import org.apache.hadoop.hbase.regionserver.wal.WALActionsListener;
@@ -65,6 +66,8 @@ public class LogRoller extends HasThread {
   // Period to roll log.
   private final long rollperiod;
   private final int threadWakeFrequency;
+  // The interval to check low replication on hlog's pipeline
+  private long checkLowReplicationInterval;
 
   public void addWAL(final WAL wal) {
     if (null == walNeedsRoll.putIfAbsent(wal, Boolean.FALSE)) {
@@ -101,6 +104,8 @@ public class LogRoller extends HasThread {
       getLong("hbase.regionserver.logroll.period", 3600000);
     this.threadWakeFrequency = this.server.getConfiguration().
       getInt(HConstants.THREAD_WAKE_FREQUENCY, 10 * 1000);
+    this.checkLowReplicationInterval = this.server.getConfiguration().getLong(
+        "hbase.regionserver.hlog.check.lowreplication.interval", 30 * 1000);
   }
 
   @Override
@@ -112,10 +117,32 @@ public class LogRoller extends HasThread {
     super.interrupt();
   }
 
+  /**
+   * we need to check low replication in period, see HBASE-18132
+   */
+  void checkLowReplication(long now) {
+    try {
+      for (Entry<WAL, Boolean> entry : walNeedsRoll.entrySet()) {
+        WAL wal = entry.getKey();
+        boolean neeRollAlready = entry.getValue();
+        if(wal instanceof FSHLog && !neeRollAlready) {
+          FSHLog hlog = (FSHLog)wal;
+          if ((now - hlog.getLastTimeCheckLowReplication())
+              > this.checkLowReplicationInterval) {
+            hlog.checkLogRoll();
+          }
+        }
+      }
+    } catch (Throwable e) {
+      LOG.warn("Failed checking low replication", e);
+    }
+  }
+
   @Override
   public void run() {
     while (!server.isStopped()) {
       long now = System.currentTimeMillis();
+      checkLowReplication(now);
       boolean periodic = false;
       if (!rollLog.get()) {
         periodic = (now - this.lastrolltime) > this.rollperiod;

http://git-wip-us.apache.org/repos/asf/hbase/blob/69deecb1/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java
index 7e1fb69..f508bd7 100644
--- a/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java
+++ b/hbase-server/src/main/java/org/apache/hadoop/hbase/regionserver/wal/FSHLog.java
@@ -351,6 +351,9 @@ public class FSHLog implements WAL {
 
   private final AtomicInteger closeErrorCount = new AtomicInteger();
 
+  // Last time to check low replication on hlog's pipeline
+  private volatile long lastTimeCheckLowReplication = EnvironmentEdgeManager.currentTime();
+
 
   /**
    * WAL Comparator; it compares the timestamp (log filenum), present in the log file name.
@@ -1303,7 +1306,7 @@ public class FSHLog implements WAL {
   /**
    * Schedule a log roll if needed.
    */
-  void checkLogRoll() {
+  public void checkLogRoll() {
     // Will return immediately if we are in the middle of a WAL log roll currently.
     if (!rollWriterLock.tryLock()) return;
     boolean lowReplication;
@@ -1326,6 +1329,7 @@ public class FSHLog implements WAL {
    */
   private boolean checkLowReplication() {
     boolean logRollNeeded = false;
+    this.lastTimeCheckLowReplication = EnvironmentEdgeManager.currentTime();
     // if the number of replicas in HDFS has fallen below the configured
     // value, then roll logs.
     try {
@@ -2057,4 +2061,12 @@ public class FSHLog implements WAL {
     }
     return new DatanodeInfo[0];
   }
+
+  /**
+   *
+   * @return last time on checking low replication
+   */
+  public long getLastTimeCheckLowReplication() {
+    return this.lastTimeCheckLowReplication;
+  }
 }

http://git-wip-us.apache.org/repos/asf/hbase/blob/69deecb1/hbase-server/src/test/java/org/apache/hadoop/hbase/wal/TestWALOpenAfterDNRollingStart.java
----------------------------------------------------------------------
diff --git a/hbase-server/src/test/java/org/apache/hadoop/hbase/wal/TestWALOpenAfterDNRollingStart.java b/hbase-server/src/test/java/org/apache/hadoop/hbase/wal/TestWALOpenAfterDNRollingStart.java
new file mode 100644
index 0000000..ee1692e
--- /dev/null
+++ b/hbase-server/src/test/java/org/apache/hadoop/hbase/wal/TestWALOpenAfterDNRollingStart.java
@@ -0,0 +1,86 @@
+/**
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hbase.wal;
+
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.hbase.HBaseTestingUtility;
+import org.apache.hadoop.hbase.regionserver.HRegionServer;
+import org.apache.hadoop.hbase.regionserver.wal.FSHLog;
+import org.apache.hadoop.hbase.testclassification.MediumTests;
+import org.junit.BeforeClass;
+import org.junit.Test;
+import org.junit.experimental.categories.Category;
+
+
+@Category(MediumTests.class)
+public class TestWALOpenAfterDNRollingStart {
+  final Log LOG = LogFactory.getLog(getClass());
+  private final static HBaseTestingUtility TEST_UTIL = new HBaseTestingUtility();
+  private static long DataNodeRestartInterval;
+
+  @BeforeClass
+  public static void setUpBeforeClass() throws Exception {
+    // Sleep time before restart next dn, we need to wait the current dn to finish start up
+    DataNodeRestartInterval = 15000;
+    // interval of checking low replication. The sleep time must smaller than DataNodeRestartInterval
+    // so a low replication case will be detected and the wal will be rolled
+    long checkLowReplicationInterval = 10000;
+    //don't let hdfs client to choose a new replica when dn down
+    TEST_UTIL.getConfiguration().setBoolean("dfs.client.block.write.replace-datanode-on-failure.enable",
+        false);
+    TEST_UTIL.getConfiguration().setLong("hbase.regionserver.hlog.check.lowreplication.interval",
+        checkLowReplicationInterval);
+    TEST_UTIL.startMiniDFSCluster(3);
+    TEST_UTIL.startMiniCluster(1);
+
+  }
+
+  /**
+   * see HBASE-18132
+   * This is a test case of failing open a wal(for replication for example) after all datanode
+   * restarted (rolling upgrade, for example).
+   * Before this patch, low replication detection is only used when syncing wal.
+   * But if the wal haven't had any entry whiten, it will never know all the replica of the wal
+   * is broken(because of dn restarting). And this wal can never be open
+   * @throws Exception
+   */
+  @Test(timeout = 300000)
+  public void test() throws Exception {
+    HRegionServer server = TEST_UTIL.getHBaseCluster().getRegionServer(0);
+    FSHLog hlog = (FSHLog)server.getWAL(null);
+    Path currentFile = hlog.getCurrentFileName();
+    //restart every dn to simulate a dn rolling upgrade
+    for(int i = 0; i < TEST_UTIL.getDFSCluster().getDataNodes().size(); i++) {
+      //This is NOT a bug, when restart dn in miniDFSCluster, it will remove the stopped dn from
+      //the dn list and then add to the tail of this list, we need to always restart the first one
+      //to simulate rolling upgrade of every dn.
+      TEST_UTIL.getDFSCluster().restartDataNode(0);
+      //sleep enough time so log roller can detect the pipeline break and roll log
+      Thread.sleep(DataNodeRestartInterval);
+    }
+
+    //if the log is not rolled, then we can never open this wal forever.
+    WAL.Reader reader = WALFactory
+        .createReader(TEST_UTIL.getTestFileSystem(), currentFile, TEST_UTIL.getConfiguration());
+  }
+
+
+}