You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by ga...@apache.org on 2020/06/24 16:00:01 UTC

[hadoop] 02/02: HDFS-15323. StandbyNode fails transition to active due to insufficient transaction tailing. Contributed by Konstantin V Shvachko.

This is an automated email from the ASF dual-hosted git repository.

gabota pushed a commit to branch branch-3.1.4
in repository https://gitbox.apache.org/repos/asf/hadoop.git

commit 4ef7dab2153d1ad30023c43da8954fceb6a01ccb
Author: Konstantin V Shvachko <sh...@apache.org>
AuthorDate: Mon May 4 10:29:50 2020 -0700

    HDFS-15323. StandbyNode fails transition to active due to insufficient transaction tailing. Contributed by Konstantin V Shvachko.
    
    (cherry picked from commit ebb878bab991c242b5089a18881aa10abf318ea0)
---
 .../hdfs/qjournal/client/QuorumJournalManager.java |  4 ++--
 .../hdfs/server/namenode/ha/EditLogTailer.java     | 24 +++++++++++++------
 .../namenode/ha/TestStandbyInProgressTail.java     | 28 ++++++++++++++++++++++
 3 files changed, 47 insertions(+), 9 deletions(-)

diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/client/QuorumJournalManager.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/client/QuorumJournalManager.java
index 94b5832..f8ebd89 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/client/QuorumJournalManager.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/qjournal/client/QuorumJournalManager.java
@@ -73,9 +73,9 @@ public class QuorumJournalManager implements JournalManager {
   static final Log LOG = LogFactory.getLog(QuorumJournalManager.class);
 
   // This config is not publicly exposed
-  static final String QJM_RPC_MAX_TXNS_KEY =
+  public static final String QJM_RPC_MAX_TXNS_KEY =
       "dfs.ha.tail-edits.qjm.rpc.max-txns";
-  static final int QJM_RPC_MAX_TXNS_DEFAULT = 5000;
+  public static final int QJM_RPC_MAX_TXNS_DEFAULT = 5000;
 
   // Maximum number of transactions to fetch at a time when using the
   // RPC edit fetch mechanism
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/EditLogTailer.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/EditLogTailer.java
index ca231b4..276b76f 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/EditLogTailer.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/EditLogTailer.java
@@ -299,13 +299,23 @@ public class EditLogTailer {
     SecurityUtil.doAsLoginUser(new PrivilegedExceptionAction<Void>() {
       @Override
       public Void run() throws Exception {
-        try {
-          // It is already under the full name system lock and the checkpointer
-          // thread is already stopped. No need to acqure any other lock.
-          doTailEdits();
-        } catch (InterruptedException e) {
-          throw new IOException(e);
-        }
+        long editsTailed = 0;
+        // Fully tail the journal to the end
+        do {
+          long startTime = Time.monotonicNow();
+          try {
+            NameNode.getNameNodeMetrics().addEditLogTailInterval(
+                startTime - lastLoadTimeMs);
+            // It is already under the name system lock and the checkpointer
+            // thread is already stopped. No need to acquire any other lock.
+            editsTailed = doTailEdits();
+          } catch (InterruptedException e) {
+            throw new IOException(e);
+          } finally {
+            NameNode.getNameNodeMetrics().addEditLogTailTime(
+                Time.monotonicNow() - startTime);
+          }
+        } while(editsTailed > 0);
         return null;
       }
     });
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestStandbyInProgressTail.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestStandbyInProgressTail.java
index 0420579..7692966 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestStandbyInProgressTail.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/ha/TestStandbyInProgressTail.java
@@ -42,6 +42,7 @@ import org.apache.hadoop.hdfs.server.namenode.NNStorage;
 import org.apache.hadoop.hdfs.server.namenode.NameNode;
 import org.apache.hadoop.test.GenericTestUtils;
 import static org.apache.hadoop.hdfs.server.namenode.NameNodeAdapter.getFileInfo;
+import static org.apache.hadoop.hdfs.qjournal.client.QuorumJournalManager.QJM_RPC_MAX_TXNS_KEY;
 
 import org.junit.After;
 import org.junit.Before;
@@ -72,6 +73,8 @@ public class TestStandbyInProgressTail {
     conf.setBoolean(DFSConfigKeys.DFS_HA_TAILEDITS_INPROGRESS_KEY, true);
     conf.setInt(DFSConfigKeys.DFS_QJOURNAL_SELECT_INPUT_STREAMS_TIMEOUT_KEY,
         500);
+    // Set very samll limit of transactions per a journal rpc call
+    conf.setInt(QJM_RPC_MAX_TXNS_KEY, 3);
     HAUtil.setAllowStandbyReads(conf, true);
     qjmhaCluster = new MiniQJMHACluster.Builder(conf).build();
     cluster = qjmhaCluster.getDfsCluster();
@@ -300,6 +303,31 @@ public class TestStandbyInProgressTail {
     waitForFileInfo(nn1, "/test", "/test2", "/test3");
   }
 
+  /**
+   * Test that Standby Node tails multiple segments while catching up
+   * during the transition to Active.
+   */
+  @Test
+  public void testUndertailingWhileFailover() throws Exception {
+    cluster.transitionToActive(0);
+    cluster.waitActive(0);
+
+    String p = "/testFailoverWhileTailingWithoutCache/";
+    mkdirs(nn0, p + 0, p + 1, p + 2, p + 3, p + 4);
+    nn0.getRpcServer().rollEditLog(); // create segment 1
+
+    mkdirs(nn0, p + 5, p + 6, p + 7, p + 8, p + 9);
+    nn0.getRpcServer().rollEditLog(); // create segment 2
+
+    mkdirs(nn0, p + 10, p + 11, p + 12, p + 13, p + 14);
+    nn0.getRpcServer().rollEditLog(); // create segment 3
+
+    cluster.transitionToStandby(0);
+    cluster.transitionToActive(1);
+    cluster.waitActive(1);
+    waitForFileInfo(nn1, p + 0, p + 1, p + 14);
+  }
+
   @Test
   public void testNonUniformConfig() throws Exception {
     // Test case where some NNs (in this case the active NN) in the cluster


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org