You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by cl...@apache.org on 2019/12/12 20:13:06 UTC

[hadoop] 01/01: HDFS-15036. Active NameNode should not silently fail the image transfer. Contributed by Chen Liang.

This is an automated email from the ASF dual-hosted git repository.

cliang pushed a commit to branch branch-2
in repository https://gitbox.apache.org/repos/asf/hadoop.git

commit d9264b6ec86ae64240cf77b99fd351de1e8175f5
Author: Chen Liang <cl...@apache.org>
AuthorDate: Thu Dec 12 12:12:44 2019 -0800

    HDFS-15036. Active NameNode should not silently fail the image transfer. Contributed by Chen Liang.
---
 .../hadoop/hdfs/server/namenode/ImageServlet.java    |  6 ++++++
 .../hdfs/server/namenode/ha/StandbyCheckpointer.java | 12 +++++++++++-
 .../org/apache/hadoop/hdfs/TestRollingUpgrade.java   | 20 ++++++++++++++++++++
 3 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ImageServlet.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ImageServlet.java
index ad8b159..3dcc168 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ImageServlet.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ImageServlet.java
@@ -573,7 +573,13 @@ public class ImageServlet extends HttpServlet {
               long timeDelta = TimeUnit.MILLISECONDS.toSeconds(
                   now - lastCheckpointTime);
 
+              // Since the goal of the check below is to prevent overly
+              // frequent upload from Standby, the check should only be done
+              // for the periodical upload from Standby. For the other
+              // scenarios such as rollback image and ckpt file, they skip
+              // this check, see HDFS-15036 for more info.
               if (checkRecentImageEnable &&
+                  NameNodeFile.IMAGE.equals(parsedParams.getNameNodeFile()) &&
                   timeDelta < checkpointPeriod &&
                   txid - lastCheckpointTxid < checkpointTxnCount) {
                 // only when at least one of two conditions are met we accept
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/StandbyCheckpointer.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/StandbyCheckpointer.java
index c05a0da..5cac972 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/StandbyCheckpointer.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ha/StandbyCheckpointer.java
@@ -289,10 +289,20 @@ public class StandbyCheckpointer {
         // TODO should there be some smarts here about retries nodes that
         //  are not the active NN?
         CheckpointReceiverEntry receiverEntry = checkpointReceivers.get(url);
-        if (upload.get() == TransferFsImage.TransferResult.SUCCESS) {
+        TransferFsImage.TransferResult uploadResult = upload.get();
+        if (uploadResult == TransferFsImage.TransferResult.SUCCESS) {
           receiverEntry.setLastUploadTime(monotonicNow());
           receiverEntry.setIsPrimary(true);
         } else {
+          // Getting here means image upload is explicitly rejected
+          // by the other node. This could happen if:
+          // 1. the other is also a standby, or
+          // 2. the other is active, but already accepted another
+          // newer image, or
+          // 3. the other is active but has a recent enough image.
+          // All these are valid cases, just log for information.
+          LOG.info("Image upload rejected by the other NameNode: " +
+              uploadResult);
           receiverEntry.setIsPrimary(false);
         }
       } catch (ExecutionException e) {
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestRollingUpgrade.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestRollingUpgrade.java
index 749ec96..579c5ef 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestRollingUpgrade.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/TestRollingUpgrade.java
@@ -57,6 +57,7 @@ import org.junit.Assert;
 import static org.junit.Assert.assertTrue;
 import org.junit.Test;
 
+import static org.apache.hadoop.hdfs.server.namenode.ImageServlet.RECENT_IMAGE_CHECK_ENABLED;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertNotEquals;
 import static org.junit.Assert.assertNull;
@@ -430,7 +431,22 @@ public class TestRollingUpgrade {
     testFinalize(3);
   }
 
+  @Test(timeout = 300000)
+  public void testFinalizeWithDeltaCheck() throws Exception {
+    testFinalize(2, true);
+  }
+
+  @Test(timeout = 300000)
+  public void testFinalizeWithMultipleNNDeltaCheck() throws Exception {
+    testFinalize(3, true);
+  }
+
   private void testFinalize(int nnCount) throws Exception {
+    testFinalize(nnCount, false);
+  }
+
+  private void testFinalize(int nnCount, boolean skipImageDeltaCheck)
+      throws Exception {
     final Configuration conf = new HdfsConfiguration();
     MiniQJMHACluster cluster = null;
     final Path foo = new Path("/foo");
@@ -449,6 +465,10 @@ public class TestRollingUpgrade {
       dfsCluster.restartNameNodes();
 
       dfsCluster.transitionToActive(0);
+
+      dfsCluster.getNameNode(0).getHttpServer()
+          .setAttribute(RECENT_IMAGE_CHECK_ENABLED, skipImageDeltaCheck);
+
       DistributedFileSystem dfs = dfsCluster.getFileSystem(0);
       dfs.mkdirs(foo);
 


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org