You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by aa...@apache.org on 2020/05/19 00:53:07 UTC

[hadoop] branch branch-3.3.0 updated: HDFS-15293. Relax the condition for accepting a fsimage when receiving a checkpoint. Contributed by Chen Liang

This is an automated email from the ASF dual-hosted git repository.

aajisaka pushed a commit to branch branch-3.3.0
in repository https://gitbox.apache.org/repos/asf/hadoop.git


The following commit(s) were added to refs/heads/branch-3.3.0 by this push:
     new 940a422  HDFS-15293. Relax the condition for accepting a fsimage when receiving a checkpoint. Contributed by Chen Liang
940a422 is described below

commit 940a422525258514165628ec93011507ba1ed5d1
Author: Chen Liang <va...@apache.org>
AuthorDate: Mon May 18 10:58:52 2020 -0700

    HDFS-15293. Relax the condition for accepting a fsimage when receiving a checkpoint. Contributed by Chen Liang
    
    (cherry picked from commit 7bb902bc0d0c62d63a8960db444de3abb0a6ad22)
    (cherry picked from commit e452163a06daa6bbebc571127754962d8776a925)
---
 .../hadoop/hdfs/server/namenode/ImageServlet.java  | 39 ++++++++++++----
 .../hdfs/server/namenode/TestCheckpoint.java       | 53 +++++++++++++++++++++-
 2 files changed, 81 insertions(+), 11 deletions(-)

diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ImageServlet.java b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ImageServlet.java
index 91f24dd..a9c2a09 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ImageServlet.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/main/java/org/apache/hadoop/hdfs/server/namenode/ImageServlet.java
@@ -99,6 +99,19 @@ public class ImageServlet extends HttpServlet {
       "recent.image.check.enabled";
   public static final boolean RECENT_IMAGE_CHECK_ENABLED_DEFAULT = true;
 
+  /*
+   * Specify a relaxation for the time delta check, the relaxation is to account
+   * for the scenario that there are chances that minor time difference (e.g.
+   * due to image upload delay, or minor machine clock skew) can cause ANN to
+   * reject a fsImage too aggressively.
+   */
+  private static double recentImageCheckTimePrecision = 0.75;
+
+  @VisibleForTesting
+  static void setRecentImageCheckTimePrecision(double ratio) {
+    recentImageCheckTimePrecision = ratio;
+  }
+
   @Override
   public void doGet(final HttpServletRequest request,
       final HttpServletResponse response) throws ServletException, IOException {
@@ -592,6 +605,9 @@ public class ImageServlet extends HttpServlet {
               long checkpointPeriod =
                   conf.getTimeDuration(DFS_NAMENODE_CHECKPOINT_PERIOD_KEY,
                       DFS_NAMENODE_CHECKPOINT_PERIOD_DEFAULT, TimeUnit.SECONDS);
+              checkpointPeriod = Math.round(
+                  checkpointPeriod * recentImageCheckTimePrecision);
+
               long checkpointTxnCount =
                   conf.getLong(DFS_NAMENODE_CHECKPOINT_TXNS_KEY,
                       DFS_NAMENODE_CHECKPOINT_TXNS_DEFAULT);
@@ -612,21 +628,24 @@ public class ImageServlet extends HttpServlet {
                 // a new fsImage
                 // 1. most recent image's txid is too far behind
                 // 2. last checkpoint time was too old
-                response.sendError(HttpServletResponse.SC_CONFLICT,
-                    "Most recent checkpoint is neither too far behind in "
-                        + "txid, nor too old. New txnid cnt is "
-                        + (txid - lastCheckpointTxid)
-                        + ", expecting at least " + checkpointTxnCount
-                        + " unless too long since last upload.");
+                String message = "Rejecting a fsimage due to small time delta "
+                    + "and txnid delta. Time since previous checkpoint is "
+                    + timeDelta + " expecting at least " + checkpointPeriod
+                    + " txnid delta since previous checkpoint is " +
+                    (txid - lastCheckpointTxid) + " expecting at least "
+                    + checkpointTxnCount;
+                LOG.info(message);
+                response.sendError(HttpServletResponse.SC_CONFLICT, message);
                 return null;
               }
 
               try {
                 if (nnImage.getStorage().findImageFile(nnf, txid) != null) {
-                  response.sendError(HttpServletResponse.SC_CONFLICT,
-                      "Either current namenode has checkpointed or "
-                          + "another checkpointer already uploaded an "
-                          + "checkpoint for txid " + txid);
+                  String message = "Either current namenode has checkpointed or "
+                      + "another checkpointer already uploaded an "
+                      + "checkpoint for txid " + txid;
+                  LOG.info(message);
+                  response.sendError(HttpServletResponse.SC_CONFLICT, message);
                   return null;
                 }
 
diff --git a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java
index 572ad8b..fabe8c5 100644
--- a/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java
+++ b/hadoop-hdfs-project/hadoop-hdfs/src/test/java/org/apache/hadoop/hdfs/server/namenode/TestCheckpoint.java
@@ -2464,7 +2464,7 @@ public class TestCheckpoint {
   }
 
   @Test(timeout = 300000)
-  public void testActiveRejectSmallerDeltaImage() throws Exception {
+  public void testActiveRejectSmallerTxidDeltaImage() throws Exception {
     MiniDFSCluster cluster = null;
     Configuration conf = new HdfsConfiguration();
     // Set the delta txid threshold to 10
@@ -2517,6 +2517,57 @@ public class TestCheckpoint {
     }
   }
 
+  /**
+   * Test that even with txid and time delta threshold, by having time
+   * relaxation, SBN can still upload images to ANN.
+   *
+   * @throws Exception
+   */
+  @Test
+  public void testActiveImageWithTimeDeltaRelaxation() throws Exception {
+    Configuration conf = new HdfsConfiguration();
+    // Set the delta txid threshold to some arbitrarily large value, so
+    // it does not trigger a checkpoint during this test.
+    conf.setInt(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_TXNS_KEY, 1000000);
+    // Set the delta time threshold to some arbitrarily large value, so
+    // it does not trigger a checkpoint during this test.
+    conf.setInt(DFSConfigKeys.DFS_NAMENODE_CHECKPOINT_PERIOD_KEY, 900000);
+    // Set relaxation to 0, means time delta = 0 from previous image is fine,
+    // this will effectively disable reject small delta image
+    ImageServlet.setRecentImageCheckTimePrecision(0);
+
+    SecondaryNameNode secondary = null;
+
+    try (MiniDFSCluster cluster = new MiniDFSCluster.Builder(conf)
+        .numDataNodes(0).format(true).build()) {
+      // enable small delta rejection
+      NameNode active = cluster.getNameNode();
+      active.httpServer.getHttpServer()
+          .setAttribute(RECENT_IMAGE_CHECK_ENABLED, true);
+
+      secondary = startSecondaryNameNode(conf);
+
+      FileSystem fs = cluster.getFileSystem();
+      assertEquals(0, active.getNamesystem().getFSImage()
+          .getMostRecentCheckpointTxId());
+
+      // create 5 dir.
+      for (int i = 0; i < 5; i++) {
+        fs.mkdirs(new Path("dir-" + i));
+      }
+
+      // Checkpoint 1st
+      secondary.doCheckpoint();
+      // at this point, despite this is a small delta change, w.r.t both
+      // txid and time delta, due to we set relaxation to 0, this image
+      // still gets accepted
+      assertEquals(9, active.getNamesystem().getFSImage()
+          .getMostRecentCheckpointTxId());
+    } finally {
+      cleanup(secondary);
+    }
+  }
+
   private static void cleanup(SecondaryNameNode snn) {
     if (snn != null) {
       try {


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org