You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by in...@apache.org on 2019/03/03 18:37:16 UTC

[hadoop] 37/45: HDFS-14230. RBF: Throw RetriableException instead of IOException when no namenodes available. Contributed by Fei Hui.

This is an automated email from the ASF dual-hosted git repository.

inigoiri pushed a commit to branch HDFS-13891
in repository https://gitbox.apache.org/repos/asf/hadoop.git

commit 63e160f0408ebe3870c596f37d46bce6303b709b
Author: Inigo Goiri <in...@apache.org>
AuthorDate: Tue Feb 12 10:44:02 2019 -0800

    HDFS-14230. RBF: Throw RetriableException instead of IOException when no namenodes available. Contributed by Fei Hui.
---
 .../federation/metrics/FederationRPCMBean.java     |  2 +
 .../federation/metrics/FederationRPCMetrics.java   | 11 +++
 .../metrics/FederationRPCPerformanceMonitor.java   |  5 ++
 .../router/NoNamenodesAvailableException.java      | 33 +++++++++
 .../server/federation/router/RouterRpcClient.java  | 16 +++-
 .../server/federation/router/RouterRpcMonitor.java |  5 ++
 .../server/federation/FederationTestUtils.java     | 38 ++++++++++
 .../router/TestRouterClientRejectOverload.java     | 86 ++++++++++++++++++++--
 .../router/TestRouterRPCClientRetries.java         |  2 +-
 9 files changed, 188 insertions(+), 10 deletions(-)

diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationRPCMBean.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationRPCMBean.java
index 973c398..76b3ca6 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationRPCMBean.java
+++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationRPCMBean.java
@@ -46,6 +46,8 @@ public interface FederationRPCMBean {
 
   long getProxyOpRetries();
 
+  long getProxyOpNoNamenodes();
+
   long getRouterFailureStateStoreOps();
 
   long getRouterFailureReadOnlyOps();
diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationRPCMetrics.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationRPCMetrics.java
index cce4b86..8e57c6b 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationRPCMetrics.java
+++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationRPCMetrics.java
@@ -60,6 +60,8 @@ public class FederationRPCMetrics implements FederationRPCMBean {
   private MutableCounterLong proxyOpNotImplemented;
   @Metric("Number of operation retries")
   private MutableCounterLong proxyOpRetries;
+  @Metric("Number of operations to hit no namenodes available")
+  private MutableCounterLong proxyOpNoNamenodes;
 
   @Metric("Failed requests due to State Store unavailable")
   private MutableCounterLong routerFailureStateStore;
@@ -138,6 +140,15 @@ public class FederationRPCMetrics implements FederationRPCMBean {
     return proxyOpRetries.value();
   }
 
+  public void incrProxyOpNoNamenodes() {
+    proxyOpNoNamenodes.incr();
+  }
+
+  @Override
+  public long getProxyOpNoNamenodes() {
+    return proxyOpNoNamenodes.value();
+  }
+
   public void incrRouterFailureStateStore() {
     routerFailureStateStore.incr();
   }
diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationRPCPerformanceMonitor.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationRPCPerformanceMonitor.java
index 15725d1..cbd63de 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationRPCPerformanceMonitor.java
+++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/metrics/FederationRPCPerformanceMonitor.java
@@ -171,6 +171,11 @@ public class FederationRPCPerformanceMonitor implements RouterRpcMonitor {
   }
 
   @Override
+  public void proxyOpNoNamenodes() {
+    metrics.incrProxyOpNoNamenodes();
+  }
+
+  @Override
   public void routerFailureStateStore() {
     metrics.incrRouterFailureStateStore();
   }
diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/NoNamenodesAvailableException.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/NoNamenodesAvailableException.java
new file mode 100644
index 0000000..7eabf00
--- /dev/null
+++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/NoNamenodesAvailableException.java
@@ -0,0 +1,33 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.hdfs.server.federation.router;
+
+import java.io.IOException;
+
+
+/**
+ * Exception when no namenodes are available.
+ */
+public class NoNamenodesAvailableException extends IOException {
+
+  private static final long serialVersionUID = 1L;
+
+  public NoNamenodesAvailableException(String nsId, IOException ioe) {
+    super("No namenodes available under nameservice " + nsId, ioe);
+  }
+}
diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterRpcClient.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterRpcClient.java
index f5985ee..d21bde3 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterRpcClient.java
+++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterRpcClient.java
@@ -61,6 +61,7 @@ import org.apache.hadoop.io.retry.RetryPolicies;
 import org.apache.hadoop.io.retry.RetryPolicy;
 import org.apache.hadoop.io.retry.RetryPolicy.RetryAction.RetryDecision;
 import org.apache.hadoop.ipc.RemoteException;
+import org.apache.hadoop.ipc.RetriableException;
 import org.apache.hadoop.ipc.StandbyException;
 import org.apache.hadoop.security.UserGroupInformation;
 import org.slf4j.Logger;
@@ -302,8 +303,8 @@ public class RouterRpcClient {
    * @param retryCount Number of retries.
    * @param nsId Nameservice ID.
    * @return Retry decision.
-   * @throws IOException Original exception if the retry policy generates one
-   *                     or IOException for no available namenodes.
+   * @throws NoNamenodesAvailableException Exception that the retry policy
+   *         generates for no available namenodes.
    */
   private RetryDecision shouldRetry(final IOException ioe, final int retryCount,
       final String nsId) throws IOException {
@@ -313,8 +314,7 @@ public class RouterRpcClient {
       if (retryCount == 0) {
         return RetryDecision.RETRY;
       } else {
-        throw new IOException("No namenode available under nameservice " + nsId,
-            ioe);
+        throw new NoNamenodesAvailableException(nsId, ioe);
       }
     }
 
@@ -405,6 +405,14 @@ public class RouterRpcClient {
           StandbyException se = new StandbyException(ioe.getMessage());
           se.initCause(ioe);
           throw se;
+        } else if (ioe instanceof NoNamenodesAvailableException) {
+          if (this.rpcMonitor != null) {
+            this.rpcMonitor.proxyOpNoNamenodes();
+          }
+          LOG.error("Can not get available namenode for {} {} error: {}",
+              nsId, rpcAddress, ioe.getMessage());
+          // Throw RetriableException so that client can retry
+          throw new RetriableException(ioe);
         } else {
           // Other communication error, this is a failure
           // Communication retries are handled by the retry policy
diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterRpcMonitor.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterRpcMonitor.java
index 7af71af..5a2adb9 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterRpcMonitor.java
+++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/main/java/org/apache/hadoop/hdfs/server/federation/router/RouterRpcMonitor.java
@@ -93,6 +93,11 @@ public interface RouterRpcMonitor {
   void proxyOpRetries();
 
   /**
+   * Failed to proxy an operation because of no namenodes available.
+   */
+  void proxyOpNoNamenodes();
+
+  /**
    * If the Router cannot contact the State Store in an operation.
    */
   void routerFailureStateStore();
diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/FederationTestUtils.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/FederationTestUtils.java
index d92edac..5434224 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/FederationTestUtils.java
+++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/FederationTestUtils.java
@@ -48,6 +48,7 @@ import org.apache.hadoop.fs.UnsupportedFileSystemException;
 import org.apache.hadoop.fs.permission.FsPermission;
 import org.apache.hadoop.ha.HAServiceProtocol.HAServiceState;
 import org.apache.hadoop.hdfs.DFSConfigKeys;
+import org.apache.hadoop.hdfs.server.federation.MiniRouterDFSCluster.NamenodeContext;
 import org.apache.hadoop.hdfs.server.federation.resolver.ActiveNamenodeResolver;
 import org.apache.hadoop.hdfs.server.federation.resolver.FederationNamenodeContext;
 import org.apache.hadoop.hdfs.server.federation.resolver.FederationNamenodeServiceState;
@@ -374,4 +375,41 @@ public final class FederationTestUtils {
     Whitebox.setInternalState(rpcClient, "connectionManager",
         spyConnectionManager);
   }
+
+  /**
+   * Switch namenodes of all hdfs name services to standby.
+   * @param cluster a federated HDFS cluster
+   */
+  public static void transitionClusterNSToStandby(
+      StateStoreDFSCluster cluster) {
+    // Name services of the cluster
+    List<String> nameServiceList = cluster.getNameservices();
+
+    // Change namenodes of each name service to standby
+    for (String nameService : nameServiceList) {
+      List<NamenodeContext>  nnList = cluster.getNamenodes(nameService);
+      for(NamenodeContext namenodeContext : nnList) {
+        cluster.switchToStandby(nameService, namenodeContext.getNamenodeId());
+      }
+    }
+  }
+
+  /**
+   * Switch the index namenode of all hdfs name services to active.
+   * @param cluster a federated HDFS cluster
+   * @param index the index of namenodes
+   */
+  public static void transitionClusterNSToActive(
+      StateStoreDFSCluster cluster, int index) {
+    // Name services of the cluster
+    List<String> nameServiceList = cluster.getNameservices();
+
+    // Change the index namenode of each name service to active
+    for (String nameService : nameServiceList) {
+      List<NamenodeContext> listNamenodeContext =
+          cluster.getNamenodes(nameService);
+      cluster.switchToActive(nameService,
+          listNamenodeContext.get(index).getNamenodeId());
+    }
+  }
 }
diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestRouterClientRejectOverload.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestRouterClientRejectOverload.java
index 0664159..14bd7b0 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestRouterClientRejectOverload.java
+++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestRouterClientRejectOverload.java
@@ -19,6 +19,8 @@ package org.apache.hadoop.hdfs.server.federation.router;
 
 import static org.apache.hadoop.hdfs.server.federation.FederationTestUtils.simulateSlowNamenode;
 import static org.apache.hadoop.hdfs.server.federation.FederationTestUtils.simulateThrowExceptionRouterRpcServer;
+import static org.apache.hadoop.hdfs.server.federation.FederationTestUtils.transitionClusterNSToStandby;
+import static org.apache.hadoop.hdfs.server.federation.FederationTestUtils.transitionClusterNSToActive;
 import static org.apache.hadoop.test.GenericTestUtils.assertExceptionContains;
 import static org.junit.Assert.assertEquals;
 import static org.junit.Assert.assertTrue;
@@ -27,6 +29,7 @@ import static org.junit.Assert.fail;
 import java.io.IOException;
 import java.net.URI;
 import java.util.ArrayList;
+import java.util.Collection;
 import java.util.List;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.Executors;
@@ -46,7 +49,9 @@ import org.apache.hadoop.hdfs.server.namenode.NameNode;
 import org.apache.hadoop.ipc.RemoteException;
 import org.apache.hadoop.ipc.StandbyException;
 import org.junit.After;
+import org.junit.Rule;
 import org.junit.Test;
+import org.junit.rules.ExpectedException;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -71,14 +76,19 @@ public class TestRouterClientRejectOverload {
     }
   }
 
-  private void setupCluster(boolean overloadControl) throws Exception {
+  @Rule
+  public ExpectedException exceptionRule = ExpectedException.none();
+
+  private void setupCluster(boolean overloadControl, boolean ha)
+      throws Exception {
     // Build and start a federated cluster
-    cluster = new StateStoreDFSCluster(false, 2);
+    cluster = new StateStoreDFSCluster(ha, 2);
     Configuration routerConf = new RouterConfigBuilder()
         .stateStore()
         .metrics()
         .admin()
         .rpc()
+        .heartbeat()
         .build();
 
     // Reduce the number of RPC clients threads to overload the Router easy
@@ -98,7 +108,7 @@ public class TestRouterClientRejectOverload {
 
   @Test
   public void testWithoutOverloadControl() throws Exception {
-    setupCluster(false);
+    setupCluster(false, false);
 
     // Nobody should get overloaded
     testOverloaded(0);
@@ -121,7 +131,7 @@ public class TestRouterClientRejectOverload {
 
   @Test
   public void testOverloadControl() throws Exception {
-    setupCluster(true);
+    setupCluster(true, false);
 
     List<RouterContext> routers = cluster.getRouters();
     FederationRPCMetrics rpcMetrics0 =
@@ -244,7 +254,7 @@ public class TestRouterClientRejectOverload {
 
   @Test
   public void testConnectionNullException() throws Exception {
-    setupCluster(false);
+    setupCluster(false, false);
 
     // Choose 1st router
     RouterContext routerContext = cluster.getRouters().get(0);
@@ -280,4 +290,70 @@ public class TestRouterClientRejectOverload {
     assertEquals(originalRouter1Failures,
         rpcMetrics1.getProxyOpFailureCommunicate());
   }
+
+  /**
+   * When failover occurs, no namenodes are available within a short time.
+   * Client will success after some retries.
+   */
+  @Test
+  public void testNoNamenodesAvailable() throws Exception{
+    setupCluster(false, true);
+
+    transitionClusterNSToStandby(cluster);
+
+    Configuration conf = cluster.getRouterClientConf();
+    // Set dfs.client.failover.random.order false, to pick 1st router at first
+    conf.setBoolean("dfs.client.failover.random.order", false);
+
+    // Retries is 3 (see FailoverOnNetworkExceptionRetry#shouldRetry, will fail
+    // when reties > max.attempts), so total access is 4.
+    conf.setInt("dfs.client.retry.max.attempts", 2);
+    DFSClient routerClient = new DFSClient(new URI("hdfs://fed"), conf);
+
+    // Get router0 metrics
+    FederationRPCMetrics rpcMetrics0 = cluster.getRouters().get(0)
+        .getRouter().getRpcServer().getRPCMetrics();
+    // Get router1 metrics
+    FederationRPCMetrics rpcMetrics1 = cluster.getRouters().get(1)
+        .getRouter().getRpcServer().getRPCMetrics();
+
+    // Original failures
+    long originalRouter0Failures = rpcMetrics0.getProxyOpNoNamenodes();
+    long originalRouter1Failures = rpcMetrics1.getProxyOpNoNamenodes();
+
+    // GetFileInfo will throw Exception
+    String exceptionMessage = "org.apache.hadoop.hdfs.server.federation."
+        + "router.NoNamenodesAvailableException: No namenodes available "
+        + "under nameservice ns0";
+    exceptionRule.expect(RemoteException.class);
+    exceptionRule.expectMessage(exceptionMessage);
+    routerClient.getFileInfo("/");
+
+    // Router 0 failures will increase
+    assertEquals(originalRouter0Failures + 4,
+        rpcMetrics0.getProxyOpNoNamenodes());
+    // Router 1 failures do not change
+    assertEquals(originalRouter1Failures,
+        rpcMetrics1.getProxyOpNoNamenodes());
+
+    // Make name services available
+    transitionClusterNSToActive(cluster, 0);
+    for (RouterContext routerContext : cluster.getRouters()) {
+      // Manually trigger the heartbeat
+      Collection<NamenodeHeartbeatService> heartbeatServices = routerContext
+          .getRouter().getNamenodeHearbeatServices();
+      for (NamenodeHeartbeatService service : heartbeatServices) {
+        service.periodicInvoke();
+      }
+      // Update service cache
+      routerContext.getRouter().getStateStore().refreshCaches(true);
+    }
+
+    originalRouter0Failures = rpcMetrics0.getProxyOpNoNamenodes();
+
+    // RPC call must be successful
+    routerClient.getFileInfo("/");
+    // Router 0 failures do not change
+    assertEquals(originalRouter0Failures, rpcMetrics0.getProxyOpNoNamenodes());
+  }
 }
diff --git a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestRouterRPCClientRetries.java b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestRouterRPCClientRetries.java
index f84e9a0..8772e2f 100644
--- a/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestRouterRPCClientRetries.java
+++ b/hadoop-hdfs-project/hadoop-hdfs-rbf/src/test/java/org/apache/hadoop/hdfs/server/federation/router/TestRouterRPCClientRetries.java
@@ -133,7 +133,7 @@ public class TestRouterRPCClientRetries {
     } catch (RemoteException e) {
       String ns0 = cluster.getNameservices().get(0);
       assertExceptionContains(
-          "No namenode available under nameservice " + ns0, e);
+          "No namenodes available under nameservice " + ns0, e);
     }
 
     // Verify the retry times, it should only retry one time.


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org