You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@knox.apache.org by sm...@apache.org on 2022/05/11 09:01:16 UTC

[knox] branch master updated: KNOX-2742 - Retrying CM service discovery in case of ConnectExceptions (#571)

This is an automated email from the ASF dual-hosted git repository.

smolnar pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/knox.git


The following commit(s) were added to refs/heads/master by this push:
     new 3b63b9851 KNOX-2742 - Retrying CM service discovery in case of ConnectExceptions (#571)
3b63b9851 is described below

commit 3b63b9851937b484546d974612e6ddd3ceaabbc9
Author: Sandor Molnar <sm...@cloudera.com>
AuthorDate: Wed May 11 11:01:11 2022 +0200

    KNOX-2742 - Retrying CM service discovery in case of ConnectExceptions (#571)
---
 .../cm/ClouderaManagerServiceDiscovery.java        | 69 ++++++++++++++++--
 .../ClouderaManagerServiceDiscoveryMessages.java   |  5 ++
 .../cm/ClouderaManagerServiceDiscoveryTest.java    | 83 +++++++++++++++++++---
 .../gateway/config/impl/GatewayConfigImpl.java     |  6 ++
 .../apache/knox/gateway/config/GatewayConfig.java  | 12 ++++
 .../org/apache/knox/gateway/GatewayTestConfig.java |  5 ++
 6 files changed, 163 insertions(+), 17 deletions(-)

diff --git a/gateway-discovery-cm/src/main/java/org/apache/knox/gateway/topology/discovery/cm/ClouderaManagerServiceDiscovery.java b/gateway-discovery-cm/src/main/java/org/apache/knox/gateway/topology/discovery/cm/ClouderaManagerServiceDiscovery.java
index d58136703..8b34cbb2e 100644
--- a/gateway-discovery-cm/src/main/java/org/apache/knox/gateway/topology/discovery/cm/ClouderaManagerServiceDiscovery.java
+++ b/gateway-discovery-cm/src/main/java/org/apache/knox/gateway/topology/discovery/cm/ClouderaManagerServiceDiscovery.java
@@ -38,6 +38,7 @@ import org.apache.knox.gateway.topology.discovery.ServiceDiscovery;
 import org.apache.knox.gateway.topology.discovery.ServiceDiscoveryConfig;
 import org.apache.knox.gateway.topology.discovery.cm.monitor.ClouderaManagerClusterConfigurationMonitor;
 
+import java.net.ConnectException;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
@@ -48,6 +49,7 @@ import java.util.Locale;
 import java.util.Map;
 import java.util.ServiceLoader;
 import java.util.Set;
+import java.util.concurrent.atomic.AtomicInteger;
 
 
 /**
@@ -90,6 +92,10 @@ public class ClouderaManagerServiceDiscovery implements ServiceDiscovery, Cluste
 
   private final ClouderaManagerServiceDiscoveryRepository repository = ClouderaManagerServiceDiscoveryRepository.getInstance();
 
+  private final AtomicInteger retryAttempts = new AtomicInteger(0);
+  private final int retrySleepSeconds = 3;  // It's been agreed that we not expose this config
+  private int maxRetryAttempts = -1;
+
   ClouderaManagerServiceDiscovery(GatewayConfig gatewayConfig) {
     this(false, gatewayConfig);
   }
@@ -106,6 +112,23 @@ public class ClouderaManagerServiceDiscovery implements ServiceDiscovery, Cluste
     if (gatewayConfig != null) {
       repository.setCacheEntryTTL(gatewayConfig.getClouderaManagerServiceDiscoveryRepositoryEntryTTL());
     }
+
+    configureRetryParams(gatewayConfig);
+  }
+
+  private void configureRetryParams(GatewayConfig gatewayConfig) {
+    final int configuredMaxRetryAttempts = gatewayConfig.getClouderaManagerServiceDiscoveryMaximumRetryAttempts();
+    if (configuredMaxRetryAttempts > 0) {
+      final int configuredRetryDurationSeconds = configuredMaxRetryAttempts * retrySleepSeconds;
+      final int pollingInterval = gatewayConfig.getClusterMonitorPollingInterval(ClouderaManagerClusterConfigurationMonitor.getType());
+      final int retryDurationLimit = pollingInterval / 2;
+      if (retryDurationLimit > configuredRetryDurationSeconds) {
+        this.maxRetryAttempts = configuredMaxRetryAttempts;
+      } else {
+        this.maxRetryAttempts = retryDurationLimit / retrySleepSeconds;
+        log.updateMaxRetryAttempts(configuredMaxRetryAttempts, maxRetryAttempts);
+      }
+    }
   }
 
   @Override
@@ -183,13 +206,41 @@ public class ClouderaManagerServiceDiscovery implements ServiceDiscovery, Cluste
         // Notify the cluster config monitor about these cluster configuration details
         configChangeMonitor.addServiceConfiguration(cluster, discoveryConfig);
       }
+      resetRetryAttempts();
     } catch (ApiException e) {
       log.clusterDiscoveryError(clusterName, e);
+      if (shouldRetryServiceDiscovery(e)) {
+        log.retryDiscovery(retrySleepSeconds, retryAttempts.get());
+        try {
+          Thread.sleep(retrySleepSeconds * 1000L);
+        } catch (InterruptedException ex) {
+          Thread.currentThread().interrupt();
+        }
+        cluster = discover(gatewayConfig, discoveryConfig, clusterName, includedServices, client);
+      } else {
+        resetRetryAttempts();
+      }
     }
 
     return cluster;
   }
 
+  private void resetRetryAttempts() {
+    retryAttempts.set(0);
+  }
+
+  private boolean shouldRetryServiceDiscovery(ApiException e) {
+    if (maxRetryAttempts > 0 && maxRetryAttempts > retryAttempts.getAndIncrement()) {
+      final Throwable cause = e.getCause();
+      if (cause != null) {
+        if (ConnectException.class.isAssignableFrom(cause.getClass())) {
+          return true;
+        }
+      }
+    }
+    return false;
+  }
+
   private ClouderaManagerCluster discoverCluster(DiscoveryApiClient client, String clusterName, Collection<String> includedServices)
       throws ApiException {
     ServicesResourceApi servicesResourceApi = new ServicesResourceApi(client);
@@ -281,7 +332,7 @@ public class ClouderaManagerServiceDiscovery implements ServiceDiscovery, Cluste
     return true;
   }
 
-  private List<ApiService> getClusterServices(ServiceDiscoveryConfig serviceDiscoveryConfig, ServicesResourceApi servicesResourceApi) {
+  private List<ApiService> getClusterServices(ServiceDiscoveryConfig serviceDiscoveryConfig, ServicesResourceApi servicesResourceApi) throws ApiException {
     log.lookupClusterServicesFromRepository();
     List<ApiService> services = repository.getServices(serviceDiscoveryConfig);
     if (services == null || services.isEmpty()) {
@@ -294,12 +345,13 @@ public class ClouderaManagerServiceDiscovery implements ServiceDiscovery, Cluste
         services.forEach(service -> repository.addService(serviceDiscoveryConfig, service));
       } catch (ApiException e) {
         log.failedToAccessServiceConfigs(serviceDiscoveryConfig.getCluster(), e);
+        throw e;
       }
     }
     return services;
   }
 
-  private ApiServiceConfig getServiceConfig(ServiceDiscoveryConfig serviceDiscoveryConfig, ServicesResourceApi servicesResourceApi, ApiService service) {
+  private ApiServiceConfig getServiceConfig(ServiceDiscoveryConfig serviceDiscoveryConfig, ServicesResourceApi servicesResourceApi, ApiService service) throws ApiException {
     log.lookupServiceConfigsFromRepository();
     // first, try in the service discovery repository
     ApiServiceConfig serviceConfig = repository.getServiceConfig(serviceDiscoveryConfig, service);
@@ -312,14 +364,15 @@ public class ClouderaManagerServiceDiscovery implements ServiceDiscovery, Cluste
 
         // make sure that service config is populated in the service discovery repository to avoid subsequent CM calls
         repository.addServiceConfig(serviceDiscoveryConfig, service, serviceConfig);
-      } catch (Exception e) {
+      } catch (ApiException e) {
         log.failedToAccessServiceConfigs(serviceDiscoveryConfig.getCluster(), e);
+        throw e;
       }
     }
     return serviceConfig;
   }
 
-  private ApiRoleList getRoles(ServiceDiscoveryConfig serviceDiscoveryConfig, RolesResourceApi rolesResourceApi, String clusterName, ApiService service) {
+  private ApiRoleList getRoles(ServiceDiscoveryConfig serviceDiscoveryConfig, RolesResourceApi rolesResourceApi, String clusterName, ApiService service) throws ApiException {
     log.lookupRolesFromRepository();
     //first, try in the service discovery repository
     ApiRoleList roles  = repository.getRoles(serviceDiscoveryConfig, service);
@@ -341,15 +394,16 @@ public class ClouderaManagerServiceDiscovery implements ServiceDiscovery, Cluste
 
         // make sure that role is populated in the service discovery repository to avoid subsequent CM calls
         repository.addRoles(serviceDiscoveryConfig, service, roles);
-      } catch (Exception e) {
+      } catch (ApiException e) {
         log.failedToAccessServiceRoleConfigs(serviceName, "N/A", clusterName, e);
+        throw e;
       }
     }
 
     return roles;
   }
 
-  private ApiConfigList getRoleConfig(ServiceDiscoveryConfig serviceDiscoveryConfig, RolesResourceApi rolesResourceApi, ApiService service, ApiRole role) {
+  private ApiConfigList getRoleConfig(ServiceDiscoveryConfig serviceDiscoveryConfig, RolesResourceApi rolesResourceApi, ApiService service, ApiRole role) throws ApiException {
     log.lookupRoleConfigsFromRepository();
     // first, try in the service discovery repository
     ApiConfigList configList = repository.getRoleConfigs(serviceDiscoveryConfig, service, role);
@@ -361,8 +415,9 @@ public class ClouderaManagerServiceDiscovery implements ServiceDiscovery, Cluste
 
         // make sure that role config is populated in the service discovery repository to avoid subsequent CM calls
         repository.addRoleConfigs(serviceDiscoveryConfig, service, role, configList);
-      } catch (Exception e) {
+      } catch (ApiException e) {
         log.failedToAccessServiceRoleConfigs(service.getName(), role.getName(), serviceDiscoveryConfig.getCluster(), e);
+        throw e;
       }
     }
     return configList;
diff --git a/gateway-discovery-cm/src/main/java/org/apache/knox/gateway/topology/discovery/cm/ClouderaManagerServiceDiscoveryMessages.java b/gateway-discovery-cm/src/main/java/org/apache/knox/gateway/topology/discovery/cm/ClouderaManagerServiceDiscoveryMessages.java
index fadb7fe89..9a1d75883 100644
--- a/gateway-discovery-cm/src/main/java/org/apache/knox/gateway/topology/discovery/cm/ClouderaManagerServiceDiscoveryMessages.java
+++ b/gateway-discovery-cm/src/main/java/org/apache/knox/gateway/topology/discovery/cm/ClouderaManagerServiceDiscoveryMessages.java
@@ -81,6 +81,9 @@ public interface ClouderaManagerServiceDiscoveryMessages {
            text = "Encountered an error during cluster ({0}) discovery: {1}")
   void clusterDiscoveryError(String clusterName, @StackTrace(level = MessageLevel.DEBUG) Exception e);
 
+  @Message(level = MessageLevel.INFO, text = "Sleeping {0} second(s) before retrying Cloudera Manager service discovery for the {1}. time")
+  void retryDiscovery(long retrySleep, int retryAttempt);
+
   @Message(level = MessageLevel.ERROR,
            text = "Failed to access the service configurations for cluster ({0}) discovery: {1}")
   void failedToAccessServiceConfigs(String clusterName, @StackTrace(level = MessageLevel.DEBUG) Exception e);
@@ -250,4 +253,6 @@ public interface ClouderaManagerServiceDiscoveryMessages {
   @Message(level = MessageLevel.DEBUG, text = "Clearing service discovery repository...")
   void clearServiceDiscoveryRepository();
 
+  @Message(level = MessageLevel.WARN, text = "The configured maximum retry attempts of {0} may overlap with the configured polling interval settings; using {1} retry attempts")
+  void updateMaxRetryAttempts(int configured, int actual);
 }
diff --git a/gateway-discovery-cm/src/test/java/org/apache/knox/gateway/topology/discovery/cm/ClouderaManagerServiceDiscoveryTest.java b/gateway-discovery-cm/src/test/java/org/apache/knox/gateway/topology/discovery/cm/ClouderaManagerServiceDiscoveryTest.java
index bd9f4b1eb..7eefa6349 100644
--- a/gateway-discovery-cm/src/test/java/org/apache/knox/gateway/topology/discovery/cm/ClouderaManagerServiceDiscoveryTest.java
+++ b/gateway-discovery-cm/src/test/java/org/apache/knox/gateway/topology/discovery/cm/ClouderaManagerServiceDiscoveryTest.java
@@ -16,6 +16,9 @@
  */
 package org.apache.knox.gateway.topology.discovery.cm;
 
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertNotNull;
+
 import com.cloudera.api.swagger.client.ApiException;
 import com.cloudera.api.swagger.client.ApiResponse;
 import com.cloudera.api.swagger.model.ApiClusterRef;
@@ -55,25 +58,31 @@ import org.apache.knox.gateway.topology.discovery.cm.model.solr.SolrServiceModel
 import org.apache.knox.gateway.topology.discovery.cm.model.spark.Spark3HistoryUIServiceModelGenerator;
 import org.apache.knox.gateway.topology.discovery.cm.model.spark.SparkHistoryUIServiceModelGenerator;
 import org.apache.knox.gateway.topology.discovery.cm.model.zeppelin.ZeppelinServiceModelGenerator;
+import org.apache.knox.gateway.topology.discovery.cm.monitor.ClouderaManagerClusterConfigurationMonitor;
 import org.easymock.EasyMock;
 import org.junit.Test;
 
-
 import java.lang.reflect.Type;
+import java.net.ConnectException;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Locale;
 import java.util.Map;
-
-import static org.junit.Assert.assertEquals;
-import static org.junit.Assert.assertNotNull;
+import java.util.concurrent.atomic.AtomicInteger;
 
 
 public class ClouderaManagerServiceDiscoveryTest {
 
   private static final String DISCOVERY_URL = "http://localhost:1234";
+  private static final String ATLAS_SERVICE_NAME = "ATLAS-1";
+
+  @Test
+  public void testServiceDiscoveryRetry() throws Exception {
+    //re-using an already existing test with 'true' retry flag
+    doTestAtlasDiscovery(true, true);
+  }
 
   @Test
   public void testJobTrackerServiceDiscovery() {
@@ -130,10 +139,14 @@ public class ClouderaManagerServiceDiscoveryTest {
   }
 
   private void doTestAtlasDiscovery(final boolean isSSL) {
+    doTestAtlasDiscovery(isSSL, false);
+  }
+
+  private void doTestAtlasDiscovery(final boolean isSSL, boolean testRetry) {
     final String hostName       = "atlas-host-1";
     final String port           = "21000";
     final String sslPort        = "21003";
-    ServiceDiscovery.Cluster cluster = doTestAtlasDiscovery(hostName, port, sslPort, isSSL);
+    ServiceDiscovery.Cluster cluster = doTestAtlasDiscovery(hostName, port, sslPort, isSSL, testRetry);
     List<String> atlastURLs = cluster.getServiceURLs(AtlasServiceModelGenerator.SERVICE);
     assertEquals(1, atlastURLs.size());
     assertEquals((isSSL ? "https" : "http") + "://" + hostName + ":" + (isSSL ? sslPort : port),
@@ -934,10 +947,18 @@ public class ClouderaManagerServiceDiscoveryTest {
     return roleProperties;
   }
 
+  private ServiceDiscovery.Cluster doTestAtlasDiscovery(final String  atlasHost,
+      final String  port,
+      final String  sslPort,
+      final boolean isSSL) {
+    return doTestAtlasDiscovery(atlasHost, port, sslPort, isSSL, false);
+  }
+
   private ServiceDiscovery.Cluster doTestAtlasDiscovery(final String  atlasHost,
                                                         final String  port,
                                                         final String  sslPort,
-                                                        final boolean isSSL) {
+                                                        final boolean isSSL,
+                                                        final boolean testRetry) {
     // Configure the role
     Map<String, String> roleProperties = new HashMap<>();
     roleProperties.put("atlas_server_http_port", port);
@@ -945,12 +966,13 @@ public class ClouderaManagerServiceDiscoveryTest {
     roleProperties.put("ssl_enabled", String.valueOf(isSSL));
 
     return doTestDiscovery(atlasHost,
-                           "ATLAS-1",
+                           ATLAS_SERVICE_NAME,
                            AtlasServiceModelGenerator.SERVICE_TYPE,
                            "ATLAS-ATLAS_SERVER-1",
                            AtlasServiceModelGenerator.ROLE_TYPE,
                            Collections.emptyMap(),
-                           roleProperties);
+                           roleProperties,
+                           testRetry);
   }
 
 
@@ -1137,22 +1159,37 @@ public class ClouderaManagerServiceDiscoveryTest {
   }
 
 
+  private ServiceDiscovery.Cluster doTestDiscovery(final String hostName,
+      final String serviceName,
+      final String serviceType,
+      final String roleName,
+      final String roleType,
+      final Map<String, String> serviceProperties,
+      final Map<String, String> roleProperties) {
+    return doTestDiscovery(hostName, serviceName, serviceType, roleName, roleType, serviceProperties, roleProperties, false);
+  }
+
   private ServiceDiscovery.Cluster doTestDiscovery(final String hostName,
                                                    final String serviceName,
                                                    final String serviceType,
                                                    final String roleName,
                                                    final String roleType,
                                                    final Map<String, String> serviceProperties,
-                                                   final Map<String, String> roleProperties) {
+                                                   final Map<String, String> roleProperties,
+                                                   boolean testRetry) {
     final String clusterName = "cluster-1";
 
     GatewayConfig gwConf = EasyMock.createNiceMock(GatewayConfig.class);
+    if (testRetry) {
+      EasyMock.expect(gwConf.getClouderaManagerServiceDiscoveryMaximumRetryAttempts()).andReturn(GatewayConfig.DEFAULT_CM_SERVICE_DISCOVERY_MAX_RETRY_ATTEMPTS).anyTimes();
+      EasyMock.expect(gwConf.getClusterMonitorPollingInterval(ClouderaManagerClusterConfigurationMonitor.getType())).andReturn(10).anyTimes();
+    }
     EasyMock.replay(gwConf);
 
     ServiceDiscoveryConfig sdConfig = createMockDiscoveryConfig(clusterName);
 
     // Create the test client for providing test response content
-    TestDiscoveryApiClient mockClient = new TestDiscoveryApiClient(sdConfig, null, null);
+    TestDiscoveryApiClient mockClient = testRetry ? new TestFaultyDiscoveryApiClient(sdConfig, null, null) : new TestDiscoveryApiClient(sdConfig, null, null);
 
     // Prepare the service list response for the cluster
     ApiServiceList serviceList = EasyMock.createNiceMock(ApiServiceList.class);
@@ -1185,6 +1222,9 @@ public class ClouderaManagerServiceDiscoveryTest {
     ServiceDiscovery.Cluster cluster = cmsd.discover(gwConf, sdConfig, clusterName, Collections.emptySet(), mockClient);
     assertNotNull(cluster);
     assertEquals(clusterName, cluster.getName());
+    if (serviceName.equals(ATLAS_SERVICE_NAME)) {
+      assertEquals(testRetry ? 9 : 4, mockClient.getExecuteCount());
+    }
     return cluster;
   }
 
@@ -1276,6 +1316,8 @@ public class ClouderaManagerServiceDiscoveryTest {
 
     private Map<Type, ApiResponse<?>> responseMap = new HashMap<>();
 
+    protected AtomicInteger executeCount = new AtomicInteger(0);
+
     TestDiscoveryApiClient(ServiceDiscoveryConfig sdConfig, AliasService aliasService,
                            KeystoreService keystoreService) {
       super(sdConfig, aliasService, keystoreService);
@@ -1292,8 +1334,29 @@ public class ClouderaManagerServiceDiscoveryTest {
 
     @Override
     public <T> ApiResponse<T> execute(Call call, Type returnType) throws ApiException {
+      executeCount.incrementAndGet();
       return (ApiResponse<T>) responseMap.get(returnType);
     }
+
+    int getExecuteCount() {
+      return executeCount.get();
+    }
+  }
+
+  private static class TestFaultyDiscoveryApiClient extends TestDiscoveryApiClient {
+
+    TestFaultyDiscoveryApiClient(ServiceDiscoveryConfig sdConfig, AliasService aliasService,
+                           KeystoreService keystoreService) {
+      super(sdConfig, aliasService, keystoreService);
+    }
+
+    @Override
+    public <T> ApiResponse<T> execute(Call call, Type returnType) throws ApiException {
+      if (executeCount.getAndIncrement() < GatewayConfig.DEFAULT_CM_SERVICE_DISCOVERY_MAX_RETRY_ATTEMPTS - 2) {
+        throw new ApiException(new ConnectException("Failed to connect to CM HOST"));
+      }
+      return super.execute(call, returnType);
+    }
   }
 
   private static class TestResponseBase<T> extends ApiResponse<T> {
diff --git a/gateway-server/src/main/java/org/apache/knox/gateway/config/impl/GatewayConfigImpl.java b/gateway-server/src/main/java/org/apache/knox/gateway/config/impl/GatewayConfigImpl.java
index 0be54759d..cf0d42ba8 100644
--- a/gateway-server/src/main/java/org/apache/knox/gateway/config/impl/GatewayConfigImpl.java
+++ b/gateway-server/src/main/java/org/apache/knox/gateway/config/impl/GatewayConfigImpl.java
@@ -261,6 +261,7 @@ public class GatewayConfigImpl extends Configuration implements GatewayConfig {
   private static final String CLOUDERA_MANAGER_DESCRIPTORS_MONITOR_INTERVAL = GATEWAY_CONFIG_FILE_PREFIX + ".cloudera.manager.descriptors.monitor.interval";
   private static final String CLOUDERA_MANAGER_ADVANCED_SERVICE_DISCOVERY_CONF_MONITOR_INTERVAL = GATEWAY_CONFIG_FILE_PREFIX + ".cloudera.manager.advanced.service.discovery.config.monitor.interval";
   private static final String CLOUDERA_MANAGER_SERVICE_DISCOVERY_REPOSITORY_CACHE_ENTRY_TTL = GATEWAY_CONFIG_FILE_PREFIX + ".cloudera.manager.service.discovery.repository.cache.entry.ttl";
+  private static final String CLOUDERA_MANAGER_SERVICE_DISCOVERY_MAX_RETRY_ATTEMPS = GATEWAY_CONFIG_FILE_PREFIX + ".cloudera.manager.service.discovery.maximum.retry.attemps";
 
   private static final String KNOX_TOKEN_EVICTION_INTERVAL = GATEWAY_CONFIG_FILE_PREFIX + ".knox.token.eviction.interval";
   private static final String KNOX_TOKEN_EVICTION_GRACE_PERIOD = GATEWAY_CONFIG_FILE_PREFIX + ".knox.token.eviction.grace.period";
@@ -1177,6 +1178,11 @@ public class GatewayConfigImpl extends Configuration implements GatewayConfig {
     return getLong(CLOUDERA_MANAGER_SERVICE_DISCOVERY_REPOSITORY_CACHE_ENTRY_TTL, DEFAULT_CM_SERVICE_DISCOVERY_CACHE_ENTRY_TTL);
   }
 
+  @Override
+  public int getClouderaManagerServiceDiscoveryMaximumRetryAttempts() {
+    return getInt(CLOUDERA_MANAGER_SERVICE_DISCOVERY_MAX_RETRY_ATTEMPS, DEFAULT_CM_SERVICE_DISCOVERY_MAX_RETRY_ATTEMPTS);
+  }
+
   @Override
   public boolean isServerManagedTokenStateEnabled() {
     return getBoolean(TOKEN_STATE_SERVER_MANAGED, false);
diff --git a/gateway-spi/src/main/java/org/apache/knox/gateway/config/GatewayConfig.java b/gateway-spi/src/main/java/org/apache/knox/gateway/config/GatewayConfig.java
index a20c83d09..faa5ea32a 100644
--- a/gateway-spi/src/main/java/org/apache/knox/gateway/config/GatewayConfig.java
+++ b/gateway-spi/src/main/java/org/apache/knox/gateway/config/GatewayConfig.java
@@ -111,6 +111,8 @@ public interface GatewayConfig {
 
   long DEFAULT_CM_SERVICE_DISCOVERY_CACHE_ENTRY_TTL = 600; // 10 minutes
 
+  int DEFAULT_CM_SERVICE_DISCOVERY_MAX_RETRY_ATTEMPTS = 3;
+
   /**
    * The location of the gateway configuration.
    * Subdirectories will be: topologies
@@ -689,6 +691,16 @@ public interface GatewayConfig {
    */
   long getClouderaManagerServiceDiscoveryRepositoryEntryTTL();
 
+  /**
+   * The maximum number of attempts to try connecting to a configured Cloudera
+   * Manager endpoint in case a communication related exception is caught when
+   * trying to discover the configured cluster.
+   * <p>
+   * Setting this configuration to <code>-1</code> indicates the user does not
+   * want to retry the failed service discovery.
+   */
+  int getClouderaManagerServiceDiscoveryMaximumRetryAttempts();
+
   /**
    * @return true, if state for tokens issued by the Knox Token service should be managed by Knox.
    */
diff --git a/gateway-test-release-utils/src/main/java/org/apache/knox/gateway/GatewayTestConfig.java b/gateway-test-release-utils/src/main/java/org/apache/knox/gateway/GatewayTestConfig.java
index 2bebd2297..d99ae5aff 100644
--- a/gateway-test-release-utils/src/main/java/org/apache/knox/gateway/GatewayTestConfig.java
+++ b/gateway-test-release-utils/src/main/java/org/apache/knox/gateway/GatewayTestConfig.java
@@ -796,6 +796,11 @@ public class GatewayTestConfig extends Configuration implements GatewayConfig {
     return 0;
   }
 
+  @Override
+  public int getClouderaManagerServiceDiscoveryMaximumRetryAttempts() {
+    return -1;
+  }
+
   @Override
   public boolean isServerManagedTokenStateEnabled() {
     return false;