You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@hive.apache.org by ss...@apache.org on 2016/07/13 20:44:40 UTC

hive git commit: HIVE-14213. Add timeouts for various components in llap status check. (Siddharth Seth, reviewed by Sergey Shelukhin)

Repository: hive
Updated Branches:
  refs/heads/master 7f9438357 -> 9b21e17db


HIVE-14213. Add timeouts for various components in llap status check. (Siddharth Seth, reviewed by Sergey Shelukhin)


Project: http://git-wip-us.apache.org/repos/asf/hive/repo
Commit: http://git-wip-us.apache.org/repos/asf/hive/commit/9b21e17d
Tree: http://git-wip-us.apache.org/repos/asf/hive/tree/9b21e17d
Diff: http://git-wip-us.apache.org/repos/asf/hive/diff/9b21e17d

Branch: refs/heads/master
Commit: 9b21e17dbd644d22ec2d652a4c6f7258b20228c3
Parents: 7f94383
Author: Siddharth Seth <ss...@apache.org>
Authored: Wed Jul 13 13:44:01 2016 -0700
Committer: Siddharth Seth <ss...@apache.org>
Committed: Wed Jul 13 13:44:01 2016 -0700

----------------------------------------------------------------------
 .../hive/llap/cli/LlapStatusServiceDriver.java  | 70 +++++++++++++++++++-
 .../main/resources/llap-cli-log4j2.properties   |  3 +-
 2 files changed, 70 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hive/blob/9b21e17d/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusServiceDriver.java
----------------------------------------------------------------------
diff --git a/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusServiceDriver.java b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusServiceDriver.java
index 5209226..17ce69b 100644
--- a/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusServiceDriver.java
+++ b/llap-server/src/java/org/apache/hadoop/hive/llap/cli/LlapStatusServiceDriver.java
@@ -33,6 +33,8 @@ import java.util.Map;
 import com.google.common.annotations.VisibleForTesting;
 import org.apache.commons.lang3.StringUtils;
 import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
+import org.apache.hadoop.hive.common.classification.InterfaceAudience;
 import org.apache.hadoop.hive.conf.HiveConf;
 import org.apache.hadoop.hive.llap.cli.LlapStatusOptionsProcessor.LlapStatusOptions;
 import org.apache.hadoop.hive.llap.configuration.LlapDaemonConfiguration;
@@ -40,6 +42,7 @@ import org.apache.hadoop.hive.llap.registry.ServiceInstance;
 import org.apache.hadoop.hive.llap.registry.impl.LlapRegistryService;
 import org.apache.hadoop.hive.ql.session.SessionState;
 import org.apache.hadoop.yarn.api.records.ApplicationReport;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
 import org.apache.hadoop.yarn.util.Clock;
 import org.apache.hadoop.yarn.util.SystemClock;
 import org.apache.slider.api.ClusterDescription;
@@ -58,6 +61,44 @@ public class LlapStatusServiceDriver {
 
   private static final Logger LOG = LoggerFactory.getLogger(LlapStatusServiceDriver.class);
 
+  // Defining a bunch of configs here instead of in HiveConf. These are experimental, and mainly
+  // for use when retry handling is fixed in Yarn/Hadoop
+
+  private static final String CONF_PREFIX = "hive.llapcli.";
+
+  // The following two keys should ideally be used to control RM connect timeouts. However,
+  // they don't seem to work. The IPC timeout needs to be set instead.
+  @InterfaceAudience.Private
+  private static final String CONFIG_YARN_RM_TIMEOUT_MAX_WAIT_MS =
+      CONF_PREFIX + "yarn.rm.connect.max-wait-ms";
+  private static final long CONFIG_YARN_RM_TIMEOUT_MAX_WAIT_MS_DEFAULT = 10000l;
+  @InterfaceAudience.Private
+  private static final String CONFIG_YARN_RM_RETRY_INTERVAL_MS =
+      CONF_PREFIX + "yarn.rm.connect.retry-interval.ms";
+  private static final long CONFIG_YARN_RM_RETRY_INTERVAL_MS_DEFAULT = 5000l;
+
+  // As of Hadoop 2.7 - this is what controls the RM timeout.
+  @InterfaceAudience.Private
+  private static final String CONFIG_IPC_CLIENT_CONNECT_MAX_RETRIES =
+      CONF_PREFIX + "ipc.client.max-retries";
+  private static final int CONFIG_IPC_CLIENT_CONNECT_MAX_RETRIES_DEFAULT = 2;
+  @InterfaceAudience.Private
+  private static final String CONFIG_IPC_CLIENT_CONNECT_RETRY_INTERVAL_MS =
+      CONF_PREFIX + "ipc.client.connect.retry-interval-ms";
+  private static final long CONFIG_IPC_CLIENT_CONNECT_RETRY_INTERVAL_MS_DEFAULT = 1500l;
+
+  // As of Hadoop 2.8 - this timeout spec behaves in a strnage manner. "2000,1" means 2000s with 1 retry.
+  // However it does this - but does it thrice. Essentially - #retries+2 is the number of times the entire config
+  // is retried. "2000,1" means 3 retries - each with 1 retry with a random 2000ms sleep.
+  @InterfaceAudience.Private
+  private static final String CONFIG_TIMELINE_SERVICE_ENTITYGROUP_FS_STORE_RETRY_POLICY_SPEC =
+      CONF_PREFIX + "timeline.service.fs-store.retry.policy.spec";
+  private static final String
+      CONFIG_TIMELINE_SERVICE_ENTITYGROUP_FS_STORE_RETRY_POLICY_SPEC_DEFAULT = "2000, 1";
+
+  private static final String CONFIG_LLAP_ZK_REGISTRY_TIMEOUT_MS =
+      CONF_PREFIX + "zk-registry.timeout-ms";
+  private static final long CONFIG_LLAP_ZK_REGISTRY_TIMEOUT_MS_DEFAULT = 10000l;
 
 
   private static final String AM_KEY = "slider-appmaster";
@@ -105,6 +146,33 @@ public class LlapStatusServiceDriver {
         conf.set((String) props.getKey(), (String) props.getValue());
       }
 
+      // Setup timeouts for various services.
+
+      // Once we move to a Hadoop-2.8 dependency, the following paramteer can be used.
+      // conf.set(YarnConfiguration.TIMELINE_SERVICE_ENTITYGROUP_FS_STORE_RETRY_POLICY_SPEC);
+      conf.set("yarn.timeline-service.entity-group-fs-store.retry-policy-spec",
+          conf.get(CONFIG_TIMELINE_SERVICE_ENTITYGROUP_FS_STORE_RETRY_POLICY_SPEC,
+              CONFIG_TIMELINE_SERVICE_ENTITYGROUP_FS_STORE_RETRY_POLICY_SPEC_DEFAULT));
+
+      conf.setLong(YarnConfiguration.RESOURCEMANAGER_CONNECT_MAX_WAIT_MS,
+          conf.getLong(CONFIG_YARN_RM_TIMEOUT_MAX_WAIT_MS,
+              CONFIG_YARN_RM_TIMEOUT_MAX_WAIT_MS_DEFAULT));
+      conf.setLong(YarnConfiguration.RESOURCEMANAGER_CONNECT_RETRY_INTERVAL_MS,
+          conf.getLong(CONFIG_YARN_RM_RETRY_INTERVAL_MS, CONFIG_YARN_RM_RETRY_INTERVAL_MS_DEFAULT));
+
+      conf.setInt(CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_MAX_RETRIES_KEY,
+          conf.getInt(CONFIG_IPC_CLIENT_CONNECT_MAX_RETRIES,
+              CONFIG_IPC_CLIENT_CONNECT_MAX_RETRIES_DEFAULT));
+      conf.setLong(CommonConfigurationKeysPublic.IPC_CLIENT_CONNECT_RETRY_INTERVAL_KEY,
+          conf.getLong(CONFIG_IPC_CLIENT_CONNECT_RETRY_INTERVAL_MS,
+              CONFIG_IPC_CLIENT_CONNECT_RETRY_INTERVAL_MS_DEFAULT));
+
+      HiveConf.setVar(conf, HiveConf.ConfVars.HIVE_ZOOKEEPER_SESSION_TIMEOUT, (conf
+          .getLong(CONFIG_LLAP_ZK_REGISTRY_TIMEOUT_MS, CONFIG_LLAP_ZK_REGISTRY_TIMEOUT_MS_DEFAULT) +
+          "ms"));
+
+
+
       String appName;
       appName = options.getName();
       if (StringUtils.isEmpty(appName)) {
@@ -867,8 +935,6 @@ public class LlapStatusServiceDriver {
       } else {
         ret = ExitCode.INTERNAL_ERROR.getInt();
       }
-    } finally {
-      LOG.info("LLAP status finished");
     }
     if (ret != 0 || options == null) { // Failure / help
       System.exit(ret);

http://git-wip-us.apache.org/repos/asf/hive/blob/9b21e17d/llap-server/src/main/resources/llap-cli-log4j2.properties
----------------------------------------------------------------------
diff --git a/llap-server/src/main/resources/llap-cli-log4j2.properties b/llap-server/src/main/resources/llap-cli-log4j2.properties
index f3c0f0e..483c81f 100644
--- a/llap-server/src/main/resources/llap-cli-log4j2.properties
+++ b/llap-server/src/main/resources/llap-cli-log4j2.properties
@@ -69,5 +69,6 @@ logger.HadoopConf.level = ERROR
 
 # root logger
 rootLogger.level = ${sys:hive.log.level}
-rootLogger.appenderRefs = root
+rootLogger.appenderRefs = root, DRFA
 rootLogger.appenderRef.root.ref = ${sys:hive.root.logger}
+rootLogger.appenderRef.DRFA.ref = DRFA