You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by jl...@apache.org on 2015/06/24 18:24:28 UTC

hadoop git commit: YARN-3809. Failed to launch new attempts because ApplicationMasterLauncher's threads all hang. Contributed by Jun Gong

Repository: hadoop
Updated Branches:
  refs/heads/trunk 72d08a0e4 -> 2a20dd9b6


YARN-3809. Failed to launch new attempts because ApplicationMasterLauncher's threads all hang. Contributed by Jun Gong


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/2a20dd9b
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/2a20dd9b
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/2a20dd9b

Branch: refs/heads/trunk
Commit: 2a20dd9b61ba3833460cbda0e8c3e8b6366fc3ab
Parents: 72d08a0
Author: Jason Lowe <jl...@apache.org>
Authored: Wed Jun 24 16:23:48 2015 +0000
Committer: Jason Lowe <jl...@apache.org>
Committed: Wed Jun 24 16:23:48 2015 +0000

----------------------------------------------------------------------
 hadoop-yarn-project/CHANGES.txt                 |  3 ++
 .../hadoop/yarn/conf/YarnConfiguration.java     | 10 +++++++
 .../src/main/resources/yarn-default.xml         | 12 ++++++++
 .../amlauncher/ApplicationMasterLauncher.java   | 30 ++++++++++++++++++--
 4 files changed, 52 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/2a20dd9b/hadoop-yarn-project/CHANGES.txt
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt
index f235338..7ecdee3 100644
--- a/hadoop-yarn-project/CHANGES.txt
+++ b/hadoop-yarn-project/CHANGES.txt
@@ -699,6 +699,9 @@ Release 2.7.1 - UNRELEASED
     YARN-3842. NMProxy should retry on NMNotYetReadyException. 
     (Robert Kanter via kasha)
 
+    YARN-3809. Failed to launch new attempts because
+    ApplicationMasterLauncher's threads all hang (Jun Gong via jlowe)
+
 Release 2.7.0 - 2015-04-20
 
   INCOMPATIBLE CHANGES

http://git-wip-us.apache.org/repos/asf/hadoop/blob/2a20dd9b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
index 5d75a21..6b660f7 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
@@ -143,6 +143,16 @@ public class YarnConfiguration extends Configuration {
     RM_PREFIX + "client.thread-count";
   public static final int DEFAULT_RM_CLIENT_THREAD_COUNT = 50;
 
+  /** Number of threads used to launch/cleanup AM.*/
+  public static final String RM_AMLAUNCHER_THREAD_COUNT =
+      RM_PREFIX + "amlauncher.thread-count";
+  public static final int DEFAULT_RM_AMLAUNCHER_THREAD_COUNT = 50;
+
+  /** Retry times to connect with NM.*/
+  public static final String RM_NODEMANAGER_CONNECT_RETIRES =
+      RM_PREFIX + "nodemanager-connect-retries";
+  public static final int DEFAULT_RM_NODEMANAGER_CONNECT_RETIRES = 10;
+
   /** The Kerberos principal for the resource manager.*/
   public static final String RM_PRINCIPAL =
     RM_PREFIX + "principal";

http://git-wip-us.apache.org/repos/asf/hadoop/blob/2a20dd9b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
index d94157c..621198c 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/resources/yarn-default.xml
@@ -77,6 +77,18 @@
   </property>
 
   <property>
+    <description>Number of threads used to launch/cleanup AM.</description>
+    <name>yarn.resourcemanager.amlauncher.thread-count</name>
+    <value>50</value>
+  </property>
+
+  <property>
+    <description>Retry times to connect with NM.</description>
+    <name>yarn.resourcemanager.nodemanager-connect-retries</name>
+    <value>10</value>
+  </property>
+
+  <property>
     <description>The expiry interval for application master reporting.</description>
     <name>yarn.am.liveness-monitor.expiry-interval-ms</name>
     <value>600000</value>

http://git-wip-us.apache.org/repos/asf/hadoop/blob/2a20dd9b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/amlauncher/ApplicationMasterLauncher.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/amlauncher/ApplicationMasterLauncher.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/amlauncher/ApplicationMasterLauncher.java
index 5fc39fd..f606e45 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/amlauncher/ApplicationMasterLauncher.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/amlauncher/ApplicationMasterLauncher.java
@@ -19,12 +19,17 @@
 package org.apache.hadoop.yarn.server.resourcemanager.amlauncher;
 import java.util.concurrent.BlockingQueue;
 import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.ThreadFactory;
 import java.util.concurrent.ThreadPoolExecutor;
 import java.util.concurrent.TimeUnit;
 
+import com.google.common.util.concurrent.ThreadFactoryBuilder;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.CommonConfigurationKeysPublic;
 import org.apache.hadoop.service.AbstractService;
+import org.apache.hadoop.yarn.conf.YarnConfiguration;
 import org.apache.hadoop.yarn.event.EventHandler;
 import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
 import org.apache.hadoop.yarn.server.resourcemanager.rmapp.attempt.RMAppAttempt;
@@ -34,7 +39,7 @@ public class ApplicationMasterLauncher extends AbstractService implements
     EventHandler<AMLauncherEvent> {
   private static final Log LOG = LogFactory.getLog(
       ApplicationMasterLauncher.class);
-  private final ThreadPoolExecutor launcherPool;
+  private ThreadPoolExecutor launcherPool;
   private LauncherThread launcherHandlingThread;
   
   private final BlockingQueue<Runnable> masterEvents
@@ -45,12 +50,31 @@ public class ApplicationMasterLauncher extends AbstractService implements
   public ApplicationMasterLauncher(RMContext context) {
     super(ApplicationMasterLauncher.class.getName());
     this.context = context;
-    this.launcherPool = new ThreadPoolExecutor(10, 10, 1, 
-        TimeUnit.HOURS, new LinkedBlockingQueue<Runnable>());
     this.launcherHandlingThread = new LauncherThread();
   }
   
   @Override
+  protected void serviceInit(Configuration conf) throws Exception {
+    int threadCount = conf.getInt(
+        YarnConfiguration.RM_AMLAUNCHER_THREAD_COUNT,
+        YarnConfiguration.DEFAULT_RM_AMLAUNCHER_THREAD_COUNT);
+    ThreadFactory tf = new ThreadFactoryBuilder()
+        .setNameFormat("ApplicationMasterLauncher #%d")
+        .build();
+    launcherPool = new ThreadPoolExecutor(threadCount, threadCount, 1,
+        TimeUnit.HOURS, new LinkedBlockingQueue<Runnable>());
+    launcherPool.setThreadFactory(tf);
+
+    Configuration newConf = new YarnConfiguration(conf);
+    newConf.setInt(CommonConfigurationKeysPublic.
+            IPC_CLIENT_CONNECT_MAX_RETRIES_ON_SOCKET_TIMEOUTS_KEY,
+        conf.getInt(YarnConfiguration.RM_NODEMANAGER_CONNECT_RETIRES,
+            YarnConfiguration.DEFAULT_RM_NODEMANAGER_CONNECT_RETIRES));
+    setConfig(newConf);
+    super.serviceInit(newConf);
+  }
+
+  @Override
   protected void serviceStart() throws Exception {
     launcherHandlingThread.start();
     super.serviceStart();