You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by wa...@apache.org on 2018/07/31 19:26:26 UTC

[1/7] hadoop git commit: YARN-8528. Final states in ContainerAllocation might be modified externally causing unexpected allocation results. Contributed by Xintong Song.

Repository: hadoop
Updated Branches:
  refs/heads/branch-3.1.1 14acc8af6 -> 8596a0ebe


YARN-8528. Final states in ContainerAllocation might be modified externally causing unexpected allocation results. Contributed by Xintong Song.

(cherry picked from commit 004e1f248ef20b78f9d12d6f1fe04f66d8c56158)


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/0c3d3bc3
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/0c3d3bc3
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/0c3d3bc3

Branch: refs/heads/branch-3.1.1
Commit: 0c3d3bc3aef4c43d73c3bf31c741c9efe3b18ada
Parents: 14acc8a
Author: Weiwei Yang <ww...@apache.org>
Authored: Fri Jul 20 22:32:11 2018 +0800
Committer: Wangda Tan <wa...@apache.org>
Committed: Tue Jul 31 12:06:16 2018 -0700

----------------------------------------------------------------------
 .../capacity/allocator/ContainerAllocation.java |  2 +-
 .../allocator/RegularContainerAllocator.java    | 10 ++--
 .../capacity/TestCapacityScheduler.java         | 48 ++++++++++++++++++++
 3 files changed, 54 insertions(+), 6 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/0c3d3bc3/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/ContainerAllocation.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/ContainerAllocation.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/ContainerAllocation.java
index f408508..b9b9bcf 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/ContainerAllocation.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/ContainerAllocation.java
@@ -56,7 +56,7 @@ public class ContainerAllocation {
 
   RMContainer containerToBeUnreserved;
   private Resource resourceToBeAllocated = Resources.none();
-  AllocationState state;
+  private AllocationState state;
   NodeType containerNodeType = NodeType.NODE_LOCAL;
   NodeType requestLocalityType = null;
 

http://git-wip-us.apache.org/repos/asf/hadoop/blob/0c3d3bc3/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/RegularContainerAllocator.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/RegularContainerAllocator.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/RegularContainerAllocator.java
index 99deb1a..adc27f5 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/RegularContainerAllocator.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/allocator/RegularContainerAllocator.java
@@ -263,7 +263,7 @@ public class RegularContainerAllocator extends AbstractContainerAllocator {
             reservedContainer, schedulingMode, resourceLimits);
     
     if (null == reservedContainer) {
-      if (result.state == AllocationState.PRIORITY_SKIPPED) {
+      if (result.getAllocationState() == AllocationState.PRIORITY_SKIPPED) {
         // Don't count 'skipped nodes' as a scheduling opportunity!
         application.subtractSchedulingOpportunity(schedulerKey);
       }
@@ -487,8 +487,8 @@ public class RegularContainerAllocator extends AbstractContainerAllocator {
 
       // When a returned allocation is LOCALITY_SKIPPED, since we're in
       // off-switch request now, we will skip this app w.r.t priorities 
-      if (allocation.state == AllocationState.LOCALITY_SKIPPED) {
-        allocation.state = AllocationState.APP_SKIPPED;
+      if (allocation.getAllocationState() == AllocationState.LOCALITY_SKIPPED) {
+        allocation = ContainerAllocation.APP_SKIPPED;
       }
       allocation.requestLocalityType = requestLocalityType;
 
@@ -836,8 +836,8 @@ public class RegularContainerAllocator extends AbstractContainerAllocator {
       result = tryAllocateOnNode(clusterResource, node, schedulingMode,
           resourceLimits, schedulerKey, reservedContainer);
 
-      if (AllocationState.ALLOCATED == result.state
-          || AllocationState.RESERVED == result.state) {
+      if (AllocationState.ALLOCATED == result.getAllocationState()
+          || AllocationState.RESERVED == result.getAllocationState()) {
         result = doAllocation(result, node, schedulerKey, reservedContainer);
         break;
       }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/0c3d3bc3/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacityScheduler.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacityScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacityScheduler.java
index 0b54010..79cdcfe 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacityScheduler.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacityScheduler.java
@@ -134,6 +134,8 @@ import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNode;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.SchedulerNodeReport;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.TestSchedulerUtils;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.YarnScheduler;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.allocator.AllocationState;
+import org.apache.hadoop.yarn.server.resourcemanager.scheduler.capacity.allocator.ContainerAllocation;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.ResourceCommitRequest;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerApp;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.common.fica.FiCaSchedulerNode;
@@ -4930,4 +4932,50 @@ public class TestCapacityScheduler extends CapacitySchedulerTestBase {
     spyCs.handle(new NodeUpdateSchedulerEvent(
         spyCs.getNode(nm.getNodeId()).getRMNode()));
   }
+
+  // Testcase for YARN-8528
+  // This is to test whether ContainerAllocation constants are holding correct
+  // values during scheduling.
+  @Test
+  public void testContainerAllocationLocalitySkipped() throws Exception {
+    Assert.assertEquals(AllocationState.APP_SKIPPED,
+        ContainerAllocation.APP_SKIPPED.getAllocationState());
+    Assert.assertEquals(AllocationState.LOCALITY_SKIPPED,
+        ContainerAllocation.LOCALITY_SKIPPED.getAllocationState());
+    Assert.assertEquals(AllocationState.PRIORITY_SKIPPED,
+        ContainerAllocation.PRIORITY_SKIPPED.getAllocationState());
+    Assert.assertEquals(AllocationState.QUEUE_SKIPPED,
+        ContainerAllocation.QUEUE_SKIPPED.getAllocationState());
+
+    // init RM & NMs & Nodes
+    final MockRM rm = new MockRM(new CapacitySchedulerConfiguration());
+    CapacityScheduler cs = (CapacityScheduler) rm.getResourceScheduler();
+    rm.start();
+    final MockNM nm1 = rm.registerNode("h1:1234", 4 * GB);
+    final MockNM nm2 = rm.registerNode("h2:1234", 6 * GB); // maximum-allocation-mb = 6GB
+
+    // submit app and request resource
+    // container2 is larger than nm1 total resource, will trigger locality skip
+    final RMApp app = rm.submitApp(1 * GB, "app", "user");
+    final MockAM am = MockRM.launchAndRegisterAM(app, rm, nm1);
+    am.addRequests(new String[] {"*"}, 5 * GB, 1, 1, 2);
+    am.schedule();
+
+    // container1 (am) should be acquired, container2 should not
+    RMNode node1 = rm.getRMContext().getRMNodes().get(nm1.getNodeId());
+    cs.handle(new NodeUpdateSchedulerEvent(node1));
+    ContainerId cid = ContainerId.newContainerId(am.getApplicationAttemptId(), 1l);
+    Assert.assertEquals(cs.getRMContainer(cid).getState(), RMContainerState.ACQUIRED);
+    cid = ContainerId.newContainerId(am.getApplicationAttemptId(), 2l);
+    Assert.assertNull(cs.getRMContainer(cid));
+
+    Assert.assertEquals(AllocationState.APP_SKIPPED,
+        ContainerAllocation.APP_SKIPPED.getAllocationState());
+    Assert.assertEquals(AllocationState.LOCALITY_SKIPPED,
+        ContainerAllocation.LOCALITY_SKIPPED.getAllocationState());
+    Assert.assertEquals(AllocationState.PRIORITY_SKIPPED,
+        ContainerAllocation.PRIORITY_SKIPPED.getAllocationState());
+    Assert.assertEquals(AllocationState.QUEUE_SKIPPED,
+        ContainerAllocation.QUEUE_SKIPPED.getAllocationState());
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org


[5/7] hadoop git commit: HADOOP-15593. Fixed NPE in UGI spawnAutoRenewalThreadForUserCreds. Contributed by Gabor Bota

Posted by wa...@apache.org.
HADOOP-15593.  Fixed NPE in UGI spawnAutoRenewalThreadForUserCreds.
               Contributed by Gabor Bota

(cherry picked from commit 77721f39e26b630352a1f4087524a3fbd21ff06e)
(cherry picked from commit a869bd970e832c4d770b3cee6257225260f4d235)


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/096c1390
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/096c1390
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/096c1390

Branch: refs/heads/branch-3.1.1
Commit: 096c1390f2510f7035db210d7a3b4db59c59d167
Parents: bae4060
Author: Eric Yang <ey...@apache.org>
Authored: Thu Jul 26 18:35:36 2018 -0400
Committer: Wangda Tan <wa...@apache.org>
Committed: Tue Jul 31 12:06:47 2018 -0700

----------------------------------------------------------------------
 .../hadoop/security/UserGroupInformation.java   | 179 ++++++++++++-------
 .../security/TestUserGroupInformation.java      |  38 ++++
 2 files changed, 148 insertions(+), 69 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/096c1390/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/UserGroupInformation.java
----------------------------------------------------------------------
diff --git a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/UserGroupInformation.java b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/UserGroupInformation.java
index 33a876f..c44ef72 100644
--- a/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/UserGroupInformation.java
+++ b/hadoop-common-project/hadoop-common/src/main/java/org/apache/hadoop/security/UserGroupInformation.java
@@ -40,6 +40,7 @@ import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
+import java.util.Date;
 import java.util.EnumMap;
 import java.util.HashMap;
 import java.util.Iterator;
@@ -850,81 +851,121 @@ public class UserGroupInformation {
     }
 
     //spawn thread only if we have kerb credentials
-    Thread t = new Thread(new Runnable() {
+    KerberosTicket tgt = getTGT();
+    if (tgt == null) {
+      return;
+    }
+    String cmd = conf.get("hadoop.kerberos.kinit.command", "kinit");
+    long nextRefresh = getRefreshTime(tgt);
+    Thread t =
+        new Thread(new AutoRenewalForUserCredsRunnable(tgt, cmd, nextRefresh));
+    t.setDaemon(true);
+    t.setName("TGT Renewer for " + getUserName());
+    t.start();
+  }
+
+  @VisibleForTesting
+  class AutoRenewalForUserCredsRunnable implements Runnable {
+    private KerberosTicket tgt;
+    private RetryPolicy rp;
+    private String kinitCmd;
+    private long nextRefresh;
+    private boolean runRenewalLoop = true;
+
+    AutoRenewalForUserCredsRunnable(KerberosTicket tgt, String kinitCmd,
+        long nextRefresh){
+      this.tgt = tgt;
+      this.kinitCmd = kinitCmd;
+      this.nextRefresh = nextRefresh;
+      this.rp = null;
+    }
+
+    public void setRunRenewalLoop(boolean runRenewalLoop) {
+      this.runRenewalLoop = runRenewalLoop;
+    }
 
-      @Override
-      public void run() {
-        String cmd = conf.get("hadoop.kerberos.kinit.command", "kinit");
-        KerberosTicket tgt = getTGT();
-        if (tgt == null) {
+    @Override
+    public void run() {
+      do {
+        try {
+          long now = Time.now();
+          if (LOG.isDebugEnabled()) {
+            LOG.debug("Current time is " + now);
+            LOG.debug("Next refresh is " + nextRefresh);
+          }
+          if (now < nextRefresh) {
+            Thread.sleep(nextRefresh - now);
+          }
+          String output = Shell.execCommand(kinitCmd, "-R");
+          if (LOG.isDebugEnabled()) {
+            LOG.debug("Renewed ticket. kinit output: {}", output);
+          }
+          reloginFromTicketCache();
+          tgt = getTGT();
+          if (tgt == null) {
+            LOG.warn("No TGT after renewal. Aborting renew thread for " +
+                getUserName());
+            return;
+          }
+          nextRefresh = Math.max(getRefreshTime(tgt),
+              now + kerberosMinSecondsBeforeRelogin);
+          metrics.renewalFailures.set(0);
+          rp = null;
+        } catch (InterruptedException ie) {
+          LOG.warn("Terminating renewal thread");
           return;
-        }
-        long nextRefresh = getRefreshTime(tgt);
-        RetryPolicy rp = null;
-        while (true) {
+        } catch (IOException ie) {
+          metrics.renewalFailuresTotal.incr();
+          final long now = Time.now();
+
+          if (tgt.isDestroyed()) {
+            LOG.error("TGT is destroyed. Aborting renew thread for {}.",
+                getUserName());
+            return;
+          }
+
+          long tgtEndTime;
+          // As described in HADOOP-15593 we need to handle the case when
+          // tgt.getEndTime() throws NPE because of JDK issue JDK-8147772
+          // NPE is only possible if this issue is not fixed in the JDK
+          // currently used
           try {
-            long now = Time.now();
-            if (LOG.isDebugEnabled()) {
-              LOG.debug("Current time is " + now);
-              LOG.debug("Next refresh is " + nextRefresh);
-            }
-            if (now < nextRefresh) {
-              Thread.sleep(nextRefresh - now);
-            }
-            Shell.execCommand(cmd, "-R");
-            if (LOG.isDebugEnabled()) {
-              LOG.debug("renewed ticket");
-            }
-            reloginFromTicketCache();
-            tgt = getTGT();
-            if (tgt == null) {
-              LOG.warn("No TGT after renewal. Aborting renew thread for " +
-                  getUserName());
-              return;
-            }
-            nextRefresh = Math.max(getRefreshTime(tgt),
-              now + kerberosMinSecondsBeforeRelogin);
-            metrics.renewalFailures.set(0);
-            rp = null;
-          } catch (InterruptedException ie) {
-            LOG.warn("Terminating renewal thread");
+            tgtEndTime = tgt.getEndTime().getTime();
+          } catch (NullPointerException npe) {
+            LOG.error("NPE thrown while getting KerberosTicket endTime. "
+                + "Aborting renew thread for {}.", getUserName());
+            return;
+          }
+
+          LOG.warn("Exception encountered while running the renewal "
+                  + "command for {}. (TGT end time:{}, renewalFailures: {},"
+                  + "renewalFailuresTotal: {})", getUserName(), tgtEndTime,
+              metrics.renewalFailures.value(),
+              metrics.renewalFailuresTotal.value(), ie);
+          if (rp == null) {
+            // Use a dummy maxRetries to create the policy. The policy will
+            // only be used to get next retry time with exponential back-off.
+            // The final retry time will be later limited within the
+            // tgt endTime in getNextTgtRenewalTime.
+            rp = RetryPolicies.exponentialBackoffRetry(Long.SIZE - 2,
+                kerberosMinSecondsBeforeRelogin, TimeUnit.MILLISECONDS);
+          }
+          try {
+            nextRefresh = getNextTgtRenewalTime(tgtEndTime, now, rp);
+          } catch (Exception e) {
+            LOG.error("Exception when calculating next tgt renewal time", e);
+            return;
+          }
+          metrics.renewalFailures.incr();
+          // retry until close enough to tgt endTime.
+          if (now > nextRefresh) {
+            LOG.error("TGT is expired. Aborting renew thread for {}.",
+                getUserName());
             return;
-          } catch (IOException ie) {
-            metrics.renewalFailuresTotal.incr();
-            final long tgtEndTime = tgt.getEndTime().getTime();
-            LOG.warn("Exception encountered while running the renewal "
-                    + "command for {}. (TGT end time:{}, renewalFailures: {},"
-                    + "renewalFailuresTotal: {})", getUserName(), tgtEndTime,
-                metrics.renewalFailures, metrics.renewalFailuresTotal, ie);
-            final long now = Time.now();
-            if (rp == null) {
-              // Use a dummy maxRetries to create the policy. The policy will
-              // only be used to get next retry time with exponential back-off.
-              // The final retry time will be later limited within the
-              // tgt endTime in getNextTgtRenewalTime.
-              rp = RetryPolicies.exponentialBackoffRetry(Long.SIZE - 2,
-                  kerberosMinSecondsBeforeRelogin, TimeUnit.MILLISECONDS);
-            }
-            try {
-              nextRefresh = getNextTgtRenewalTime(tgtEndTime, now, rp);
-            } catch (Exception e) {
-              LOG.error("Exception when calculating next tgt renewal time", e);
-              return;
-            }
-            metrics.renewalFailures.incr();
-            // retry until close enough to tgt endTime.
-            if (now > nextRefresh) {
-              LOG.error("TGT is expired. Aborting renew thread for {}.",
-                  getUserName());
-              return;
-            }
           }
         }
-      }
-    });
-    t.setDaemon(true);
-    t.setName("TGT Renewer for " + getUserName());
-    t.start();
+      } while (runRenewalLoop);
+    }
   }
 
   /**

http://git-wip-us.apache.org/repos/asf/hadoop/blob/096c1390/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUserGroupInformation.java
----------------------------------------------------------------------
diff --git a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUserGroupInformation.java b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUserGroupInformation.java
index 9477990..011e930 100644
--- a/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUserGroupInformation.java
+++ b/hadoop-common-project/hadoop-common/src/test/java/org/apache/hadoop/security/TestUserGroupInformation.java
@@ -47,6 +47,7 @@ import org.slf4j.event.Level;
 
 import javax.security.auth.Subject;
 import javax.security.auth.kerberos.KerberosPrincipal;
+import javax.security.auth.kerberos.KerberosTicket;
 import javax.security.auth.kerberos.KeyTab;
 import javax.security.auth.login.AppConfigurationEntry;
 import javax.security.auth.login.LoginContext;
@@ -61,6 +62,7 @@ import java.security.PrivilegedExceptionAction;
 import java.util.Collection;
 import java.util.ConcurrentModificationException;
 import java.util.Date;
+import java.util.HashSet;
 import java.util.LinkedHashSet;
 import java.util.Set;
 import java.util.concurrent.Callable;
@@ -88,7 +90,10 @@ import static org.junit.Assert.assertNull;
 import static org.junit.Assert.assertSame;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
+import static org.mockito.Mockito.atLeastOnce;
+import static org.mockito.Mockito.doThrow;
 import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.spy;
 import static org.mockito.Mockito.when;
 
 public class TestUserGroupInformation {
@@ -1211,4 +1216,37 @@ public class TestUserGroupInformation {
     barrier.await();
     assertSame(testUgi1.getSubject(), blockingLookup.get().getSubject());
   }
+
+  @Test
+  public void testKerberosTicketIsDestroyedChecked() throws Exception {
+    // Create UserGroupInformation
+    GenericTestUtils.setLogLevel(UserGroupInformation.LOG, Level.DEBUG);
+    Set<User> users = new HashSet<>();
+    users.add(new User("Foo"));
+    Subject subject =
+        new Subject(true, users, new HashSet<>(), new HashSet<>());
+    UserGroupInformation ugi = spy(new UserGroupInformation(subject));
+
+    // throw IOException in the middle of the autoRenewalForUserCreds
+    doThrow(new IOException()).when(ugi).reloginFromTicketCache();
+
+    // Create and destroy the KerberosTicket, so endTime will be null
+    Date d = new Date();
+    KerberosPrincipal kp = new KerberosPrincipal("Foo");
+    KerberosTicket tgt = spy(new KerberosTicket(new byte[]{}, kp, kp, new
+        byte[]{}, 0, null, d, d, d, d, null));
+    tgt.destroy();
+
+    // run AutoRenewalForUserCredsRunnable with this
+    UserGroupInformation.AutoRenewalForUserCredsRunnable userCredsRunnable =
+        ugi.new AutoRenewalForUserCredsRunnable(tgt,
+            Boolean.toString(Boolean.TRUE), 100);
+
+    // Set the runnable to not to run in a loop
+    userCredsRunnable.setRunRenewalLoop(false);
+    // there should be no exception when calling this
+    userCredsRunnable.run();
+    // isDestroyed should be called at least once
+    Mockito.verify(tgt, atLeastOnce()).isDestroyed();
+  }
 }


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org


[3/7] hadoop git commit: YARN-8546. Resource leak caused by a reserved container being released more than once under async scheduling. Contributed by Tao Yang.

Posted by wa...@apache.org.
YARN-8546. Resource leak caused by a reserved container being released more than once under async scheduling. Contributed by Tao Yang.

(Cherry-picked from commit 5be9f4a5d05c9cb99348719fe35626b1de3055db)

(cherry picked from commit b89624a943268e180e0e1532b3a394ff580a962c)


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/49795e9b
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/49795e9b
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/49795e9b

Branch: refs/heads/branch-3.1.1
Commit: 49795e9b9f381debc2641d688aabde48acc87385
Parents: 7cb37ed
Author: Weiwei Yang <ww...@apache.org>
Authored: Wed Jul 25 17:35:27 2018 +0800
Committer: Wangda Tan <wa...@apache.org>
Committed: Tue Jul 31 12:06:30 2018 -0700

----------------------------------------------------------------------
 .../scheduler/common/fica/FiCaSchedulerApp.java | 15 ++++
 .../TestCapacitySchedulerAsyncScheduling.java   | 89 ++++++++++++++++++++
 2 files changed, 104 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/49795e9b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerApp.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerApp.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerApp.java
index 3b1b82c..9810e98 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerApp.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/main/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/common/fica/FiCaSchedulerApp.java
@@ -361,6 +361,21 @@ public class FiCaSchedulerApp extends SchedulerApplicationAttempt {
         .isEmpty()) {
       for (SchedulerContainer<FiCaSchedulerApp, FiCaSchedulerNode>
           releaseContainer : allocation.getToRelease()) {
+        // Make sure to-release reserved containers are not outdated
+        if (releaseContainer.getRmContainer().getState()
+            == RMContainerState.RESERVED
+            && releaseContainer.getRmContainer() != releaseContainer
+            .getSchedulerNode().getReservedContainer()) {
+          if (LOG.isDebugEnabled()) {
+            LOG.debug("Failed to accept this proposal because "
+                + "it tries to release an outdated reserved container "
+                + releaseContainer.getRmContainer().getContainerId()
+                + " on node " + releaseContainer.getSchedulerNode().getNodeID()
+                + " whose reserved container is "
+                + releaseContainer.getSchedulerNode().getReservedContainer());
+          }
+          return false;
+        }
         // Only consider non-reserved container (reserved container will
         // not affect available resource of node) on the same node
         if (releaseContainer.getRmContainer().getState()

http://git-wip-us.apache.org/repos/asf/hadoop/blob/49795e9b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacitySchedulerAsyncScheduling.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacitySchedulerAsyncScheduling.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacitySchedulerAsyncScheduling.java
index 338b9f9..c2c1519 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacitySchedulerAsyncScheduling.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-resourcemanager/src/test/java/org/apache/hadoop/yarn/server/resourcemanager/scheduler/capacity/TestCapacitySchedulerAsyncScheduling.java
@@ -31,6 +31,7 @@ import org.apache.hadoop.yarn.conf.YarnConfiguration;
 import org.apache.hadoop.yarn.server.resourcemanager.MockAM;
 import org.apache.hadoop.yarn.server.resourcemanager.MockNM;
 import org.apache.hadoop.yarn.server.resourcemanager.MockRM;
+import org.apache.hadoop.yarn.server.resourcemanager.RMContext;
 import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.NullRMNodeLabelsManager;
 import org.apache.hadoop.yarn.server.resourcemanager.nodelabels.RMNodeLabelsManager;
 import org.apache.hadoop.yarn.server.resourcemanager.rmapp.RMApp;
@@ -41,6 +42,7 @@ import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEven
 import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerEventType;
 import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerImpl;
 import org.apache.hadoop.yarn.server.resourcemanager.rmcontainer.RMContainerState;
+import org.apache.hadoop.yarn.server.resourcemanager.rmnode.RMNode;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.AbstractYarnScheduler;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.NodeType;
 import org.apache.hadoop.yarn.server.resourcemanager.scheduler.ResourceScheduler;
@@ -685,6 +687,93 @@ public class TestCapacitySchedulerAsyncScheduling {
     rm.stop();
   }
 
+
+  @Test(timeout = 60000)
+  public void testReleaseOutdatedReservedContainer() throws Exception {
+    /*
+     * Submit a application, reserved container_02 on nm1,
+     * submit two allocate proposals which contain the same reserved
+     * container_02 as to-released container.
+     * First proposal should be accepted, second proposal should be rejected
+     * because it try to release an outdated reserved container
+     */
+    MockRM rm1 = new MockRM();
+    rm1.getRMContext().setNodeLabelManager(mgr);
+    rm1.start();
+    MockNM nm1 = rm1.registerNode("h1:1234", 8 * GB);
+    MockNM nm2 = rm1.registerNode("h2:1234", 8 * GB);
+    MockNM nm3 = rm1.registerNode("h3:1234", 8 * GB);
+    rm1.drainEvents();
+
+    CapacityScheduler cs = (CapacityScheduler) rm1.getResourceScheduler();
+    RMNode rmNode1 = rm1.getRMContext().getRMNodes().get(nm1.getNodeId());
+    LeafQueue defaultQueue = (LeafQueue) cs.getQueue("default");
+    SchedulerNode sn1 = cs.getSchedulerNode(nm1.getNodeId());
+    SchedulerNode sn2 = cs.getSchedulerNode(nm2.getNodeId());
+    SchedulerNode sn3 = cs.getSchedulerNode(nm3.getNodeId());
+
+    // launch another app to queue, AM container should be launched in nm1
+    RMApp app1 = rm1.submitApp(4 * GB, "app", "user", null, "default");
+    MockAM am1 = MockRM.launchAndRegisterAM(app1, rm1, nm1);
+    Resource allocateResource = Resources.createResource(5 * GB);
+    am1.allocate("*", (int) allocateResource.getMemorySize(), 3, 0,
+        new ArrayList<ContainerId>(), "");
+    FiCaSchedulerApp schedulerApp1 =
+        cs.getApplicationAttempt(am1.getApplicationAttemptId());
+
+    cs.handle(new NodeUpdateSchedulerEvent(rmNode1));
+    Assert.assertEquals(1, schedulerApp1.getReservedContainers().size());
+    Assert.assertEquals(9 * GB,
+        defaultQueue.getQueueResourceUsage().getUsed().getMemorySize());
+
+    RMContainer reservedContainer =
+        schedulerApp1.getReservedContainers().get(0);
+    ResourceCommitRequest allocateFromSameReservedContainerProposal1 =
+        createAllocateFromReservedProposal(3, allocateResource, schedulerApp1,
+            sn2, sn1, cs.getRMContext(), reservedContainer);
+    boolean tryCommitResult = cs.tryCommit(cs.getClusterResource(),
+        allocateFromSameReservedContainerProposal1, true);
+    Assert.assertTrue(tryCommitResult);
+    ResourceCommitRequest allocateFromSameReservedContainerProposal2 =
+        createAllocateFromReservedProposal(4, allocateResource, schedulerApp1,
+            sn3, sn1, cs.getRMContext(), reservedContainer);
+    tryCommitResult = cs.tryCommit(cs.getClusterResource(),
+        allocateFromSameReservedContainerProposal2, true);
+    Assert.assertFalse("This proposal should be rejected because "
+        + "it try to release an outdated reserved container", tryCommitResult);
+
+    rm1.close();
+  }
+
+  private ResourceCommitRequest createAllocateFromReservedProposal(
+      int containerId, Resource allocateResource, FiCaSchedulerApp schedulerApp,
+      SchedulerNode allocateNode, SchedulerNode reservedNode,
+      RMContext rmContext, RMContainer reservedContainer) {
+    Container container = Container.newInstance(
+        ContainerId.newContainerId(schedulerApp.getApplicationAttemptId(), containerId),
+        allocateNode.getNodeID(), allocateNode.getHttpAddress(), allocateResource,
+        Priority.newInstance(0), null);
+    RMContainer rmContainer = new RMContainerImpl(container, SchedulerRequestKey
+        .create(ResourceRequest
+            .newInstance(Priority.newInstance(0), "*", allocateResource, 1)),
+        schedulerApp.getApplicationAttemptId(), allocateNode.getNodeID(), "user",
+        rmContext);
+    SchedulerContainer allocateContainer =
+        new SchedulerContainer(schedulerApp, allocateNode, rmContainer, "", true);
+    SchedulerContainer reservedSchedulerContainer =
+        new SchedulerContainer(schedulerApp, reservedNode, reservedContainer, "",
+            false);
+    List<SchedulerContainer> toRelease = new ArrayList<>();
+    toRelease.add(reservedSchedulerContainer);
+    ContainerAllocationProposal allocateFromReservedProposal =
+        new ContainerAllocationProposal(allocateContainer, toRelease, null,
+            NodeType.OFF_SWITCH, NodeType.OFF_SWITCH,
+            SchedulingMode.RESPECT_PARTITION_EXCLUSIVITY, allocateResource);
+    List<ContainerAllocationProposal> allocateProposals = new ArrayList<>();
+    allocateProposals.add(allocateFromReservedProposal);
+    return new ResourceCommitRequest(allocateProposals, null, null);
+  }
+
   private void keepNMHeartbeat(List<MockNM> mockNMs, int interval) {
     if (nmHeartbeatThread != null) {
       nmHeartbeatThread.setShouldStop();


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org


[7/7] hadoop git commit: YARN-8418. App local logs could leaked if log aggregation fails to initialize for the app. (Bibin A Chundatt via wangda)

Posted by wa...@apache.org.
YARN-8418. App local logs could leaked if log aggregation fails to initialize for the app. (Bibin A Chundatt via wangda)

Change-Id: I29a23ca4b219b48c92e7975cd44cddb8b0e04104
(cherry picked from commit 4b540bbfcf02d828052999215c6135603d98f5db)
(cherry picked from commit 7b552c9d722ed243cb6e17122edcecda62f54e68)


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/8596a0eb
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/8596a0eb
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/8596a0eb

Branch: refs/heads/branch-3.1.1
Commit: 8596a0ebe8c8a862c57d291669e56aacfc4d411b
Parents: 17625e4
Author: Wangda Tan <wa...@apache.org>
Authored: Tue Jul 31 12:07:51 2018 -0700
Committer: Wangda Tan <wa...@apache.org>
Committed: Tue Jul 31 12:18:11 2018 -0700

----------------------------------------------------------------------
 .../LogAggregationFileController.java           |  7 ++
 .../nodemanager/NodeStatusUpdaterImpl.java      |  1 +
 .../containermanager/ContainerManager.java      |  1 +
 .../containermanager/ContainerManagerImpl.java  | 13 ++-
 .../logaggregation/AppLogAggregator.java        |  8 ++
 .../logaggregation/AppLogAggregatorImpl.java    | 15 ++++
 .../logaggregation/LogAggregationService.java   | 83 ++++++++++++++++----
 .../containermanager/loghandler/LogHandler.java |  7 ++
 .../loghandler/NonAggregatingLogHandler.java    |  9 +++
 .../loghandler/event/LogHandlerEventType.java   |  4 +-
 .../event/LogHandlerTokenUpdatedEvent.java      | 26 ++++++
 .../nodemanager/DummyContainerManager.java      |  7 ++
 .../TestLogAggregationService.java              | 34 +++++---
 13 files changed, 187 insertions(+), 28 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/8596a0eb/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/logaggregation/filecontroller/LogAggregationFileController.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/logaggregation/filecontroller/LogAggregationFileController.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/logaggregation/filecontroller/LogAggregationFileController.java
index 5ac89e9..25d012e 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/logaggregation/filecontroller/LogAggregationFileController.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-common/src/main/java/org/apache/hadoop/yarn/logaggregation/filecontroller/LogAggregationFileController.java
@@ -43,11 +43,14 @@ import org.apache.hadoop.fs.FileSystem;
 import org.apache.hadoop.fs.Path;
 import org.apache.hadoop.fs.permission.FsPermission;
 import org.apache.hadoop.io.IOUtils;
+import org.apache.hadoop.ipc.RemoteException;
 import org.apache.hadoop.security.UserGroupInformation;
+import org.apache.hadoop.security.token.SecretManager;
 import org.apache.hadoop.yarn.api.records.ApplicationAccessType;
 import org.apache.hadoop.yarn.api.records.ApplicationId;
 import org.apache.hadoop.yarn.api.records.NodeId;
 import org.apache.hadoop.yarn.conf.YarnConfiguration;
+import org.apache.hadoop.yarn.exceptions.YarnException;
 import org.apache.hadoop.yarn.exceptions.YarnRuntimeException;
 import org.apache.hadoop.yarn.logaggregation.LogAggregationUtils;
 import org.apache.hadoop.yarn.webapp.View.ViewContext;
@@ -363,6 +366,10 @@ public abstract class LogAggregationFileController {
         }
       });
     } catch (Exception e) {
+      if (e instanceof RemoteException) {
+        throw new YarnRuntimeException(((RemoteException) e)
+            .unwrapRemoteException(SecretManager.InvalidToken.class));
+      }
       throw new YarnRuntimeException(e);
     }
   }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/8596a0eb/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java
index 8154723..faf7adb 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/NodeStatusUpdaterImpl.java
@@ -1135,6 +1135,7 @@ public class NodeStatusUpdaterImpl extends AbstractService implements
             if (systemCredentials != null && !systemCredentials.isEmpty()) {
               ((NMContext) context).setSystemCrendentialsForApps(
                   parseCredentials(systemCredentials));
+              context.getContainerManager().handleCredentialUpdate();
             }
             List<org.apache.hadoop.yarn.api.records.Container>
                 containersToUpdate = response.getContainersToUpdate();

http://git-wip-us.apache.org/repos/asf/hadoop/blob/8596a0eb/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManager.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManager.java
index 2aeb245..356c2e0 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManager.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManager.java
@@ -44,4 +44,5 @@ public interface ContainerManager extends ServiceStateChangeListener,
 
   ContainerScheduler getContainerScheduler();
 
+  void handleCredentialUpdate();
 }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/8596a0eb/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java
index ad63720..b107200 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/ContainerManagerImpl.java
@@ -21,6 +21,7 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager;
 import com.google.common.annotations.VisibleForTesting;
 import com.google.protobuf.ByteString;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.container.UpdateContainerTokenEvent;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.event.LogHandlerTokenUpdatedEvent;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.scheduler.ContainerSchedulerEvent;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -170,7 +171,6 @@ import java.nio.ByteBuffer;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
-import java.util.EnumSet;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
@@ -214,6 +214,7 @@ public class ContainerManagerImpl extends CompositeService implements
   protected final AsyncDispatcher dispatcher;
 
   private final DeletionService deletionService;
+  private LogHandler logHandler;
   private boolean serviceStopped = false;
   private final ReadLock readLock;
   private final WriteLock writeLock;
@@ -292,7 +293,7 @@ public class ContainerManagerImpl extends CompositeService implements
   @Override
   public void serviceInit(Configuration conf) throws Exception {
 
-    LogHandler logHandler =
+    logHandler =
       createLogHandler(conf, this.context, this.deletionService);
     addIfService(logHandler);
     dispatcher.register(LogHandlerEventType.class, logHandler);
@@ -1904,4 +1905,12 @@ public class ContainerManagerImpl extends CompositeService implements
   public ContainerScheduler getContainerScheduler() {
     return this.containerScheduler;
   }
+
+  @Override
+  public void handleCredentialUpdate() {
+    Set<ApplicationId> invalidApps = logHandler.getInvalidTokenApps();
+    if (!invalidApps.isEmpty()) {
+      dispatcher.getEventHandler().handle(new LogHandlerTokenUpdatedEvent());
+    }
+  }
 }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/8596a0eb/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/AppLogAggregator.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/AppLogAggregator.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/AppLogAggregator.java
index 0178699..93436fa 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/AppLogAggregator.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/AppLogAggregator.java
@@ -18,6 +18,8 @@
 
 package org.apache.hadoop.yarn.server.nodemanager.containermanager.logaggregation;
 
+import org.apache.hadoop.security.Credentials;
+import org.apache.hadoop.security.UserGroupInformation;
 import org.apache.hadoop.yarn.server.api.ContainerLogContext;
 
 public interface AppLogAggregator extends Runnable {
@@ -29,4 +31,10 @@ public interface AppLogAggregator extends Runnable {
   void finishLogAggregation();
 
   void disableLogAggregation();
+
+  void enableLogAggregation();
+
+  boolean isAggregationEnabled();
+
+  UserGroupInformation updateCredentials(Credentials cred);
 }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/8596a0eb/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/AppLogAggregatorImpl.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/AppLogAggregatorImpl.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/AppLogAggregatorImpl.java
index c7e06ff..0470e73 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/AppLogAggregatorImpl.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/AppLogAggregatorImpl.java
@@ -534,6 +534,16 @@ public class AppLogAggregatorImpl implements AppLogAggregator {
     this.logAggregationDisabled = true;
   }
 
+  @Override
+  public void enableLogAggregation() {
+    this.logAggregationDisabled = false;
+  }
+
+  @Override
+  public boolean isAggregationEnabled() {
+    return !logAggregationDisabled;
+  }
+
   @Private
   @VisibleForTesting
   // This is only used for testing.
@@ -616,6 +626,11 @@ public class AppLogAggregatorImpl implements AppLogAggregator {
     return this.userUgi;
   }
 
+  public UserGroupInformation updateCredentials(Credentials cred) {
+    this.userUgi.addCredentials(cred);
+    return userUgi;
+  }
+
   @Private
   @VisibleForTesting
   public int getLogAggregationTimes() {

http://git-wip-us.apache.org/repos/asf/hadoop/blob/8596a0eb/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/LogAggregationService.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/LogAggregationService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/LogAggregationService.java
index 4938939..8ea97d1 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/LogAggregationService.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/LogAggregationService.java
@@ -20,10 +20,14 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.logaggregatio
 
 import java.io.IOException;
 import java.util.Map;
+import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ConcurrentMap;
 import java.util.concurrent.ExecutorService;
 import java.util.concurrent.TimeUnit;
+
+import org.apache.hadoop.security.token.SecretManager;
+
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
@@ -58,6 +62,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.eve
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.event.LogHandlerContainerFinishedEvent;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.event.LogHandlerEvent;
 
+
 import com.google.common.annotations.VisibleForTesting;
 import com.google.common.util.concurrent.ThreadFactoryBuilder;
 
@@ -83,6 +88,9 @@ public class LogAggregationService extends AbstractService implements
 
   private final ConcurrentMap<ApplicationId, AppLogAggregator> appLogAggregators;
 
+  // Holds applications whose aggregation is disable due to invalid Token
+  private final Set<ApplicationId> invalidTokenApps;
+
   @VisibleForTesting
   ExecutorService threadPool;
   
@@ -95,6 +103,7 @@ public class LogAggregationService extends AbstractService implements
     this.dirsHandler = dirsHandler;
     this.appLogAggregators =
         new ConcurrentHashMap<ApplicationId, AppLogAggregator>();
+    this.invalidTokenApps = ConcurrentHashMap.newKeySet();
   }
 
   protected void serviceInit(Configuration conf) throws Exception {
@@ -224,8 +233,8 @@ public class LogAggregationService extends AbstractService implements
       userUgi.addCredentials(credentials);
     }
 
-    LogAggregationFileController logAggregationFileController
-        = getLogAggregationFileController(getConfig());
+    LogAggregationFileController logAggregationFileController =
+        getLogAggregationFileController(getConfig());
     logAggregationFileController.verifyAndCreateRemoteLogDir();
     // New application
     final AppLogAggregator appLogAggregator =
@@ -245,14 +254,16 @@ public class LogAggregationService extends AbstractService implements
       logAggregationFileController.createAppDir(user, appId, userUgi);
     } catch (Exception e) {
       appLogAggregator.disableLogAggregation();
+
+      // add to disabled aggregators if due to InvalidToken
+      if (e.getCause() instanceof SecretManager.InvalidToken) {
+        invalidTokenApps.add(appId);
+      }
       if (!(e instanceof YarnRuntimeException)) {
         appDirException = new YarnRuntimeException(e);
       } else {
         appDirException = (YarnRuntimeException)e;
       }
-      appLogAggregators.remove(appId);
-      closeFileSystems(userUgi);
-      throw appDirException;
     }
 
     // TODO Get the user configuration for the list of containers that need log
@@ -270,6 +281,10 @@ public class LogAggregationService extends AbstractService implements
       }
     };
     this.threadPool.execute(aggregatorWrapper);
+
+    if (appDirException != null) {
+      throw appDirException;
+    }
   }
 
   protected void closeFileSystems(final UserGroupInformation userUgi) {
@@ -307,17 +322,20 @@ public class LogAggregationService extends AbstractService implements
 
     // App is complete. Finish up any containers' pending log aggregation and
     // close the application specific logFile.
-
-    AppLogAggregator aggregator = this.appLogAggregators.get(appId);
-    if (aggregator == null) {
-      LOG.warn("Log aggregation is not initialized for " + appId
-          + ", did it fail to start?");
-      this.dispatcher.getEventHandler().handle(
-          new ApplicationEvent(appId,
-              ApplicationEventType.APPLICATION_LOG_HANDLING_FAILED));
-      return;
+    try {
+      AppLogAggregator aggregator = this.appLogAggregators.get(appId);
+      if (aggregator == null) {
+        LOG.warn("Log aggregation is not initialized for " + appId
+            + ", did it fail to start?");
+        this.dispatcher.getEventHandler().handle(new ApplicationEvent(appId,
+            ApplicationEventType.APPLICATION_LOG_HANDLING_FAILED));
+        return;
+      }
+      aggregator.finishLogAggregation();
+    } finally {
+      // Remove invalid Token Apps
+      invalidTokenApps.remove(appId);
     }
-    aggregator.finishLogAggregation();
   }
 
   @Override
@@ -344,12 +362,47 @@ public class LogAggregationService extends AbstractService implements
             (LogHandlerAppFinishedEvent) event;
         stopApp(appFinishedEvent.getApplicationId());
         break;
+      case LOG_AGG_TOKEN_UPDATE:
+        checkAndEnableAppAggregators();
+        break;
       default:
         ; // Ignore
     }
 
   }
 
+  private void checkAndEnableAppAggregators() {
+    for (ApplicationId appId : invalidTokenApps) {
+      try {
+        AppLogAggregator aggregator = appLogAggregators.get(appId);
+        if (aggregator != null) {
+          Credentials credentials =
+              context.getSystemCredentialsForApps().get(appId);
+          if (credentials != null) {
+            // Create the app dir again with
+            LogAggregationFileController logAggregationFileController =
+                getLogAggregationFileController(getConfig());
+            UserGroupInformation userUgi =
+                aggregator.updateCredentials(credentials);
+            logAggregationFileController
+                .createAppDir(userUgi.getShortUserName(), appId, userUgi);
+            aggregator.enableLogAggregation();
+          }
+          invalidTokenApps.remove(appId);
+          LOG.info("LogAggregation enabled for application {}", appId);
+        }
+      } catch (Exception e) {
+        //Ignore exception
+        LOG.warn("Enable aggregators failed {}", appId);
+      }
+    }
+  }
+
+  @Override
+  public Set<ApplicationId> getInvalidTokenApps() {
+    return invalidTokenApps;
+  }
+
   @VisibleForTesting
   public ConcurrentMap<ApplicationId, AppLogAggregator> getAppLogAggregators() {
     return this.appLogAggregators;

http://git-wip-us.apache.org/repos/asf/hadoop/blob/8596a0eb/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/loghandler/LogHandler.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/loghandler/LogHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/loghandler/LogHandler.java
index 6eb3fb4..459fdf4 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/loghandler/LogHandler.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/loghandler/LogHandler.java
@@ -18,9 +18,16 @@
 
 package org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler;
 
+import org.apache.hadoop.yarn.api.records.ApplicationId;
 import org.apache.hadoop.yarn.event.EventHandler;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.event.LogHandlerEvent;
 
+
+
+import java.util.Set;
+
 public interface LogHandler extends EventHandler<LogHandlerEvent> {
   public void handle(LogHandlerEvent event);
+
+  public Set<ApplicationId> getInvalidTokenApps();
 }
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/hadoop/blob/8596a0eb/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/loghandler/NonAggregatingLogHandler.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/loghandler/NonAggregatingLogHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/loghandler/NonAggregatingLogHandler.java
index 9c43dde..d66aa12 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/loghandler/NonAggregatingLogHandler.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/loghandler/NonAggregatingLogHandler.java
@@ -19,8 +19,12 @@ package org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler;
 
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Collections;
+import java.util.HashMap;
+import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.ScheduledThreadPoolExecutor;
 import java.util.concurrent.ThreadFactory;
@@ -204,6 +208,11 @@ public class NonAggregatingLogHandler extends AbstractService implements
     }
   }
 
+  @Override
+  public Set<ApplicationId> getInvalidTokenApps() {
+    return Collections.emptySet();
+  }
+
   ScheduledThreadPoolExecutor createScheduledThreadPoolExecutor(
       Configuration conf) {
     ThreadFactory tf =

http://git-wip-us.apache.org/repos/asf/hadoop/blob/8596a0eb/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/loghandler/event/LogHandlerEventType.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/loghandler/event/LogHandlerEventType.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/loghandler/event/LogHandlerEventType.java
index 684d6b2..ec477c2 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/loghandler/event/LogHandlerEventType.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/loghandler/event/LogHandlerEventType.java
@@ -19,5 +19,7 @@
 package org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.event;
 
 public enum LogHandlerEventType {
-  APPLICATION_STARTED, CONTAINER_FINISHED, APPLICATION_FINISHED
+  APPLICATION_STARTED,
+  CONTAINER_FINISHED,
+  APPLICATION_FINISHED, LOG_AGG_TOKEN_UPDATE
 }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/8596a0eb/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/loghandler/event/LogHandlerTokenUpdatedEvent.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/loghandler/event/LogHandlerTokenUpdatedEvent.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/loghandler/event/LogHandlerTokenUpdatedEvent.java
new file mode 100644
index 0000000..772a463
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/loghandler/event/LogHandlerTokenUpdatedEvent.java
@@ -0,0 +1,26 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.event;
+
+public class LogHandlerTokenUpdatedEvent extends LogHandlerEvent {
+
+  public LogHandlerTokenUpdatedEvent() {
+    super(LogHandlerEventType.LOG_AGG_TOKEN_UPDATE);
+  }
+
+}

http://git-wip-us.apache.org/repos/asf/hadoop/blob/8596a0eb/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/DummyContainerManager.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/DummyContainerManager.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/DummyContainerManager.java
index b5cb43b..feabeb1 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/DummyContainerManager.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/DummyContainerManager.java
@@ -24,6 +24,8 @@ import org.slf4j.LoggerFactory;
 
 import java.io.IOException;
 import java.util.Collection;
+import java.util.Collections;
+import java.util.Set;
 
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.fs.Path;
@@ -187,6 +189,11 @@ public class DummyContainerManager extends ContainerManagerImpl {
             // Ignore
           }
       }
+
+      @Override
+      public Set<ApplicationId> getInvalidTokenApps() {
+        return Collections.emptySet();
+      }
     };
   }
 

http://git-wip-us.apache.org/repos/asf/hadoop/blob/8596a0eb/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/TestLogAggregationService.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/TestLogAggregationService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/TestLogAggregationService.java
index 05cdd49..dd83d99 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/TestLogAggregationService.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/containermanager/logaggregation/TestLogAggregationService.java
@@ -73,6 +73,7 @@ import org.apache.hadoop.fs.permission.FsPermission;
 import org.apache.hadoop.io.Text;
 import org.apache.hadoop.security.Credentials;
 import org.apache.hadoop.security.UserGroupInformation;
+import org.apache.hadoop.security.token.SecretManager;
 import org.apache.hadoop.security.token.Token;
 import org.apache.hadoop.test.GenericTestUtils;
 import org.apache.hadoop.yarn.api.protocolrecords.StartContainerRequest;
@@ -128,6 +129,7 @@ import org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.Tes
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.event.LogHandlerAppFinishedEvent;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.event.LogHandlerAppStartedEvent;
 import org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.event.LogHandlerContainerFinishedEvent;
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.loghandler.event.LogHandlerTokenUpdatedEvent;
 import org.apache.hadoop.yarn.server.nodemanager.executor.DeletionAsUserContext;
 import org.apache.hadoop.yarn.server.utils.BuilderUtils;
 import org.apache.hadoop.yarn.util.ConverterUtils;
@@ -823,7 +825,8 @@ public class TestLogAggregationService extends BaseContainerManagerTest {
         .getFileControllerForWrite();
     LogAggregationFileController spyLogAggregationFileFormat =
         spy(logAggregationFileFormat);
-    Exception e = new RuntimeException("KABOOM!");
+    Exception e =
+        new YarnRuntimeException(new SecretManager.InvalidToken("KABOOM!"));
     doThrow(e).when(spyLogAggregationFileFormat)
         .createAppDir(any(String.class), any(ApplicationId.class),
             any(UserGroupInformation.class));
@@ -862,29 +865,40 @@ public class TestLogAggregationService extends BaseContainerManagerTest {
     };
     checkEvents(appEventHandler, expectedEvents, false,
         "getType", "getApplicationID", "getDiagnostic");
-
+    Assert.assertEquals(logAggregationService.getInvalidTokenApps().size(), 1);
     // verify trying to collect logs for containers/apps we don't know about
     // doesn't blow up and tear down the NM
     logAggregationService.handle(new LogHandlerContainerFinishedEvent(
         BuilderUtils.newContainerId(4, 1, 1, 1),
         ContainerType.APPLICATION_MASTER, 0));
     dispatcher.await();
+
+    AppLogAggregator appAgg =
+        logAggregationService.getAppLogAggregators().get(appId);
+    Assert.assertFalse("Aggregation should be disabled",
+        appAgg.isAggregationEnabled());
+
+    // Enabled aggregation
+    logAggregationService.handle(new LogHandlerTokenUpdatedEvent());
+    dispatcher.await();
+
+    appAgg =
+        logAggregationService.getAppLogAggregators().get(appId);
+    Assert.assertFalse("Aggregation should be enabled",
+        appAgg.isAggregationEnabled());
+
+    // Check disabled apps are cleared
+    Assert.assertEquals(0, logAggregationService.getInvalidTokenApps().size());
+
     logAggregationService.handle(new LogHandlerAppFinishedEvent(
         BuilderUtils.newApplicationId(1, 5)));
     dispatcher.await();
 
     logAggregationService.stop();
     assertEquals(0, logAggregationService.getNumAggregators());
-    // local log dir shouldn't be deleted given log aggregation cannot
-    // continue due to aggregated log dir creation failure on remoteFS.
-    FileDeletionTask deletionTask = new FileDeletionTask(spyDelSrvc, user,
-        null, null);
-    verify(spyDelSrvc, never()).delete(deletionTask);
+    verify(spyDelSrvc).delete(any(FileDeletionTask.class));
     verify(logAggregationService).closeFileSystems(
         any(UserGroupInformation.class));
-    // make sure local log dir is not deleted in case log aggregation
-    // service cannot be initiated.
-    assertTrue(appLogDir.exists());
   }
 
   private void writeContainerLogs(File appLogDir, ContainerId containerId,


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org


[6/7] hadoop git commit: YARN-8508. Release GPU resource for killed container. Contributed by Chandni Singh

Posted by wa...@apache.org.
YARN-8508.  Release GPU resource for killed container.
            Contributed by Chandni Singh

(cherry picked from commit ed9d60e888d0acfd748fda7f66249f5b79a3ed6d)
(cherry picked from commit c2c3eee69c8b389525fbde800e057dbcb2fc643e)


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/17625e40
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/17625e40
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/17625e40

Branch: refs/heads/branch-3.1.1
Commit: 17625e40f61ac31fc87f4a74d66388dbe508d33d
Parents: 096c139
Author: Eric Yang <ey...@apache.org>
Authored: Fri Jul 27 19:33:58 2018 -0400
Committer: Wangda Tan <wa...@apache.org>
Committed: Tue Jul 31 12:06:54 2018 -0700

----------------------------------------------------------------------
 .../nodemanager/LinuxContainerExecutor.java     | 34 ++++++++++----------
 .../nodemanager/TestLinuxContainerExecutor.java |  9 +++++-
 2 files changed, 25 insertions(+), 18 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/17625e40/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java
index 03b88a4..4253f2f 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/LinuxContainerExecutor.java
@@ -573,15 +573,7 @@ public class LinuxContainerExecutor extends ContainerExecutor {
       return handleExitCode(e, container, containerId);
     } finally {
       resourcesHandler.postExecute(containerId);
-
-      try {
-        if (resourceHandlerChain != null) {
-          resourceHandlerChain.postComplete(containerId);
-        }
-      } catch (ResourceHandlerException e) {
-        LOG.warn("ResourceHandlerChain.postComplete failed for " +
-            "containerId: " + containerId + ". Exception: " + e);
-      }
+      postComplete(containerId);
     }
 
     return 0;
@@ -721,14 +713,7 @@ public class LinuxContainerExecutor extends ContainerExecutor {
       return super.reacquireContainer(ctx);
     } finally {
       resourcesHandler.postExecute(containerId);
-      if (resourceHandlerChain != null) {
-        try {
-          resourceHandlerChain.postComplete(containerId);
-        } catch (ResourceHandlerException e) {
-          LOG.warn("ResourceHandlerChain.postComplete failed for " +
-              "containerId: " + containerId + " Exception: " + e);
-        }
-      }
+      postComplete(containerId);
     }
   }
 
@@ -798,6 +783,8 @@ public class LinuxContainerExecutor extends ContainerExecutor {
       logOutput(e.getOutput());
       throw new IOException("Error in reaping container "
           + container.getContainerId().toString() + " exit = " + retCode, e);
+    } finally {
+      postComplete(container.getContainerId());
     }
     return true;
   }
@@ -968,4 +955,17 @@ public class LinuxContainerExecutor extends ContainerExecutor {
       LOG.warn("Unable to remove docker container: " + containerId);
     }
   }
+
+  @VisibleForTesting
+  void postComplete(final ContainerId containerId) {
+    try {
+      if (resourceHandlerChain != null) {
+        LOG.debug("{} post complete", containerId);
+        resourceHandlerChain.postComplete(containerId);
+      }
+    } catch (ResourceHandlerException e) {
+      LOG.warn("ResourceHandlerChain.postComplete failed for " +
+          "containerId: {}. Exception: ", containerId, e);
+    }
+  }
 }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/17625e40/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java
index ddbf3b9..6d77fc4 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/TestLinuxContainerExecutor.java
@@ -25,11 +25,14 @@ import static org.junit.Assert.assertFalse;
 import static org.junit.Assert.assertNotNull;
 import static org.junit.Assert.assertTrue;
 import static org.junit.Assert.fail;
+import static org.mockito.Matchers.anyObject;
 import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.spy;
 import static org.mockito.Mockito.times;
 import static org.mockito.Mockito.verify;
 import static org.mockito.Mockito.when;
 
+import org.apache.hadoop.yarn.server.nodemanager.containermanager.linux.runtime.LinuxContainerRuntime;
 import org.apache.hadoop.yarn.server.nodemanager.executor.ContainerReapContext;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
@@ -40,6 +43,7 @@ import java.io.IOException;
 import java.io.PrintWriter;
 import java.net.InetSocketAddress;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.EnumSet;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -667,12 +671,15 @@ public class TestLinuxContainerExecutor {
   @Test
   public void testReapContainer() throws Exception {
     Container container = mock(Container.class);
-    LinuxContainerExecutor lce = mock(LinuxContainerExecutor.class);
+    LinuxContainerRuntime containerRuntime = mock(LinuxContainerRuntime.class);
+    LinuxContainerExecutor lce = spy(new LinuxContainerExecutor(
+        containerRuntime));
     ContainerReapContext.Builder builder =  new ContainerReapContext.Builder();
     builder.setContainer(container).setUser("foo");
     ContainerReapContext ctx = builder.build();
     lce.reapContainer(ctx);
     verify(lce, times(1)).reapContainer(ctx);
+    verify(lce, times(1)).postComplete(anyObject());
   }
 
   @Test


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org


[4/7] hadoop git commit: YARN-8545. Return allocated resource to RM for failed container. Contributed by Chandni Singh

Posted by wa...@apache.org.
YARN-8545.  Return allocated resource to RM for failed container.
            Contributed by Chandni Singh

(cherry picked from commit 40fad32824d2f8f960c779d78357e62103453da0)
(cherry picked from commit 177f6045ac4ae6e2dbae2e04da8c9cebb5da8748)


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/bae40602
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/bae40602
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/bae40602

Branch: refs/heads/branch-3.1.1
Commit: bae40602fa7aa0c30e7305626acf6c01cde0a129
Parents: 49795e9
Author: Eric Yang <ey...@apache.org>
Authored: Thu Jul 26 18:22:57 2018 -0400
Committer: Wangda Tan <wa...@apache.org>
Committed: Tue Jul 31 12:06:39 2018 -0700

----------------------------------------------------------------------
 .../hadoop/yarn/service/ServiceScheduler.java   |  3 +-
 .../yarn/service/component/Component.java       | 42 +++++++++++---------
 .../component/instance/ComponentInstance.java   | 21 +++++++---
 .../instance/ComponentInstanceEvent.java        |  2 +
 .../containerlaunch/ContainerLaunchService.java | 12 ++++--
 .../hadoop/yarn/service/MockServiceAM.java      | 34 +++++++++++++++-
 .../hadoop/yarn/service/TestServiceAM.java      | 35 ++++++++++++++++
 .../yarn/service/component/TestComponent.java   |  3 +-
 .../instance/TestComponentInstance.java         | 26 ++++++------
 9 files changed, 135 insertions(+), 43 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/bae40602/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/ServiceScheduler.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/ServiceScheduler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/ServiceScheduler.java
index d3e8e4f..cfaf356 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/ServiceScheduler.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/ServiceScheduler.java
@@ -687,7 +687,8 @@ public class ServiceScheduler extends CompositeService {
         }
         ComponentEvent event =
             new ComponentEvent(instance.getCompName(), CONTAINER_COMPLETED)
-                .setStatus(status).setInstance(instance);
+                .setStatus(status).setInstance(instance)
+                .setContainerId(containerId);
         dispatcher.getEventHandler().handle(event);
       }
     }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/bae40602/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/Component.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/Component.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/Component.java
index a1ee796..aaa23da 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/Component.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/Component.java
@@ -19,6 +19,7 @@
 package org.apache.hadoop.yarn.service.component;
 
 import com.google.common.annotations.VisibleForTesting;
+import com.google.common.base.Preconditions;
 import org.apache.hadoop.yarn.api.records.Container;
 import org.apache.hadoop.yarn.api.records.ContainerStatus;
 import org.apache.hadoop.yarn.api.records.ExecutionType;
@@ -518,10 +519,10 @@ public class Component implements EventHandler<ComponentEvent> {
   private static class ContainerCompletedTransition extends BaseTransition {
     @Override
     public void transition(Component component, ComponentEvent event) {
-
+      Preconditions.checkNotNull(event.getContainerId());
       component.updateMetrics(event.getStatus());
       component.dispatcher.getEventHandler().handle(
-          new ComponentInstanceEvent(event.getStatus().getContainerId(), STOP)
+          new ComponentInstanceEvent(event.getContainerId(), STOP)
               .setStatus(event.getStatus()));
 
       ComponentRestartPolicy restartPolicy =
@@ -784,28 +785,33 @@ public class Component implements EventHandler<ComponentEvent> {
   }
 
   private void updateMetrics(ContainerStatus status) {
-    switch (status.getExitStatus()) {
-    case SUCCESS:
-      componentMetrics.containersSucceeded.incr();
-      scheduler.getServiceMetrics().containersSucceeded.incr();
-      return;
-    case PREEMPTED:
-      componentMetrics.containersPreempted.incr();
-      scheduler.getServiceMetrics().containersPreempted.incr();
-      break;
-    case DISKS_FAILED:
-      componentMetrics.containersDiskFailure.incr();
-      scheduler.getServiceMetrics().containersDiskFailure.incr();
-      break;
-    default:
-      break;
+    //when a container preparation fails while building launch context, then
+    //the container status may not exist.
+    if (status != null) {
+      switch (status.getExitStatus()) {
+        case SUCCESS:
+          componentMetrics.containersSucceeded.incr();
+          scheduler.getServiceMetrics().containersSucceeded.incr();
+          return;
+        case PREEMPTED:
+          componentMetrics.containersPreempted.incr();
+          scheduler.getServiceMetrics().containersPreempted.incr();
+          break;
+        case DISKS_FAILED:
+          componentMetrics.containersDiskFailure.incr();
+          scheduler.getServiceMetrics().containersDiskFailure.incr();
+          break;
+        default:
+          break;
+      }
     }
 
     // containersFailed include preempted, disks_failed etc.
     componentMetrics.containersFailed.incr();
     scheduler.getServiceMetrics().containersFailed.incr();
 
-    if (Apps.shouldCountTowardsNodeBlacklisting(status.getExitStatus())) {
+    if (status != null && Apps.shouldCountTowardsNodeBlacklisting(
+        status.getExitStatus())) {
       String host = scheduler.getLiveInstances().get(status.getContainerId())
           .getNodeId().getHost();
       failureTracker.incNodeFailure(host);

http://git-wip-us.apache.org/repos/asf/hadoop/blob/bae40602/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/instance/ComponentInstance.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/instance/ComponentInstance.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/instance/ComponentInstance.java
index 529596d..44ae1e7 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/instance/ComponentInstance.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/instance/ComponentInstance.java
@@ -76,6 +76,8 @@ public class ComponentInstance implements EventHandler<ComponentInstanceEvent>,
     Comparable<ComponentInstance> {
   private static final Logger LOG =
       LoggerFactory.getLogger(ComponentInstance.class);
+  private static final String FAILED_BEFORE_LAUNCH_DIAG =
+      "failed before launch";
 
   private  StateMachine<ComponentInstanceState, ComponentInstanceEventType,
       ComponentInstanceEvent> stateMachine;
@@ -236,7 +238,8 @@ public class ComponentInstance implements EventHandler<ComponentInstanceEvent>,
 
   @VisibleForTesting
   static void handleComponentInstanceRelaunch(
-      ComponentInstance compInstance, ComponentInstanceEvent event) {
+      ComponentInstance compInstance, ComponentInstanceEvent event,
+      boolean failureBeforeLaunch) {
     Component comp = compInstance.getComponent();
 
     // Do we need to relaunch the service?
@@ -252,8 +255,10 @@ public class ComponentInstance implements EventHandler<ComponentInstanceEvent>,
               + ": {} completed. Reinsert back to pending list and requested " +
               "a new container." + System.lineSeparator() +
               " exitStatus={}, diagnostics={}.",
-          event.getContainerId(), event.getStatus().getExitStatus(),
-          event.getStatus().getDiagnostics());
+          event.getContainerId(), failureBeforeLaunch ? null :
+              event.getStatus().getExitStatus(),
+          failureBeforeLaunch ? FAILED_BEFORE_LAUNCH_DIAG :
+              event.getStatus().getDiagnostics());
     } else {
       // When no relaunch, update component's #succeeded/#failed
       // instances.
@@ -292,8 +297,8 @@ public class ComponentInstance implements EventHandler<ComponentInstanceEvent>,
 
       Component comp = compInstance.component;
       String containerDiag =
-          compInstance.getCompInstanceId() + ": " + event.getStatus()
-              .getDiagnostics();
+          compInstance.getCompInstanceId() + ": " + (failedBeforeLaunching ?
+              FAILED_BEFORE_LAUNCH_DIAG : event.getStatus().getDiagnostics());
       compInstance.diagnostics.append(containerDiag + System.lineSeparator());
       compInstance.cancelContainerStatusRetriever();
       if (compInstance.getState().equals(ComponentInstanceState.UPGRADING)) {
@@ -307,6 +312,9 @@ public class ComponentInstance implements EventHandler<ComponentInstanceEvent>,
       boolean shouldFailService = false;
 
       final ServiceScheduler scheduler = comp.getScheduler();
+      scheduler.getAmRMClient().releaseAssignedContainer(
+          event.getContainerId());
+
       // Check if it exceeds the failure threshold, but only if health threshold
       // monitor is not enabled
       if (!comp.isHealthThresholdMonitorEnabled()
@@ -347,7 +355,8 @@ public class ComponentInstance implements EventHandler<ComponentInstanceEvent>,
 
       // According to component restart policy, handle container restart
       // or finish the service (if all components finished)
-      handleComponentInstanceRelaunch(compInstance, event);
+      handleComponentInstanceRelaunch(compInstance, event,
+          failedBeforeLaunching);
 
       if (shouldFailService) {
         scheduler.getTerminationHandler().terminate(-1);

http://git-wip-us.apache.org/repos/asf/hadoop/blob/bae40602/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/instance/ComponentInstanceEvent.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/instance/ComponentInstanceEvent.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/instance/ComponentInstanceEvent.java
index 707b034..889da6e 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/instance/ComponentInstanceEvent.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/component/instance/ComponentInstanceEvent.java
@@ -18,6 +18,7 @@
 
 package org.apache.hadoop.yarn.service.component.instance;
 
+import com.google.common.base.Preconditions;
 import org.apache.hadoop.yarn.api.records.ContainerId;
 import org.apache.hadoop.yarn.api.records.ContainerStatus;
 import org.apache.hadoop.yarn.event.AbstractEvent;
@@ -32,6 +33,7 @@ public class ComponentInstanceEvent
   public ComponentInstanceEvent(ContainerId containerId,
       ComponentInstanceEventType componentInstanceEventType) {
     super(componentInstanceEventType);
+    Preconditions.checkNotNull(containerId);
     this.id = containerId;
   }
 

http://git-wip-us.apache.org/repos/asf/hadoop/blob/bae40602/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/containerlaunch/ContainerLaunchService.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/containerlaunch/ContainerLaunchService.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/containerlaunch/ContainerLaunchService.java
index 084c721..f674e0d 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/containerlaunch/ContainerLaunchService.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/main/java/org/apache/hadoop/yarn/service/containerlaunch/ContainerLaunchService.java
@@ -22,8 +22,11 @@ import com.google.common.base.Preconditions;
 import org.apache.hadoop.conf.Configuration;
 import org.apache.hadoop.service.AbstractService;
 import org.apache.hadoop.yarn.api.records.Container;
+import org.apache.hadoop.yarn.api.records.ContainerStatus;
 import org.apache.hadoop.yarn.service.ServiceContext;
 import org.apache.hadoop.yarn.service.api.records.Artifact;
+import org.apache.hadoop.yarn.service.component.ComponentEvent;
+import org.apache.hadoop.yarn.service.component.ComponentEventType;
 import org.apache.hadoop.yarn.service.component.instance.ComponentInstance;
 import org.apache.hadoop.yarn.service.provider.ProviderService;
 import org.apache.hadoop.yarn.service.provider.ProviderFactory;
@@ -116,9 +119,12 @@ public class ContainerLaunchService extends AbstractService{
                   launcher.completeContainerLaunch(), true);
         }
       } catch (Exception e) {
-        LOG.error(instance.getCompInstanceId()
-            + ": Failed to launch container. ", e);
-
+        LOG.error("{}: Failed to launch container.",
+            instance.getCompInstanceId(), e);
+        ComponentEvent event = new ComponentEvent(instance.getCompName(),
+            ComponentEventType.CONTAINER_COMPLETED)
+            .setInstance(instance).setContainerId(container.getId());
+        context.scheduler.getDispatcher().getEventHandler().handle(event);
       }
     }
   }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/bae40602/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/test/java/org/apache/hadoop/yarn/service/MockServiceAM.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/test/java/org/apache/hadoop/yarn/service/MockServiceAM.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/test/java/org/apache/hadoop/yarn/service/MockServiceAM.java
index 4a75aef..729287c 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/test/java/org/apache/hadoop/yarn/service/MockServiceAM.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/test/java/org/apache/hadoop/yarn/service/MockServiceAM.java
@@ -68,6 +68,7 @@ import java.util.Iterator;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
+import java.util.Set;
 import java.util.concurrent.ConcurrentHashMap;
 import java.util.concurrent.TimeoutException;
 
@@ -99,6 +100,8 @@ public class MockServiceAM extends ServiceMaster {
   private Map<ContainerId, ContainerStatus> containerStatuses =
       new ConcurrentHashMap<>();
 
+  private Set<ContainerId> releasedContainers = ConcurrentHashMap.newKeySet();
+
   private Credentials amCreds;
 
   public MockServiceAM(Service service) {
@@ -223,6 +226,13 @@ public class MockServiceAM extends ServiceMaster {
             return response;
           }
 
+          @Override
+          public synchronized void releaseAssignedContainer(
+              ContainerId containerId) {
+            releasedContainers.add(containerId);
+            super.releaseAssignedContainer(containerId);
+          }
+
           @Override public void unregisterApplicationMaster(
               FinalApplicationStatus appStatus, String appMessage,
               String appTrackingUrl) {
@@ -288,7 +298,7 @@ public class MockServiceAM extends ServiceMaster {
   }
 
   /**
-   *
+   * Creates a mock container and container ID and feeds to the component.
    * @param service The service for the component
    * @param id The id for the container
    * @param compName The component to which the container is fed
@@ -297,6 +307,18 @@ public class MockServiceAM extends ServiceMaster {
   public Container feedContainerToComp(Service service, int id,
       String compName) {
     ContainerId containerId = createContainerId(id);
+    return feedContainerToComp(service, containerId, compName);
+  }
+
+  /**
+   * Feeds the container to the component.
+   * @param service The service for the component
+   * @param containerId container id
+   * @param compName The component to which the container is fed
+   * @return
+   */
+  public Container feedContainerToComp(Service service, ContainerId containerId,
+      String compName) {
     Container container = createContainer(containerId, compName);
     synchronized (feedContainers) {
       feedContainers.add(container);
@@ -423,4 +445,14 @@ public class MockServiceAM extends ServiceMaster {
     }
     return ByteBuffer.wrap(dob.getData(), 0, dob.getLength());
   }
+
+  /**
+   * Waits for the container to get released
+   * @param containerId           ContainerId
+   */
+  public void waitForContainerToRelease(ContainerId containerId)
+      throws TimeoutException, InterruptedException {
+    GenericTestUtils.waitFor(() -> releasedContainers.contains(containerId),
+        1000, 9990000);
+  }
 }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/bae40602/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/test/java/org/apache/hadoop/yarn/service/TestServiceAM.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/test/java/org/apache/hadoop/yarn/service/TestServiceAM.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/test/java/org/apache/hadoop/yarn/service/TestServiceAM.java
index e9478f0..21e93fa 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/test/java/org/apache/hadoop/yarn/service/TestServiceAM.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/test/java/org/apache/hadoop/yarn/service/TestServiceAM.java
@@ -18,6 +18,7 @@
 
 package org.apache.hadoop.yarn.service;
 
+import com.google.common.base.Supplier;
 import com.google.common.collect.ImmutableMap;
 import org.apache.commons.io.FileUtils;
 import org.apache.curator.test.TestingCluster;
@@ -391,4 +392,38 @@ public class TestServiceAM extends ServiceTestUtils{
         .equals("newer.host"), 2000, 200000);
     am.stop();
   }
+
+  // Test to verify that the containers are released and the
+  // component instance is added to the pending queue when building the launch
+  // context fails.
+  @Test(timeout = 9990000)
+  public void testContainersReleasedWhenPreLaunchFails()
+      throws Exception {
+    ApplicationId applicationId = ApplicationId.newInstance(
+        System.currentTimeMillis(), 1);
+    Service exampleApp = new Service();
+    exampleApp.setId(applicationId.toString());
+    exampleApp.setVersion("v1");
+    exampleApp.setName("testContainersReleasedWhenPreLaunchFails");
+
+    Component compA = createComponent("compa", 1, "pwd");
+    Artifact artifact = new Artifact();
+    artifact.setType(Artifact.TypeEnum.TARBALL);
+    compA.artifact(artifact);
+    exampleApp.addComponent(compA);
+
+    MockServiceAM am = new MockServiceAM(exampleApp);
+    am.init(conf);
+    am.start();
+
+    ContainerId containerId = am.createContainerId(1);
+
+    // allocate a container
+    am.feedContainerToComp(exampleApp, containerId, "compa");
+    am.waitForContainerToRelease(containerId);
+
+    Assert.assertEquals(1,
+        am.getComponent("compa").getPendingInstances().size());
+    am.stop();
+  }
 }

http://git-wip-us.apache.org/repos/asf/hadoop/blob/bae40602/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/test/java/org/apache/hadoop/yarn/service/component/TestComponent.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/test/java/org/apache/hadoop/yarn/service/component/TestComponent.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/test/java/org/apache/hadoop/yarn/service/component/TestComponent.java
index d7c15ec..a3b86a7 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/test/java/org/apache/hadoop/yarn/service/component/TestComponent.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/test/java/org/apache/hadoop/yarn/service/component/TestComponent.java
@@ -196,7 +196,8 @@ public class TestComponent {
           org.apache.hadoop.yarn.api.records.ContainerState.COMPLETE,
           "successful", 0);
       comp.handle(new ComponentEvent(comp.getName(),
-          ComponentEventType.CONTAINER_COMPLETED).setStatus(containerStatus));
+          ComponentEventType.CONTAINER_COMPLETED).setStatus(containerStatus)
+          .setContainerId(instanceContainer.getId()));
       componentInstance.handle(
           new ComponentInstanceEvent(componentInstance.getContainer().getId(),
               ComponentInstanceEventType.STOP).setStatus(containerStatus));

http://git-wip-us.apache.org/repos/asf/hadoop/blob/bae40602/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/test/java/org/apache/hadoop/yarn/service/component/instance/TestComponentInstance.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/test/java/org/apache/hadoop/yarn/service/component/instance/TestComponentInstance.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/test/java/org/apache/hadoop/yarn/service/component/instance/TestComponentInstance.java
index 26e8c93..95d774a 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/test/java/org/apache/hadoop/yarn/service/component/instance/TestComponentInstance.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-applications/hadoop-yarn-services/hadoop-yarn-services-core/src/test/java/org/apache/hadoop/yarn/service/component/instance/TestComponentInstance.java
@@ -245,7 +245,7 @@ public class TestComponentInstance {
         comp.getAllComponentInstances().iterator().next();
 
     ComponentInstance.handleComponentInstanceRelaunch(componentInstance,
-        componentInstanceEvent);
+        componentInstanceEvent, false);
 
     verify(comp, never()).markAsSucceeded(any(ComponentInstance.class));
     verify(comp, never()).markAsFailed(any(ComponentInstance.class));
@@ -262,7 +262,7 @@ public class TestComponentInstance {
     componentInstance = comp.getAllComponentInstances().iterator().next();
     containerStatus.setExitStatus(1);
     ComponentInstance.handleComponentInstanceRelaunch(componentInstance,
-        componentInstanceEvent);
+        componentInstanceEvent, false);
     verify(comp, never()).markAsSucceeded(any(ComponentInstance.class));
     verify(comp, never()).markAsFailed(any(ComponentInstance.class));
     verify(comp, times(1)).reInsertPendingInstance(
@@ -286,7 +286,7 @@ public class TestComponentInstance {
     when(comp.getNumSucceededInstances()).thenReturn(new Long(1));
 
     ComponentInstance.handleComponentInstanceRelaunch(componentInstance,
-        componentInstanceEvent);
+        componentInstanceEvent, false);
     verify(comp, times(1)).markAsSucceeded(any(ComponentInstance.class));
     verify(comp, never()).markAsFailed(any(ComponentInstance.class));
     verify(comp, times(0)).reInsertPendingInstance(
@@ -304,7 +304,7 @@ public class TestComponentInstance {
 
     when(comp.getNumFailedInstances()).thenReturn(new Long(1));
     ComponentInstance.handleComponentInstanceRelaunch(componentInstance,
-        componentInstanceEvent);
+        componentInstanceEvent, false);
 
     verify(comp, never()).markAsSucceeded(any(ComponentInstance.class));
     verify(comp, times(1)).markAsFailed(any(ComponentInstance.class));
@@ -323,7 +323,7 @@ public class TestComponentInstance {
     componentInstance = comp.getAllComponentInstances().iterator().next();
     containerStatus.setExitStatus(1);
     ComponentInstance.handleComponentInstanceRelaunch(componentInstance,
-        componentInstanceEvent);
+        componentInstanceEvent, false);
     verify(comp, never()).markAsSucceeded(any(ComponentInstance.class));
     verify(comp, never()).markAsFailed(any(ComponentInstance.class));
     verify(comp, times(1)).reInsertPendingInstance(
@@ -340,7 +340,7 @@ public class TestComponentInstance {
     componentInstance = comp.getAllComponentInstances().iterator().next();
     containerStatus.setExitStatus(1);
     ComponentInstance.handleComponentInstanceRelaunch(componentInstance,
-        componentInstanceEvent);
+        componentInstanceEvent, false);
     verify(comp, never()).markAsSucceeded(any(ComponentInstance.class));
     verify(comp, times(1)).markAsFailed(any(ComponentInstance.class));
     verify(comp, times(0)).reInsertPendingInstance(
@@ -363,7 +363,7 @@ public class TestComponentInstance {
     containerStatus.setExitStatus(1);
     ComponentInstance commponentInstance = iter.next();
     ComponentInstance.handleComponentInstanceRelaunch(commponentInstance,
-        componentInstanceEvent);
+        componentInstanceEvent, false);
 
     verify(comp, never()).markAsSucceeded(any(ComponentInstance.class));
     verify(comp, never()).markAsFailed(any(ComponentInstance.class));
@@ -404,7 +404,7 @@ public class TestComponentInstance {
       when(component2Instance.getComponent().getNumFailedInstances())
           .thenReturn(new Long(failed2Instances.size()));
       ComponentInstance.handleComponentInstanceRelaunch(component2Instance,
-          componentInstanceEvent);
+          componentInstanceEvent, false);
     }
 
     Map<String, ComponentInstance> failed1Instances = new HashMap<>();
@@ -418,7 +418,7 @@ public class TestComponentInstance {
       when(component1Instance.getComponent().getNumFailedInstances())
           .thenReturn(new Long(failed1Instances.size()));
       ComponentInstance.handleComponentInstanceRelaunch(component1Instance,
-          componentInstanceEvent);
+          componentInstanceEvent, false);
     }
 
     verify(comp, never()).markAsSucceeded(any(ComponentInstance.class));
@@ -458,7 +458,7 @@ public class TestComponentInstance {
       when(component2Instance.getComponent().getNumSucceededInstances())
           .thenReturn(new Long(succeeded2Instances.size()));
       ComponentInstance.handleComponentInstanceRelaunch(component2Instance,
-          componentInstanceEvent);
+          componentInstanceEvent, false);
     }
 
     Map<String, ComponentInstance> succeeded1Instances = new HashMap<>();
@@ -471,7 +471,7 @@ public class TestComponentInstance {
       when(component1Instance.getComponent().getNumSucceededInstances())
           .thenReturn(new Long(succeeded1Instances.size()));
       ComponentInstance.handleComponentInstanceRelaunch(component1Instance,
-          componentInstanceEvent);
+          componentInstanceEvent, false);
     }
 
     verify(comp, times(2)).markAsSucceeded(any(ComponentInstance.class));
@@ -500,7 +500,7 @@ public class TestComponentInstance {
 
     for (ComponentInstance component2Instance : component2Instances) {
       ComponentInstance.handleComponentInstanceRelaunch(component2Instance,
-          componentInstanceEvent);
+          componentInstanceEvent, false);
     }
 
     succeeded1Instances = new HashMap<>();
@@ -511,7 +511,7 @@ public class TestComponentInstance {
       when(component1Instance.getComponent().getSucceededInstances())
           .thenReturn(succeeded1Instances.values());
       ComponentInstance.handleComponentInstanceRelaunch(component1Instance,
-          componentInstanceEvent);
+          componentInstanceEvent, false);
     }
 
     verify(comp, times(2)).markAsSucceeded(any(ComponentInstance.class));


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org


[2/7] hadoop git commit: YARN-8301. Added YARN service upgrade instructions. Contributed by Chandni Singh

Posted by wa...@apache.org.
YARN-8301.  Added YARN service upgrade instructions.
            Contributed by Chandni Singh

(cherry picked from commit 10014a4d88f239d3c072e51bc0739cba1fca9406)
(cherry picked from commit 4f2a129f2e2326ca28659d93b412cf8649ed5025)


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/7cb37ed6
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/7cb37ed6
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/7cb37ed6

Branch: refs/heads/branch-3.1.1
Commit: 7cb37ed66bc830af41b8fb87207023e74ac5386f
Parents: 0c3d3bc
Author: Eric Yang <ey...@apache.org>
Authored: Fri Jul 20 19:46:35 2018 -0400
Committer: Wangda Tan <wa...@apache.org>
Committed: Tue Jul 31 12:06:23 2018 -0700

----------------------------------------------------------------------
 .../src/site/markdown/yarn-service/Overview.md  |   4 +-
 .../markdown/yarn-service/ServiceUpgrade.md     | 197 +++++++++++++++++++
 2 files changed, 198 insertions(+), 3 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/7cb37ed6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/yarn-service/Overview.md
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/yarn-service/Overview.md b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/yarn-service/Overview.md
index 8e2bf9a..041b0ee 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/yarn-service/Overview.md
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/yarn-service/Overview.md
@@ -56,6 +56,4 @@ The benefits of combining these workloads are two-fold:
 * [Registry DNS](RegistryDNS.html): Deep dives into the Registry DNS internals.
 * [Examples](Examples.html): List some example service definitions (`Yarnfile`).
 * [Configurations](Configurations.html): Describes how to configure the custom services on YARN.
-
-
- 
+* [Service Upgrade](ServiceUpgrade.html): Describes how to upgrade a YARN service which is an experimental feature.

http://git-wip-us.apache.org/repos/asf/hadoop/blob/7cb37ed6/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/yarn-service/ServiceUpgrade.md
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/yarn-service/ServiceUpgrade.md b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/yarn-service/ServiceUpgrade.md
new file mode 100644
index 0000000..839be22
--- /dev/null
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-site/src/site/markdown/yarn-service/ServiceUpgrade.md
@@ -0,0 +1,197 @@
+<!---
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License. See accompanying LICENSE file.
+-->
+
+# Service Upgrade (Experimental Feature - Tech Preview)
+
+Yarn service provides a way of upgrading/downgrading long running applications without
+shutting down the application to minimize the downtime during this process. This is
+an experimental feature which is currently not enabled by default.
+
+## Overview
+
+Upgrading a Yarn Service is a 3 steps (or 2 steps when auto-finalization of
+upgrade is chosen) process:
+
+1. Initiate service upgrade.\
+This step involves providing the service spec of the newer version of the service.
+Once, the service upgrade is initiated, the state of the service is changed to
+`UPGRADING`.
+
+2. Upgrade component instances.\
+This step involves triggering upgrade of individual component instance.
+By providing an API to upgrade at instance level, users can orchestrate upgrade
+of the entire service in any order which is relevant for the service.\
+In addition, there are APIs to upgrade multiple instances, all instances of a
+component, and all instances of multiple components.
+
+3. Finalize upgrade.\
+This step involves finalization of upgrade. With an explicit step to finalize the
+upgrade, users have a chance to cancel current upgrade in progress. When the
+user chose to cancel, the service will make the best effort to revert to the
+previous version.\
+\
+When the upgrade is finalized, the old service definition is
+overwritten by the new service definition and the service state changes to `STABLE`.\
+A service can be auto-finalized when the upgrade is initialized with
+`-autoFinalize` option. With auto-finalization, when all the component-instances of
+the service have been upgraded, finalization will be performed automatically by the
+service framework.\
+\
+**NOTE**: Cancel of upgrade is not implemented yet.
+
+## Upgrade Example
+This example shows upgrade of sleeper service. Below is the sleeper service
+definition
+
+```
+{
+  "name": "sleeper-service",
+  "components" :
+    [
+      {
+        "name": "sleeper",
+        "version": "1.0.0",
+        "number_of_containers": 1,
+        "launch_command": "sleep 900000",
+        "resource": {
+          "cpus": 1,
+          "memory": "256"
+       }
+      }
+    ]
+}
+```
+Assuming, user launched an instance of sleeper service named as `my-sleeper`:
+```
+{
+  "components":
+    [
+      {
+        "configuration": {...},
+        "containers":
+          [
+            {
+              "bare_host": "0.0.0.0",
+              "component_instance_name": "sleeper-0",
+              "hostname": "example.local",
+              "id": "container_1531508836237_0002_01_000002",
+              "ip": "0.0.0.0",
+              "launch_time": 1531941023675,
+              "state": "READY"
+            },
+            {
+              "bare_host": "0.0.0.0",
+              "component_instance_name": "sleeper-1",
+              "hostname": "example.local",
+              "id": "container_1531508836237_0002_01_000003",
+              "ip": "0.0.0.0",
+              "launch_time": 1531941024680,
+              "state": "READY"
+            }
+          ],
+        "dependencies": [],
+        "launch_command": "sleep 900000",
+        "name": "sleeper",
+        "number_of_containers": 2,
+        "quicklinks": [],
+        "resource": {...},
+        "restart_policy": "ALWAYS",
+        "run_privileged_container": false,
+        "state": "STABLE"
+      }
+    ],
+  "configuration": {...},
+  "id": "application_1531508836237_0002",
+  "kerberos_principal": {},
+  "lifetime": -1,
+  "name": "my-sleeper",
+  "quicklinks": {},
+  "state": "STABLE",
+  "version": "1.0.0"
+}
+```
+
+### Enable Service Upgrade
+Below is the configuration in `yarn-site.xml` required for enabling service
+upgrade.
+
+```
+  <property>
+    <name>yarn.service.upgrade.enabled</name>
+    <value>true</value>
+  </property>
+```
+
+### Initiate Upgrade
+User can initiate upgrade using the below command:
+```
+yarn app -upgrade ${service_name} -initate ${path_to_new_service_def_file} [-autoFinalize]
+```
+
+e.g. To upgrade `my-sleeper` to sleep for *1200000* instead of *900000*, the user
+can upgrade the service to version 1.0.1. Below is the service definition for
+version 1.0.1 of sleeper-service:
+
+```
+{
+  "components" :
+    [
+      {
+        "name": "sleeper",
+        "version": "1.0.1",
+        "number_of_containers": 1,
+        "launch_command": "sleep 1200000",
+        "resource": {
+          "cpus": 1,
+          "memory": "256"
+        }
+      }
+    ]
+}
+```
+The command below initiates the upgrade to version 1.0.1.
+```
+yarn app -upgrade my-sleeper -initiate sleeper_v101.json
+```
+
+### Upgrade Instance
+User can upgrade a component instance using the below command:
+```
+yarn app -upgrade ${service_name} -instances ${comma_separated_list_of_instance_names}
+```
+e.g. The command below upgrades `sleeper-0` and `sleeper-1` instances of `my-service`:
+```
+yarn app -upgrade my-sleeper -instances sleeper-0,sleeper-1
+```
+
+### Upgrade Component
+User can upgrade a component, that is, all the instances of a component with
+one command:
+```
+yarn app -upgrade ${service_name} -components ${comma_separated_list_of_component_names}
+```
+e.g. The command below upgrades all the instances of `sleeper` component of `my-service`:
+```
+yarn app -ugrade my-sleeper -components sleeper
+```
+
+### Finalize Upgrade
+User must finalize the upgrade using the below command (since autoFinalize was not specified during initiate):
+```
+yarn app -upgrade ${service_name} -finalize
+```
+e.g. The command below finalizes the upgrade of `my-sleeper`:
+```
+yarn app -upgrade my-sleeper -finalize
+```


---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org