You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by jl...@apache.org on 2015/02/10 18:27:59 UTC

hadoop git commit: YARN-2809. Implement workaround for linux kernel panic when removing cgroup. Contributed by Nathan Roberts

Repository: hadoop
Updated Branches:
  refs/heads/trunk 4eb5f7fa3 -> 3f5431a22


YARN-2809. Implement workaround for linux kernel panic when removing cgroup. Contributed by Nathan Roberts


Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/3f5431a2
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/3f5431a2
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/3f5431a2

Branch: refs/heads/trunk
Commit: 3f5431a22fcef7e3eb9aceeefe324e5b7ac84049
Parents: 4eb5f7f
Author: Jason Lowe <jl...@apache.org>
Authored: Tue Feb 10 17:27:21 2015 +0000
Committer: Jason Lowe <jl...@apache.org>
Committed: Tue Feb 10 17:27:21 2015 +0000

----------------------------------------------------------------------
 hadoop-yarn-project/CHANGES.txt                 |  3 +
 .../hadoop/yarn/conf/YarnConfiguration.java     | 11 ++-
 .../util/CgroupsLCEResourcesHandler.java        | 73 +++++++++++++++---
 .../util/TestCgroupsLCEResourcesHandler.java    | 80 +++++++++++++++-----
 4 files changed, 136 insertions(+), 31 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/hadoop/blob/3f5431a2/hadoop-yarn-project/CHANGES.txt
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt
index 5a3a505..ee58b24 100644
--- a/hadoop-yarn-project/CHANGES.txt
+++ b/hadoop-yarn-project/CHANGES.txt
@@ -530,6 +530,9 @@ Release 2.7.0 - UNRELEASED
     YARN-3090. DeletionService can silently ignore deletion task failures
     (Varun Saxena via jlowe)
 
+    YARN-2809. Implement workaround for linux kernel panic when removing
+    cgroup (Nathan Roberts via jlowe)
+
 Release 2.6.0 - 2014-11-18
 
   INCOMPATIBLE CHANGES

http://git-wip-us.apache.org/repos/asf/hadoop/blob/3f5431a2/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
index 6904543..05c6cbf 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
@@ -1002,7 +1002,16 @@ public class YarnConfiguration extends Configuration {
 
   public static final long DEFAULT_NM_LINUX_CONTAINER_CGROUPS_DELETE_TIMEOUT =
       1000;
-  
+
+  /**
+   * Delay between attempts to remove linux cgroup.
+   */
+  public static final String NM_LINUX_CONTAINER_CGROUPS_DELETE_DELAY =
+      NM_PREFIX + "linux-container-executor.cgroups.delete-delay-ms";
+
+  public static final long DEFAULT_NM_LINUX_CONTAINER_CGROUPS_DELETE_DELAY =
+      20;
+
   /** 
   /* The Windows group that the windows-secure-container-executor should run as.
   */

http://git-wip-us.apache.org/repos/asf/hadoop/blob/3f5431a2/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/util/CgroupsLCEResourcesHandler.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/util/CgroupsLCEResourcesHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/util/CgroupsLCEResourcesHandler.java
index a832a7a..ffa17ac 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/util/CgroupsLCEResourcesHandler.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/util/CgroupsLCEResourcesHandler.java
@@ -22,6 +22,7 @@ import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileOutputStream;
+import java.io.FileReader;
 import java.io.IOException;
 import java.io.InputStreamReader;
 import java.io.OutputStreamWriter;
@@ -37,6 +38,7 @@ import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 
 import com.google.common.annotations.VisibleForTesting;
+
 import org.apache.commons.io.FileUtils;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
@@ -75,6 +77,7 @@ public class CgroupsLCEResourcesHandler implements LCEResourcesHandler {
   private final Map<String, String> controllerPaths; // Controller -> path
 
   private long deleteCgroupTimeout;
+  private long deleteCgroupDelay;
   // package private for testing purposes
   Clock clock;
 
@@ -108,6 +111,9 @@ public class CgroupsLCEResourcesHandler implements LCEResourcesHandler {
     this.deleteCgroupTimeout = conf.getLong(
         YarnConfiguration.NM_LINUX_CONTAINER_CGROUPS_DELETE_TIMEOUT,
         YarnConfiguration.DEFAULT_NM_LINUX_CONTAINER_CGROUPS_DELETE_TIMEOUT);
+    this.deleteCgroupDelay =
+        conf.getLong(YarnConfiguration.NM_LINUX_CONTAINER_CGROUPS_DELETE_DELAY,
+            YarnConfiguration.DEFAULT_NM_LINUX_CONTAINER_CGROUPS_DELETE_DELAY);
     // remove extra /'s at end or start of cgroupPrefix
     if (cgroupPrefix.charAt(0) == '/') {
       cgroupPrefix = cgroupPrefix.substring(1);
@@ -271,23 +277,71 @@ public class CgroupsLCEResourcesHandler implements LCEResourcesHandler {
     }
   }
 
+  /*
+   * Utility routine to print first line from cgroup tasks file
+   */
+  private void logLineFromTasksFile(File cgf) {
+    String str;
+    if (LOG.isDebugEnabled()) {
+      try (BufferedReader inl =
+            new BufferedReader(new InputStreamReader(new FileInputStream(cgf
+              + "/tasks"), "UTF-8"))) {
+        if ((str = inl.readLine()) != null) {
+          LOG.debug("First line in cgroup tasks file: " + cgf + " " + str);
+        }
+      } catch (IOException e) {
+        LOG.warn("Failed to read cgroup tasks file. ", e);
+      }
+    }
+  }
+
+  /**
+   * If tasks file is empty, delete the cgroup.
+   *
+   * @param file object referring to the cgroup to be deleted
+   * @return Boolean indicating whether cgroup was deleted
+   */
+  @VisibleForTesting
+  boolean checkAndDeleteCgroup(File cgf) throws InterruptedException {
+    boolean deleted = false;
+    // FileInputStream in = null;
+    try (FileInputStream in = new FileInputStream(cgf + "/tasks")) {
+      if (in.read() == -1) {
+        /*
+         * "tasks" file is empty, sleep a bit more and then try to delete the
+         * cgroup. Some versions of linux will occasionally panic due to a race
+         * condition in this area, hence the paranoia.
+         */
+        Thread.sleep(deleteCgroupDelay);
+        deleted = cgf.delete();
+        if (!deleted) {
+          LOG.warn("Failed attempt to delete cgroup: " + cgf);
+        }
+      } else {
+        logLineFromTasksFile(cgf);
+      }
+    } catch (IOException e) {
+      LOG.warn("Failed to read cgroup tasks file. ", e);
+    }
+    return deleted;
+  }
+
   @VisibleForTesting
   boolean deleteCgroup(String cgroupPath) {
-    boolean deleted;
-    
+    boolean deleted = false;
+
     if (LOG.isDebugEnabled()) {
       LOG.debug("deleteCgroup: " + cgroupPath);
     }
-
     long start = clock.getTime();
     do {
-      deleted = new File(cgroupPath).delete();
-      if (!deleted) {
-        try {
-          Thread.sleep(20);
-        } catch (InterruptedException ex) {
-          // NOP        
+      try {
+        deleted = checkAndDeleteCgroup(new File(cgroupPath));
+        if (!deleted) {
+          Thread.sleep(deleteCgroupDelay);
         }
+      } catch (InterruptedException ex) {
+        // NOP
       }
     } while (!deleted && (clock.getTime() - start) < deleteCgroupTimeout);
 
@@ -295,7 +349,6 @@ public class CgroupsLCEResourcesHandler implements LCEResourcesHandler {
       LOG.warn("Unable to delete cgroup at: " + cgroupPath +
           ", tried to delete for " + deleteCgroupTimeout + "ms");
     }
-
     return deleted;
   }
 

http://git-wip-us.apache.org/repos/asf/hadoop/blob/3f5431a2/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/util/TestCgroupsLCEResourcesHandler.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/util/TestCgroupsLCEResourcesHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/util/TestCgroupsLCEResourcesHandler.java
index d0bceee..4e35169 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/util/TestCgroupsLCEResourcesHandler.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/util/TestCgroupsLCEResourcesHandler.java
@@ -26,6 +26,8 @@ import org.junit.Assert;
 import org.apache.hadoop.yarn.conf.YarnConfiguration;
 import org.apache.hadoop.yarn.util.Clock;
 import org.junit.Test;
+import org.junit.After;
+import org.junit.Before;
 import org.mockito.Mockito;
 
 import java.io.*;
@@ -35,6 +37,7 @@ import java.util.UUID;
 import java.util.concurrent.CountDownLatch;
 
 public class TestCgroupsLCEResourcesHandler {
+  static File cgroupDir = null;
 
   static class MockClock implements Clock {
     long time;
@@ -43,6 +46,51 @@ public class TestCgroupsLCEResourcesHandler {
       return time;
     }
   }
+
+  @Before
+  public void setUp() throws Exception {
+    cgroupDir =
+        new File(System.getProperty("test.build.data",
+            System.getProperty("java.io.tmpdir", "target")), this.getClass()
+            .getName());
+    FileUtils.deleteQuietly(cgroupDir);
+  }
+
+  @After
+  public void tearDown() throws Exception {
+    FileUtils.deleteQuietly(cgroupDir);
+  }
+
+  @Test
+  public void testcheckAndDeleteCgroup() throws Exception {
+    CgroupsLCEResourcesHandler handler = new CgroupsLCEResourcesHandler();
+    handler.setConf(new YarnConfiguration());
+    handler.initConfig();
+
+    FileUtils.deleteQuietly(cgroupDir);
+    // Test 0
+    // tasks file not present, should return false
+    Assert.assertFalse(handler.checkAndDeleteCgroup(cgroupDir));
+
+    File tfile = new File(cgroupDir.getAbsolutePath(), "tasks");
+    FileOutputStream fos = FileUtils.openOutputStream(tfile);
+    File fspy = Mockito.spy(cgroupDir);
+
+    // Test 1, tasks file is empty
+    // tasks file has no data, should return true
+    Mockito.stub(fspy.delete()).toReturn(true);
+    Assert.assertTrue(handler.checkAndDeleteCgroup(fspy));
+
+    // Test 2, tasks file has data
+    fos.write("1234".getBytes());
+    fos.close();
+    // tasks has data, would not be able to delete, should return false
+    Assert.assertFalse(handler.checkAndDeleteCgroup(fspy));
+    FileUtils.deleteQuietly(cgroupDir);
+
+  }
+
+  // Verify DeleteCgroup times out if "tasks" file contains data
   @Test
   public void testDeleteCgroup() throws Exception {
     final MockClock clock = new MockClock();
@@ -51,13 +99,15 @@ public class TestCgroupsLCEResourcesHandler {
     handler.setConf(new YarnConfiguration());
     handler.initConfig();
     handler.clock = clock;
-    
-    //file exists
-    File file = new File("target", UUID.randomUUID().toString());
-    new FileOutputStream(file).close();
-    Assert.assertTrue(handler.deleteCgroup(file.getPath()));
 
-    //file does not exists, timing out
+    FileUtils.deleteQuietly(cgroupDir);
+
+    // Create a non-empty tasks file
+    File tfile = new File(cgroupDir.getAbsolutePath(), "tasks");
+    FileOutputStream fos = FileUtils.openOutputStream(tfile);
+    fos.write("1234".getBytes());
+    fos.close();
+
     final CountDownLatch latch = new CountDownLatch(1);
     new Thread() {
       @Override
@@ -73,8 +123,8 @@ public class TestCgroupsLCEResourcesHandler {
       }
     }.start();
     latch.await();
-    file = new File("target", UUID.randomUUID().toString());
-    Assert.assertFalse(handler.deleteCgroup(file.getPath()));
+    Assert.assertFalse(handler.deleteCgroup(cgroupDir.getAbsolutePath()));
+    FileUtils.deleteQuietly(cgroupDir);
   }
 
   static class MockLinuxContainerExecutor extends LinuxContainerExecutor {
@@ -122,7 +172,6 @@ public class TestCgroupsLCEResourcesHandler {
     handler.initConfig();
 
     // create mock cgroup
-    File cgroupDir = createMockCgroup();
     File cgroupMountDir = createMockCgroupMount(cgroupDir);
 
     // create mock mtab
@@ -202,18 +251,10 @@ public class TestCgroupsLCEResourcesHandler {
     Assert.assertEquals(-1, ret[1]);
   }
 
-  private File createMockCgroup() throws IOException {
-    File cgroupDir = new File("target", UUID.randomUUID().toString());
-    if (!cgroupDir.mkdir()) {
-      String message = "Could not create dir " + cgroupDir.getAbsolutePath();
-      throw new IOException(message);
-    }
-    return cgroupDir;
-  }
-
   private File createMockCgroupMount(File cgroupDir) throws IOException {
     File cgroupMountDir = new File(cgroupDir.getAbsolutePath(), "hadoop-yarn");
-    if (!cgroupMountDir.mkdir()) {
+    FileUtils.deleteQuietly(cgroupDir);
+    if (!cgroupMountDir.mkdirs()) {
       String message =
           "Could not create dir " + cgroupMountDir.getAbsolutePath();
       throw new IOException(message);
@@ -253,7 +294,6 @@ public class TestCgroupsLCEResourcesHandler {
     handler.initConfig();
 
     // create mock cgroup
-    File cgroupDir = createMockCgroup();
     File cgroupMountDir = createMockCgroupMount(cgroupDir);
 
     // create mock mtab