You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by jl...@apache.org on 2015/02/10 18:27:59 UTC
hadoop git commit: YARN-2809. Implement workaround for linux kernel
panic when removing cgroup. Contributed by Nathan Roberts
Repository: hadoop
Updated Branches:
refs/heads/trunk 4eb5f7fa3 -> 3f5431a22
YARN-2809. Implement workaround for linux kernel panic when removing cgroup. Contributed by Nathan Roberts
Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/3f5431a2
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/3f5431a2
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/3f5431a2
Branch: refs/heads/trunk
Commit: 3f5431a22fcef7e3eb9aceeefe324e5b7ac84049
Parents: 4eb5f7f
Author: Jason Lowe <jl...@apache.org>
Authored: Tue Feb 10 17:27:21 2015 +0000
Committer: Jason Lowe <jl...@apache.org>
Committed: Tue Feb 10 17:27:21 2015 +0000
----------------------------------------------------------------------
hadoop-yarn-project/CHANGES.txt | 3 +
.../hadoop/yarn/conf/YarnConfiguration.java | 11 ++-
.../util/CgroupsLCEResourcesHandler.java | 73 +++++++++++++++---
.../util/TestCgroupsLCEResourcesHandler.java | 80 +++++++++++++++-----
4 files changed, 136 insertions(+), 31 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hadoop/blob/3f5431a2/hadoop-yarn-project/CHANGES.txt
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/CHANGES.txt b/hadoop-yarn-project/CHANGES.txt
index 5a3a505..ee58b24 100644
--- a/hadoop-yarn-project/CHANGES.txt
+++ b/hadoop-yarn-project/CHANGES.txt
@@ -530,6 +530,9 @@ Release 2.7.0 - UNRELEASED
YARN-3090. DeletionService can silently ignore deletion task failures
(Varun Saxena via jlowe)
+ YARN-2809. Implement workaround for linux kernel panic when removing
+ cgroup (Nathan Roberts via jlowe)
+
Release 2.6.0 - 2014-11-18
INCOMPATIBLE CHANGES
http://git-wip-us.apache.org/repos/asf/hadoop/blob/3f5431a2/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
index 6904543..05c6cbf 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-api/src/main/java/org/apache/hadoop/yarn/conf/YarnConfiguration.java
@@ -1002,7 +1002,16 @@ public class YarnConfiguration extends Configuration {
public static final long DEFAULT_NM_LINUX_CONTAINER_CGROUPS_DELETE_TIMEOUT =
1000;
-
+
+ /**
+ * Delay between attempts to remove linux cgroup.
+ */
+ public static final String NM_LINUX_CONTAINER_CGROUPS_DELETE_DELAY =
+ NM_PREFIX + "linux-container-executor.cgroups.delete-delay-ms";
+
+ public static final long DEFAULT_NM_LINUX_CONTAINER_CGROUPS_DELETE_DELAY =
+ 20;
+
/**
/* The Windows group that the windows-secure-container-executor should run as.
*/
http://git-wip-us.apache.org/repos/asf/hadoop/blob/3f5431a2/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/util/CgroupsLCEResourcesHandler.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/util/CgroupsLCEResourcesHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/util/CgroupsLCEResourcesHandler.java
index a832a7a..ffa17ac 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/util/CgroupsLCEResourcesHandler.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/java/org/apache/hadoop/yarn/server/nodemanager/util/CgroupsLCEResourcesHandler.java
@@ -22,6 +22,7 @@ import java.io.BufferedReader;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
+import java.io.FileReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
@@ -37,6 +38,7 @@ import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.google.common.annotations.VisibleForTesting;
+
import org.apache.commons.io.FileUtils;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
@@ -75,6 +77,7 @@ public class CgroupsLCEResourcesHandler implements LCEResourcesHandler {
private final Map<String, String> controllerPaths; // Controller -> path
private long deleteCgroupTimeout;
+ private long deleteCgroupDelay;
// package private for testing purposes
Clock clock;
@@ -108,6 +111,9 @@ public class CgroupsLCEResourcesHandler implements LCEResourcesHandler {
this.deleteCgroupTimeout = conf.getLong(
YarnConfiguration.NM_LINUX_CONTAINER_CGROUPS_DELETE_TIMEOUT,
YarnConfiguration.DEFAULT_NM_LINUX_CONTAINER_CGROUPS_DELETE_TIMEOUT);
+ this.deleteCgroupDelay =
+ conf.getLong(YarnConfiguration.NM_LINUX_CONTAINER_CGROUPS_DELETE_DELAY,
+ YarnConfiguration.DEFAULT_NM_LINUX_CONTAINER_CGROUPS_DELETE_DELAY);
// remove extra /'s at end or start of cgroupPrefix
if (cgroupPrefix.charAt(0) == '/') {
cgroupPrefix = cgroupPrefix.substring(1);
@@ -271,23 +277,71 @@ public class CgroupsLCEResourcesHandler implements LCEResourcesHandler {
}
}
+ /*
+ * Utility routine to print first line from cgroup tasks file
+ */
+ private void logLineFromTasksFile(File cgf) {
+ String str;
+ if (LOG.isDebugEnabled()) {
+ try (BufferedReader inl =
+ new BufferedReader(new InputStreamReader(new FileInputStream(cgf
+ + "/tasks"), "UTF-8"))) {
+ if ((str = inl.readLine()) != null) {
+ LOG.debug("First line in cgroup tasks file: " + cgf + " " + str);
+ }
+ } catch (IOException e) {
+ LOG.warn("Failed to read cgroup tasks file. ", e);
+ }
+ }
+ }
+
+ /**
+ * If tasks file is empty, delete the cgroup.
+ *
+ * @param file object referring to the cgroup to be deleted
+ * @return Boolean indicating whether cgroup was deleted
+ */
+ @VisibleForTesting
+ boolean checkAndDeleteCgroup(File cgf) throws InterruptedException {
+ boolean deleted = false;
+ // FileInputStream in = null;
+ try (FileInputStream in = new FileInputStream(cgf + "/tasks")) {
+ if (in.read() == -1) {
+ /*
+ * "tasks" file is empty, sleep a bit more and then try to delete the
+ * cgroup. Some versions of linux will occasionally panic due to a race
+ * condition in this area, hence the paranoia.
+ */
+ Thread.sleep(deleteCgroupDelay);
+ deleted = cgf.delete();
+ if (!deleted) {
+ LOG.warn("Failed attempt to delete cgroup: " + cgf);
+ }
+ } else {
+ logLineFromTasksFile(cgf);
+ }
+ } catch (IOException e) {
+ LOG.warn("Failed to read cgroup tasks file. ", e);
+ }
+ return deleted;
+ }
+
@VisibleForTesting
boolean deleteCgroup(String cgroupPath) {
- boolean deleted;
-
+ boolean deleted = false;
+
if (LOG.isDebugEnabled()) {
LOG.debug("deleteCgroup: " + cgroupPath);
}
-
long start = clock.getTime();
do {
- deleted = new File(cgroupPath).delete();
- if (!deleted) {
- try {
- Thread.sleep(20);
- } catch (InterruptedException ex) {
- // NOP
+ try {
+ deleted = checkAndDeleteCgroup(new File(cgroupPath));
+ if (!deleted) {
+ Thread.sleep(deleteCgroupDelay);
}
+ } catch (InterruptedException ex) {
+ // NOP
}
} while (!deleted && (clock.getTime() - start) < deleteCgroupTimeout);
@@ -295,7 +349,6 @@ public class CgroupsLCEResourcesHandler implements LCEResourcesHandler {
LOG.warn("Unable to delete cgroup at: " + cgroupPath +
", tried to delete for " + deleteCgroupTimeout + "ms");
}
-
return deleted;
}
http://git-wip-us.apache.org/repos/asf/hadoop/blob/3f5431a2/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/util/TestCgroupsLCEResourcesHandler.java
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/util/TestCgroupsLCEResourcesHandler.java b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/util/TestCgroupsLCEResourcesHandler.java
index d0bceee..4e35169 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/util/TestCgroupsLCEResourcesHandler.java
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/test/java/org/apache/hadoop/yarn/server/nodemanager/util/TestCgroupsLCEResourcesHandler.java
@@ -26,6 +26,8 @@ import org.junit.Assert;
import org.apache.hadoop.yarn.conf.YarnConfiguration;
import org.apache.hadoop.yarn.util.Clock;
import org.junit.Test;
+import org.junit.After;
+import org.junit.Before;
import org.mockito.Mockito;
import java.io.*;
@@ -35,6 +37,7 @@ import java.util.UUID;
import java.util.concurrent.CountDownLatch;
public class TestCgroupsLCEResourcesHandler {
+ static File cgroupDir = null;
static class MockClock implements Clock {
long time;
@@ -43,6 +46,51 @@ public class TestCgroupsLCEResourcesHandler {
return time;
}
}
+
+ @Before
+ public void setUp() throws Exception {
+ cgroupDir =
+ new File(System.getProperty("test.build.data",
+ System.getProperty("java.io.tmpdir", "target")), this.getClass()
+ .getName());
+ FileUtils.deleteQuietly(cgroupDir);
+ }
+
+ @After
+ public void tearDown() throws Exception {
+ FileUtils.deleteQuietly(cgroupDir);
+ }
+
+ @Test
+ public void testcheckAndDeleteCgroup() throws Exception {
+ CgroupsLCEResourcesHandler handler = new CgroupsLCEResourcesHandler();
+ handler.setConf(new YarnConfiguration());
+ handler.initConfig();
+
+ FileUtils.deleteQuietly(cgroupDir);
+ // Test 0
+ // tasks file not present, should return false
+ Assert.assertFalse(handler.checkAndDeleteCgroup(cgroupDir));
+
+ File tfile = new File(cgroupDir.getAbsolutePath(), "tasks");
+ FileOutputStream fos = FileUtils.openOutputStream(tfile);
+ File fspy = Mockito.spy(cgroupDir);
+
+ // Test 1, tasks file is empty
+ // tasks file has no data, should return true
+ Mockito.stub(fspy.delete()).toReturn(true);
+ Assert.assertTrue(handler.checkAndDeleteCgroup(fspy));
+
+ // Test 2, tasks file has data
+ fos.write("1234".getBytes());
+ fos.close();
+ // tasks has data, would not be able to delete, should return false
+ Assert.assertFalse(handler.checkAndDeleteCgroup(fspy));
+ FileUtils.deleteQuietly(cgroupDir);
+
+ }
+
+ // Verify DeleteCgroup times out if "tasks" file contains data
@Test
public void testDeleteCgroup() throws Exception {
final MockClock clock = new MockClock();
@@ -51,13 +99,15 @@ public class TestCgroupsLCEResourcesHandler {
handler.setConf(new YarnConfiguration());
handler.initConfig();
handler.clock = clock;
-
- //file exists
- File file = new File("target", UUID.randomUUID().toString());
- new FileOutputStream(file).close();
- Assert.assertTrue(handler.deleteCgroup(file.getPath()));
- //file does not exists, timing out
+ FileUtils.deleteQuietly(cgroupDir);
+
+ // Create a non-empty tasks file
+ File tfile = new File(cgroupDir.getAbsolutePath(), "tasks");
+ FileOutputStream fos = FileUtils.openOutputStream(tfile);
+ fos.write("1234".getBytes());
+ fos.close();
+
final CountDownLatch latch = new CountDownLatch(1);
new Thread() {
@Override
@@ -73,8 +123,8 @@ public class TestCgroupsLCEResourcesHandler {
}
}.start();
latch.await();
- file = new File("target", UUID.randomUUID().toString());
- Assert.assertFalse(handler.deleteCgroup(file.getPath()));
+ Assert.assertFalse(handler.deleteCgroup(cgroupDir.getAbsolutePath()));
+ FileUtils.deleteQuietly(cgroupDir);
}
static class MockLinuxContainerExecutor extends LinuxContainerExecutor {
@@ -122,7 +172,6 @@ public class TestCgroupsLCEResourcesHandler {
handler.initConfig();
// create mock cgroup
- File cgroupDir = createMockCgroup();
File cgroupMountDir = createMockCgroupMount(cgroupDir);
// create mock mtab
@@ -202,18 +251,10 @@ public class TestCgroupsLCEResourcesHandler {
Assert.assertEquals(-1, ret[1]);
}
- private File createMockCgroup() throws IOException {
- File cgroupDir = new File("target", UUID.randomUUID().toString());
- if (!cgroupDir.mkdir()) {
- String message = "Could not create dir " + cgroupDir.getAbsolutePath();
- throw new IOException(message);
- }
- return cgroupDir;
- }
-
private File createMockCgroupMount(File cgroupDir) throws IOException {
File cgroupMountDir = new File(cgroupDir.getAbsolutePath(), "hadoop-yarn");
- if (!cgroupMountDir.mkdir()) {
+ FileUtils.deleteQuietly(cgroupDir);
+ if (!cgroupMountDir.mkdirs()) {
String message =
"Could not create dir " + cgroupMountDir.getAbsolutePath();
throw new IOException(message);
@@ -253,7 +294,6 @@ public class TestCgroupsLCEResourcesHandler {
handler.initConfig();
// create mock cgroup
- File cgroupDir = createMockCgroup();
File cgroupMountDir = createMockCgroupMount(cgroupDir);
// create mock mtab