You are viewing a plain text version of this content. The canonical link for it is here.
Posted to common-commits@hadoop.apache.org by ep...@apache.org on 2017/08/02 16:00:39 UTC
hadoop git commit: YARN-6846. Nodemanager can fail to fully delete
application local directories when applications are killed. Contributed by
Jason Lowe.
Repository: hadoop
Updated Branches:
refs/heads/trunk 5e4434f62 -> 48899134d
YARN-6846. Nodemanager can fail to fully delete application local directories when applications are killed. Contributed by Jason Lowe.
Project: http://git-wip-us.apache.org/repos/asf/hadoop/repo
Commit: http://git-wip-us.apache.org/repos/asf/hadoop/commit/48899134
Tree: http://git-wip-us.apache.org/repos/asf/hadoop/tree/48899134
Diff: http://git-wip-us.apache.org/repos/asf/hadoop/diff/48899134
Branch: refs/heads/trunk
Commit: 48899134d2a77935a821072b5388ab1b1b7b399c
Parents: 5e4434f
Author: Eric Payne <ep...@apache.org>
Authored: Wed Aug 2 10:59:33 2017 -0500
Committer: Eric Payne <ep...@apache.org>
Committed: Wed Aug 2 10:59:33 2017 -0500
----------------------------------------------------------------------
.../impl/container-executor.c | 39 ++++++---
.../test/test-container-executor.c | 89 +++++++++++++++++++-
2 files changed, 116 insertions(+), 12 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/hadoop/blob/48899134/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.c
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.c b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.c
index 5070d62..99f7b56 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.c
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/impl/container-executor.c
@@ -1837,7 +1837,7 @@ static int rmdir_as_nm(const char* path) {
int user_gid = getegid();
int ret = change_effective_user(nm_uid, nm_gid);
if (ret == 0) {
- if (rmdir(path) != 0) {
+ if (rmdir(path) != 0 && errno != ENOENT) {
fprintf(LOGFILE, "rmdir of %s failed - %s\n", path, strerror(errno));
ret = -1;
}
@@ -1882,7 +1882,7 @@ static int unlink_helper(int dirfd, const char *name, int flags) {
} else {
ret = unlink(name);
}
- if (ret >= 0) {
+ if (ret >= 0 || errno == ENOENT) {
return 0;
}
return errno;
@@ -1919,7 +1919,7 @@ static int is_symlink_helper(int dirfd, const char *name)
static int recursive_unlink_helper(int dirfd, const char *name,
const char* fullpath)
{
- int fd = -1, ret = 0;
+ int fd = -1, ret = 0, unlink_err = 0;
DIR *dfd = NULL;
struct stat stat;
@@ -1928,6 +1928,10 @@ static int recursive_unlink_helper(int dirfd, const char *name,
ret = is_symlink_helper(dirfd, name);
if (ret < 0) {
// is_symlink_helper failed.
+ if (ret == -ENOENT) {
+ ret = 0;
+ goto done;
+ }
ret = -ret;
fprintf(LOGFILE, "is_symlink_helper(%s) failed: %s\n",
fullpath, strerror(ret));
@@ -1949,6 +1953,10 @@ static int recursive_unlink_helper(int dirfd, const char *name,
if (fd == -EACCES) {
ret = chmod_helper(dirfd, name, 0700);
if (ret) {
+ if (ret == ENOENT) {
+ ret = 0;
+ goto done;
+ }
fprintf(LOGFILE, "chmod(%s) failed: %s\n", fullpath, strerror(ret));
goto done;
}
@@ -1956,11 +1964,19 @@ static int recursive_unlink_helper(int dirfd, const char *name,
}
if (fd < 0) {
ret = -fd;
+ if (ret == ENOENT) {
+ ret = 0;
+ goto done;
+ }
fprintf(LOGFILE, "error opening %s: %s\n", fullpath, strerror(ret));
goto done;
}
if (fstat(fd, &stat) < 0) {
ret = errno;
+ if (ret == ENOENT) {
+ ret = 0;
+ goto done;
+ }
fprintf(LOGFILE, "failed to stat %s: %s\n", fullpath, strerror(ret));
goto done;
}
@@ -1974,6 +1990,10 @@ static int recursive_unlink_helper(int dirfd, const char *name,
dfd = fdopendir(fd);
if (!dfd) {
ret = errno;
+ if (ret == ENOENT) {
+ ret = 0;
+ goto done;
+ }
fprintf(LOGFILE, "fopendir(%s) failed: %s\n", fullpath, strerror(ret));
goto done;
}
@@ -1985,7 +2005,7 @@ static int recursive_unlink_helper(int dirfd, const char *name,
de = readdir(dfd);
if (!de) {
ret = errno;
- if (ret) {
+ if (ret && ret != ENOENT) {
fprintf(LOGFILE, "readdir(%s) failed: %s\n", fullpath, strerror(ret));
goto done;
}
@@ -2003,10 +2023,10 @@ static int recursive_unlink_helper(int dirfd, const char *name,
ret = ENOMEM;
goto done;
}
- ret = recursive_unlink_helper(fd, de->d_name, new_fullpath);
+ int rc = recursive_unlink_helper(fd, de->d_name, new_fullpath);
free(new_fullpath);
- if (ret) {
- goto done;
+ if (rc && !unlink_err) {
+ unlink_err = rc;
}
}
if (dirfd != -1) {
@@ -2017,7 +2037,7 @@ static int recursive_unlink_helper(int dirfd, const char *name,
}
}
}
- ret = 0;
+ ret = unlink_err;
done:
if (fd >= 0) {
close(fd);
@@ -2048,9 +2068,6 @@ static int delete_path(const char *full_path,
return PATH_TO_DELETE_IS_NULL;
}
ret = recursive_unlink_children(full_path);
- if (ret == ENOENT) {
- return 0;
- }
if (ret != 0) {
fprintf(LOGFILE, "Error while deleting %s: %d (%s)\n",
full_path, ret, strerror(ret));
http://git-wip-us.apache.org/repos/asf/hadoop/blob/48899134/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/test-container-executor.c
----------------------------------------------------------------------
diff --git a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/test-container-executor.c b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/test-container-executor.c
index b7d0e44..cf5f119 100644
--- a/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/test-container-executor.c
+++ b/hadoop-yarn-project/hadoop-yarn/hadoop-yarn-server/hadoop-yarn-server-nodemanager/src/main/native/container-executor/test/test-container-executor.c
@@ -368,7 +368,7 @@ void test_delete_app() {
sprintf(buffer, "chmod 000 %s/who/let", container_dir);
run(buffer);
- // delete container directory
+ // delete application directory
int ret = delete_as_user(yarn_username, app_dir, NULL);
if (ret != 0) {
printf("FAIL: return code from delete_as_user is %d\n", ret);
@@ -390,6 +390,13 @@ void test_delete_app() {
printf("FAIL: accidently deleted file %s\n", dont_touch);
exit(1);
}
+ // verify attempt to delete a nonexistent directory does not fail
+ ret = delete_as_user(yarn_username, app_dir, NULL);
+ if (ret != 0) {
+ printf("FAIL: return code from delete_as_user is %d\n", ret);
+ exit(1);
+ }
+
free(app_dir);
free(container_dir);
free(dont_touch);
@@ -975,6 +982,83 @@ static void expect_type(const char *path, int mode) {
}
}
+static void test_delete_race_internal() {
+ char* app_dir = get_app_directory(TEST_ROOT "/local-2", yarn_username, "app_1");
+ char* container_dir = get_container_work_directory(TEST_ROOT "/local-2",
+ yarn_username, "app_1", "container_1");
+ char buffer[100000];
+
+ sprintf(buffer, "mkdir -p %s/a/b/c/d", container_dir);
+ run(buffer);
+ int i;
+ for (i = 0; i < 100; ++i) {
+ sprintf(buffer, "%s/a/f%d", container_dir, i);
+ touch_or_die(buffer);
+ sprintf(buffer, "%s/a/b/f%d", container_dir, i);
+ touch_or_die(buffer);
+ sprintf(buffer, "%s/a/b/c/f%d", container_dir, i);
+ touch_or_die(buffer);
+ sprintf(buffer, "%s/a/b/c/d/f%d", container_dir, i);
+ touch_or_die(buffer);
+ }
+
+ pid_t child = fork();
+ if (child == -1) {
+ printf("FAIL: fork failed\n");
+ exit(1);
+ } else if (child == 0) {
+ // delete container directory
+ char * dirs[] = {app_dir, 0};
+ int ret = delete_as_user(yarn_username, "container_1" , dirs);
+ if (ret != 0) {
+ printf("FAIL: return code from delete_as_user is %d\n", ret);
+ exit(1);
+ }
+ exit(0);
+ } else {
+ // delete application directory
+ int ret = delete_as_user(yarn_username, app_dir, NULL);
+ int status = 0;
+ if (waitpid(child, &status, 0) == -1) {
+ printf("FAIL: waitpid %" PRId64 " failed - %s\n", (int64_t)child, strerror(errno));
+ exit(1);
+ }
+ if (!WIFEXITED(status)) {
+ printf("FAIL: child %" PRId64 " didn't exit - %d\n", (int64_t)child, status);
+ exit(1);
+ }
+ if (WEXITSTATUS(status) != 0) {
+ printf("FAIL: child %" PRId64 " exited with bad status %d\n",
+ (int64_t)child, WEXITSTATUS(status));
+ exit(1);
+ }
+ if (ret != 0) {
+ printf("FAIL: return code from delete_as_user is %d\n", ret);
+ exit(1);
+ }
+ }
+
+ // check to make sure the app directory is gone
+ if (access(app_dir, R_OK) == 0) {
+ printf("FAIL: didn't delete the directory - %s\n", app_dir);
+ exit(1);
+ }
+
+ free(app_dir);
+ free(container_dir);
+}
+
+void test_delete_race() {
+ if (initialize_user(yarn_username, local_dirs)) {
+ printf("FAIL: failed to initialize user %s\n", yarn_username);
+ exit(1);
+ }
+ int i;
+ for (i = 0; i < 100; ++i) {
+ test_delete_race_internal();
+ }
+}
+
int recursive_unlink_children(const char *name);
void test_recursive_unlink_children() {
@@ -1204,6 +1288,9 @@ int main(int argc, char **argv) {
printf("\nTesting delete_app()\n");
test_delete_app();
+ printf("\nTesting delete race\n");
+ test_delete_race();
+
printf("\nTesting is_feature_enabled()\n");
test_is_feature_enabled();
---------------------------------------------------------------------
To unsubscribe, e-mail: common-commits-unsubscribe@hadoop.apache.org
For additional commands, e-mail: common-commits-help@hadoop.apache.org