You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ambari.apache.org by ao...@apache.org on 2018/08/06 11:19:54 UTC
[ambari] branch branch-2.7 updated: AMBARI-24399. Components start
failing with 'Holder DFSClient_NONMAPREDUCE does not have any open files'
while adding Namespace (aonishuk)
This is an automated email from the ASF dual-hosted git repository.
aonishuk pushed a commit to branch branch-2.7
in repository https://gitbox.apache.org/repos/asf/ambari.git
The following commit(s) were added to refs/heads/branch-2.7 by this push:
new 44384bf AMBARI-24399. Components start failing with 'Holder DFSClient_NONMAPREDUCE does not have any open files' while adding Namespace (aonishuk)
44384bf is described below
commit 44384bf1e682c4c0bbfcb8881a0590dc93bc4414
Author: Andrew Onishuk <ao...@hortonworks.com>
AuthorDate: Mon Aug 6 13:20:42 2018 +0300
AMBARI-24399. Components start failing with 'Holder DFSClient_NONMAPREDUCE does not have any open files' while adding Namespace (aonishuk)
---
.../libraries/providers/hdfs_resource.py | 28 ++++++++++++++++++----
1 file changed, 23 insertions(+), 5 deletions(-)
diff --git a/ambari-common/src/main/python/resource_management/libraries/providers/hdfs_resource.py b/ambari-common/src/main/python/resource_management/libraries/providers/hdfs_resource.py
index a7b43c7..37febf7 100644
--- a/ambari-common/src/main/python/resource_management/libraries/providers/hdfs_resource.py
+++ b/ambari-common/src/main/python/resource_management/libraries/providers/hdfs_resource.py
@@ -58,9 +58,16 @@ RESOURCE_TO_JSON_FIELDS = {
}
EXCEPTIONS_TO_RETRY = {
- # "ExceptionName": (try_count, try_sleep_seconds)
- "LeaseExpiredException": (20, 6),
- "RetriableException": (20, 6),
+ # ("ExceptionName"): ("required text fragment", try_count, try_sleep_seconds)
+
+ """
+ Happens when multiple nodes try to put same file at the same time.
+ Needs a longer retry time, to wait for other nodes success.
+ """
+ "FileNotFoundException": (" does not have any open files", 6, 30),
+
+ "LeaseExpiredException": ("", 20, 6),
+ "RetriableException": ("", 20, 6),
}
class HdfsResourceJar:
@@ -173,6 +180,11 @@ class WebHDFSCallException(Fail):
return self.result_message["RemoteException"]["exception"]
return None
+ def get_exception_text(self):
+ if isinstance(self.result_message, dict) and "RemoteException" in self.result_message and "message" in self.result_message["RemoteException"]:
+ return self.result_message["RemoteException"]["message"]
+ return None
+
class WebHDFSUtil:
def __init__(self, hdfs_site, nameservice, run_user, security_enabled, logoutput=None):
self.is_https_enabled = is_https_enabled_in_hdfs(hdfs_site['dfs.http.policy'], hdfs_site['dfs.https.enable'])
@@ -199,9 +211,15 @@ class WebHDFSUtil:
return self._run_command(*args, **kwargs)
except WebHDFSCallException as ex:
exception_name = ex.get_exception_name()
+ exception_text = ex.get_exception_text()
if exception_name in EXCEPTIONS_TO_RETRY:
- try_count, try_sleep = EXCEPTIONS_TO_RETRY[exception_name]
- last_exception = ex
+
+ required_text, try_count, try_sleep = EXCEPTIONS_TO_RETRY[exception_name]
+
+ if not required_text or (exception_text and required_text in exception_text):
+ last_exception = ex
+ else:
+ raise
else:
raise