You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ambari.apache.org by ao...@apache.org on 2018/08/06 11:19:54 UTC

[ambari] branch branch-2.7 updated: AMBARI-24399. Components start failing with 'Holder DFSClient_NONMAPREDUCE does not have any open files' while adding Namespace (aonishuk)

This is an automated email from the ASF dual-hosted git repository.

aonishuk pushed a commit to branch branch-2.7
in repository https://gitbox.apache.org/repos/asf/ambari.git


The following commit(s) were added to refs/heads/branch-2.7 by this push:
     new 44384bf  AMBARI-24399. Components start failing with 'Holder DFSClient_NONMAPREDUCE does not have any open files' while adding Namespace  (aonishuk)
44384bf is described below

commit 44384bf1e682c4c0bbfcb8881a0590dc93bc4414
Author: Andrew Onishuk <ao...@hortonworks.com>
AuthorDate: Mon Aug 6 13:20:42 2018 +0300

    AMBARI-24399. Components start failing with 'Holder DFSClient_NONMAPREDUCE does not have any open files' while adding Namespace  (aonishuk)
---
 .../libraries/providers/hdfs_resource.py           | 28 ++++++++++++++++++----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/ambari-common/src/main/python/resource_management/libraries/providers/hdfs_resource.py b/ambari-common/src/main/python/resource_management/libraries/providers/hdfs_resource.py
index a7b43c7..37febf7 100644
--- a/ambari-common/src/main/python/resource_management/libraries/providers/hdfs_resource.py
+++ b/ambari-common/src/main/python/resource_management/libraries/providers/hdfs_resource.py
@@ -58,9 +58,16 @@ RESOURCE_TO_JSON_FIELDS = {
 }
 
 EXCEPTIONS_TO_RETRY = {
-  # "ExceptionName": (try_count, try_sleep_seconds)
-  "LeaseExpiredException": (20, 6),
-  "RetriableException": (20, 6),
+  # ("ExceptionName"): ("required text fragment", try_count, try_sleep_seconds)
+
+  """
+  Happens when multiple nodes try to put same file at the same time.
+  Needs a longer retry time, to wait for other nodes success.
+  """
+  "FileNotFoundException": (" does not have any open files", 6, 30),
+
+  "LeaseExpiredException": ("", 20, 6),
+  "RetriableException": ("", 20, 6),
 }
 
 class HdfsResourceJar:
@@ -173,6 +180,11 @@ class WebHDFSCallException(Fail):
       return self.result_message["RemoteException"]["exception"]
     return None
 
+  def get_exception_text(self):
+    if isinstance(self.result_message, dict) and "RemoteException" in self.result_message and "message" in self.result_message["RemoteException"]:
+      return self.result_message["RemoteException"]["message"]
+    return None
+
 class WebHDFSUtil:
   def __init__(self, hdfs_site, nameservice, run_user, security_enabled, logoutput=None):
     self.is_https_enabled = is_https_enabled_in_hdfs(hdfs_site['dfs.http.policy'], hdfs_site['dfs.https.enable'])
@@ -199,9 +211,15 @@ class WebHDFSUtil:
       return self._run_command(*args, **kwargs)
     except WebHDFSCallException as ex:
       exception_name = ex.get_exception_name()
+      exception_text = ex.get_exception_text()
       if exception_name in EXCEPTIONS_TO_RETRY:
-        try_count, try_sleep = EXCEPTIONS_TO_RETRY[exception_name]
-        last_exception = ex
+
+        required_text, try_count, try_sleep = EXCEPTIONS_TO_RETRY[exception_name]
+
+        if not required_text or (exception_text and required_text in exception_text):
+          last_exception = ex
+        else:
+          raise
       else:
         raise