You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ambari.apache.org by ds...@apache.org on 2015/11/18 18:54:40 UTC

ambari git commit: AMBARI-13905 Add error logs to script alerts when there are failures (dsen)

Repository: ambari
Updated Branches:
  refs/heads/trunk 6e7c75bb0 -> 58b598a5b


AMBARI-13905 Add error logs to script alerts when there are failures (dsen)


Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/58b598a5
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/58b598a5
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/58b598a5

Branch: refs/heads/trunk
Commit: 58b598a5b453e31a30101b27a25f8e18b52c2083
Parents: 6e7c75b
Author: Dmytro Sen <ds...@apache.org>
Authored: Wed Nov 18 19:54:30 2015 +0200
Committer: Dmytro Sen <ds...@apache.org>
Committed: Wed Nov 18 19:54:30 2015 +0200

----------------------------------------------------------------------
 .../python/ambari_agent/alerts/script_alert.py  | 32 +++++++++++++-------
 .../package/alerts/alert_checkpoint_time.py     |  7 +++--
 .../package/alerts/alert_ha_namenode_health.py  |  5 ++-
 .../package/alerts/alert_upgrade_finalized.py   |  8 ++---
 .../package/alerts/alert_hive_metastore.py      | 20 ++++++------
 .../package/alerts/alert_hive_thrift_port.py    | 22 ++++++++------
 .../package/alerts/alert_webhcat_server.py      | 23 ++++++++------
 .../package/alerts/alert_nodemanager_health.py  | 15 ++++++---
 .../alerts/alert_nodemanagers_summary.py        | 13 ++++----
 9 files changed, 87 insertions(+), 58 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/ambari/blob/58b598a5/ambari-agent/src/main/python/ambari_agent/alerts/script_alert.py
----------------------------------------------------------------------
diff --git a/ambari-agent/src/main/python/ambari_agent/alerts/script_alert.py b/ambari-agent/src/main/python/ambari_agent/alerts/script_alert.py
index 97c3b64..e8d0125 100644
--- a/ambari-agent/src/main/python/ambari_agent/alerts/script_alert.py
+++ b/ambari-agent/src/main/python/ambari_agent/alerts/script_alert.py
@@ -27,21 +27,21 @@ from resource_management.core.environment import Environment
 from resource_management.core.logger import Logger
 from ambari_agent import Constants
 
-logger = logging.getLogger(__name__)
+logger = logging.getLogger("ambari_alerts")
 
 class ScriptAlert(BaseAlert):
   def __init__(self, alert_meta, alert_source_meta, config):
     """ ScriptAlert reporting structure is output from the script itself """
-    
+
     alert_source_meta['reporting'] = {
       'ok': { 'text': '{0}' },
       'warning': { 'text': '{0}' },
       'critical': { 'text': '{0}' },
       'unknown': { 'text': '{0}' }
     }
-    
+
     super(ScriptAlert, self).__init__(alert_meta, alert_source_meta)
-    
+
     self.config = config
     self.path = None
     self.stacks_dir = None
@@ -49,10 +49,10 @@ class ScriptAlert(BaseAlert):
     self.host_scripts_dir = None
     self.path_to_script = None
     self.parameters = {}
-    
+
     if 'path' in alert_source_meta:
       self.path = alert_source_meta['path']
-      
+
     if 'common_services_directory' in alert_source_meta:
       self.common_services_dir = alert_source_meta['common_services_directory']
 
@@ -101,12 +101,22 @@ class ScriptAlert(BaseAlert):
       if matchObj:
         basedir = matchObj.group(1)
         with Environment(basedir, tmp_dir=Constants.AGENT_TMP_DIR, logger=logging.getLogger('ambari_alerts')) as env:
-          return cmd_module.execute(configurations, self.parameters, self.host_name)
+          result = cmd_module.execute(configurations, self.parameters, self.host_name)
       else:
-        return cmd_module.execute(configurations, self.parameters, self.host_name)
+        result = cmd_module.execute(configurations, self.parameters, self.host_name)
+
+      loggerMsg = "[Alert][{0}] Failed with result {2}: {3}".format(
+        self.get_name(), self.path_to_script, result[0], result[1])
+
+      if result[0] == self.RESULT_CRITICAL:
+        logger.error(loggerMsg)
+      elif result[0] == self.RESULT_WARNING or result[0] == self.RESULT_UNKNOWN:
+        logger.debug(loggerMsg)
+
+      return result
     else:
       return (self.RESULT_UNKNOWN, ["Unable to execute script {0}".format(self.path)])
-    
+
 
   def _load_source(self):
     if self.path is None and self.stack_path is None and self.host_scripts_dir is None:
@@ -114,7 +124,7 @@ class ScriptAlert(BaseAlert):
 
     paths = self.path.split('/')
     self.path_to_script = self.path
-    
+
     # if the path doesn't exist and stacks dir is defined, try that
     if not os.path.exists(self.path_to_script) and self.stacks_dir is not None:
       self.path_to_script = os.path.join(self.stacks_dir, *paths)
@@ -127,7 +137,7 @@ class ScriptAlert(BaseAlert):
     if not os.path.exists(self.path_to_script) and self.host_scripts_dir is not None:
       self.path_to_script = os.path.join(self.host_scripts_dir, *paths)
 
-    # if the path can't be evaluated, throw exception      
+    # if the path can't be evaluated, throw exception
     if not os.path.exists(self.path_to_script) or not os.path.isfile(self.path_to_script):
       raise Exception(
         "Unable to find '{0}' as an absolute path or part of {1} or {2}".format(self.path,

http://git-wip-us.apache.org/repos/asf/ambari/blob/58b598a5/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_checkpoint_time.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_checkpoint_time.py b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_checkpoint_time.py
index 548a838..ef389cd 100644
--- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_checkpoint_time.py
+++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_checkpoint_time.py
@@ -22,6 +22,7 @@ import time
 import urllib2
 import ambari_simplejson as json # simplejson is much faster comparing to Python 2.6 json module and has the same functions set.
 import logging
+import traceback
 
 from resource_management.libraries.functions.curl_krb_request import curl_krb_request
 from resource_management.core.environment import Environment
@@ -52,7 +53,7 @@ SECURITY_ENABLED_KEY = '{{cluster-env/security_enabled}}'
 SMOKEUSER_KEY = "{{cluster-env/smokeuser}}"
 EXECUTABLE_SEARCH_PATHS = '{{kerberos-env/executable_search_paths}}'
 
-logger = logging.getLogger()
+logger = logging.getLogger('ambari_alerts')
 
 def get_tokens():
   """
@@ -191,8 +192,8 @@ def execute(configurations={}, parameters={}, host_name=None):
     elif (transaction_difference > int(checkpoint_tx)) and (float(delta) / int(checkpoint_period)*100 >= int(percent_warning)):
       result_code = 'WARNING'
 
-  except Exception, e:
-    label = str(e)
+  except:
+    label = traceback.format_exc()
     result_code = 'UNKNOWN'
         
   return ((result_code, [label]))

http://git-wip-us.apache.org/repos/asf/ambari/blob/58b598a5/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_ha_namenode_health.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_ha_namenode_health.py b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_ha_namenode_health.py
index bfc518f..7cd5591 100644
--- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_ha_namenode_health.py
+++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_ha_namenode_health.py
@@ -21,6 +21,7 @@ limitations under the License.
 import urllib2
 import ambari_simplejson as json # simplejson is much faster comparing to Python 2.6 json module and has the same functions set.
 import logging
+import traceback
 
 from resource_management.libraries.functions.curl_krb_request import curl_krb_request
 from resource_management.core.environment import Environment
@@ -48,7 +49,8 @@ EXECUTABLE_SEARCH_PATHS = '{{kerberos-env/executable_search_paths}}'
 CONNECTION_TIMEOUT_KEY = 'connection.timeout'
 CONNECTION_TIMEOUT_DEFAULT = 5.0
 
-logger = logging.getLogger()
+LOGGER_EXCEPTION_MESSAGE = "[Alert] NameNode High Availability Health on {0} fails:"
+logger = logging.getLogger('ambari_alerts')
 
 def get_tokens():
   """
@@ -167,6 +169,7 @@ def execute(configurations={}, parameters={}, host_name=None):
         else:
           unknown_namenodes.append(value)
       except:
+        logger.exception(LOGGER_EXCEPTION_MESSAGE.format(host_name))
         unknown_namenodes.append(value)
 
   # now that the request is done, determine if this host is the host that

http://git-wip-us.apache.org/repos/asf/ambari/blob/58b598a5/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_upgrade_finalized.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_upgrade_finalized.py b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_upgrade_finalized.py
index be72327..6e8945c 100644
--- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_upgrade_finalized.py
+++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_upgrade_finalized.py
@@ -21,6 +21,7 @@ limitations under the License.
 import urllib2
 import ambari_simplejson as json # simplejson is much faster comparing to Python 2.6 json module and has the same functions set.
 import logging
+import traceback
 
 from resource_management.libraries.functions.curl_krb_request import curl_krb_request, CONNECTION_TIMEOUT_DEFAULT
 from resource_management.core.environment import Environment
@@ -34,8 +35,7 @@ KERBEROS_PRINCIPAL = '{{hdfs-site/dfs.web.authentication.kerberos.principal}}'
 SECURITY_ENABLED_KEY = '{{cluster-env/security_enabled}}'
 SMOKEUSER_KEY = "{{cluster-env/smokeuser}}"
 EXECUTABLE_SEARCH_PATHS = '{{kerberos-env/executable_search_paths}}'
-
-logger = logging.getLogger()
+logger = logging.getLogger('ambari_alerts')
 
 def get_tokens():
   """
@@ -138,8 +138,8 @@ def execute(configurations={}, parameters={}, host_name=None):
       label = "HDFS cluster is not finalized"
       result_code = 'CRITICAL'
 
-  except Exception, e:
-    label = str(e)
+  except:
+    label = traceback.format_exc()
     result_code = 'UNKNOWN'
 
   return ((result_code, [label]))

http://git-wip-us.apache.org/repos/asf/ambari/blob/58b598a5/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_metastore.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_metastore.py b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_metastore.py
index 7f23d31..861c48e 100644
--- a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_metastore.py
+++ b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_metastore.py
@@ -21,6 +21,8 @@ limitations under the License.
 import os
 import socket
 import time
+import traceback
+import logging
 
 from resource_management.libraries.functions import format
 from resource_management.libraries.functions import get_kinit_path
@@ -30,7 +32,6 @@ from ambari_commons.os_family_impl import OsFamilyFuncImpl, OsFamilyImpl
 
 OK_MESSAGE = "Metastore OK - Hive command took {0:.3f}s"
 CRITICAL_MESSAGE = "Metastore on {0} failed ({1})"
-
 SECURITY_ENABLED_KEY = '{{cluster-env/security_enabled}}'
 SMOKEUSER_KEYTAB_KEY = '{{cluster-env/smokeuser_keytab}}'
 SMOKEUSER_PRINCIPAL_KEY = '{{cluster-env/smokeuser_principal_name}}'
@@ -60,6 +61,7 @@ HIVE_BIN_DIR_LEGACY = '/usr/lib/hive/bin'
 
 HADOOPUSER_KEY = '{{cluster-env/hadoop.user.name}}'
 HADOOPUSER_DEFAULT = 'hadoop'
+logger = logging.getLogger('ambari_alerts')
 
 @OsFamilyFuncImpl(os_family=OsFamilyImpl.DEFAULT)
 def get_tokens():
@@ -177,12 +179,12 @@ def execute(configurations={}, parameters={}, host_name=None):
 
       result_code = 'OK'
       label = OK_MESSAGE.format(total_time)
-    except Exception, exception:
+    except:
       result_code = 'CRITICAL'
-      label = CRITICAL_MESSAGE.format(host_name, str(exception))
+      label = CRITICAL_MESSAGE.format(host_name, traceback.format_exc())
 
-  except Exception, e:
-    label = str(e)
+  except:
+    label = traceback.format_exc()
     result_code = 'UNKNOWN'
 
   return ((result_code, [label]))
@@ -236,11 +238,11 @@ def execute(configurations={}, parameters={}, host_name=None):
       total_time = time.time() - start_time
       result_code = 'OK'
       label = OK_MESSAGE.format(total_time)
-    except Exception, exception:
+    except:
       result_code = 'CRITICAL'
-      label = CRITICAL_MESSAGE.format(host_name, str(exception))
-  except Exception, e:
-    label = str(e)
+      label = CRITICAL_MESSAGE.format(host_name, traceback.format_exc())
+  except:
+    label = traceback.format_exc()
     result_code = 'UNKNOWN'
 
   return ((result_code, [label]))
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/ambari/blob/58b598a5/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_thrift_port.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_thrift_port.py b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_thrift_port.py
index 8d8bd58..a04c2a6 100644
--- a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_thrift_port.py
+++ b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_thrift_port.py
@@ -21,6 +21,8 @@ limitations under the License.
 import os
 import socket
 import time
+import logging
+import traceback
 from resource_management.libraries.functions import hive_check
 from resource_management.libraries.functions import format
 from resource_management.libraries.functions import get_kinit_path
@@ -66,6 +68,8 @@ SMOKEUSER_DEFAULT = 'ambari-qa'
 HADOOPUSER_KEY = '{{cluster-env/hadoop.user.name}}'
 HADOOPUSER_DEFAULT = 'hadoop'
 
+logger = logging.getLogger('ambari_alerts')
+
 @OsFamilyFuncImpl(os_family=OsFamilyImpl.DEFAULT)
 def get_tokens():
   """
@@ -189,12 +193,12 @@ def execute(configurations={}, parameters={}, host_name=None):
       result_code = 'OK'
       total_time = time.time() - start_time
       label = OK_MESSAGE.format(total_time, port)
-    except Exception, exception:
+    except:
       result_code = 'CRITICAL'
-      label = CRITICAL_MESSAGE.format(host_name, port, str(exception))
+      label = CRITICAL_MESSAGE.format(host_name, port, traceback.format_exc())
 
-  except Exception, e:
-    label = str(e)
+  except:
+    label = traceback.format_exc()
     result_code = 'UNKNOWN'
 
   return (result_code, [label])
@@ -252,11 +256,11 @@ def execute(configurations={}, parameters={}, host_name=None):
       total_time = time.time() - start_time
       result_code = 'OK'
       label = OK_MESSAGE.format(total_time, port)
-    except Exception, exception:
+    except:
       result_code = 'CRITICAL'
-      label = CRITICAL_MESSAGE.format(host_name, port, str(exception))
-  except Exception, e:
-    label = str(e)
+      label = CRITICAL_MESSAGE.format(host_name, port, traceback.format_exc())
+  except:
+    label = traceback.format_exc()
     result_code = 'UNKNOWN'
 
-  return (result_code, [label])
\ No newline at end of file
+  return (result_code, [label])

http://git-wip-us.apache.org/repos/asf/ambari/blob/58b598a5/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_webhcat_server.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_webhcat_server.py b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_webhcat_server.py
index 2c44b36..1e95703 100644
--- a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_webhcat_server.py
+++ b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_webhcat_server.py
@@ -22,6 +22,8 @@ import ambari_simplejson as json # simplejson is much faster comparing to Python
 import socket
 import time
 import urllib2
+import traceback
+import logging
 
 from resource_management.core.environment import Environment
 from resource_management.core.resources import Execute
@@ -37,8 +39,8 @@ RESULT_CODE_CRITICAL = "CRITICAL"
 RESULT_CODE_UNKNOWN = "UNKNOWN"
 
 OK_MESSAGE = "WebHCat status was OK ({0:.3f}s response from {1})"
-CRITICAL_CONNECTION_MESSAGE = "Connection failed to {0}"
-CRITICAL_HTTP_MESSAGE = "HTTP {0} response from {1}"
+CRITICAL_CONNECTION_MESSAGE = "Connection failed to {0} + \n{1}"
+CRITICAL_HTTP_MESSAGE = "HTTP {0} response from {1} \n{2}"
 CRITICAL_WEBHCAT_STATUS_MESSAGE = 'WebHCat returned an unexpected status of "{0}"'
 CRITICAL_WEBHCAT_UNKNOWN_JSON_MESSAGE = "Unable to determine WebHCat health from unexpected JSON response"
 
@@ -71,6 +73,7 @@ SMOKEUSER_PRINCIPAL_DEFAULT = 'ambari-qa@EXAMPLE.COM'
 
 # default smoke user
 SMOKEUSER_DEFAULT = 'ambari-qa'
+logger = logging.getLogger('ambari_alerts')
 
 def get_tokens():
   """
@@ -162,12 +165,12 @@ def execute(configurations={}, parameters={}, host_name=None):
 
       # 0 indicates no connection
       if response_code == 0:
-        label = CRITICAL_CONNECTION_MESSAGE.format(query_url)
+        label = CRITICAL_CONNECTION_MESSAGE.format(query_url, traceback.format_exc())
         return (RESULT_CODE_CRITICAL, [label])
 
       # any other response aside from 200 is a problem
       if response_code != 200:
-        label = CRITICAL_HTTP_MESSAGE.format(response_code, query_url)
+        label = CRITICAL_HTTP_MESSAGE.format(response_code, query_url, traceback.format_exc())
         return (RESULT_CODE_CRITICAL, [label])
 
       # now that we have the http status and it was 200, get the content
@@ -176,8 +179,8 @@ def execute(configurations={}, parameters={}, host_name=None):
                                                       False, "WebHCat Server Status", smokeuser,
                                                       connection_timeout=curl_connection_timeout)
       json_response = json.loads(stdout)
-    except Exception, exception:
-      return (RESULT_CODE_CRITICAL, [str(exception)])
+    except:
+      return (RESULT_CODE_CRITICAL, [traceback.format_exc()])
   else:
     url_response = None
 
@@ -189,10 +192,10 @@ def execute(configurations={}, parameters={}, host_name=None):
 
       json_response = json.loads(url_response.read())
     except urllib2.HTTPError as httpError:
-      label = CRITICAL_HTTP_MESSAGE.format(httpError.code, query_url)
+      label = CRITICAL_HTTP_MESSAGE.format(httpError.code, query_url, traceback.format_exc())
       return (RESULT_CODE_CRITICAL, [label])
     except:
-      label = CRITICAL_CONNECTION_MESSAGE.format(query_url)
+      label = CRITICAL_CONNECTION_MESSAGE.format(query_url, traceback.format_exc())
       return (RESULT_CODE_CRITICAL, [label])
     finally:
       if url_response is not None:
@@ -204,14 +207,14 @@ def execute(configurations={}, parameters={}, host_name=None):
 
   # if status is not in the response, we can't do any check; return CRIT
   if 'status' not in json_response:
-    return (RESULT_CODE_CRITICAL, [CRITICAL_WEBHCAT_UNKNOWN_JSON_MESSAGE])
+    return (RESULT_CODE_CRITICAL, [CRITICAL_WEBHCAT_UNKNOWN_JSON_MESSAGE + str(json_response)])
 
 
   # URL response received, parse it
   try:
     webhcat_status = json_response['status']
   except:
-    return (RESULT_CODE_CRITICAL, [CRITICAL_WEBHCAT_UNKNOWN_JSON_MESSAGE])
+    return (RESULT_CODE_CRITICAL, [CRITICAL_WEBHCAT_UNKNOWN_JSON_MESSAGE + "\n" + traceback.format_exc()])
 
 
   # proper JSON received, compare against known value

http://git-wip-us.apache.org/repos/asf/ambari/blob/58b598a5/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanager_health.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanager_health.py b/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanager_health.py
index 912969a..ef5e6b3 100644
--- a/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanager_health.py
+++ b/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanager_health.py
@@ -21,6 +21,8 @@ limitations under the License.
 import ambari_simplejson as json # simplejson is much faster comparing to Python 2.6 json module and has the same functions set.
 import socket
 import urllib2
+import logging
+import traceback
 from ambari_commons import OSCheck
 from ambari_commons.inet_utils import resolve_address
 from resource_management.libraries.functions.curl_krb_request import curl_krb_request
@@ -36,7 +38,7 @@ YARN_HTTP_POLICY_KEY = '{{yarn-site/yarn.http.policy}}'
 
 OK_MESSAGE = 'NodeManager Healthy'
 CRITICAL_CONNECTION_MESSAGE = 'Connection failed to {0} ({1})'
-CRITICAL_HTTP_STATUS_MESSAGE = 'HTTP {0} returned from {1} ({2})'
+CRITICAL_HTTP_STATUS_MESSAGE = 'HTTP {0} returned from {1} ({2}) \n{3}'
 CRITICAL_NODEMANAGER_STATUS_MESSAGE = 'NodeManager returned an unexpected status of "{0}"'
 CRITICAL_NODEMANAGER_UNKNOWN_JSON_MESSAGE = 'Unable to determine NodeManager health from unexpected JSON response'
 
@@ -51,6 +53,9 @@ NODEMANAGER_DEFAULT_PORT = 8042
 CONNECTION_TIMEOUT_KEY = 'connection.timeout'
 CONNECTION_TIMEOUT_DEFAULT = 5.0
 
+LOGGER_EXCEPTION_MESSAGE = "[Alert] NodeManager Health on {0} fails:"
+logger = logging.getLogger('ambari_alerts')
+
 def get_tokens():
   """
   Returns a tuple of tokens in the format {{site/property}} that will be used
@@ -161,11 +166,11 @@ def execute(configurations={}, parameters={}, host_name=None):
       json_response = json.loads(url_response.read())
   except urllib2.HTTPError, httpError:
     label = CRITICAL_HTTP_STATUS_MESSAGE.format(str(httpError.code), query,
-      str(httpError))
+      str(httpError), traceback.format_exc())
 
     return (RESULT_CODE_CRITICAL, [label])
-  except Exception, exception:
-    label = CRITICAL_CONNECTION_MESSAGE.format(query, str(exception))
+  except:
+    label = CRITICAL_CONNECTION_MESSAGE.format(query, traceback.format_exc())
     return (RESULT_CODE_CRITICAL, [label])
 
   # URL response received, parse it
@@ -176,7 +181,7 @@ def execute(configurations={}, parameters={}, host_name=None):
     # convert boolean to string
     node_healthy = str(node_healthy)
   except:
-    return (RESULT_CODE_CRITICAL, [query])
+    return (RESULT_CODE_CRITICAL, [query + "\n" + traceback.format_exc()])
   finally:
     if url_response is not None:
       try:

http://git-wip-us.apache.org/repos/asf/ambari/blob/58b598a5/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanagers_summary.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanagers_summary.py b/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanagers_summary.py
index 11f29fd..b9a7094 100644
--- a/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanagers_summary.py
+++ b/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanagers_summary.py
@@ -21,6 +21,7 @@ limitations under the License.
 import urllib2
 import ambari_simplejson as json # simplejson is much faster comparing to Python 2.6 json module and has the same functions set.
 import logging
+import traceback
 
 from ambari_commons.urllib_handlers import RefreshHeaderProcessor
 from resource_management.libraries.functions.curl_krb_request import curl_krb_request
@@ -42,7 +43,8 @@ EXECUTABLE_SEARCH_PATHS = '{{kerberos-env/executable_search_paths}}'
 CONNECTION_TIMEOUT_KEY = 'connection.timeout'
 CONNECTION_TIMEOUT_DEFAULT = 5.0
 
-logger = logging.getLogger()
+LOGGER_EXCEPTION_MESSAGE = "[Alert] NodeManager Health Summary on {0} fails:"
+logger = logging.getLogger('ambari_alerts')
 
 def get_tokens():
   """
@@ -133,9 +135,8 @@ def execute(configurations={}, parameters={}, host_name=None):
         live_nodemanagers = json.loads(url_response_json["beans"][0]["LiveNodeManagers"])
       except ValueError, error:
         convert_to_json_failed = True
-        if logger.isEnabledFor(logging.DEBUG):
-          logger.exception("[Alert][{0}] Convert response to json failed or json doesn't contain needed data: {1}".
-          format("NodeManager Health Summary", str(error)))
+        logger.exception("[Alert][{0}] Convert response to json failed or json doesn't contain needed data: {1}".
+        format("NodeManager Health Summary", str(error)))
 
       if convert_to_json_failed:
         response_code, error_msg, time_millis  = curl_krb_request(env.tmp_dir, kerberos_keytab, kerberos_principal,
@@ -169,8 +170,8 @@ def execute(configurations={}, parameters={}, host_name=None):
       else:
         label = ERROR_LABEL.format(unhealthy_count, 's', 'are')
 
-  except Exception, e:
-    label = str(e)
+  except:
+    label = traceback.format_exc()
     result_code = 'UNKNOWN'
 
   return (result_code, [label])