You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ambari.apache.org by ds...@apache.org on 2015/11/18 18:54:40 UTC
ambari git commit: AMBARI-13905 Add error logs to script alerts when
there are failures (dsen)
Repository: ambari
Updated Branches:
refs/heads/trunk 6e7c75bb0 -> 58b598a5b
AMBARI-13905 Add error logs to script alerts when there are failures (dsen)
Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/58b598a5
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/58b598a5
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/58b598a5
Branch: refs/heads/trunk
Commit: 58b598a5b453e31a30101b27a25f8e18b52c2083
Parents: 6e7c75b
Author: Dmytro Sen <ds...@apache.org>
Authored: Wed Nov 18 19:54:30 2015 +0200
Committer: Dmytro Sen <ds...@apache.org>
Committed: Wed Nov 18 19:54:30 2015 +0200
----------------------------------------------------------------------
.../python/ambari_agent/alerts/script_alert.py | 32 +++++++++++++-------
.../package/alerts/alert_checkpoint_time.py | 7 +++--
.../package/alerts/alert_ha_namenode_health.py | 5 ++-
.../package/alerts/alert_upgrade_finalized.py | 8 ++---
.../package/alerts/alert_hive_metastore.py | 20 ++++++------
.../package/alerts/alert_hive_thrift_port.py | 22 ++++++++------
.../package/alerts/alert_webhcat_server.py | 23 ++++++++------
.../package/alerts/alert_nodemanager_health.py | 15 ++++++---
.../alerts/alert_nodemanagers_summary.py | 13 ++++----
9 files changed, 87 insertions(+), 58 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/ambari/blob/58b598a5/ambari-agent/src/main/python/ambari_agent/alerts/script_alert.py
----------------------------------------------------------------------
diff --git a/ambari-agent/src/main/python/ambari_agent/alerts/script_alert.py b/ambari-agent/src/main/python/ambari_agent/alerts/script_alert.py
index 97c3b64..e8d0125 100644
--- a/ambari-agent/src/main/python/ambari_agent/alerts/script_alert.py
+++ b/ambari-agent/src/main/python/ambari_agent/alerts/script_alert.py
@@ -27,21 +27,21 @@ from resource_management.core.environment import Environment
from resource_management.core.logger import Logger
from ambari_agent import Constants
-logger = logging.getLogger(__name__)
+logger = logging.getLogger("ambari_alerts")
class ScriptAlert(BaseAlert):
def __init__(self, alert_meta, alert_source_meta, config):
""" ScriptAlert reporting structure is output from the script itself """
-
+
alert_source_meta['reporting'] = {
'ok': { 'text': '{0}' },
'warning': { 'text': '{0}' },
'critical': { 'text': '{0}' },
'unknown': { 'text': '{0}' }
}
-
+
super(ScriptAlert, self).__init__(alert_meta, alert_source_meta)
-
+
self.config = config
self.path = None
self.stacks_dir = None
@@ -49,10 +49,10 @@ class ScriptAlert(BaseAlert):
self.host_scripts_dir = None
self.path_to_script = None
self.parameters = {}
-
+
if 'path' in alert_source_meta:
self.path = alert_source_meta['path']
-
+
if 'common_services_directory' in alert_source_meta:
self.common_services_dir = alert_source_meta['common_services_directory']
@@ -101,12 +101,22 @@ class ScriptAlert(BaseAlert):
if matchObj:
basedir = matchObj.group(1)
with Environment(basedir, tmp_dir=Constants.AGENT_TMP_DIR, logger=logging.getLogger('ambari_alerts')) as env:
- return cmd_module.execute(configurations, self.parameters, self.host_name)
+ result = cmd_module.execute(configurations, self.parameters, self.host_name)
else:
- return cmd_module.execute(configurations, self.parameters, self.host_name)
+ result = cmd_module.execute(configurations, self.parameters, self.host_name)
+
+ loggerMsg = "[Alert][{0}] Failed with result {2}: {3}".format(
+ self.get_name(), self.path_to_script, result[0], result[1])
+
+ if result[0] == self.RESULT_CRITICAL:
+ logger.error(loggerMsg)
+ elif result[0] == self.RESULT_WARNING or result[0] == self.RESULT_UNKNOWN:
+ logger.debug(loggerMsg)
+
+ return result
else:
return (self.RESULT_UNKNOWN, ["Unable to execute script {0}".format(self.path)])
-
+
def _load_source(self):
if self.path is None and self.stack_path is None and self.host_scripts_dir is None:
@@ -114,7 +124,7 @@ class ScriptAlert(BaseAlert):
paths = self.path.split('/')
self.path_to_script = self.path
-
+
# if the path doesn't exist and stacks dir is defined, try that
if not os.path.exists(self.path_to_script) and self.stacks_dir is not None:
self.path_to_script = os.path.join(self.stacks_dir, *paths)
@@ -127,7 +137,7 @@ class ScriptAlert(BaseAlert):
if not os.path.exists(self.path_to_script) and self.host_scripts_dir is not None:
self.path_to_script = os.path.join(self.host_scripts_dir, *paths)
- # if the path can't be evaluated, throw exception
+ # if the path can't be evaluated, throw exception
if not os.path.exists(self.path_to_script) or not os.path.isfile(self.path_to_script):
raise Exception(
"Unable to find '{0}' as an absolute path or part of {1} or {2}".format(self.path,
http://git-wip-us.apache.org/repos/asf/ambari/blob/58b598a5/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_checkpoint_time.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_checkpoint_time.py b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_checkpoint_time.py
index 548a838..ef389cd 100644
--- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_checkpoint_time.py
+++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_checkpoint_time.py
@@ -22,6 +22,7 @@ import time
import urllib2
import ambari_simplejson as json # simplejson is much faster comparing to Python 2.6 json module and has the same functions set.
import logging
+import traceback
from resource_management.libraries.functions.curl_krb_request import curl_krb_request
from resource_management.core.environment import Environment
@@ -52,7 +53,7 @@ SECURITY_ENABLED_KEY = '{{cluster-env/security_enabled}}'
SMOKEUSER_KEY = "{{cluster-env/smokeuser}}"
EXECUTABLE_SEARCH_PATHS = '{{kerberos-env/executable_search_paths}}'
-logger = logging.getLogger()
+logger = logging.getLogger('ambari_alerts')
def get_tokens():
"""
@@ -191,8 +192,8 @@ def execute(configurations={}, parameters={}, host_name=None):
elif (transaction_difference > int(checkpoint_tx)) and (float(delta) / int(checkpoint_period)*100 >= int(percent_warning)):
result_code = 'WARNING'
- except Exception, e:
- label = str(e)
+ except:
+ label = traceback.format_exc()
result_code = 'UNKNOWN'
return ((result_code, [label]))
http://git-wip-us.apache.org/repos/asf/ambari/blob/58b598a5/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_ha_namenode_health.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_ha_namenode_health.py b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_ha_namenode_health.py
index bfc518f..7cd5591 100644
--- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_ha_namenode_health.py
+++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_ha_namenode_health.py
@@ -21,6 +21,7 @@ limitations under the License.
import urllib2
import ambari_simplejson as json # simplejson is much faster comparing to Python 2.6 json module and has the same functions set.
import logging
+import traceback
from resource_management.libraries.functions.curl_krb_request import curl_krb_request
from resource_management.core.environment import Environment
@@ -48,7 +49,8 @@ EXECUTABLE_SEARCH_PATHS = '{{kerberos-env/executable_search_paths}}'
CONNECTION_TIMEOUT_KEY = 'connection.timeout'
CONNECTION_TIMEOUT_DEFAULT = 5.0
-logger = logging.getLogger()
+LOGGER_EXCEPTION_MESSAGE = "[Alert] NameNode High Availability Health on {0} fails:"
+logger = logging.getLogger('ambari_alerts')
def get_tokens():
"""
@@ -167,6 +169,7 @@ def execute(configurations={}, parameters={}, host_name=None):
else:
unknown_namenodes.append(value)
except:
+ logger.exception(LOGGER_EXCEPTION_MESSAGE.format(host_name))
unknown_namenodes.append(value)
# now that the request is done, determine if this host is the host that
http://git-wip-us.apache.org/repos/asf/ambari/blob/58b598a5/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_upgrade_finalized.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_upgrade_finalized.py b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_upgrade_finalized.py
index be72327..6e8945c 100644
--- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_upgrade_finalized.py
+++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_upgrade_finalized.py
@@ -21,6 +21,7 @@ limitations under the License.
import urllib2
import ambari_simplejson as json # simplejson is much faster comparing to Python 2.6 json module and has the same functions set.
import logging
+import traceback
from resource_management.libraries.functions.curl_krb_request import curl_krb_request, CONNECTION_TIMEOUT_DEFAULT
from resource_management.core.environment import Environment
@@ -34,8 +35,7 @@ KERBEROS_PRINCIPAL = '{{hdfs-site/dfs.web.authentication.kerberos.principal}}'
SECURITY_ENABLED_KEY = '{{cluster-env/security_enabled}}'
SMOKEUSER_KEY = "{{cluster-env/smokeuser}}"
EXECUTABLE_SEARCH_PATHS = '{{kerberos-env/executable_search_paths}}'
-
-logger = logging.getLogger()
+logger = logging.getLogger('ambari_alerts')
def get_tokens():
"""
@@ -138,8 +138,8 @@ def execute(configurations={}, parameters={}, host_name=None):
label = "HDFS cluster is not finalized"
result_code = 'CRITICAL'
- except Exception, e:
- label = str(e)
+ except:
+ label = traceback.format_exc()
result_code = 'UNKNOWN'
return ((result_code, [label]))
http://git-wip-us.apache.org/repos/asf/ambari/blob/58b598a5/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_metastore.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_metastore.py b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_metastore.py
index 7f23d31..861c48e 100644
--- a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_metastore.py
+++ b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_metastore.py
@@ -21,6 +21,8 @@ limitations under the License.
import os
import socket
import time
+import traceback
+import logging
from resource_management.libraries.functions import format
from resource_management.libraries.functions import get_kinit_path
@@ -30,7 +32,6 @@ from ambari_commons.os_family_impl import OsFamilyFuncImpl, OsFamilyImpl
OK_MESSAGE = "Metastore OK - Hive command took {0:.3f}s"
CRITICAL_MESSAGE = "Metastore on {0} failed ({1})"
-
SECURITY_ENABLED_KEY = '{{cluster-env/security_enabled}}'
SMOKEUSER_KEYTAB_KEY = '{{cluster-env/smokeuser_keytab}}'
SMOKEUSER_PRINCIPAL_KEY = '{{cluster-env/smokeuser_principal_name}}'
@@ -60,6 +61,7 @@ HIVE_BIN_DIR_LEGACY = '/usr/lib/hive/bin'
HADOOPUSER_KEY = '{{cluster-env/hadoop.user.name}}'
HADOOPUSER_DEFAULT = 'hadoop'
+logger = logging.getLogger('ambari_alerts')
@OsFamilyFuncImpl(os_family=OsFamilyImpl.DEFAULT)
def get_tokens():
@@ -177,12 +179,12 @@ def execute(configurations={}, parameters={}, host_name=None):
result_code = 'OK'
label = OK_MESSAGE.format(total_time)
- except Exception, exception:
+ except:
result_code = 'CRITICAL'
- label = CRITICAL_MESSAGE.format(host_name, str(exception))
+ label = CRITICAL_MESSAGE.format(host_name, traceback.format_exc())
- except Exception, e:
- label = str(e)
+ except:
+ label = traceback.format_exc()
result_code = 'UNKNOWN'
return ((result_code, [label]))
@@ -236,11 +238,11 @@ def execute(configurations={}, parameters={}, host_name=None):
total_time = time.time() - start_time
result_code = 'OK'
label = OK_MESSAGE.format(total_time)
- except Exception, exception:
+ except:
result_code = 'CRITICAL'
- label = CRITICAL_MESSAGE.format(host_name, str(exception))
- except Exception, e:
- label = str(e)
+ label = CRITICAL_MESSAGE.format(host_name, traceback.format_exc())
+ except:
+ label = traceback.format_exc()
result_code = 'UNKNOWN'
return ((result_code, [label]))
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/ambari/blob/58b598a5/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_thrift_port.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_thrift_port.py b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_thrift_port.py
index 8d8bd58..a04c2a6 100644
--- a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_thrift_port.py
+++ b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_thrift_port.py
@@ -21,6 +21,8 @@ limitations under the License.
import os
import socket
import time
+import logging
+import traceback
from resource_management.libraries.functions import hive_check
from resource_management.libraries.functions import format
from resource_management.libraries.functions import get_kinit_path
@@ -66,6 +68,8 @@ SMOKEUSER_DEFAULT = 'ambari-qa'
HADOOPUSER_KEY = '{{cluster-env/hadoop.user.name}}'
HADOOPUSER_DEFAULT = 'hadoop'
+logger = logging.getLogger('ambari_alerts')
+
@OsFamilyFuncImpl(os_family=OsFamilyImpl.DEFAULT)
def get_tokens():
"""
@@ -189,12 +193,12 @@ def execute(configurations={}, parameters={}, host_name=None):
result_code = 'OK'
total_time = time.time() - start_time
label = OK_MESSAGE.format(total_time, port)
- except Exception, exception:
+ except:
result_code = 'CRITICAL'
- label = CRITICAL_MESSAGE.format(host_name, port, str(exception))
+ label = CRITICAL_MESSAGE.format(host_name, port, traceback.format_exc())
- except Exception, e:
- label = str(e)
+ except:
+ label = traceback.format_exc()
result_code = 'UNKNOWN'
return (result_code, [label])
@@ -252,11 +256,11 @@ def execute(configurations={}, parameters={}, host_name=None):
total_time = time.time() - start_time
result_code = 'OK'
label = OK_MESSAGE.format(total_time, port)
- except Exception, exception:
+ except:
result_code = 'CRITICAL'
- label = CRITICAL_MESSAGE.format(host_name, port, str(exception))
- except Exception, e:
- label = str(e)
+ label = CRITICAL_MESSAGE.format(host_name, port, traceback.format_exc())
+ except:
+ label = traceback.format_exc()
result_code = 'UNKNOWN'
- return (result_code, [label])
\ No newline at end of file
+ return (result_code, [label])
http://git-wip-us.apache.org/repos/asf/ambari/blob/58b598a5/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_webhcat_server.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_webhcat_server.py b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_webhcat_server.py
index 2c44b36..1e95703 100644
--- a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_webhcat_server.py
+++ b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_webhcat_server.py
@@ -22,6 +22,8 @@ import ambari_simplejson as json # simplejson is much faster comparing to Python
import socket
import time
import urllib2
+import traceback
+import logging
from resource_management.core.environment import Environment
from resource_management.core.resources import Execute
@@ -37,8 +39,8 @@ RESULT_CODE_CRITICAL = "CRITICAL"
RESULT_CODE_UNKNOWN = "UNKNOWN"
OK_MESSAGE = "WebHCat status was OK ({0:.3f}s response from {1})"
-CRITICAL_CONNECTION_MESSAGE = "Connection failed to {0}"
-CRITICAL_HTTP_MESSAGE = "HTTP {0} response from {1}"
+CRITICAL_CONNECTION_MESSAGE = "Connection failed to {0} + \n{1}"
+CRITICAL_HTTP_MESSAGE = "HTTP {0} response from {1} \n{2}"
CRITICAL_WEBHCAT_STATUS_MESSAGE = 'WebHCat returned an unexpected status of "{0}"'
CRITICAL_WEBHCAT_UNKNOWN_JSON_MESSAGE = "Unable to determine WebHCat health from unexpected JSON response"
@@ -71,6 +73,7 @@ SMOKEUSER_PRINCIPAL_DEFAULT = 'ambari-qa@EXAMPLE.COM'
# default smoke user
SMOKEUSER_DEFAULT = 'ambari-qa'
+logger = logging.getLogger('ambari_alerts')
def get_tokens():
"""
@@ -162,12 +165,12 @@ def execute(configurations={}, parameters={}, host_name=None):
# 0 indicates no connection
if response_code == 0:
- label = CRITICAL_CONNECTION_MESSAGE.format(query_url)
+ label = CRITICAL_CONNECTION_MESSAGE.format(query_url, traceback.format_exc())
return (RESULT_CODE_CRITICAL, [label])
# any other response aside from 200 is a problem
if response_code != 200:
- label = CRITICAL_HTTP_MESSAGE.format(response_code, query_url)
+ label = CRITICAL_HTTP_MESSAGE.format(response_code, query_url, traceback.format_exc())
return (RESULT_CODE_CRITICAL, [label])
# now that we have the http status and it was 200, get the content
@@ -176,8 +179,8 @@ def execute(configurations={}, parameters={}, host_name=None):
False, "WebHCat Server Status", smokeuser,
connection_timeout=curl_connection_timeout)
json_response = json.loads(stdout)
- except Exception, exception:
- return (RESULT_CODE_CRITICAL, [str(exception)])
+ except:
+ return (RESULT_CODE_CRITICAL, [traceback.format_exc()])
else:
url_response = None
@@ -189,10 +192,10 @@ def execute(configurations={}, parameters={}, host_name=None):
json_response = json.loads(url_response.read())
except urllib2.HTTPError as httpError:
- label = CRITICAL_HTTP_MESSAGE.format(httpError.code, query_url)
+ label = CRITICAL_HTTP_MESSAGE.format(httpError.code, query_url, traceback.format_exc())
return (RESULT_CODE_CRITICAL, [label])
except:
- label = CRITICAL_CONNECTION_MESSAGE.format(query_url)
+ label = CRITICAL_CONNECTION_MESSAGE.format(query_url, traceback.format_exc())
return (RESULT_CODE_CRITICAL, [label])
finally:
if url_response is not None:
@@ -204,14 +207,14 @@ def execute(configurations={}, parameters={}, host_name=None):
# if status is not in the response, we can't do any check; return CRIT
if 'status' not in json_response:
- return (RESULT_CODE_CRITICAL, [CRITICAL_WEBHCAT_UNKNOWN_JSON_MESSAGE])
+ return (RESULT_CODE_CRITICAL, [CRITICAL_WEBHCAT_UNKNOWN_JSON_MESSAGE + str(json_response)])
# URL response received, parse it
try:
webhcat_status = json_response['status']
except:
- return (RESULT_CODE_CRITICAL, [CRITICAL_WEBHCAT_UNKNOWN_JSON_MESSAGE])
+ return (RESULT_CODE_CRITICAL, [CRITICAL_WEBHCAT_UNKNOWN_JSON_MESSAGE + "\n" + traceback.format_exc()])
# proper JSON received, compare against known value
http://git-wip-us.apache.org/repos/asf/ambari/blob/58b598a5/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanager_health.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanager_health.py b/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanager_health.py
index 912969a..ef5e6b3 100644
--- a/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanager_health.py
+++ b/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanager_health.py
@@ -21,6 +21,8 @@ limitations under the License.
import ambari_simplejson as json # simplejson is much faster comparing to Python 2.6 json module and has the same functions set.
import socket
import urllib2
+import logging
+import traceback
from ambari_commons import OSCheck
from ambari_commons.inet_utils import resolve_address
from resource_management.libraries.functions.curl_krb_request import curl_krb_request
@@ -36,7 +38,7 @@ YARN_HTTP_POLICY_KEY = '{{yarn-site/yarn.http.policy}}'
OK_MESSAGE = 'NodeManager Healthy'
CRITICAL_CONNECTION_MESSAGE = 'Connection failed to {0} ({1})'
-CRITICAL_HTTP_STATUS_MESSAGE = 'HTTP {0} returned from {1} ({2})'
+CRITICAL_HTTP_STATUS_MESSAGE = 'HTTP {0} returned from {1} ({2}) \n{3}'
CRITICAL_NODEMANAGER_STATUS_MESSAGE = 'NodeManager returned an unexpected status of "{0}"'
CRITICAL_NODEMANAGER_UNKNOWN_JSON_MESSAGE = 'Unable to determine NodeManager health from unexpected JSON response'
@@ -51,6 +53,9 @@ NODEMANAGER_DEFAULT_PORT = 8042
CONNECTION_TIMEOUT_KEY = 'connection.timeout'
CONNECTION_TIMEOUT_DEFAULT = 5.0
+LOGGER_EXCEPTION_MESSAGE = "[Alert] NodeManager Health on {0} fails:"
+logger = logging.getLogger('ambari_alerts')
+
def get_tokens():
"""
Returns a tuple of tokens in the format {{site/property}} that will be used
@@ -161,11 +166,11 @@ def execute(configurations={}, parameters={}, host_name=None):
json_response = json.loads(url_response.read())
except urllib2.HTTPError, httpError:
label = CRITICAL_HTTP_STATUS_MESSAGE.format(str(httpError.code), query,
- str(httpError))
+ str(httpError), traceback.format_exc())
return (RESULT_CODE_CRITICAL, [label])
- except Exception, exception:
- label = CRITICAL_CONNECTION_MESSAGE.format(query, str(exception))
+ except:
+ label = CRITICAL_CONNECTION_MESSAGE.format(query, traceback.format_exc())
return (RESULT_CODE_CRITICAL, [label])
# URL response received, parse it
@@ -176,7 +181,7 @@ def execute(configurations={}, parameters={}, host_name=None):
# convert boolean to string
node_healthy = str(node_healthy)
except:
- return (RESULT_CODE_CRITICAL, [query])
+ return (RESULT_CODE_CRITICAL, [query + "\n" + traceback.format_exc()])
finally:
if url_response is not None:
try:
http://git-wip-us.apache.org/repos/asf/ambari/blob/58b598a5/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanagers_summary.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanagers_summary.py b/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanagers_summary.py
index 11f29fd..b9a7094 100644
--- a/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanagers_summary.py
+++ b/ambari-server/src/main/resources/common-services/YARN/2.1.0.2.0/package/alerts/alert_nodemanagers_summary.py
@@ -21,6 +21,7 @@ limitations under the License.
import urllib2
import ambari_simplejson as json # simplejson is much faster comparing to Python 2.6 json module and has the same functions set.
import logging
+import traceback
from ambari_commons.urllib_handlers import RefreshHeaderProcessor
from resource_management.libraries.functions.curl_krb_request import curl_krb_request
@@ -42,7 +43,8 @@ EXECUTABLE_SEARCH_PATHS = '{{kerberos-env/executable_search_paths}}'
CONNECTION_TIMEOUT_KEY = 'connection.timeout'
CONNECTION_TIMEOUT_DEFAULT = 5.0
-logger = logging.getLogger()
+LOGGER_EXCEPTION_MESSAGE = "[Alert] NodeManager Health Summary on {0} fails:"
+logger = logging.getLogger('ambari_alerts')
def get_tokens():
"""
@@ -133,9 +135,8 @@ def execute(configurations={}, parameters={}, host_name=None):
live_nodemanagers = json.loads(url_response_json["beans"][0]["LiveNodeManagers"])
except ValueError, error:
convert_to_json_failed = True
- if logger.isEnabledFor(logging.DEBUG):
- logger.exception("[Alert][{0}] Convert response to json failed or json doesn't contain needed data: {1}".
- format("NodeManager Health Summary", str(error)))
+ logger.exception("[Alert][{0}] Convert response to json failed or json doesn't contain needed data: {1}".
+ format("NodeManager Health Summary", str(error)))
if convert_to_json_failed:
response_code, error_msg, time_millis = curl_krb_request(env.tmp_dir, kerberos_keytab, kerberos_principal,
@@ -169,8 +170,8 @@ def execute(configurations={}, parameters={}, host_name=None):
else:
label = ERROR_LABEL.format(unhealthy_count, 's', 'are')
- except Exception, e:
- label = str(e)
+ except:
+ label = traceback.format_exc()
result_code = 'UNKNOWN'
return (result_code, [label])