You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ambari.apache.org by ds...@apache.org on 2016/02/11 18:47:01 UTC
[1/2] ambari git commit: AMBARI-14800 Alerts: HDFS alerts based on
AMS metrics (dsen)
Repository: ambari
Updated Branches:
refs/heads/branch-2.2 644d4a2c3 -> f99db40c0
AMBARI-14800 Alerts: HDFS alerts based on AMS metrics (dsen)
Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/749821e4
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/749821e4
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/749821e4
Branch: refs/heads/branch-2.2
Commit: 749821e4ea1f88e9b971bc24e81c037bac9bc1c8
Parents: 644d4a2
Author: Dmytro Sen <ds...@apache.org>
Authored: Tue Jan 26 20:03:26 2016 +0200
Committer: Dmytro Sen <ds...@apache.org>
Committed: Thu Feb 11 19:44:28 2016 +0200
----------------------------------------------------------------------
.../common-services/HDFS/2.1.0.2.0/alerts.json | 464 +++++++++++++++++++
.../package/alerts/alert_metrics_deviation.py | 357 ++++++++++++++
2 files changed, 821 insertions(+)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/ambari/blob/749821e4/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/alerts.json
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/alerts.json b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/alerts.json
index 1eda00f..bba6c11 100644
--- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/alerts.json
+++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/alerts.json
@@ -520,6 +520,470 @@
}
]
}
+ },
+ {
+ "name": "increase_nn_heap_usage_hourly",
+ "label": "Hourly increase in NN heap usage",
+ "description": "This service-level alert is triggered if the NN heap usage deviation has grown beyond the specified threshold within a given time interval.",
+ "interval": 5,
+ "scope": "ANY",
+ "enabled": false,
+ "source": {
+ "type": "SCRIPT",
+ "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
+ "parameters": [
+ {
+ "name": "mergeHaMetrics",
+ "display_name": "Whether active and stanby NameNodes metrics should be merged",
+ "value": "false",
+ "type": "STRING",
+ "description": "Whether active and stanby NameNodes metrics should be merged."
+ },
+ {
+ "name": "interval",
+ "display_name": "Time interval in minutes",
+ "value": 60,
+ "type": "NUMERIC",
+ "description": "Time interval in minutes."
+ },
+ {
+ "name": "appId",
+ "display_name": "AMS application id",
+ "value": "NAMENODE",
+ "type": "STRING",
+ "description": "The application id used to retrieve the metric."
+ },
+ {
+ "name": "metricName",
+ "display_name": "Metric Name",
+ "value": "jvm.JvmMetrics.MemHeapUsedM",
+ "type": "STRING",
+ "description": "The metric to monitor."
+ },
+ {
+ "name": "metric.deviation.warning.threshold",
+ "display_name": "The standard deviation threshold above which a warning is produced.",
+ "type": "PERCENT",
+ "units": "%",
+ "value": 100,
+ "threshold": "WARNING"
+ },
+ {
+ "name": "metric.deviation.critical.threshold",
+ "display_name": "The standard deviation threshold above which a critical alert is produced.",
+ "type": "PERCENT",
+ "units": "%",
+ "value": 200,
+ "threshold": "CRITICAL"
+ }
+ ]
+ }
+ },
+ {
+ "name": "namenode_service_rpc_latency_hourly",
+ "label": "Hourly Service-RPC latency",
+ "description": "This service-level alert is triggered if the Service-RPC latency deviation has grown beyond the specified threshold within a given time interval.",
+ "interval": 5,
+ "scope": "ANY",
+ "enabled": false,
+ "source": {
+ "type": "SCRIPT",
+ "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
+ "parameters": [
+ {
+ "name": "mergeHaMetrics",
+ "display_name": "Whether active and stanby NameNodes metrics should be merged",
+ "value": "false",
+ "type": "STRING",
+ "description": "Whether active and stanby NameNodes metrics should be merged."
+ },
+ {
+ "name": "interval",
+ "display_name": "Time interval in minutes",
+ "value": 60,
+ "type": "NUMERIC",
+ "description": "Time interval in minutes."
+ },
+ {
+ "name": "appId",
+ "display_name": "AMS application id",
+ "value": "NAMENODE",
+ "type": "STRING",
+ "description": "The application id used to retrieve the metric."
+ },
+ {
+ "name": "metricName",
+ "display_name": "Metric Name",
+ "value": "rpc.rpc.RpcProcessingTimeAvgTime",
+ "type": "STRING",
+ "description": "The metric to monitor."
+ },
+ {
+ "name": "metric.deviation.warning.threshold",
+ "display_name": "The standard deviation threshold above which a warning is produced.",
+ "type": "PERCENT",
+ "units": "%",
+ "value": 100,
+ "threshold": "WARNING"
+ },
+ {
+ "name": "metric.deviation.critical.threshold",
+ "display_name": "The standard deviation threshold above which a critical alert is produced.",
+ "type": "PERCENT",
+ "units": "%",
+ "value": 200,
+ "threshold": "CRITICAL"
+ }
+ ]
+ }
+ },
+ {
+ "name": "namenode_increase_in_storage_capacity_usage_hourly",
+ "label": "Hourly increase in storage capacity usage",
+ "description": "This service-level alert is triggered if the increase in storage capacity usage deviation has grown beyond the specified threshold within a given time interval.",
+ "interval": 5,
+ "scope": "ANY",
+ "enabled": false,
+ "source": {
+ "type": "SCRIPT",
+ "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
+ "parameters": [
+ {
+ "name": "mergeHaMetrics",
+ "display_name": "Whether active and stanby NameNodes metrics should be merged",
+ "value": "true",
+ "type": "STRING",
+ "description": "Whether active and stanby NameNodes metrics should be merged."
+ },
+ {
+ "name": "interval",
+ "display_name": "Time interval in minutes",
+ "value": 60,
+ "type": "NUMERIC",
+ "description": "Time interval in minutes."
+ },
+ {
+ "name": "appId",
+ "display_name": "AMS application id",
+ "value": "NAMENODE",
+ "type": "STRING",
+ "description": "The application id used to retrieve the metric."
+ },
+ {
+ "name": "metricName",
+ "display_name": "Metric Name",
+ "value": "dfs.FSNamesystem.CapacityUsed",
+ "type": "STRING",
+ "description": "The metric to monitor."
+ },
+ {
+ "name": "metric.deviation.warning.threshold",
+ "display_name": "The standard deviation threshold above which a warning is produced.",
+ "type": "PERCENT",
+ "units": "%",
+ "value": 100,
+ "threshold": "WARNING"
+ },
+ {
+ "name": "metric.deviation.critical.threshold",
+ "display_name": "The standard deviation threshold above which a critical alert is produced.",
+ "type": "PERCENT",
+ "units": "%",
+ "value": 200,
+ "threshold": "CRITICAL"
+ }
+ ]
+ }
+ },
+ {
+ "name": "increase_nn_heap_usage_daily",
+ "label": "Daily increase in NN heap usage",
+ "description": "This service-level alert is triggered if the NN heap usage deviation has grown beyond the specified threshold within a given time interval.",
+ "interval": 480,
+ "scope": "ANY",
+ "enabled": false,
+ "source": {
+ "type": "SCRIPT",
+ "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
+ "parameters": [
+ {
+ "name": "mergeHaMetrics",
+ "display_name": "Whether active and stanby NameNodes metrics should be merged",
+ "value": "false",
+ "type": "STRING",
+ "description": "Whether active and stanby NameNodes metrics should be merged."
+ },
+ {
+ "name": "interval",
+ "display_name": "Time interval in minutes",
+ "value": 1440,
+ "type": "NUMERIC",
+ "description": "Time interval in minutes."
+ },
+ {
+ "name": "appId",
+ "display_name": "AMS application id",
+ "value": "NAMENODE",
+ "type": "STRING",
+ "description": "The application id used to retrieve the metric."
+ },
+ {
+ "name": "metricName",
+ "display_name": "Metric Name",
+ "value": "jvm.JvmMetrics.MemHeapUsedM",
+ "type": "STRING",
+ "description": "The metric to monitor."
+ },
+ {
+ "name": "metric.deviation.warning.threshold",
+ "display_name": "The standard deviation threshold above which a warning is produced.",
+ "type": "PERCENT",
+ "units": "%",
+ "value": 100,
+ "threshold": "WARNING"
+ },
+ {
+ "name": "metric.deviation.critical.threshold",
+ "display_name": "The standard deviation threshold above which a critical alert is produced.",
+ "type": "PERCENT",
+ "units": "%",
+ "value": 200,
+ "threshold": "CRITICAL"
+ }
+ ]
+ }
+ },
+ {
+ "name": "namenode_service_rpc_latency_daily",
+ "label": "Daily Service-RPC latency",
+ "description": "This service-level alert is triggered if the Service-RPC latency deviation has grown beyond the specified threshold within a given time interval.",
+ "interval": 480,
+ "scope": "ANY",
+ "enabled": false,
+ "source": {
+ "type": "SCRIPT",
+ "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
+ "parameters": [
+ {
+ "name": "mergeHaMetrics",
+ "display_name": "Whether active and stanby NameNodes metrics should be merged",
+ "value": "false",
+ "type": "STRING",
+ "description": "Whether active and stanby NameNodes metrics should be merged."
+ },
+ {
+ "name": "interval",
+ "display_name": "Time interval in minutes",
+ "value": 1440,
+ "type": "NUMERIC",
+ "description": "Time interval in minutes."
+ },
+ {
+ "name": "appId",
+ "display_name": "AMS application id",
+ "value": "NAMENODE",
+ "type": "STRING",
+ "description": "The application id used to retrieve the metric."
+ },
+ {
+ "name": "metricName",
+ "display_name": "Metric Name",
+ "value": "rpc.rpc.RpcProcessingTimeAvgTime",
+ "type": "STRING",
+ "description": "The metric to monitor."
+ },
+ {
+ "name": "metric.deviation.warning.threshold",
+ "display_name": "The standard deviation threshold above which a warning is produced.",
+ "type": "PERCENT",
+ "units": "%",
+ "value": 100,
+ "threshold": "WARNING"
+ },
+ {
+ "name": "metric.deviation.critical.threshold",
+ "display_name": "The standard deviation threshold above which a critical alert is produced.",
+ "type": "PERCENT",
+ "units": "%",
+ "value": 200,
+ "threshold": "CRITICAL"
+ }
+ ]
+ }
+ },
+ {
+ "name": "namenode_increase_in_storage_capacity_usage_daily",
+ "label": "Daily increase in storage capacity usage",
+ "description": "This service-level alert is triggered if the increase in storage capacity usage deviation has grown beyond the specified threshold within a given time interval.",
+ "interval": 480,
+ "scope": "ANY",
+ "enabled": false,
+ "source": {
+ "type": "SCRIPT",
+ "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
+ "parameters": [
+ {
+ "name": "mergeHaMetrics",
+ "display_name": "Whether active and stanby NameNodes metrics should be merged",
+ "value": "true",
+ "type": "STRING",
+ "description": "Whether active and stanby NameNodes metrics should be merged."
+ },
+ {
+ "name": "interval",
+ "display_name": "Time interval in minutes",
+ "value": 1440,
+ "type": "NUMERIC",
+ "description": "Time interval in minutes."
+ },
+ {
+ "name": "appId",
+ "display_name": "AMS application id",
+ "value": "NAMENODE",
+ "type": "STRING",
+ "description": "The application id used to retrieve the metric."
+ },
+ {
+ "name": "metricName",
+ "display_name": "Metric Name",
+ "value": "dfs.FSNamesystem.CapacityUsed",
+ "type": "STRING",
+ "description": "The metric to monitor."
+ },
+ {
+ "name": "metric.deviation.warning.threshold",
+ "display_name": "The standard deviation threshold above which a warning is produced.",
+ "type": "PERCENT",
+ "units": "%",
+ "value": 100,
+ "threshold": "WARNING"
+ },
+ {
+ "name": "metric.deviation.critical.threshold",
+ "display_name": "The standard deviation threshold above which a critical alert is produced.",
+ "type": "PERCENT",
+ "units": "%",
+ "value": 200,
+ "threshold": "CRITICAL"
+ }
+ ]
+ }
+ },
+ {
+ "name": "increase_nn_heap_usage_weekly",
+ "label": "Weekly increase in NN heap usage",
+ "description": "This service-level alert is triggered if the NN heap usage deviation has grown beyond the specified threshold within a given time interval.",
+ "interval": 1440,
+ "scope": "ANY",
+ "enabled": false,
+ "source": {
+ "type": "SCRIPT",
+ "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
+ "parameters": [
+ {
+ "name": "mergeHaMetrics",
+ "display_name": "Whether active and stanby NameNodes metrics should be merged",
+ "value": "false",
+ "type": "STRING",
+ "description": "Whether active and stanby NameNodes metrics should be merged."
+ },
+ {
+ "name": "interval",
+ "display_name": "Time interval in minutes",
+ "value": 10080,
+ "type": "NUMERIC",
+ "description": "Time interval in minutes."
+ },
+ {
+ "name": "appId",
+ "display_name": "AMS application id",
+ "value": "NAMENODE",
+ "type": "STRING",
+ "description": "The application id used to retrieve the metric."
+ },
+ {
+ "name": "metricName",
+ "display_name": "Metric Name",
+ "value": "jvm.JvmMetrics.MemHeapUsedM",
+ "type": "STRING",
+ "description": "The metric to monitor."
+ },
+ {
+ "name": "metric.deviation.warning.threshold",
+ "display_name": "The standard deviation threshold above which a warning is produced.",
+ "type": "PERCENT",
+ "units": "%",
+ "value": 100,
+ "threshold": "WARNING"
+ },
+ {
+ "name": "metric.deviation.critical.threshold",
+ "display_name": "The standard deviation threshold above which a critical alert is produced.",
+ "type": "PERCENT",
+ "units": "%",
+ "value": 200,
+ "threshold": "CRITICAL"
+ }
+ ]
+ }
+ },
+ {
+ "name": "namenode_increase_in_storage_capacity_usage_weekly",
+ "label": "Weekly increase in storage capacity usage",
+ "description": "This service-level alert is triggered if the increase in storage capacity usage deviation has grown beyond the specified threshold within a given time interval.",
+ "interval": 1440,
+ "scope": "ANY",
+ "enabled": false,
+ "source": {
+ "type": "SCRIPT",
+ "path": "HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py",
+ "parameters": [
+ {
+ "name": "mergeHaMetrics",
+ "display_name": "Whether active and stanby NameNodes metrics should be merged",
+ "value": "true",
+ "type": "STRING",
+ "description": "Whether active and stanby NameNodes metrics should be merged."
+ },
+ {
+ "name": "interval",
+ "display_name": "Time interval in minutes",
+ "value": 10080,
+ "type": "NUMERIC",
+ "description": "Time interval in minutes."
+ },
+ {
+ "name": "appId",
+ "display_name": "AMS application id",
+ "value": "NAMENODE",
+ "type": "STRING",
+ "description": "The application id used to retrieve the metric."
+ },
+ {
+ "name": "metricName",
+ "display_name": "Metric Name",
+ "value": "dfs.FSNamesystem.CapacityUsed",
+ "type": "STRING",
+ "description": "The metric to monitor."
+ },
+ {
+ "name": "metric.deviation.warning.threshold",
+ "display_name": "The standard deviation threshold above which a warning is produced.",
+ "type": "PERCENT",
+ "units": "%",
+ "value": 100,
+ "threshold": "WARNING"
+ },
+ {
+ "name": "metric.deviation.critical.threshold",
+ "display_name": "The standard deviation threshold above which a critical alert is produced.",
+ "type": "PERCENT",
+ "units": "%",
+ "value": 200,
+ "threshold": "CRITICAL"
+ }
+ ]
+ }
}
],
"SECONDARY_NAMENODE": [
http://git-wip-us.apache.org/repos/asf/ambari/blob/749821e4/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py
new file mode 100644
index 0000000..217f3b8
--- /dev/null
+++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py
@@ -0,0 +1,357 @@
+#!/usr/bin/env python
+
+"""
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements. See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership. The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+import httplib
+
+import json
+import logging
+from math import sqrt
+import urllib
+import time
+import urllib2
+from resource_management import Environment, curl_krb_request
+
+RESULT_STATE_OK = 'OK'
+RESULT_STATE_CRITICAL = 'CRITICAL'
+RESULT_STATE_WARNING = 'WARNING'
+RESULT_STATE_UNKNOWN = 'UNKNOWN'
+RESULT_STATE_SKIPPED = 'SKIPPED'
+
+HDFS_NN_STATE_ACTIVE = 'active'
+HDFS_NN_STATE_STANDBY = 'standby'
+
+HDFS_SITE_KEY = '{{hdfs-site}}'
+NAMESERVICE_KEY = '{{hdfs-site/dfs.nameservices}}'
+NN_HTTP_ADDRESS_KEY = '{{hdfs-site/dfs.namenode.http-address}}'
+NN_HTTPS_ADDRESS_KEY = '{{hdfs-site/dfs.namenode.https-address}}'
+DFS_POLICY_KEY = '{{hdfs-site/dfs.http.policy}}'
+
+KERBEROS_KEYTAB = '{{hdfs-site/dfs.web.authentication.kerberos.keytab}}'
+KERBEROS_PRINCIPAL = '{{hdfs-site/dfs.web.authentication.kerberos.principal}}'
+SECURITY_ENABLED_KEY = '{{cluster-env/security_enabled}}'
+SMOKEUSER_KEY = '{{cluster-env/smokeuser}}'
+EXECUTABLE_SEARCH_PATHS = '{{kerberos-env/executable_search_paths}}'
+
+METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY = '{{ams-site/timeline.metrics.service.webapp.address}}'
+
+CONNECTION_TIMEOUT_KEY = 'connection.timeout'
+CONNECTION_TIMEOUT_DEFAULT = 5.0
+
+MERGE_HA_METRICS_PARAM_KEY = 'mergeHaMetrics'
+MERGE_HA_METRICS_PARAM_DEFAULT = False
+METRIC_NAME_PARAM_KEY = 'metricName'
+METRIC_NAME_PARAM_DEFAULT = ''
+APP_ID_PARAM_KEY = 'appId'
+APP_ID_PARAM_DEFAULT = 'NAMENODE'
+INTERVAL_PARAM_KEY = 'interval'
+INTERVAL_PARAM_DEFAULT = 60
+DEVIATION_CRITICAL_THRESHOLD_KEY = 'metric.deviation.critical.threshold'
+DEVIATION_CRITICAL_THRESHOLD_DEFAULT = 10
+DEVIATION_WARNING_THRESHOLD_KEY = 'metric.deviation.warning.threshold'
+DEVIATION_WARNING_THRESHOLD_DEFAULT = 5
+
+AMS_METRICS_GET_URL = "/ws/v1/timeline/metrics?%s"
+
+logger = logging.getLogger()
+
+def get_tokens():
+ """
+ Returns a tuple of tokens in the format {{site/property}} that will be used
+ to build the dictionary passed into execute
+ """
+ return (HDFS_SITE_KEY, NAMESERVICE_KEY, NN_HTTP_ADDRESS_KEY, DFS_POLICY_KEY,
+ EXECUTABLE_SEARCH_PATHS, NN_HTTPS_ADDRESS_KEY, SMOKEUSER_KEY,
+ KERBEROS_KEYTAB, KERBEROS_PRINCIPAL, SECURITY_ENABLED_KEY,
+ METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY)
+
+def execute(configurations={}, parameters={}, host_name=None):
+ """
+ Returns a tuple containing the result code and a pre-formatted result label
+
+ Keyword arguments:
+ configurations : a mapping of configuration key to value
+ parameters : a mapping of script parameter key to value
+ host_name : the name of this host where the alert is running
+
+ :type configurations dict
+ :type parameters dict
+ :type host_name str
+ """
+ hostnames = host_name
+ current_time = int(time.time()) * 1000
+
+ # parse script arguments
+ connection_timeout = CONNECTION_TIMEOUT_DEFAULT
+ if CONNECTION_TIMEOUT_KEY in parameters:
+ connection_timeout = float(parameters[CONNECTION_TIMEOUT_KEY])
+
+ merge_ha_metrics = MERGE_HA_METRICS_PARAM_DEFAULT
+ if MERGE_HA_METRICS_PARAM_KEY in parameters:
+ merge_ha_metrics = parameters[MERGE_HA_METRICS_PARAM_KEY].lower() == 'true'
+
+ metric_name = METRIC_NAME_PARAM_DEFAULT
+ if METRIC_NAME_PARAM_KEY in parameters:
+ metric_name = parameters[METRIC_NAME_PARAM_KEY]
+
+ app_id = APP_ID_PARAM_DEFAULT
+ if APP_ID_PARAM_KEY in parameters:
+ app_id = parameters[APP_ID_PARAM_KEY]
+
+ interval = INTERVAL_PARAM_DEFAULT
+ if INTERVAL_PARAM_KEY in parameters:
+ interval = int(parameters[INTERVAL_PARAM_KEY])
+
+ warning_threshold = DEVIATION_WARNING_THRESHOLD_DEFAULT
+ if DEVIATION_WARNING_THRESHOLD_KEY in parameters:
+ warning_threshold = int(parameters[DEVIATION_WARNING_THRESHOLD_KEY])
+
+ critical_threshold = DEVIATION_CRITICAL_THRESHOLD_DEFAULT
+ if DEVIATION_CRITICAL_THRESHOLD_KEY in parameters:
+ critical_threshold = int(parameters[DEVIATION_CRITICAL_THRESHOLD_KEY])
+
+ #parse configuration
+ if configurations is None:
+ return (RESULT_STATE_UNKNOWN, ['There were no configurations supplied to the script.'])
+
+ # hdfs-site is required
+ if not HDFS_SITE_KEY in configurations:
+ return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script'.format(HDFS_SITE_KEY)])
+
+ # ams-site/timeline.metrics.service.webapp.address is required
+ if not METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY in configurations:
+ return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script'.format(METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY)])
+ else:
+ collector_webapp_address = configurations[METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY].split(":")
+ if valid_collector_webapp_address(collector_webapp_address):
+ collector_host = collector_webapp_address[0]
+ collector_port = int(collector_webapp_address[1])
+ else:
+ return (RESULT_STATE_UNKNOWN, ['{0} value should be set as "fqdn_hostname:port", but set to {1}'.format(METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY, configurations[METRICS_COLLECTOR_WEBAPP_ADDRESS_KEY])])
+
+ # if namenode alert and HA mode
+ if NAMESERVICE_KEY in configurations and app_id.lower() == 'namenode':
+ # hdfs-site is required
+ if not HDFS_SITE_KEY in configurations:
+ return (RESULT_STATE_UNKNOWN, ['{0} is a required parameter for the script'.format(HDFS_SITE_KEY)])
+
+ if SMOKEUSER_KEY in configurations:
+ smokeuser = configurations[SMOKEUSER_KEY]
+
+ executable_paths = None
+ if EXECUTABLE_SEARCH_PATHS in configurations:
+ executable_paths = configurations[EXECUTABLE_SEARCH_PATHS]
+
+ # parse script arguments
+ security_enabled = False
+ if SECURITY_ENABLED_KEY in configurations:
+ security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'
+
+ kerberos_keytab = None
+ if KERBEROS_KEYTAB in configurations:
+ kerberos_keytab = configurations[KERBEROS_KEYTAB]
+
+ kerberos_principal = None
+ if KERBEROS_PRINCIPAL in configurations:
+ kerberos_principal = configurations[KERBEROS_PRINCIPAL]
+ kerberos_principal = kerberos_principal.replace('_HOST', host_name)
+
+ # determine whether or not SSL is enabled
+ is_ssl_enabled = False
+ if DFS_POLICY_KEY in configurations:
+ dfs_policy = configurations[DFS_POLICY_KEY]
+ if dfs_policy == "HTTPS_ONLY":
+ is_ssl_enabled = True
+
+ name_service = configurations[NAMESERVICE_KEY]
+ hdfs_site = configurations[HDFS_SITE_KEY]
+
+ # look for dfs.ha.namenodes.foo
+ nn_unique_ids_key = 'dfs.ha.namenodes.' + name_service
+ if not nn_unique_ids_key in hdfs_site:
+ return (RESULT_STATE_UNKNOWN, ['Unable to find unique NameNode alias key {0}'.format(nn_unique_ids_key)])
+
+ namenode_http_fragment = 'dfs.namenode.http-address.{0}.{1}'
+ jmx_uri_fragment = "http://{0}/jmx?qry=Hadoop:service=NameNode,name=*"
+
+ if is_ssl_enabled:
+ namenode_http_fragment = 'dfs.namenode.https-address.{0}.{1}'
+ jmx_uri_fragment = "https://{0}/jmx?qry=Hadoop:service=NameNode,name=*"
+
+ # now we have something like 'nn1,nn2,nn3,nn4'
+ # turn it into dfs.namenode.[property].[dfs.nameservices].[nn_unique_id]
+ # ie dfs.namenode.http-address.hacluster.nn1
+ namenodes = []
+ active_namenodes = []
+ nn_unique_ids = hdfs_site[nn_unique_ids_key].split(',')
+ for nn_unique_id in nn_unique_ids:
+ key = namenode_http_fragment.format(name_service,nn_unique_id)
+
+ if key in hdfs_site:
+ # use str() to ensure that unicode strings do not have the u' in them
+ value = str(hdfs_site[key])
+ namenode = str(hdfs_site[key]).split(":")[0]
+
+ namenodes.append(namenode)
+ try:
+ jmx_uri = jmx_uri_fragment.format(value)
+ if kerberos_principal is not None and kerberos_keytab is not None and security_enabled:
+ env = Environment.get_instance()
+
+ # curl requires an integer timeout
+ curl_connection_timeout = int(connection_timeout)
+ state_response, error_msg, time_millis = curl_krb_request(env.tmp_dir,
+ kerberos_keytab, kerberos_principal, jmx_uri,"ha_nn_health", executable_paths, False,
+ "NameNode High Availability Health", smokeuser, connection_timeout=curl_connection_timeout)
+
+ state = _get_ha_state_from_json(state_response)
+ else:
+ state_response = get_jmx(jmx_uri, connection_timeout)
+ state = _get_ha_state_from_json(state_response)
+
+ if state == HDFS_NN_STATE_ACTIVE:
+ active_namenodes.append(namenode)
+ except:
+ logger.exception("Unable to determine active NameNode")
+
+
+ if merge_ha_metrics:
+ hostnames = ",".join(namenodes)
+ # run only on active NN, no need to run the same requests from the
+ if host_name not in active_namenodes:
+ return (RESULT_STATE_SKIPPED, ['Another host will report this alert'])
+
+ get_metrics_parameters = {
+ "metricNames": metric_name,
+ "appId": app_id,
+ "hostname": hostnames,
+ "startTime": current_time - interval*60*1000,
+ "endTime": current_time,
+ "grouped": "true",
+ }
+
+ encoded_get_metrics_parameters = urllib.urlencode(get_metrics_parameters)
+
+ try:
+ conn = httplib.HTTPConnection(collector_host, int(collector_port),
+ timeout=connection_timeout)
+ conn.request("GET", AMS_METRICS_GET_URL % encoded_get_metrics_parameters)
+ response = conn.getresponse()
+ data = response.read()
+ conn.close()
+ except Exception:
+ return (RESULT_STATE_UNKNOWN, ["Unable to retrieve metrics from AMS."])
+
+ if response.status != 200:
+ return (RESULT_STATE_UNKNOWN, ["Unable to retrieve metrics from AMS."])
+
+ data_json = json.loads(data)
+ metrics = []
+ # will get large standard deviation for multiple hosts,
+ # if host1 reports small local values, but host2 reports large local values
+ for metrics_data in data_json["metrics"]:
+ metrics += metrics_data["metrics"].values()
+
+ if not metrics or len(metrics) < 2:
+ return (RESULT_STATE_UNKNOWN, ["Unable to calculate the standard deviation for {0} datapoints".format(len(metrics))])
+
+ mean = calculate_mean(metrics)
+ stddev = calulate_sample_std_deviation(metrics)
+
+ try:
+ deviation_percent = stddev/mean*100
+ except ZeroDivisionError:
+ # should not be a case for this alert
+ return (RESULT_STATE_UNKNOWN, ["Unable to calculate the standard deviation percentage. The mean value is 0"])
+
+ logger.debug("""
+ AMS request parameters - {0}
+ AMS response - {1}
+ Mean - {2}
+ Standard deviation - {3}
+ Percentage standard deviation - {4}
+ """.format(encoded_get_metrics_parameters, data_json, mean, stddev, deviation_percent))
+
+ if deviation_percent > critical_threshold:
+ return (RESULT_STATE_CRITICAL,['CRITICAL. Percentage standard deviation value {0}% is beyond the critical threshold of {1}%'.format("%.2f" % deviation_percent, "%.2f" % critical_threshold)])
+ if deviation_percent > warning_threshold:
+ return (RESULT_STATE_WARNING,['WARNING. Percentage standard deviation value {0}% is beyond the warning threshold of {1}%'.format("%.2f" % deviation_percent, "%.2f" % warning_threshold)])
+ return (RESULT_STATE_OK,['OK. Percentage standard deviation value is {0}%'.format("%.2f" % deviation_percent)])
+
+def calulate_sample_std_deviation(lst):
+ """calculates standard deviation"""
+ mean = calculate_mean(lst)
+ variance = sum([(element-mean)**2 for element in lst]) / (len(lst) - 1)
+ return sqrt(variance)
+
+def calculate_mean(lst):
+ """calculates mean"""
+ return sum(lst) / len(lst)
+
+def valid_collector_webapp_address(webapp_address):
+ if len(webapp_address) == 2 \
+ and webapp_address[0] != '127.0.0.1' \
+ and webapp_address[0] != '0.0.0.0' \
+ and webapp_address[1].isdigit():
+ return True
+
+ return False
+
+def get_jmx(query, connection_timeout):
+ response = None
+
+ try:
+ response = urllib2.urlopen(query, timeout=connection_timeout)
+ json_data = response.read()
+ return json_data
+ except Exception:
+ return {"beans": {}}
+ finally:
+ if response is not None:
+ try:
+ response.close()
+ except:
+ pass
+
+def _get_ha_state_from_json(string_json):
+ """
+ Searches through the specified JSON string looking for either the HDP 2.0 or 2.1+ HA state
+ enumerations.
+ :param string_json: the string JSON
+ :return: the value of the HA state (active, standby, etc)
+ """
+ json_data = json.loads(string_json)
+ jmx_beans = json_data["beans"]
+
+ # look for HDP 2.1+ first
+ for jmx_bean in jmx_beans:
+ if "name" not in jmx_bean:
+ continue
+
+ jmx_bean_name = jmx_bean["name"]
+ if jmx_bean_name == "Hadoop:service=NameNode,name=NameNodeStatus" and "State" in jmx_bean:
+ return jmx_bean["State"]
+
+ # look for HDP 2.0 last
+ for jmx_bean in jmx_beans:
+ if "name" not in jmx_bean:
+ continue
+
+ jmx_bean_name = jmx_bean["name"]
+ if jmx_bean_name == "Hadoop:service=NameNode,name=FSNamesystem":
+ return jmx_bean["tag.HAState"]
[2/2] ambari git commit: AMBARI-14800 Alerts: HDFS alerts based on
AMS metrics (additional patch) (dsen)
Posted by ds...@apache.org.
AMBARI-14800 Alerts: HDFS alerts based on AMS metrics (additional patch) (dsen)
Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/f99db40c
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/f99db40c
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/f99db40c
Branch: refs/heads/branch-2.2
Commit: f99db40c09bae674f605505bf85394c9be3d5561
Parents: 749821e
Author: Dmytro Sen <ds...@apache.org>
Authored: Thu Feb 11 19:37:11 2016 +0200
Committer: Dmytro Sen <ds...@apache.org>
Committed: Thu Feb 11 19:44:44 2016 +0200
----------------------------------------------------------------------
.../common-services/HDFS/2.1.0.2.0/alerts.json | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/ambari/blob/f99db40c/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/alerts.json
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/alerts.json b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/alerts.json
index bba6c11..2a6229c 100644
--- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/alerts.json
+++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/alerts.json
@@ -523,7 +523,7 @@
},
{
"name": "increase_nn_heap_usage_hourly",
- "label": "Hourly increase in NN heap usage",
+ "label": "NameNode Heap Usage (Hourly)",
"description": "This service-level alert is triggered if the NN heap usage deviation has grown beyond the specified threshold within a given time interval.",
"interval": 5,
"scope": "ANY",
@@ -581,7 +581,7 @@
},
{
"name": "namenode_service_rpc_latency_hourly",
- "label": "Hourly Service-RPC latency",
+ "label": "NameNode RPC Latency (Hourly)",
"description": "This service-level alert is triggered if the Service-RPC latency deviation has grown beyond the specified threshold within a given time interval.",
"interval": 5,
"scope": "ANY",
@@ -639,7 +639,7 @@
},
{
"name": "namenode_increase_in_storage_capacity_usage_hourly",
- "label": "Hourly increase in storage capacity usage",
+ "label": "HDFS Storage Capacity Usage (Hourly)",
"description": "This service-level alert is triggered if the increase in storage capacity usage deviation has grown beyond the specified threshold within a given time interval.",
"interval": 5,
"scope": "ANY",
@@ -697,7 +697,7 @@
},
{
"name": "increase_nn_heap_usage_daily",
- "label": "Daily increase in NN heap usage",
+ "label": "NameNode Heap Usage (Daily)",
"description": "This service-level alert is triggered if the NN heap usage deviation has grown beyond the specified threshold within a given time interval.",
"interval": 480,
"scope": "ANY",
@@ -755,7 +755,7 @@
},
{
"name": "namenode_service_rpc_latency_daily",
- "label": "Daily Service-RPC latency",
+ "label": "NameNode RPC Latency (Daily)",
"description": "This service-level alert is triggered if the Service-RPC latency deviation has grown beyond the specified threshold within a given time interval.",
"interval": 480,
"scope": "ANY",
@@ -813,7 +813,7 @@
},
{
"name": "namenode_increase_in_storage_capacity_usage_daily",
- "label": "Daily increase in storage capacity usage",
+ "label": "HDFS Storage Capacity Usage (Daily)",
"description": "This service-level alert is triggered if the increase in storage capacity usage deviation has grown beyond the specified threshold within a given time interval.",
"interval": 480,
"scope": "ANY",
@@ -871,7 +871,7 @@
},
{
"name": "increase_nn_heap_usage_weekly",
- "label": "Weekly increase in NN heap usage",
+ "label": "NameNode Heap Usage (Weekly)",
"description": "This service-level alert is triggered if the NN heap usage deviation has grown beyond the specified threshold within a given time interval.",
"interval": 1440,
"scope": "ANY",
@@ -929,7 +929,7 @@
},
{
"name": "namenode_increase_in_storage_capacity_usage_weekly",
- "label": "Weekly increase in storage capacity usage",
+ "label": "HDFS Storage Capacity Usage (Weekly)",
"description": "This service-level alert is triggered if the increase in storage capacity usage deviation has grown beyond the specified threshold within a given time interval.",
"interval": 1440,
"scope": "ANY",