You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ambari.apache.org by ad...@apache.org on 2018/04/10 08:29:15 UTC
[ambari] branch trunk updated: AMBARI-23445. Refactor metrics_alert
to support multiple nameservices (#904)
This is an automated email from the ASF dual-hosted git repository.
adoroszlai pushed a commit to branch trunk
in repository https://gitbox.apache.org/repos/asf/ambari.git
The following commit(s) were added to refs/heads/trunk by this push:
new 09c959d AMBARI-23445. Refactor metrics_alert to support multiple nameservices (#904)
09c959d is described below
commit 09c959dba7622d0428931e87262f35a580fa3c14
Author: majorendre <34...@users.noreply.github.com>
AuthorDate: Tue Apr 10 10:29:12 2018 +0200
AMBARI-23445. Refactor metrics_alert to support multiple nameservices (#904)
---
.../libraries/functions/namenode_ha_utils.py | 30 +++++-
.../package/alerts/alert_metrics_deviation.py | 20 +++-
.../2.0.6/HDFS/test_alert_metrics_deviation.py | 112 +++++++++++++++++++--
3 files changed, 150 insertions(+), 12 deletions(-)
diff --git a/ambari-common/src/main/python/resource_management/libraries/functions/namenode_ha_utils.py b/ambari-common/src/main/python/resource_management/libraries/functions/namenode_ha_utils.py
index 4d51e69..0d2cd3f 100644
--- a/ambari-common/src/main/python/resource_management/libraries/functions/namenode_ha_utils.py
+++ b/ambari-common/src/main/python/resource_management/libraries/functions/namenode_ha_utils.py
@@ -27,7 +27,8 @@ from resource_management.libraries.functions.hdfs_utils import is_https_enabled_
__all__ = ["get_namenode_states", "get_active_namenode",
- "get_property_for_active_namenodes", "get_property_for_active_namenode", "get_nameservices"]
+ "get_property_for_active_namenodes", "get_property_for_active_namenode", "get_nameservices",
+ "get_name_service_by_hostname"]
HDFS_NN_STATE_ACTIVE = 'active'
HDFS_NN_STATE_STANDBY = 'standby'
@@ -355,3 +356,30 @@ def get_initial_active_namenodes(hadoop_env):
return frozenset(setting.split(','))
return frozenset()
+
+
+def get_name_service_by_hostname(hdfs_site, host_name):
+ """
+ Finds the name service which the name node belongs to in an HA or federated setup.
+ :param hdfs_site: the hdfs config
+ :param host_name: the host name of the name node, can be None if there is only 1 name service
+ :return: the name service
+ """
+ #there has to be a name service - we are in HA at least
+ name_services_string = hdfs_site['dfs.internal.nameservices']
+ if not name_services_string:
+ raise Fail('Not a HA setup')
+ name_services = name_services_string.split(',')
+ if len(name_services) == 1:
+ return name_services[0]
+
+ if not host_name:
+ raise ValueError('Host name required when using namenode federation')
+
+ for ns in name_services:
+ ha_name_nodes = hdfs_site['dfs.ha.namenodes.{0}'.format(ns)].split(',')
+ for nn in ha_name_nodes:
+ nn_rpc_port = hdfs_site['dfs.namenode.rpc-address.{0}.{1}'.format(ns,nn)]
+ nn_rpc = nn_rpc_port.split(':')[0]
+ if nn_rpc == host_name:
+ return ns
diff --git a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py
index 3f8eb2e..fe2e9fc 100644
--- a/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py
+++ b/ambari-server/src/main/resources/common-services/HDFS/2.1.0.2.0/package/alerts/alert_metrics_deviation.py
@@ -33,6 +33,7 @@ from ambari_commons.aggregate_functions import sample_standard_deviation, mean
from resource_management.libraries.functions.curl_krb_request import curl_krb_request
from resource_management.libraries.functions.curl_krb_request import DEFAULT_KERBEROS_KINIT_TIMER_MS
from resource_management.libraries.functions.curl_krb_request import KERBEROS_KINIT_TIMER_PARAMETER
+from resource_management.libraries.functions.namenode_ha_utils import get_name_service_by_hostname
from ambari_commons.ambari_metrics_helper import select_metric_collector_for_sink
from ambari_agent.AmbariConfig import AmbariConfig
@@ -233,7 +234,7 @@ def execute(configurations={}, parameters={}, host_name=None):
kinit_timer_ms = parameters.get(KERBEROS_KINIT_TIMER_PARAMETER, DEFAULT_KERBEROS_KINIT_TIMER_MS)
- name_service = configurations[NAMESERVICE_KEY]
+ name_service = get_name_service_by_hostname(hdfs_site, host_name)
# look for dfs.ha.namenodes.foo
nn_unique_ids_key = 'dfs.ha.namenodes.' + name_service
@@ -276,8 +277,7 @@ def execute(configurations={}, parameters={}, host_name=None):
state = _get_ha_state_from_json(state_response)
else:
- state_response = get_jmx(jmx_uri, connection_timeout)
- state = _get_ha_state_from_json(state_response)
+ state = _get_state_from_jmx(jmx_uri, connection_timeout)
if state == HDFS_NN_STATE_ACTIVE:
active_namenodes.append(namenode)
@@ -319,13 +319,14 @@ def execute(configurations={}, parameters={}, host_name=None):
metric_truststore_ca_certs)
metric_collector_https_enabled = str(configurations[AMS_HTTP_POLICY]) == "HTTPS_ONLY"
+ _ssl_version = _get_ssl_version()
try:
conn = network.get_http_connection(
collector_host,
int(collector_port),
metric_collector_https_enabled,
ca_certs,
- ssl_version=AmbariConfig.get_resolved_config().get_force_https_protocol_value()
+ ssl_version=_ssl_version
)
conn.request("GET", AMS_METRICS_GET_URL % encoded_get_metrics_parameters)
response = conn.getresponse()
@@ -427,6 +428,16 @@ def valid_collector_webapp_address(webapp_address):
return False
+def _get_state_from_jmx(jmx_uri, connection_timeout):
+ state_response = get_jmx(jmx_uri, connection_timeout)
+ state = _get_ha_state_from_json(state_response)
+ return state
+
+
+def _get_ssl_version():
+ return AmbariConfig.get_resolved_config().get_force_https_protocol_value()
+
+
def get_jmx(query, connection_timeout):
response = None
@@ -484,3 +495,4 @@ def _coerce_to_integer(value):
return int(value)
except ValueError:
return int(float(value))
+
diff --git a/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_alert_metrics_deviation.py b/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_alert_metrics_deviation.py
index 09e8886..22f6d2b 100644
--- a/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_alert_metrics_deviation.py
+++ b/ambari-server/src/test/python/stacks/2.0.6/HDFS/test_alert_metrics_deviation.py
@@ -39,6 +39,14 @@ RESULT_STATE_CRITICAL = "CRITICAL"
RESULT_STATE_UNKNOWN = "UNKNOWN"
RESULT_STATE_SKIPPED = "SKIPPED"
+def dummy_get_state_from_jmx(jmx_uri, connection_timeout):
+ if 'c6401' in jmx_uri or 'c6403' in jmx_uri:
+ return 'active'
+ return 'standby'
+
+def dummy_get_ssl_version():
+ return 3
+
class TestAlertMetricsDeviation(RMFTestCase):
def setUp(self):
@@ -127,7 +135,94 @@ class TestAlertMetricsDeviation(RMFTestCase):
}
self.make_alert_tests(configs, conn_mock)
- def make_alert_tests(self, configs, conn_mock):
+ @patch.object(ambari_metrics_helper, 'get_metric_collectors_from_properties_file', new = MagicMock(return_value='c6401.ambari.apache.org:6188'))
+ @patch("httplib.HTTPConnection")
+ def test_alert_ha(self, conn_mock):
+ configs = {
+ '{{hdfs-site/dfs.namenode.https-address}}': 'c6401.ambari.apache.org:50470',
+ '{{hdfs-site/dfs.http.policy}}': 'HTTP_ONLY',
+ '{{ams-site/timeline.metrics.service.webapp.address}}': '0.0.0.0:6188',
+ '{{ams-site/timeline.metrics.service.http.policy}}' : 'HTTP_ONLY',
+ '{{hdfs-site/dfs.namenode.http-address}}': 'c6401.ambari.apache.org:50070',
+ '{{cluster-env/security_enabled}}': 'false',
+ '{{cluster-env/smokeuser}}': 'ambari-qa',
+ '{{hdfs-site/dfs.internal.nameservices}}': 'ns',
+ '{{hdfs-site}}': {
+ 'dfs.internal.nameservices': 'ns',
+ 'dfs.nameservices': 'ns',
+ 'dfs.ha.namenodes.ns': 'nn1,nn2',
+ 'dfs.datanode.address.ns.nn1': '0.0.0.0:50010',
+ 'dfs.namenode.rpc-address.ns.nn1': 'c6401.ambari.apache.org:8020',
+ 'dfs.namenode.https-address.ns.nn1': 'c6401.ambari.apache.org:50470',
+ 'dfs.namenode.http-address.ns.nn1': 'c6401.ambari.apache.org:50070',
+ 'dfs.datanode.https.address.ns.nn1': '0.0.0.0:50475',
+ 'dfs.namenode.secondary.http-address.ns.nn1': 'c6401.ambari.apache.org:50090',
+ 'dfs.datanode.http.address.ns.nn1': '0.0.0.0:50075',
+ 'dfs.namenode.rpc-address.ns.nn2': 'c6402.ambari.apache.org:8020',
+ 'dfs.namenode.https-address.ns.nn2': 'c6402.ambari.apache.org:50470',
+ 'dfs.namenode.http-address.ns.nn2': 'c6402.ambari.apache.org:50070',
+ 'dfs.datanode.https.address.ns.nn2': 'c6402.ambari.apache.org:50475',
+ 'dfs.namenode.secondary.http-address.ns.nn2': 'c6402.ambari.apache.org:50090',
+ 'dfs.datanode.http.address.ns.nn2': 'c6402.ambari.apache.org:50075',
+ 'dfs.http.policy': 'HTTP_ONLY',
+ 'dfs.journalnode.https-address': '0.0.0.0:8481',
+ 'dfs.journalnode.http-address': '0.0.0.0:8480',
+ }
+ }
+ self.make_alert_tests(configs, conn_mock)
+
+ @patch.object(ambari_metrics_helper, 'get_metric_collectors_from_properties_file', new = MagicMock(return_value='c6401.ambari.apache.org:6188'))
+ @patch("httplib.HTTPConnection")
+ def test_alert_federation(self, conn_mock):
+ configs = {
+ '{{hdfs-site/dfs.namenode.https-address}}': 'c6401.ambari.apache.org:50470',
+ '{{hdfs-site/dfs.http.policy}}': 'HTTP_ONLY',
+ '{{ams-site/timeline.metrics.service.webapp.address}}': '0.0.0.0:6188',
+ '{{ams-site/timeline.metrics.service.http.policy}}' : 'HTTP_ONLY',
+ '{{hdfs-site/dfs.namenode.http-address}}': 'c6401.ambari.apache.org:50070',
+ '{{cluster-env/security_enabled}}': 'false',
+ '{{cluster-env/smokeuser}}': 'ambari-qa',
+ '{{hdfs-site/dfs.internal.nameservices}}': 'ns1,ns2',
+ '{{hdfs-site}}': {
+ 'dfs.internal.nameservices': 'ns1,ns2',
+ 'dfs.nameservices': 'ns1,ns2',
+ 'dfs.ha.namenodes.ns1': 'nn1,nn2',
+ 'dfs.datanode.address.ns1.nn1': 'c6401.ambari.apache.org:50010',
+ 'dfs.namenode.rpc-address.ns1.nn1': 'c6401.ambari.apache.org:8020',
+ 'dfs.namenode.https-address.ns1.nn1': 'c6401.ambari.apache.org:50470',
+ 'dfs.namenode.http-address.ns1.nn1': 'c6401.ambari.apache.org:50070',
+ 'dfs.datanode.https.address.ns1.nn1': 'c6401.ambari.apache.org:50475',
+ 'dfs.namenode.secondary.http-address.ns1.nn1': 'c6401.ambari.apache.org:50090',
+ 'dfs.datanode.http.address.ns1.nn1': '0.0.0.0:50075',
+ 'dfs.namenode.rpc-address.ns1.nn2': 'c6402.ambari.apache.org:8020',
+ 'dfs.namenode.https-address.ns1.nn2': 'c6402.ambari.apache.org:50470',
+ 'dfs.namenode.http-address.ns1.nn2': 'c6402.ambari.apache.org:50070',
+ 'dfs.datanode.https.address.ns1.nn2': 'c6402.ambari.apache.org:50475',
+ 'dfs.namenode.secondary.http-address.ns1.nn2': 'c6402.ambari.apache.org:50090',
+ 'dfs.datanode.http.address.ns1.nn2': 'c6402.ambari.apache.org:50075',
+
+ 'dfs.ha.namenodes.ns2': 'nn3,nn4',
+ 'dfs.namenode.rpc-address.ns2.nn3': 'c6403.ambari.apache.org:8020',
+ 'dfs.namenode.https-address.ns2.nn3': 'c6403.ambari.apache.org:50470',
+ 'dfs.namenode.http-address.ns2.nn3': 'c6403.ambari.apache.org:50070',
+ 'dfs.datanode.https.address.ns2.nn3': 'c6403.ambari.apache.org:50475',
+ 'dfs.namenode.secondary.http-address.ns2.nn3': 'c6403.ambari.apache.org:50090',
+ 'dfs.datanode.http.address.ns2.nn3': '0.0.0.0:50075',
+ 'dfs.namenode.rpc-address.ns2.nn4': 'c6404.ambari.apache.org:8020',
+ 'dfs.namenode.https-address.ns2.nn4': 'c6404.ambari.apache.org:50470',
+ 'dfs.namenode.http-address.ns2.nn4': 'c6404.ambari.apache.org:50070',
+ 'dfs.datanode.https.address.ns2.nn4': 'c6404.ambari.apache.org:50475',
+ 'dfs.namenode.secondary.http-address.ns2.nn4': 'c6404.ambari.apache.org:50090',
+ 'dfs.datanode.http.address.ns2.nn4': 'c6404.ambari.apache.org:50075',
+ 'dfs.http.policy': 'HTTP_ONLY',
+ 'dfs.journalnode.https-address': '0.0.0.0:8481',
+ 'dfs.journalnode.http-address': '0.0.0.0:8480',
+ }
+ }
+ self.make_alert_tests(configs, conn_mock, 'c6401.ambari.apache.org')
+
+
+ def make_alert_tests(self, configs, conn_mock, _host_name = None):
connection = MagicMock()
response = MagicMock()
response.status = 200
@@ -135,29 +230,32 @@ class TestAlertMetricsDeviation(RMFTestCase):
conn_mock.return_value = connection
response.read.return_value = '{"metrics":[{"metricname":"metric1","metrics":{"1459966360838":1,"1459966370838":3}}]}'
+ alert._get_state_from_jmx = dummy_get_state_from_jmx
+ alert._get_ssl_version = dummy_get_ssl_version
+
# OK, but no datapoints above the minimum threshold
- [status, messages] = alert.execute(configurations=configs, parameters=parameters)
+ [status, messages] = alert.execute(configurations=configs, parameters=parameters, host_name=_host_name)
self.assertEqual(status, RESULT_STATE_OK)
self.assertTrue(messages is not None and len(messages) == 1)
self.assertEquals('There were no data points above the minimum threshold of 30 seconds',messages[0])
# Unable to calculate the standard deviation for 1 data point
response.read.return_value = '{"metrics":[{"metricname":"metric1","metrics":{"1459966360838":40000}}]}'
- [status, messages] = alert.execute(configurations=configs, parameters=parameters)
+ [status, messages] = alert.execute(configurations=configs, parameters=parameters, host_name=_host_name)
self.assertEqual(status, RESULT_STATE_SKIPPED)
self.assertTrue(messages is not None and len(messages) == 1)
self.assertEquals('There are not enough data points to calculate the standard deviation (1 sampled)', messages[0])
# OK
response.read.return_value = '{"metrics":[{"metricname":"metric1","metrics":{"1459966360838":40000,"1459966370838":50000}}]}'
- [status, messages] = alert.execute(configurations=configs, parameters=parameters)
+ [status, messages] = alert.execute(configurations=configs, parameters=parameters, host_name=_host_name)
self.assertEqual(status, RESULT_STATE_OK)
self.assertTrue(messages is not None and len(messages) == 1)
self.assertEquals('The variance for this alert is 7071ms which is within 100% of the 45000ms average (45000ms is the limit)', messages[0])
# Warning
response.read.return_value = '{"metrics":[{"metricname":"metric1","metrics":{"1459966360838":40000,"1459966370838":1000000}}]}'
- [status, messages] = alert.execute(configurations=configs, parameters=parameters)
+ [status, messages] = alert.execute(configurations=configs, parameters=parameters, host_name=_host_name)
self.assertEqual(status, RESULT_STATE_WARNING)
self.assertTrue(messages is not None and len(messages) == 1)
self.assertEquals('The variance for this alert is 678823ms which is 131% of the 520000ms average (520000ms is the limit)', messages[0])
@@ -165,14 +263,14 @@ class TestAlertMetricsDeviation(RMFTestCase):
# HTTP request to AMS failed
response.read.return_value = ''
response.status = 501
- [status, messages] = alert.execute(configurations=configs, parameters=parameters)
+ [status, messages] = alert.execute(configurations=configs, parameters=parameters, host_name=_host_name)
self.assertEqual(status, RESULT_STATE_UNKNOWN)
self.assertTrue(messages is not None and len(messages) == 1)
self.assertEquals('Unable to retrieve metrics from the Ambari Metrics service.', messages[0])
# Unable to connect to AMS
conn_mock.side_effect = Exception('Unable to connect to AMS')
- [status, messages] = alert.execute(configurations=configs, parameters=parameters)
+ [status, messages] = alert.execute(configurations=configs, parameters=parameters, host_name=_host_name)
self.assertEqual(status, RESULT_STATE_UNKNOWN)
self.assertTrue(messages is not None and len(messages) == 1)
self.assertEquals('Unable to retrieve metrics from the Ambari Metrics service.', messages[0])
--
To stop receiving notification emails like this one, please contact
adoroszlai@apache.org.