You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ambari.apache.org by sw...@apache.org on 2016/04/28 11:42:22 UTC
ambari git commit: AMBARI-16149. Support for LLAP alert in Ambari.
Also fixes Hive Metastore alert failure.
Repository: ambari
Updated Branches:
refs/heads/trunk 9bef76ba0 -> 82bea1cbf
AMBARI-16149. Support for LLAP alert in Ambari. Also fixes Hive Metastore alert failure.
Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/82bea1cb
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/82bea1cb
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/82bea1cb
Branch: refs/heads/trunk
Commit: 82bea1cbfde826d13579b516f19e730e051adf72
Parents: 9bef76b
Author: Swapan Shridhar <ss...@hortonworks.com>
Authored: Wed Apr 27 18:17:41 2016 -0700
Committer: Swapan Shridhar <ss...@hortonworks.com>
Committed: Thu Apr 28 02:41:45 2016 -0700
----------------------------------------------------------------------
.../common-services/HIVE/0.12.0.2.0/alerts.json | 47 ++++
.../package/alerts/alert_hive_metastore.py | 22 +-
.../package/alerts/alert_llap_app_status.py | 213 +++++++++++++++++++
.../0.12.0.2.0/package/scripts/params_linux.py | 1 +
.../HIVE/configuration/hive-interactive-env.xml | 6 +
5 files changed, 280 insertions(+), 9 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/ambari/blob/82bea1cb/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/alerts.json
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/alerts.json b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/alerts.json
index 9f0466c..0fad732 100644
--- a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/alerts.json
+++ b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/alerts.json
@@ -146,6 +146,53 @@
}
]
}
+ },
+ {
+ "name": "llap_application",
+ "label": "LLAP Application",
+ "description": "This alert is triggered if the LLAP Application cannot be determined to be up and responding to requests.",
+ "interval": 3,
+ "scope": "ANY",
+ "enabled": true,
+ "source": {
+ "type": "SCRIPT",
+ "path": "HIVE/0.12.0.2.0/package/alerts/alert_llap_app_status.py",
+ "parameters": [
+ {
+ "name": "check.command.timeout",
+ "display_name": "Command Timeout",
+ "value": 15.0,
+ "type": "NUMERIC",
+ "description": "The maximum time before check command will be killed by timeout",
+ "units": "seconds",
+ "threshold": "CRITICAL"
+ },
+ {
+ "name": "default.hive.user",
+ "display_name": "Default HIVE User",
+ "value": "hive",
+ "type": "STRING",
+ "description": "The user that will run the Hive commands if not specified in cluster-env",
+ "visibility": "HIDDEN"
+ },
+ {
+ "name": "default.hive.principal",
+ "display_name": "Default HIVE Principal",
+ "value": "hive@EXAMPLE.COM",
+ "type": "STRING",
+ "description": "The principal to use when retrieving the kerberos ticket if not specified in cluster-env",
+ "visibility": "HIDDEN"
+ },
+ {
+ "name": "default.hive.keytab",
+ "display_name": "Default HIVE Keytab",
+ "value": "/etc/security/keytabs/hive.llap.zk.sm.keytab",
+ "type": "STRING",
+ "description": "The keytab to use when retrieving the kerberos ticket if not specified in cluster-env.",
+ "visibility": "HIDDEN"
+ }
+ ]
+ }
}
],
"WEBHCAT_SERVER": [
http://git-wip-us.apache.org/repos/asf/ambari/blob/82bea1cb/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_metastore.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_metastore.py b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_metastore.py
index a556410..e02ed5a 100644
--- a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_metastore.py
+++ b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_metastore.py
@@ -31,9 +31,6 @@ from resource_management.core.resources import Execute
from ambari_commons.os_check import OSConst
from ambari_commons.os_family_impl import OsFamilyFuncImpl, OsFamilyImpl
-import params
-
-stack_root = params.stack_root
OK_MESSAGE = "Metastore OK - Hive command took {0:.3f}s"
CRITICAL_MESSAGE = "Metastore on {0} failed ({1})"
@@ -58,10 +55,10 @@ SMOKEUSER_PRINCIPAL_DEFAULT = 'ambari-qa@EXAMPLE.COM'
SMOKEUSER_SCRIPT_PARAM_KEY = 'default.smoke.user'
SMOKEUSER_DEFAULT = 'ambari-qa'
-HIVE_CONF_DIR = format("{stack_root}/current/hive-metastore/conf/conf.server")
+STACK_ROOT = '{{cluster-env/stack_root}}'
+
HIVE_CONF_DIR_LEGACY = '/etc/hive/conf.server'
-HIVE_BIN_DIR = format("{stack_root}/current/hive-metastore/bin")
HIVE_BIN_DIR_LEGACY = '/usr/lib/hive/bin'
CHECK_COMMAND_TIMEOUT_KEY = 'check.command.timeout'
@@ -69,6 +66,7 @@ CHECK_COMMAND_TIMEOUT_DEFAULT = 60.0
HADOOPUSER_KEY = '{{cluster-env/hadoop.user.name}}'
HADOOPUSER_DEFAULT = 'hadoop'
+
logger = logging.getLogger('ambari_alerts')
@OsFamilyFuncImpl(os_family=OsFamilyImpl.DEFAULT)
@@ -78,7 +76,8 @@ def get_tokens():
to build the dictionary passed into execute
"""
return (SECURITY_ENABLED_KEY,SMOKEUSER_KEYTAB_KEY,SMOKEUSER_PRINCIPAL_KEY,
- HIVE_METASTORE_URIS_KEY, SMOKEUSER_KEY, KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY)
+ HIVE_METASTORE_URIS_KEY, SMOKEUSER_KEY, KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY,
+ STACK_ROOT)
@OsFamilyFuncImpl(os_family=OSConst.WINSRV_FAMILY)
def get_tokens():
@@ -174,9 +173,14 @@ def execute(configurations={}, parameters={}, host_name=None):
conf_dir = HIVE_CONF_DIR_LEGACY
bin_dir = HIVE_BIN_DIR_LEGACY
- if os.path.exists(HIVE_CONF_DIR):
- conf_dir = HIVE_CONF_DIR
- bin_dir = HIVE_BIN_DIR
+
+ if STACK_ROOT in configurations:
+ hive_conf_dir = configurations[STACK_ROOT] + format("/current/hive-metastore/conf/conf.server")
+ hive_bin_dir = configurations[STACK_ROOT] + format("/current/hive-metastore/bin")
+
+ if os.path.exists(hive_conf_dir):
+ conf_dir = hive_conf_dir
+ bin_dir = hive_bin_dir
cmd = format("export HIVE_CONF_DIR='{conf_dir}' ; "
"hive --hiveconf hive.metastore.uris={metastore_uri}\
http://git-wip-us.apache.org/repos/asf/ambari/blob/82bea1cb/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_llap_app_status.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_llap_app_status.py b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_llap_app_status.py
new file mode 100644
index 0000000..b18c366
--- /dev/null
+++ b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_llap_app_status.py
@@ -0,0 +1,213 @@
+#!/usr/bin/env python
+
+"""
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements. See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership. The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License. You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import time
+import logging
+import traceback
+import json
+import subprocess
+
+from resource_management.libraries.functions import format
+from resource_management.libraries.functions import get_kinit_path
+from ambari_commons.os_check import OSConst
+from ambari_commons.os_family_impl import OsFamilyFuncImpl, OsFamilyImpl
+from resource_management.core import shell
+from resource_management.core.resources import Execute
+from resource_management.core import global_lock
+
+
+OK_MESSAGE = "APP is in : '{0}' state. Check took {1:.3f}s"
+MESSAGE_WITH_STATE_AND_INSTANCES = "APP is in : '{0}' state. Instances 'live' : {1}, 'desired' : {2}. Check took {3:.3f}s"
+CRITICAL_MESSAGE_WITH_STATE = "APP is in : '{0}' state. Check took {1:.3f}s"
+CRITICAL_MESSAGE = "APP information couldn't be retrieved. Check took {0:.3f}s"
+
+# results codes
+CRITICAL_RESULT_CODE = 'CRITICAL'
+OK_RESULT_CODE = 'OK'
+UKNOWN_STATUS_CODE = 'UNKNOWN'
+
+
+SECURITY_ENABLED_KEY = '{{cluster-env/security_enabled}}'
+
+HIVE_PRINCIPAL_KEY = '{{hive-interactive-site/hive.llap.zk.sm.principal}}'
+HIVE_PRINCIPAL_DEFAULT = 'default.hive.principal'
+
+HIVE_PRINCIPAL_KEYTAB_KEY = '{{hive-interactive-site/hive.llap.zk.sm.keytab.file}}'
+HIVE_PRINCIPAL_KEYTAB_DEFAULT = 'default.hive.keytab'
+
+HIVE_AUTHENTICATION_DEFAULT = 'NOSASL'
+
+HIVE_USER_KEY = '{{hive-env/hive_user}}'
+HIVE_USER_DEFAULT = 'default.smoke.user'
+
+STACK_ROOT = '{{cluster-env/stack_root}}'
+STACK_ROOT_DEFAULT = "/usr/hdp"
+
+LLAP_APP_NAME_KEY = '{{hive-interactive-env/llap_app_name}}'
+LLAP_APP_NAME_DEFAULT = 'llap0'
+
+# The configured Kerberos executable search paths, if any
+KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY = '{{kerberos-env/executable_search_paths}}'
+
+
+CHECK_COMMAND_TIMEOUT_KEY = 'check.command.timeout'
+CHECK_COMMAND_TIMEOUT_DEFAULT = 15.0
+
+
+
+logger = logging.getLogger('ambari_alerts')
+
+@OsFamilyFuncImpl(os_family=OsFamilyImpl.DEFAULT)
+def get_tokens():
+ """
+ Returns a tuple of tokens in the format {{site/property}} that will be used
+ to build the dictionary passed into execute
+ """
+ return (SECURITY_ENABLED_KEY, KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY, HIVE_PRINCIPAL_KEY, HIVE_PRINCIPAL_KEYTAB_KEY,
+ HIVE_USER_KEY, STACK_ROOT, LLAP_APP_NAME_KEY)
+
+
+@OsFamilyFuncImpl(os_family=OsFamilyImpl.DEFAULT)
+def execute(configurations={}, parameters={}, host_name=None):
+ """
+ Returns a tuple containing the result code and a pre-formatted result label
+
+ Keyword arguments:
+ configurations (dictionary): a mapping of configuration key to value
+ parameters (dictionary): a mapping of script parameter key to value
+ host_name (string): the name of this host where the alert is running
+ """
+
+ if configurations is None:
+ return ('UNKNOWN', ['There were no configurations supplied to the script.'])
+
+ result_code = None
+
+ try:
+ security_enabled = False
+ if SECURITY_ENABLED_KEY in configurations:
+ security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'
+
+ check_command_timeout = CHECK_COMMAND_TIMEOUT_DEFAULT
+ if CHECK_COMMAND_TIMEOUT_KEY in configurations:
+ check_command_timeout = int(parameters[CHECK_COMMAND_TIMEOUT_KEY])
+
+ hive_user = HIVE_USER_DEFAULT
+ if HIVE_USER_KEY in configurations:
+ hive_user = configurations[HIVE_USER_KEY]
+
+ llap_app_name = LLAP_APP_NAME_DEFAULT
+ if LLAP_APP_NAME_KEY in configurations:
+ llap_app_name = configurations[LLAP_APP_NAME_KEY]
+
+ if security_enabled:
+ llap_principal = HIVE_PRINCIPAL_DEFAULT
+ if HIVE_PRINCIPAL_KEY in configurations:
+ llap_principal = configurations[HIVE_PRINCIPAL_KEY]
+
+ llap_keytab = HIVE_PRINCIPAL_KEYTAB_DEFAULT
+ if HIVE_PRINCIPAL_KEYTAB_KEY in configurations:
+ llap_keytab = configurations[HIVE_PRINCIPAL_KEYTAB_KEY]
+
+ # Get the configured Kerberos executable search paths, if any
+ if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations:
+ kerberos_executable_search_paths = configurations[KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY]
+ else:
+ kerberos_executable_search_paths = None
+
+ kinit_path_local = get_kinit_path(kerberos_executable_search_paths)
+ kinitcmd=format("{kinit_path_local} -kt {llap_keytab} {llap_principal}; ")
+
+ # prevent concurrent kinit
+ kinit_lock = global_lock.get_lock(global_lock.LOCK_TYPE_KERBEROS)
+ kinit_lock.acquire()
+ try:
+ Execute(kinitcmd, user=hive_user,#status_params.hive_user,
+ path=["/bin/", "/usr/bin/", "/usr/lib/hive/bin/", "/usr/sbin/"],
+ timeout=10)
+ finally:
+ kinit_lock.release()
+
+
+
+ start_time = time.time()
+ if STACK_ROOT in configurations:
+ llap_status_cmd = configurations[STACK_ROOT] + format("/current/hive-server2-hive2/bin/hive --service llapstatus --name {llap_app_name}")
+ else:
+ llap_status_cmd = format("/usr/hdp/current/hive-server2-hive2/bin/hive --service llapstatus --name {llap_app_name}")
+
+ code, output, error = shell.checked_call(llap_status_cmd, user=hive_user, stderr=subprocess.PIPE,
+ timeout=check_command_timeout,
+ logoutput=False)
+ llap_app_info = json.loads(output)
+
+ if llap_app_info is None or 'state' not in llap_app_info:
+ alert_label = traceback.format_exc()
+ result_code = UKNOWN_STATUS_CODE
+ return (result_code, [alert_label])
+
+ if llap_app_info['state'].upper() in ['RUNNING_ALL']:
+ result_code = OK_RESULT_CODE
+ total_time = time.time() - start_time
+ alert_label = OK_MESSAGE.format(llap_app_info['state'], total_time)
+ elif llap_app_info['state'].upper() in ['RUNNING_PARTIAL']:
+ live_instances = 0
+ desired_instances = 0
+ percentInstancesUp = 0
+ percent_desired_instances_to_be_up = 80
+ # Get 'live' and 'desired' instances
+ if 'liveInstances' not in llap_app_info or 'desiredInstances' not in llap_app_info:
+ result_code = CRITICAL_RESULT_CODE
+ total_time = time.time() - start_time
+ alert_label = CRITICAL_MESSAGE_WITH_STATE.format(llap_app_info['state'], total_time)
+ return (result_code, [alert_label])
+
+ live_instances = llap_app_info['liveInstances']
+ desired_instances = llap_app_info['desiredInstances']
+ if live_instances < 0 or desired_instances <= 0:
+ result_code = CRITICAL_RESULT_CODE
+ total_time = time.time() - start_time
+ alert_label = MESSAGE_WITH_STATE_AND_INSTANCES.format(llap_app_info['state'], total_time)
+ return (result_code, [alert_label])
+
+ percentInstancesUp = float(live_instances) / desired_instances * 100
+ if percentInstancesUp >= percent_desired_instances_to_be_up:
+ result_code = OK_RESULT_CODE
+ total_time = time.time() - start_time
+ alert_label = MESSAGE_WITH_STATE_AND_INSTANCES.format(llap_app_info['state'],
+ llap_app_info['liveInstances'],
+ llap_app_info['desiredInstances'],
+ total_time)
+ else:
+ result_code = CRITICAL_RESULT_CODE
+ total_time = time.time() - start_time
+ alert_label = MESSAGE_WITH_STATE_AND_INSTANCES.format(llap_app_info['state'],
+ llap_app_info['liveInstances'],
+ llap_app_info['desiredInstances'],
+ total_time)
+ else:
+ result_code = CRITICAL_RESULT_CODE
+ total_time = time.time() - start_time
+ alert_label = CRITICAL_MESSAGE_WITH_STATE.format(llap_app_info['state'], total_time)
+ except:
+ alert_label = traceback.format_exc()
+ traceback.format_exc()
+ result_code = UKNOWN_STATUS_CODE
+ return (result_code, [alert_label])
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/ambari/blob/82bea1cb/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/scripts/params_linux.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/scripts/params_linux.py b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/scripts/params_linux.py
index 22e1b55..a4f5378 100644
--- a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/scripts/params_linux.py
+++ b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/scripts/params_linux.py
@@ -559,6 +559,7 @@ if has_hive_interactive:
llap_log_level = config['configurations']['hive-interactive-env']['llap_log_level']
hive_llap_io_mem_size = config['configurations']['hive-interactive-site']['hive.llap.io.memory.size']
llap_heap_size = config['configurations']['hive-interactive-env']['llap_heap_size']
+ llap_app_name = config['configurations']['hive-interactive-env']['llap_app_name']
if security_enabled:
hive_llap_keytab_file = config['configurations']['hive-interactive-site']['hive.llap.zk.sm.keytab.file']
hive_headless_keytab = config['configurations']['hive-interactive-site']['hive.llap.zk.sm.principal']
http://git-wip-us.apache.org/repos/asf/ambari/blob/82bea1cb/ambari-server/src/main/resources/stacks/HDP/2.5/services/HIVE/configuration/hive-interactive-env.xml
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/stacks/HDP/2.5/services/HIVE/configuration/hive-interactive-env.xml b/ambari-server/src/main/resources/stacks/HDP/2.5/services/HIVE/configuration/hive-interactive-env.xml
index aad9c47..a4d39e1 100644
--- a/ambari-server/src/main/resources/stacks/HDP/2.5/services/HIVE/configuration/hive-interactive-env.xml
+++ b/ambari-server/src/main/resources/stacks/HDP/2.5/services/HIVE/configuration/hive-interactive-env.xml
@@ -184,6 +184,12 @@
<description>LLAP app logging level</description>
<display-name>LLAP app logging level</display-name>
</property>
+ <property>
+ <name>llap_app_name</name>
+ <value>llap0</value>
+ <description>LLAP app name</description>
+ <display-name>LLAP app name</display-name>
+ </property>
<!-- hive-env.sh -->
<property>