You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@ambari.apache.org by sw...@apache.org on 2016/04/28 11:42:22 UTC

ambari git commit: AMBARI-16149. Support for LLAP alert in Ambari. Also fixes Hive Metastore alert failure.

Repository: ambari
Updated Branches:
  refs/heads/trunk 9bef76ba0 -> 82bea1cbf


AMBARI-16149. Support for LLAP alert in Ambari. Also fixes Hive Metastore alert failure.


Project: http://git-wip-us.apache.org/repos/asf/ambari/repo
Commit: http://git-wip-us.apache.org/repos/asf/ambari/commit/82bea1cb
Tree: http://git-wip-us.apache.org/repos/asf/ambari/tree/82bea1cb
Diff: http://git-wip-us.apache.org/repos/asf/ambari/diff/82bea1cb

Branch: refs/heads/trunk
Commit: 82bea1cbfde826d13579b516f19e730e051adf72
Parents: 9bef76b
Author: Swapan Shridhar <ss...@hortonworks.com>
Authored: Wed Apr 27 18:17:41 2016 -0700
Committer: Swapan Shridhar <ss...@hortonworks.com>
Committed: Thu Apr 28 02:41:45 2016 -0700

----------------------------------------------------------------------
 .../common-services/HIVE/0.12.0.2.0/alerts.json |  47 ++++
 .../package/alerts/alert_hive_metastore.py      |  22 +-
 .../package/alerts/alert_llap_app_status.py     | 213 +++++++++++++++++++
 .../0.12.0.2.0/package/scripts/params_linux.py  |   1 +
 .../HIVE/configuration/hive-interactive-env.xml |   6 +
 5 files changed, 280 insertions(+), 9 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/ambari/blob/82bea1cb/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/alerts.json
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/alerts.json b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/alerts.json
index 9f0466c..0fad732 100644
--- a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/alerts.json
+++ b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/alerts.json
@@ -146,6 +146,53 @@
             }
           ]
         }
+      },
+      {
+        "name": "llap_application",
+        "label": "LLAP Application",
+        "description": "This alert is triggered if the LLAP Application cannot be determined to be up and responding to requests.",
+        "interval": 3,
+        "scope": "ANY",
+        "enabled": true,
+        "source": {
+          "type": "SCRIPT",
+          "path": "HIVE/0.12.0.2.0/package/alerts/alert_llap_app_status.py",
+          "parameters": [
+            {
+              "name": "check.command.timeout",
+              "display_name": "Command Timeout",
+              "value": 15.0,
+              "type": "NUMERIC",
+              "description": "The maximum time before check command will be killed by timeout",
+              "units": "seconds",
+              "threshold": "CRITICAL"
+            },
+            {
+              "name": "default.hive.user",
+              "display_name": "Default HIVE User",
+              "value": "hive",
+              "type": "STRING",
+              "description": "The user that will run the Hive commands if not specified in cluster-env",
+              "visibility": "HIDDEN"
+            },
+            {
+              "name": "default.hive.principal",
+              "display_name": "Default HIVE Principal",
+              "value": "hive@EXAMPLE.COM",
+              "type": "STRING",
+              "description": "The principal to use when retrieving the kerberos ticket if not specified in cluster-env",
+              "visibility": "HIDDEN"
+            },
+            {
+              "name": "default.hive.keytab",
+              "display_name": "Default HIVE Keytab",
+              "value": "/etc/security/keytabs/hive.llap.zk.sm.keytab",
+              "type": "STRING",
+              "description": "The keytab to use when retrieving the kerberos ticket if not specified in cluster-env.",
+              "visibility": "HIDDEN"
+            }
+          ]
+        }
       }
     ],
     "WEBHCAT_SERVER": [

http://git-wip-us.apache.org/repos/asf/ambari/blob/82bea1cb/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_metastore.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_metastore.py b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_metastore.py
index a556410..e02ed5a 100644
--- a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_metastore.py
+++ b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_hive_metastore.py
@@ -31,9 +31,6 @@ from resource_management.core.resources import Execute
 from ambari_commons.os_check import OSConst
 from ambari_commons.os_family_impl import OsFamilyFuncImpl, OsFamilyImpl
 
-import params
-
-stack_root = params.stack_root
 
 OK_MESSAGE = "Metastore OK - Hive command took {0:.3f}s"
 CRITICAL_MESSAGE = "Metastore on {0} failed ({1})"
@@ -58,10 +55,10 @@ SMOKEUSER_PRINCIPAL_DEFAULT = 'ambari-qa@EXAMPLE.COM'
 SMOKEUSER_SCRIPT_PARAM_KEY = 'default.smoke.user'
 SMOKEUSER_DEFAULT = 'ambari-qa'
 
-HIVE_CONF_DIR = format("{stack_root}/current/hive-metastore/conf/conf.server")
+STACK_ROOT = '{{cluster-env/stack_root}}'
+
 HIVE_CONF_DIR_LEGACY = '/etc/hive/conf.server'
 
-HIVE_BIN_DIR = format("{stack_root}/current/hive-metastore/bin")
 HIVE_BIN_DIR_LEGACY = '/usr/lib/hive/bin'
 
 CHECK_COMMAND_TIMEOUT_KEY = 'check.command.timeout'
@@ -69,6 +66,7 @@ CHECK_COMMAND_TIMEOUT_DEFAULT = 60.0
 
 HADOOPUSER_KEY = '{{cluster-env/hadoop.user.name}}'
 HADOOPUSER_DEFAULT = 'hadoop'
+
 logger = logging.getLogger('ambari_alerts')
 
 @OsFamilyFuncImpl(os_family=OsFamilyImpl.DEFAULT)
@@ -78,7 +76,8 @@ def get_tokens():
   to build the dictionary passed into execute
   """
   return (SECURITY_ENABLED_KEY,SMOKEUSER_KEYTAB_KEY,SMOKEUSER_PRINCIPAL_KEY,
-    HIVE_METASTORE_URIS_KEY, SMOKEUSER_KEY, KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY)
+    HIVE_METASTORE_URIS_KEY, SMOKEUSER_KEY, KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY,
+    STACK_ROOT)
 
 @OsFamilyFuncImpl(os_family=OSConst.WINSRV_FAMILY)
 def get_tokens():
@@ -174,9 +173,14 @@ def execute(configurations={}, parameters={}, host_name=None):
     conf_dir = HIVE_CONF_DIR_LEGACY
     bin_dir = HIVE_BIN_DIR_LEGACY
 
-    if os.path.exists(HIVE_CONF_DIR):
-      conf_dir = HIVE_CONF_DIR
-      bin_dir = HIVE_BIN_DIR
+
+    if STACK_ROOT in configurations:
+      hive_conf_dir = configurations[STACK_ROOT] + format("/current/hive-metastore/conf/conf.server")
+      hive_bin_dir = configurations[STACK_ROOT] + format("/current/hive-metastore/bin")
+
+      if os.path.exists(hive_conf_dir):
+        conf_dir = hive_conf_dir
+        bin_dir = hive_bin_dir
 
     cmd = format("export HIVE_CONF_DIR='{conf_dir}' ; "
                  "hive --hiveconf hive.metastore.uris={metastore_uri}\

http://git-wip-us.apache.org/repos/asf/ambari/blob/82bea1cb/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_llap_app_status.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_llap_app_status.py b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_llap_app_status.py
new file mode 100644
index 0000000..b18c366
--- /dev/null
+++ b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/alerts/alert_llap_app_status.py
@@ -0,0 +1,213 @@
+#!/usr/bin/env python
+
+"""
+Licensed to the Apache Software Foundation (ASF) under one
+or more contributor license agreements.  See the NOTICE file
+distributed with this work for additional information
+regarding copyright ownership.  The ASF licenses this file
+to you under the Apache License, Version 2.0 (the
+"License"); you may not use this file except in compliance
+with the License.  You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import time
+import logging
+import traceback
+import json
+import subprocess
+
+from resource_management.libraries.functions import format
+from resource_management.libraries.functions import get_kinit_path
+from ambari_commons.os_check import OSConst
+from ambari_commons.os_family_impl import OsFamilyFuncImpl, OsFamilyImpl
+from resource_management.core import shell
+from resource_management.core.resources import Execute
+from resource_management.core import global_lock
+
+
+OK_MESSAGE = "APP is in : '{0}' state. Check took {1:.3f}s"
+MESSAGE_WITH_STATE_AND_INSTANCES = "APP is in : '{0}' state. Instances 'live' : {1}, 'desired' : {2}. Check took {3:.3f}s"
+CRITICAL_MESSAGE_WITH_STATE = "APP is in : '{0}' state. Check took {1:.3f}s"
+CRITICAL_MESSAGE = "APP information couldn't be retrieved. Check took {0:.3f}s"
+
+# results codes
+CRITICAL_RESULT_CODE = 'CRITICAL'
+OK_RESULT_CODE = 'OK'
+UKNOWN_STATUS_CODE = 'UNKNOWN'
+
+
+SECURITY_ENABLED_KEY = '{{cluster-env/security_enabled}}'
+
+HIVE_PRINCIPAL_KEY = '{{hive-interactive-site/hive.llap.zk.sm.principal}}'
+HIVE_PRINCIPAL_DEFAULT = 'default.hive.principal'
+
+HIVE_PRINCIPAL_KEYTAB_KEY = '{{hive-interactive-site/hive.llap.zk.sm.keytab.file}}'
+HIVE_PRINCIPAL_KEYTAB_DEFAULT = 'default.hive.keytab'
+
+HIVE_AUTHENTICATION_DEFAULT = 'NOSASL'
+
+HIVE_USER_KEY = '{{hive-env/hive_user}}'
+HIVE_USER_DEFAULT = 'default.smoke.user'
+
+STACK_ROOT = '{{cluster-env/stack_root}}'
+STACK_ROOT_DEFAULT = "/usr/hdp"
+
+LLAP_APP_NAME_KEY = '{{hive-interactive-env/llap_app_name}}'
+LLAP_APP_NAME_DEFAULT = 'llap0'
+
+# The configured Kerberos executable search paths, if any
+KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY = '{{kerberos-env/executable_search_paths}}'
+
+
+CHECK_COMMAND_TIMEOUT_KEY = 'check.command.timeout'
+CHECK_COMMAND_TIMEOUT_DEFAULT = 15.0
+
+
+
+logger = logging.getLogger('ambari_alerts')
+
+@OsFamilyFuncImpl(os_family=OsFamilyImpl.DEFAULT)
+def get_tokens():
+  """
+  Returns a tuple of tokens in the format {{site/property}} that will be used
+  to build the dictionary passed into execute
+  """
+  return (SECURITY_ENABLED_KEY, KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY, HIVE_PRINCIPAL_KEY, HIVE_PRINCIPAL_KEYTAB_KEY,
+          HIVE_USER_KEY, STACK_ROOT, LLAP_APP_NAME_KEY)
+
+
+@OsFamilyFuncImpl(os_family=OsFamilyImpl.DEFAULT)
+def execute(configurations={}, parameters={}, host_name=None):
+  """
+  Returns a tuple containing the result code and a pre-formatted result label
+
+  Keyword arguments:
+  configurations (dictionary): a mapping of configuration key to value
+  parameters (dictionary): a mapping of script parameter key to value
+  host_name (string): the name of this host where the alert is running
+  """
+
+  if configurations is None:
+    return ('UNKNOWN', ['There were no configurations supplied to the script.'])
+
+  result_code = None
+
+  try:
+    security_enabled = False
+    if SECURITY_ENABLED_KEY in configurations:
+      security_enabled = str(configurations[SECURITY_ENABLED_KEY]).upper() == 'TRUE'
+
+    check_command_timeout = CHECK_COMMAND_TIMEOUT_DEFAULT
+    if CHECK_COMMAND_TIMEOUT_KEY in configurations:
+      check_command_timeout = int(parameters[CHECK_COMMAND_TIMEOUT_KEY])
+
+    hive_user = HIVE_USER_DEFAULT
+    if HIVE_USER_KEY in configurations:
+      hive_user = configurations[HIVE_USER_KEY]
+
+    llap_app_name = LLAP_APP_NAME_DEFAULT
+    if LLAP_APP_NAME_KEY in configurations:
+      llap_app_name = configurations[LLAP_APP_NAME_KEY]
+
+    if security_enabled:
+      llap_principal = HIVE_PRINCIPAL_DEFAULT
+      if HIVE_PRINCIPAL_KEY in configurations:
+        llap_principal = configurations[HIVE_PRINCIPAL_KEY]
+
+      llap_keytab = HIVE_PRINCIPAL_KEYTAB_DEFAULT
+      if HIVE_PRINCIPAL_KEYTAB_KEY in configurations:
+        llap_keytab = configurations[HIVE_PRINCIPAL_KEYTAB_KEY]
+
+      # Get the configured Kerberos executable search paths, if any
+      if KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY in configurations:
+        kerberos_executable_search_paths = configurations[KERBEROS_EXECUTABLE_SEARCH_PATHS_KEY]
+      else:
+        kerberos_executable_search_paths = None
+
+      kinit_path_local = get_kinit_path(kerberos_executable_search_paths)
+      kinitcmd=format("{kinit_path_local} -kt {llap_keytab} {llap_principal}; ")
+
+      # prevent concurrent kinit
+      kinit_lock = global_lock.get_lock(global_lock.LOCK_TYPE_KERBEROS)
+      kinit_lock.acquire()
+      try:
+        Execute(kinitcmd, user=hive_user,#status_params.hive_user,
+                path=["/bin/", "/usr/bin/", "/usr/lib/hive/bin/", "/usr/sbin/"],
+                timeout=10)
+      finally:
+        kinit_lock.release()
+
+
+
+    start_time = time.time()
+    if STACK_ROOT in configurations:
+      llap_status_cmd = configurations[STACK_ROOT] + format("/current/hive-server2-hive2/bin/hive --service llapstatus --name {llap_app_name}")
+    else:
+      llap_status_cmd = format("/usr/hdp/current/hive-server2-hive2/bin/hive --service llapstatus --name {llap_app_name}")
+
+    code, output, error = shell.checked_call(llap_status_cmd, user=hive_user, stderr=subprocess.PIPE,
+                                             timeout=check_command_timeout,
+                                             logoutput=False)
+    llap_app_info = json.loads(output)
+
+    if llap_app_info is None or 'state' not in llap_app_info:
+      alert_label = traceback.format_exc()
+      result_code = UKNOWN_STATUS_CODE
+      return (result_code, [alert_label])
+
+    if llap_app_info['state'].upper() in ['RUNNING_ALL']:
+      result_code = OK_RESULT_CODE
+      total_time = time.time() - start_time
+      alert_label = OK_MESSAGE.format(llap_app_info['state'], total_time)
+    elif llap_app_info['state'].upper() in ['RUNNING_PARTIAL']:
+      live_instances = 0
+      desired_instances = 0
+      percentInstancesUp = 0
+      percent_desired_instances_to_be_up = 80
+      # Get 'live' and 'desired' instances
+      if 'liveInstances' not in llap_app_info or 'desiredInstances' not in llap_app_info:
+        result_code = CRITICAL_RESULT_CODE
+        total_time = time.time() - start_time
+        alert_label = CRITICAL_MESSAGE_WITH_STATE.format(llap_app_info['state'], total_time)
+        return (result_code, [alert_label])
+
+      live_instances = llap_app_info['liveInstances']
+      desired_instances = llap_app_info['desiredInstances']
+      if live_instances < 0 or desired_instances <= 0:
+        result_code = CRITICAL_RESULT_CODE
+        total_time = time.time() - start_time
+        alert_label = MESSAGE_WITH_STATE_AND_INSTANCES.format(llap_app_info['state'], total_time)
+        return (result_code, [alert_label])
+
+      percentInstancesUp = float(live_instances) / desired_instances * 100
+      if percentInstancesUp >= percent_desired_instances_to_be_up:
+        result_code = OK_RESULT_CODE
+        total_time = time.time() - start_time
+        alert_label = MESSAGE_WITH_STATE_AND_INSTANCES.format(llap_app_info['state'],
+                                                              llap_app_info['liveInstances'],
+                                                              llap_app_info['desiredInstances'],
+                                                              total_time)
+      else:
+        result_code = CRITICAL_RESULT_CODE
+        total_time = time.time() - start_time
+        alert_label = MESSAGE_WITH_STATE_AND_INSTANCES.format(llap_app_info['state'],
+                                                              llap_app_info['liveInstances'],
+                                                              llap_app_info['desiredInstances'],
+                                                              total_time)
+    else:
+      result_code = CRITICAL_RESULT_CODE
+      total_time = time.time() - start_time
+      alert_label = CRITICAL_MESSAGE_WITH_STATE.format(llap_app_info['state'], total_time)
+  except:
+    alert_label = traceback.format_exc()
+    traceback.format_exc()
+    result_code = UKNOWN_STATUS_CODE
+  return (result_code, [alert_label])
\ No newline at end of file

http://git-wip-us.apache.org/repos/asf/ambari/blob/82bea1cb/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/scripts/params_linux.py
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/scripts/params_linux.py b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/scripts/params_linux.py
index 22e1b55..a4f5378 100644
--- a/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/scripts/params_linux.py
+++ b/ambari-server/src/main/resources/common-services/HIVE/0.12.0.2.0/package/scripts/params_linux.py
@@ -559,6 +559,7 @@ if has_hive_interactive:
   llap_log_level = config['configurations']['hive-interactive-env']['llap_log_level']
   hive_llap_io_mem_size = config['configurations']['hive-interactive-site']['hive.llap.io.memory.size']
   llap_heap_size = config['configurations']['hive-interactive-env']['llap_heap_size']
+  llap_app_name = config['configurations']['hive-interactive-env']['llap_app_name']
   if security_enabled:
     hive_llap_keytab_file = config['configurations']['hive-interactive-site']['hive.llap.zk.sm.keytab.file']
     hive_headless_keytab = config['configurations']['hive-interactive-site']['hive.llap.zk.sm.principal']

http://git-wip-us.apache.org/repos/asf/ambari/blob/82bea1cb/ambari-server/src/main/resources/stacks/HDP/2.5/services/HIVE/configuration/hive-interactive-env.xml
----------------------------------------------------------------------
diff --git a/ambari-server/src/main/resources/stacks/HDP/2.5/services/HIVE/configuration/hive-interactive-env.xml b/ambari-server/src/main/resources/stacks/HDP/2.5/services/HIVE/configuration/hive-interactive-env.xml
index aad9c47..a4d39e1 100644
--- a/ambari-server/src/main/resources/stacks/HDP/2.5/services/HIVE/configuration/hive-interactive-env.xml
+++ b/ambari-server/src/main/resources/stacks/HDP/2.5/services/HIVE/configuration/hive-interactive-env.xml
@@ -184,6 +184,12 @@
     <description>LLAP app logging level</description>
     <display-name>LLAP app logging level</display-name>
   </property>
+  <property>
+    <name>llap_app_name</name>
+    <value>llap0</value>
+    <description>LLAP app name</description>
+    <display-name>LLAP app name</display-name>
+  </property>
 
   <!-- hive-env.sh -->
   <property>