You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@bigtop.apache.org by kw...@apache.org on 2018/08/02 18:26:59 UTC

bigtop git commit: BIGTOP-3047: juju: nagios support for zookeeper

Repository: bigtop
Updated Branches:
  refs/heads/master 4cfd51332 -> 05f14ffa5


BIGTOP-3047: juju: nagios support for zookeeper

Closes #372

Signed-off-by: Kevin W Monroe <ke...@canonical.com>


Project: http://git-wip-us.apache.org/repos/asf/bigtop/repo
Commit: http://git-wip-us.apache.org/repos/asf/bigtop/commit/05f14ffa
Tree: http://git-wip-us.apache.org/repos/asf/bigtop/tree/05f14ffa
Diff: http://git-wip-us.apache.org/repos/asf/bigtop/diff/05f14ffa

Branch: refs/heads/master
Commit: 05f14ffa56196f2aba92662155df1a076c17d1f2
Parents: 4cfd513
Author: Kevin W Monroe <ke...@canonical.com>
Authored: Thu Aug 2 13:25:38 2018 -0500
Committer: Kevin W Monroe <ke...@canonical.com>
Committed: Thu Aug 2 13:25:38 2018 -0500

----------------------------------------------------------------------
 .../charm/zookeeper/layer-zookeeper/config.yaml |  16 +
 .../layer-zookeeper/files/check_zookeeper.py    | 356 +++++++++++++++++++
 .../charm/zookeeper/layer-zookeeper/layer.yaml  |   2 +
 .../zookeeper/layer-zookeeper/metadata.yaml     |   6 +
 .../layer-zookeeper/reactive/zookeeper.py       | 102 +++++-
 5 files changed, 481 insertions(+), 1 deletion(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/bigtop/blob/05f14ffa/bigtop-packages/src/charm/zookeeper/layer-zookeeper/config.yaml
----------------------------------------------------------------------
diff --git a/bigtop-packages/src/charm/zookeeper/layer-zookeeper/config.yaml b/bigtop-packages/src/charm/zookeeper/layer-zookeeper/config.yaml
index df9af76..63f566c 100644
--- a/bigtop-packages/src/charm/zookeeper/layer-zookeeper/config.yaml
+++ b/bigtop-packages/src/charm/zookeeper/layer-zookeeper/config.yaml
@@ -23,3 +23,19 @@ options:
       snapRetainCount most recent snapshots and the corresponding
       transaction logs in the dataDir and dataLogDir respectively
       and deletes the rest. Defaults to 3. Minimum value is 3.
+  nagios_context:
+    default: "juju"
+    type: string
+    description: |
+      Used by the nrpe subordinate charms.
+      A string that will be prepended to instance name to set the host name
+      in nagios. So for instance the hostname would be something like:
+          juju-myservice-0
+      If you're running multiple environments with the same services in them
+      this allows you to differentiate between them.
+  nagios_servicegroups:
+    default: ""
+    type: string
+    description: |
+      A comma-separated list of nagios servicegroups.
+      If left empty, the nagios_context will be used as the servicegroup

http://git-wip-us.apache.org/repos/asf/bigtop/blob/05f14ffa/bigtop-packages/src/charm/zookeeper/layer-zookeeper/files/check_zookeeper.py
----------------------------------------------------------------------
diff --git a/bigtop-packages/src/charm/zookeeper/layer-zookeeper/files/check_zookeeper.py b/bigtop-packages/src/charm/zookeeper/layer-zookeeper/files/check_zookeeper.py
new file mode 100644
index 0000000..923ccef
--- /dev/null
+++ b/bigtop-packages/src/charm/zookeeper/layer-zookeeper/files/check_zookeeper.py
@@ -0,0 +1,356 @@
+#! /usr/bin/env python
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" Check Zookeeper Cluster
+
+Generic monitoring script that could be used with multiple platforms (Ganglia, Nagios, Cacti).
+
+It requires ZooKeeper 3.4.0 or greater. The script needs the 'mntr' 4letter word
+command (patch ZOOKEEPER-744) that was now commited to the trunk.
+The script also works with ZooKeeper 3.3.x but in a limited way.
+
+Taken from https://github.com/andreisavu/zookeeper-monitoring/
+
+"""
+
+import sys
+import socket
+import logging
+import re
+import subprocess
+
+from StringIO import StringIO
+from optparse import OptionParser, OptionGroup
+
+__version__ = (0, 1, 0)
+
+log = logging.getLogger()
+logging.basicConfig(level=logging.ERROR)
+
+class NagiosHandler(object):
+
+    @classmethod
+    def register_options(cls, parser):
+        group = OptionGroup(parser, 'Nagios specific options')
+
+        group.add_option('-w', '--warning', dest='warning')
+        group.add_option('-c', '--critical', dest='critical')
+
+        parser.add_option_group(group)
+
+    def analyze(self, opts, cluster_stats):
+        try:
+            warning = int(opts.warning)
+            critical = int(opts.critical)
+
+        except (TypeError, ValueError):
+            print >>sys.stderr, 'Invalid values for "warning" and "critical".'
+            return 2
+
+        if opts.key is None:
+            print >>sys.stderr, 'You should specify a key name.'
+            return 2
+
+        warning_state, critical_state, values = [], [], []
+        for host, stats in cluster_stats.items():
+            if opts.key in stats:
+
+                value = stats[opts.key]
+                values.append('%s=%s;%s;%s' % (host, value, warning, critical))
+
+                if warning >= value > critical or warning <= value < critical:
+                    warning_state.append(host)
+
+                elif (warning < critical and critical <= value) or (warning > critical and critical >= value):
+                    critical_state.append(host)
+
+        values = ' '.join(values)
+        if critical_state:
+            print 'Critical "%s" %s!|%s' % (opts.key, ', '.join(critical_state), values)
+            return 2
+
+        elif warning_state:
+            print 'Warning "%s" %s!|%s' % (opts.key, ', '.join(warning_state), values)
+            return 1
+
+        else:
+            print 'Ok "%s"!|%s' % (opts.key, values)
+            return 0
+
+class CactiHandler(object):
+
+    @classmethod
+    def register_options(cls, parser):
+        group = OptionGroup(parser, 'Cacti specific options')
+
+        group.add_option('-l', '--leader', dest='leader',
+            action="store_true", help="only query the cluster leader")
+
+        parser.add_option_group(group)
+
+    def analyze(self, opts, cluster_stats):
+        if opts.key is None:
+            print >>sys.stderr, 'The key name is mandatory.'
+            return 1
+
+        if opts.leader is True:
+            try:
+                leader = [x for x in cluster_stats.values() \
+                    if x.get('zk_server_state', '') == 'leader'][0]
+
+            except IndexError:
+                print >>sys.stderr, 'No leader found.'
+                return 3
+
+            if opts.key in leader:
+                print leader[opts.key]
+                return 0
+
+            else:
+                print >>sys.stderr, 'Unknown key: "%s"' % opts.key
+                return 2
+        else:
+            for host, stats in cluster_stats.items():
+                if opts.key not in stats:
+                    continue
+
+                host = host.replace(':', '_')
+                print '%s:%s' % (host, stats[opts.key]),
+
+
+class GangliaHandler(object):
+
+    @classmethod
+    def register_options(cls, parser):
+        group = OptionGroup(parser, 'Ganglia specific options')
+
+        group.add_option('-g', '--gmetric', dest='gmetric',
+            default='/usr/bin/gmetric', help='ganglia gmetric binary '\
+            'location: /usr/bin/gmetric')
+
+        parser.add_option_group(group)
+
+    def call(self, *args, **kwargs):
+        subprocess.call(*args, **kwargs)
+
+    def analyze(self, opts, cluster_stats):
+        if len(cluster_stats) != 1:
+            print >>sys.stderr, 'Only allowed to monitor a single node.'
+            return 1
+
+        for host, stats in cluster_stats.items():
+            for k, v in stats.items():
+                try:
+                    self.call([opts.gmetric, '-n', k, '-v', str(int(v)), '-t', 'uint32'])
+                except (TypeError, ValueError):
+                    pass
+
+class ZooKeeperServer(object):
+
+    def __init__(self, host='localhost', port='2181', timeout=1):
+        self._address = (host, int(port))
+        self._timeout = timeout
+
+    def get_stats(self):
+        """ Get ZooKeeper server stats as a map """
+        data = self._send_cmd('mntr')
+        if data:
+            return self._parse(data)
+        else:
+            data = self._send_cmd('stat')
+            return self._parse_stat(data)
+
+    def _create_socket(self):
+        return socket.socket()
+
+    def _send_cmd(self, cmd):
+        """ Send a 4letter word command to the server """
+        s = self._create_socket()
+        s.settimeout(self._timeout)
+
+        s.connect(self._address)
+        s.send(cmd)
+
+        data = s.recv(2048)
+        s.close()
+
+        return data
+
+    def _parse(self, data):
+        """ Parse the output from the 'mntr' 4letter word command """
+        h = StringIO(data)
+
+        result = {}
+        for line in h.readlines():
+            try:
+                key, value = self._parse_line(line)
+                result[key] = value
+            except ValueError:
+                pass # ignore broken lines
+
+        return result
+
+    def _parse_stat(self, data):
+        """ Parse the output from the 'stat' 4letter word command """
+        h = StringIO(data)
+
+        result = {}
+
+        version = h.readline()
+        if version:
+            result['zk_version'] = version[version.index(':')+1:].strip()
+
+        # skip all lines until we find the empty one
+        while h.readline().strip(): pass
+
+        for line in h.readlines():
+            m = re.match('Latency min/avg/max: (\d+)/(\d+)/(\d+)', line)
+            if m is not None:
+                result['zk_min_latency'] = int(m.group(1))
+                result['zk_avg_latency'] = int(m.group(2))
+                result['zk_max_latency'] = int(m.group(3))
+                continue
+
+            m = re.match('Received: (\d+)', line)
+            if m is not None:
+                result['zk_packets_received'] = int(m.group(1))
+                continue
+
+            m = re.match('Sent: (\d+)', line)
+            if m is not None:
+                result['zk_packets_sent'] = int(m.group(1))
+                continue
+
+            m = re.match('Outstanding: (\d+)', line)
+            if m is not None:
+                result['zk_outstanding_requests'] = int(m.group(1))
+                continue
+
+            m = re.match('Mode: (.*)', line)
+            if m is not None:
+                result['zk_server_state'] = m.group(1)
+                continue
+
+            m = re.match('Node count: (\d+)', line)
+            if m is not None:
+                result['zk_znode_count'] = int(m.group(1))
+                continue
+
+        return result
+
+    def _parse_line(self, line):
+        try:
+            key, value = map(str.strip, line.split('\t'))
+        except ValueError:
+            raise ValueError('Found invalid line: %s' % line)
+
+        if not key:
+            raise ValueError('The key is mandatory and should not be empty')
+
+        try:
+            value = int(value)
+        except (TypeError, ValueError):
+            pass
+
+        return key, value
+
+def main():
+    opts, args = parse_cli()
+
+    cluster_stats = get_cluster_stats(opts.servers)
+    if opts.output is None:
+        dump_stats(cluster_stats)
+        return 0
+
+    handler = create_handler(opts.output)
+    if handler is None:
+        log.error('undefined handler: %s' % opts.output)
+        sys.exit(1)
+
+    return handler.analyze(opts, cluster_stats)
+
+def create_handler(name):
+    """ Return an instance of a platform specific analyzer """
+    try:
+        return globals()['%sHandler' % name.capitalize()]()
+    except KeyError:
+        return None
+
+def get_all_handlers():
+    """ Get a list containing all the platform specific analyzers """
+    return [NagiosHandler, CactiHandler, GangliaHandler]
+
+def dump_stats(cluster_stats):
+    """ Dump cluster statistics in an user friendly format """
+    for server, stats in cluster_stats.items():
+        print 'Server:', server
+
+        for key, value in stats.items():
+            print "%30s" % key, ' ', value
+        print
+
+def get_cluster_stats(servers):
+    """ Get stats for all the servers in the cluster """
+    stats = {}
+    for host, port in servers:
+        try:
+            zk = ZooKeeperServer(host, port)
+            stats["%s:%s" % (host, port)] = zk.get_stats()
+
+        except socket.error, e:
+            # ignore because the cluster can still work even
+            # if some servers fail completely
+
+            # this error should be also visible in a variable
+            # exposed by the server in the statistics
+
+            logging.info('unable to connect to server '\
+                '"%s" on port "%s"' % (host, port))
+
+    return stats
+
+
+def get_version():
+    return '.'.join(map(str, __version__))
+
+
+def parse_cli():
+    parser = OptionParser(usage='./check_zookeeper.py <options>', version=get_version())
+
+    parser.add_option('-s', '--servers', dest='servers',
+        help='a list of SERVERS', metavar='SERVERS')
+
+    parser.add_option('-o', '--output', dest='output',
+        help='output HANDLER: nagios, ganglia, cacti', metavar='HANDLER')
+
+    parser.add_option('-k', '--key', dest='key')
+
+    for handler in get_all_handlers():
+        handler.register_options(parser)
+
+    opts, args = parser.parse_args()
+
+    if opts.servers is None:
+        parser.error('The list of servers is mandatory')
+
+    opts.servers = [s.split(':') for s in opts.servers.split(',')]
+
+    return (opts, args)
+
+
+if __name__ == '__main__':
+    sys.exit(main())

http://git-wip-us.apache.org/repos/asf/bigtop/blob/05f14ffa/bigtop-packages/src/charm/zookeeper/layer-zookeeper/layer.yaml
----------------------------------------------------------------------
diff --git a/bigtop-packages/src/charm/zookeeper/layer-zookeeper/layer.yaml b/bigtop-packages/src/charm/zookeeper/layer-zookeeper/layer.yaml
index 7f6ee76..e52afc8 100644
--- a/bigtop-packages/src/charm/zookeeper/layer-zookeeper/layer.yaml
+++ b/bigtop-packages/src/charm/zookeeper/layer-zookeeper/layer.yaml
@@ -4,6 +4,8 @@ includes:
   - 'layer:leadership'
   - 'interface:zookeeper-quorum'
   - 'interface:zookeeper'
+  - 'interface:nrpe-external-master'
+  - 'interface:local-monitors'
 options:
   apache-bigtop-base:
     ports:

http://git-wip-us.apache.org/repos/asf/bigtop/blob/05f14ffa/bigtop-packages/src/charm/zookeeper/layer-zookeeper/metadata.yaml
----------------------------------------------------------------------
diff --git a/bigtop-packages/src/charm/zookeeper/layer-zookeeper/metadata.yaml b/bigtop-packages/src/charm/zookeeper/layer-zookeeper/metadata.yaml
index a563775..36dce42 100644
--- a/bigtop-packages/src/charm/zookeeper/layer-zookeeper/metadata.yaml
+++ b/bigtop-packages/src/charm/zookeeper/layer-zookeeper/metadata.yaml
@@ -14,6 +14,12 @@ tags: []
 provides:
   zookeeper:
     interface: zookeeper
+  nrpe-external-master:
+    interface: nrpe-external-master
+    scope: container
+  local-monitors:
+    interface: local-monitors
+    scope: container
 peers:
   zkpeer:
     interface: zookeeper-quorum

http://git-wip-us.apache.org/repos/asf/bigtop/blob/05f14ffa/bigtop-packages/src/charm/zookeeper/layer-zookeeper/reactive/zookeeper.py
----------------------------------------------------------------------
diff --git a/bigtop-packages/src/charm/zookeeper/layer-zookeeper/reactive/zookeeper.py b/bigtop-packages/src/charm/zookeeper/layer-zookeeper/reactive/zookeeper.py
index 0cf11c2..5af4c5d 100644
--- a/bigtop-packages/src/charm/zookeeper/layer-zookeeper/reactive/zookeeper.py
+++ b/bigtop-packages/src/charm/zookeeper/layer-zookeeper/reactive/zookeeper.py
@@ -19,8 +19,108 @@ from charmhelpers.core import hookenv
 from charms.layer.apache_bigtop_base import get_package_version
 from charms.layer.bigtop_zookeeper import Zookeeper
 from charms.leadership import leader_set, leader_get
-from charms.reactive import set_state, when, when_not, is_state
+from charms.reactive import (
+    hook,
+    is_state,
+    remove_state,
+    set_state,
+    when,
+    when_not
+)
 from charms.reactive.helpers import data_changed
+import shutil
+import os
+
+
+@when('local-monitors.available')
+def local_monitors_available(nagios):
+    setup_nagios(nagios)
+
+
+@when('nrpe-external-master.available')
+def nrpe_external_master_available(nagios):
+    setup_nagios(nagios)
+
+
+def setup_nagios(nagios):
+    config = hookenv.config()
+    unit_name = hookenv.local_unit()
+    checks = [
+        {
+            'name': 'zk_open_file_descriptor_coun',
+            'description': 'ZK_Open_File_Descriptors_Count',
+            'warn': 500,
+            'crit': 800
+         },
+        {
+            'name': 'zk_ephemerals_count',
+            'description': 'ZK_Ephemerals_Count',
+            'warn': 10000,
+            'crit': 100000
+         },
+        {
+            'name': 'zk_avg_latency',
+            'description': 'ZK_Avg_Latency',
+            'warn': 500,
+            'crit': 1000
+         },
+        {
+            'name': 'zk_max_latency',
+            'description': 'ZK_Max_Latency',
+            'warn': 1000,
+            'crit': 2000
+         },
+        {
+            'name': 'zk_min_latency',
+            'description': 'ZK_Min_Latency',
+            'warn': 500,
+            'crit': 1000
+         },
+        {
+            'name': 'zk_outstanding_requests',
+            'description': 'ZK_Outstanding_Requests',
+            'warn': 20,
+            'crit': 50
+         },
+        {
+            'name': 'zk_watch_count',
+            'description': 'ZK_Watch_Count',
+            'warn': 100,
+            'crit': 500
+         },
+    ]
+    check_cmd = ['/usr/local/lib/nagios/plugins/check_zookeeper.py',
+                 '-o', 'nagios', '-s', 'localhost:2181']
+    for check in checks:
+        nagios.add_check(check_cmd + ['--key', check['name'],
+                                      '-w', str(check['warn']),
+                                      '-c', str(check['crit'])],
+                         name=check['name'],
+                         description=check['description'],
+                         context=config["nagios_context"],
+                         servicegroups=config["nagios_servicegroups"],
+                         unit=unit_name
+                         )
+    nagios.updated()
+
+
+@hook('upgrade-charm')
+def nrpe_helper_upgrade_charm():
+    # Make sure the nrpe handler will get replaced at charm upgrade
+    remove_state('zookeeper.nrpe_helper.installed')
+
+
+@when('zookeeper.nrpe_helper.registered')
+@when_not('zookeeper.nrpe_helper.installed')
+def install_nrpe_helper():
+    dst_dir = '/usr/local/lib/nagios/plugins/'
+    if not os.path.exists(dst_dir):
+        os.makedirs(dst_dir)
+    src = '{}/files/check_zookeeper.py'.format(hookenv.charm_dir())
+    dst = '{}/check_zookeeper.py'.format(dst_dir)
+    shutil.copy(src, dst)
+    os.chmod(dst, 0o755)
+    set_state('zookeeper.nrpe_helper.installed')
 
 
 @when('bigtop.available')