You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@bigtop.apache.org by kw...@apache.org on 2018/08/02 18:26:59 UTC
bigtop git commit: BIGTOP-3047: juju: nagios support for zookeeper
Repository: bigtop
Updated Branches:
refs/heads/master 4cfd51332 -> 05f14ffa5
BIGTOP-3047: juju: nagios support for zookeeper
Closes #372
Signed-off-by: Kevin W Monroe <ke...@canonical.com>
Project: http://git-wip-us.apache.org/repos/asf/bigtop/repo
Commit: http://git-wip-us.apache.org/repos/asf/bigtop/commit/05f14ffa
Tree: http://git-wip-us.apache.org/repos/asf/bigtop/tree/05f14ffa
Diff: http://git-wip-us.apache.org/repos/asf/bigtop/diff/05f14ffa
Branch: refs/heads/master
Commit: 05f14ffa56196f2aba92662155df1a076c17d1f2
Parents: 4cfd513
Author: Kevin W Monroe <ke...@canonical.com>
Authored: Thu Aug 2 13:25:38 2018 -0500
Committer: Kevin W Monroe <ke...@canonical.com>
Committed: Thu Aug 2 13:25:38 2018 -0500
----------------------------------------------------------------------
.../charm/zookeeper/layer-zookeeper/config.yaml | 16 +
.../layer-zookeeper/files/check_zookeeper.py | 356 +++++++++++++++++++
.../charm/zookeeper/layer-zookeeper/layer.yaml | 2 +
.../zookeeper/layer-zookeeper/metadata.yaml | 6 +
.../layer-zookeeper/reactive/zookeeper.py | 102 +++++-
5 files changed, 481 insertions(+), 1 deletion(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/bigtop/blob/05f14ffa/bigtop-packages/src/charm/zookeeper/layer-zookeeper/config.yaml
----------------------------------------------------------------------
diff --git a/bigtop-packages/src/charm/zookeeper/layer-zookeeper/config.yaml b/bigtop-packages/src/charm/zookeeper/layer-zookeeper/config.yaml
index df9af76..63f566c 100644
--- a/bigtop-packages/src/charm/zookeeper/layer-zookeeper/config.yaml
+++ b/bigtop-packages/src/charm/zookeeper/layer-zookeeper/config.yaml
@@ -23,3 +23,19 @@ options:
snapRetainCount most recent snapshots and the corresponding
transaction logs in the dataDir and dataLogDir respectively
and deletes the rest. Defaults to 3. Minimum value is 3.
+ nagios_context:
+ default: "juju"
+ type: string
+ description: |
+ Used by the nrpe subordinate charms.
+ A string that will be prepended to instance name to set the host name
+ in nagios. So for instance the hostname would be something like:
+ juju-myservice-0
+ If you're running multiple environments with the same services in them
+ this allows you to differentiate between them.
+ nagios_servicegroups:
+ default: ""
+ type: string
+ description: |
+ A comma-separated list of nagios servicegroups.
+ If left empty, the nagios_context will be used as the servicegroup
http://git-wip-us.apache.org/repos/asf/bigtop/blob/05f14ffa/bigtop-packages/src/charm/zookeeper/layer-zookeeper/files/check_zookeeper.py
----------------------------------------------------------------------
diff --git a/bigtop-packages/src/charm/zookeeper/layer-zookeeper/files/check_zookeeper.py b/bigtop-packages/src/charm/zookeeper/layer-zookeeper/files/check_zookeeper.py
new file mode 100644
index 0000000..923ccef
--- /dev/null
+++ b/bigtop-packages/src/charm/zookeeper/layer-zookeeper/files/check_zookeeper.py
@@ -0,0 +1,356 @@
+#! /usr/bin/env python
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements. See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" Check Zookeeper Cluster
+
+Generic monitoring script that could be used with multiple platforms (Ganglia, Nagios, Cacti).
+
+It requires ZooKeeper 3.4.0 or greater. The script needs the 'mntr' 4letter word
+command (patch ZOOKEEPER-744) that was now commited to the trunk.
+The script also works with ZooKeeper 3.3.x but in a limited way.
+
+Taken from https://github.com/andreisavu/zookeeper-monitoring/
+
+"""
+
+import sys
+import socket
+import logging
+import re
+import subprocess
+
+from StringIO import StringIO
+from optparse import OptionParser, OptionGroup
+
+__version__ = (0, 1, 0)
+
+log = logging.getLogger()
+logging.basicConfig(level=logging.ERROR)
+
+class NagiosHandler(object):
+
+ @classmethod
+ def register_options(cls, parser):
+ group = OptionGroup(parser, 'Nagios specific options')
+
+ group.add_option('-w', '--warning', dest='warning')
+ group.add_option('-c', '--critical', dest='critical')
+
+ parser.add_option_group(group)
+
+ def analyze(self, opts, cluster_stats):
+ try:
+ warning = int(opts.warning)
+ critical = int(opts.critical)
+
+ except (TypeError, ValueError):
+ print >>sys.stderr, 'Invalid values for "warning" and "critical".'
+ return 2
+
+ if opts.key is None:
+ print >>sys.stderr, 'You should specify a key name.'
+ return 2
+
+ warning_state, critical_state, values = [], [], []
+ for host, stats in cluster_stats.items():
+ if opts.key in stats:
+
+ value = stats[opts.key]
+ values.append('%s=%s;%s;%s' % (host, value, warning, critical))
+
+ if warning >= value > critical or warning <= value < critical:
+ warning_state.append(host)
+
+ elif (warning < critical and critical <= value) or (warning > critical and critical >= value):
+ critical_state.append(host)
+
+ values = ' '.join(values)
+ if critical_state:
+ print 'Critical "%s" %s!|%s' % (opts.key, ', '.join(critical_state), values)
+ return 2
+
+ elif warning_state:
+ print 'Warning "%s" %s!|%s' % (opts.key, ', '.join(warning_state), values)
+ return 1
+
+ else:
+ print 'Ok "%s"!|%s' % (opts.key, values)
+ return 0
+
+class CactiHandler(object):
+
+ @classmethod
+ def register_options(cls, parser):
+ group = OptionGroup(parser, 'Cacti specific options')
+
+ group.add_option('-l', '--leader', dest='leader',
+ action="store_true", help="only query the cluster leader")
+
+ parser.add_option_group(group)
+
+ def analyze(self, opts, cluster_stats):
+ if opts.key is None:
+ print >>sys.stderr, 'The key name is mandatory.'
+ return 1
+
+ if opts.leader is True:
+ try:
+ leader = [x for x in cluster_stats.values() \
+ if x.get('zk_server_state', '') == 'leader'][0]
+
+ except IndexError:
+ print >>sys.stderr, 'No leader found.'
+ return 3
+
+ if opts.key in leader:
+ print leader[opts.key]
+ return 0
+
+ else:
+ print >>sys.stderr, 'Unknown key: "%s"' % opts.key
+ return 2
+ else:
+ for host, stats in cluster_stats.items():
+ if opts.key not in stats:
+ continue
+
+ host = host.replace(':', '_')
+ print '%s:%s' % (host, stats[opts.key]),
+
+
+class GangliaHandler(object):
+
+ @classmethod
+ def register_options(cls, parser):
+ group = OptionGroup(parser, 'Ganglia specific options')
+
+ group.add_option('-g', '--gmetric', dest='gmetric',
+ default='/usr/bin/gmetric', help='ganglia gmetric binary '\
+ 'location: /usr/bin/gmetric')
+
+ parser.add_option_group(group)
+
+ def call(self, *args, **kwargs):
+ subprocess.call(*args, **kwargs)
+
+ def analyze(self, opts, cluster_stats):
+ if len(cluster_stats) != 1:
+ print >>sys.stderr, 'Only allowed to monitor a single node.'
+ return 1
+
+ for host, stats in cluster_stats.items():
+ for k, v in stats.items():
+ try:
+ self.call([opts.gmetric, '-n', k, '-v', str(int(v)), '-t', 'uint32'])
+ except (TypeError, ValueError):
+ pass
+
+class ZooKeeperServer(object):
+
+ def __init__(self, host='localhost', port='2181', timeout=1):
+ self._address = (host, int(port))
+ self._timeout = timeout
+
+ def get_stats(self):
+ """ Get ZooKeeper server stats as a map """
+ data = self._send_cmd('mntr')
+ if data:
+ return self._parse(data)
+ else:
+ data = self._send_cmd('stat')
+ return self._parse_stat(data)
+
+ def _create_socket(self):
+ return socket.socket()
+
+ def _send_cmd(self, cmd):
+ """ Send a 4letter word command to the server """
+ s = self._create_socket()
+ s.settimeout(self._timeout)
+
+ s.connect(self._address)
+ s.send(cmd)
+
+ data = s.recv(2048)
+ s.close()
+
+ return data
+
+ def _parse(self, data):
+ """ Parse the output from the 'mntr' 4letter word command """
+ h = StringIO(data)
+
+ result = {}
+ for line in h.readlines():
+ try:
+ key, value = self._parse_line(line)
+ result[key] = value
+ except ValueError:
+ pass # ignore broken lines
+
+ return result
+
+ def _parse_stat(self, data):
+ """ Parse the output from the 'stat' 4letter word command """
+ h = StringIO(data)
+
+ result = {}
+
+ version = h.readline()
+ if version:
+ result['zk_version'] = version[version.index(':')+1:].strip()
+
+ # skip all lines until we find the empty one
+ while h.readline().strip(): pass
+
+ for line in h.readlines():
+ m = re.match('Latency min/avg/max: (\d+)/(\d+)/(\d+)', line)
+ if m is not None:
+ result['zk_min_latency'] = int(m.group(1))
+ result['zk_avg_latency'] = int(m.group(2))
+ result['zk_max_latency'] = int(m.group(3))
+ continue
+
+ m = re.match('Received: (\d+)', line)
+ if m is not None:
+ result['zk_packets_received'] = int(m.group(1))
+ continue
+
+ m = re.match('Sent: (\d+)', line)
+ if m is not None:
+ result['zk_packets_sent'] = int(m.group(1))
+ continue
+
+ m = re.match('Outstanding: (\d+)', line)
+ if m is not None:
+ result['zk_outstanding_requests'] = int(m.group(1))
+ continue
+
+ m = re.match('Mode: (.*)', line)
+ if m is not None:
+ result['zk_server_state'] = m.group(1)
+ continue
+
+ m = re.match('Node count: (\d+)', line)
+ if m is not None:
+ result['zk_znode_count'] = int(m.group(1))
+ continue
+
+ return result
+
+ def _parse_line(self, line):
+ try:
+ key, value = map(str.strip, line.split('\t'))
+ except ValueError:
+ raise ValueError('Found invalid line: %s' % line)
+
+ if not key:
+ raise ValueError('The key is mandatory and should not be empty')
+
+ try:
+ value = int(value)
+ except (TypeError, ValueError):
+ pass
+
+ return key, value
+
+def main():
+ opts, args = parse_cli()
+
+ cluster_stats = get_cluster_stats(opts.servers)
+ if opts.output is None:
+ dump_stats(cluster_stats)
+ return 0
+
+ handler = create_handler(opts.output)
+ if handler is None:
+ log.error('undefined handler: %s' % opts.output)
+ sys.exit(1)
+
+ return handler.analyze(opts, cluster_stats)
+
+def create_handler(name):
+ """ Return an instance of a platform specific analyzer """
+ try:
+ return globals()['%sHandler' % name.capitalize()]()
+ except KeyError:
+ return None
+
+def get_all_handlers():
+ """ Get a list containing all the platform specific analyzers """
+ return [NagiosHandler, CactiHandler, GangliaHandler]
+
+def dump_stats(cluster_stats):
+ """ Dump cluster statistics in an user friendly format """
+ for server, stats in cluster_stats.items():
+ print 'Server:', server
+
+ for key, value in stats.items():
+ print "%30s" % key, ' ', value
+ print
+
+def get_cluster_stats(servers):
+ """ Get stats for all the servers in the cluster """
+ stats = {}
+ for host, port in servers:
+ try:
+ zk = ZooKeeperServer(host, port)
+ stats["%s:%s" % (host, port)] = zk.get_stats()
+
+ except socket.error, e:
+ # ignore because the cluster can still work even
+ # if some servers fail completely
+
+ # this error should be also visible in a variable
+ # exposed by the server in the statistics
+
+ logging.info('unable to connect to server '\
+ '"%s" on port "%s"' % (host, port))
+
+ return stats
+
+
+def get_version():
+ return '.'.join(map(str, __version__))
+
+
+def parse_cli():
+ parser = OptionParser(usage='./check_zookeeper.py <options>', version=get_version())
+
+ parser.add_option('-s', '--servers', dest='servers',
+ help='a list of SERVERS', metavar='SERVERS')
+
+ parser.add_option('-o', '--output', dest='output',
+ help='output HANDLER: nagios, ganglia, cacti', metavar='HANDLER')
+
+ parser.add_option('-k', '--key', dest='key')
+
+ for handler in get_all_handlers():
+ handler.register_options(parser)
+
+ opts, args = parser.parse_args()
+
+ if opts.servers is None:
+ parser.error('The list of servers is mandatory')
+
+ opts.servers = [s.split(':') for s in opts.servers.split(',')]
+
+ return (opts, args)
+
+
+if __name__ == '__main__':
+ sys.exit(main())
http://git-wip-us.apache.org/repos/asf/bigtop/blob/05f14ffa/bigtop-packages/src/charm/zookeeper/layer-zookeeper/layer.yaml
----------------------------------------------------------------------
diff --git a/bigtop-packages/src/charm/zookeeper/layer-zookeeper/layer.yaml b/bigtop-packages/src/charm/zookeeper/layer-zookeeper/layer.yaml
index 7f6ee76..e52afc8 100644
--- a/bigtop-packages/src/charm/zookeeper/layer-zookeeper/layer.yaml
+++ b/bigtop-packages/src/charm/zookeeper/layer-zookeeper/layer.yaml
@@ -4,6 +4,8 @@ includes:
- 'layer:leadership'
- 'interface:zookeeper-quorum'
- 'interface:zookeeper'
+ - 'interface:nrpe-external-master'
+ - 'interface:local-monitors'
options:
apache-bigtop-base:
ports:
http://git-wip-us.apache.org/repos/asf/bigtop/blob/05f14ffa/bigtop-packages/src/charm/zookeeper/layer-zookeeper/metadata.yaml
----------------------------------------------------------------------
diff --git a/bigtop-packages/src/charm/zookeeper/layer-zookeeper/metadata.yaml b/bigtop-packages/src/charm/zookeeper/layer-zookeeper/metadata.yaml
index a563775..36dce42 100644
--- a/bigtop-packages/src/charm/zookeeper/layer-zookeeper/metadata.yaml
+++ b/bigtop-packages/src/charm/zookeeper/layer-zookeeper/metadata.yaml
@@ -14,6 +14,12 @@ tags: []
provides:
zookeeper:
interface: zookeeper
+ nrpe-external-master:
+ interface: nrpe-external-master
+ scope: container
+ local-monitors:
+ interface: local-monitors
+ scope: container
peers:
zkpeer:
interface: zookeeper-quorum
http://git-wip-us.apache.org/repos/asf/bigtop/blob/05f14ffa/bigtop-packages/src/charm/zookeeper/layer-zookeeper/reactive/zookeeper.py
----------------------------------------------------------------------
diff --git a/bigtop-packages/src/charm/zookeeper/layer-zookeeper/reactive/zookeeper.py b/bigtop-packages/src/charm/zookeeper/layer-zookeeper/reactive/zookeeper.py
index 0cf11c2..5af4c5d 100644
--- a/bigtop-packages/src/charm/zookeeper/layer-zookeeper/reactive/zookeeper.py
+++ b/bigtop-packages/src/charm/zookeeper/layer-zookeeper/reactive/zookeeper.py
@@ -19,8 +19,108 @@ from charmhelpers.core import hookenv
from charms.layer.apache_bigtop_base import get_package_version
from charms.layer.bigtop_zookeeper import Zookeeper
from charms.leadership import leader_set, leader_get
-from charms.reactive import set_state, when, when_not, is_state
+from charms.reactive import (
+ hook,
+ is_state,
+ remove_state,
+ set_state,
+ when,
+ when_not
+)
from charms.reactive.helpers import data_changed
+import shutil
+import os
+
+
+@when('local-monitors.available')
+def local_monitors_available(nagios):
+ setup_nagios(nagios)
+
+
+@when('nrpe-external-master.available')
+def nrpe_external_master_available(nagios):
+ setup_nagios(nagios)
+
+
+def setup_nagios(nagios):
+ config = hookenv.config()
+ unit_name = hookenv.local_unit()
+ checks = [
+ {
+ 'name': 'zk_open_file_descriptor_coun',
+ 'description': 'ZK_Open_File_Descriptors_Count',
+ 'warn': 500,
+ 'crit': 800
+ },
+ {
+ 'name': 'zk_ephemerals_count',
+ 'description': 'ZK_Ephemerals_Count',
+ 'warn': 10000,
+ 'crit': 100000
+ },
+ {
+ 'name': 'zk_avg_latency',
+ 'description': 'ZK_Avg_Latency',
+ 'warn': 500,
+ 'crit': 1000
+ },
+ {
+ 'name': 'zk_max_latency',
+ 'description': 'ZK_Max_Latency',
+ 'warn': 1000,
+ 'crit': 2000
+ },
+ {
+ 'name': 'zk_min_latency',
+ 'description': 'ZK_Min_Latency',
+ 'warn': 500,
+ 'crit': 1000
+ },
+ {
+ 'name': 'zk_outstanding_requests',
+ 'description': 'ZK_Outstanding_Requests',
+ 'warn': 20,
+ 'crit': 50
+ },
+ {
+ 'name': 'zk_watch_count',
+ 'description': 'ZK_Watch_Count',
+ 'warn': 100,
+ 'crit': 500
+ },
+ ]
+ check_cmd = ['/usr/local/lib/nagios/plugins/check_zookeeper.py',
+ '-o', 'nagios', '-s', 'localhost:2181']
+ for check in checks:
+ nagios.add_check(check_cmd + ['--key', check['name'],
+ '-w', str(check['warn']),
+ '-c', str(check['crit'])],
+ name=check['name'],
+ description=check['description'],
+ context=config["nagios_context"],
+ servicegroups=config["nagios_servicegroups"],
+ unit=unit_name
+ )
+ nagios.updated()
+
+
+@hook('upgrade-charm')
+def nrpe_helper_upgrade_charm():
+ # Make sure the nrpe handler will get replaced at charm upgrade
+ remove_state('zookeeper.nrpe_helper.installed')
+
+
+@when('zookeeper.nrpe_helper.registered')
+@when_not('zookeeper.nrpe_helper.installed')
+def install_nrpe_helper():
+ dst_dir = '/usr/local/lib/nagios/plugins/'
+ if not os.path.exists(dst_dir):
+ os.makedirs(dst_dir)
+ src = '{}/files/check_zookeeper.py'.format(hookenv.charm_dir())
+ dst = '{}/check_zookeeper.py'.format(dst_dir)
+ shutil.copy(src, dst)
+ os.chmod(dst, 0o755)
+ set_state('zookeeper.nrpe_helper.installed')
@when('bigtop.available')