You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@accumulo.apache.org by ec...@apache.org on 2015/10/20 23:03:19 UTC

accumulo git commit: ACCUMULO-4034 re-write the agitator to include zookeeper stop/start

Repository: accumulo
Updated Branches:
  refs/heads/master 39f14472b -> 43f4464b3


ACCUMULO-4034 re-write the agitator to include zookeeper stop/start


Project: http://git-wip-us.apache.org/repos/asf/accumulo/repo
Commit: http://git-wip-us.apache.org/repos/asf/accumulo/commit/43f4464b
Tree: http://git-wip-us.apache.org/repos/asf/accumulo/tree/43f4464b
Diff: http://git-wip-us.apache.org/repos/asf/accumulo/diff/43f4464b

Branch: refs/heads/master
Commit: 43f4464b38d9c380ed882069da0e76579b0db7a4
Parents: 39f1447
Author: Eric C. Newton <er...@gmail.com>
Authored: Tue Oct 20 17:03:05 2015 -0400
Committer: Eric C. Newton <er...@gmail.com>
Committed: Tue Oct 20 17:03:05 2015 -0400

----------------------------------------------------------------------
 test/system/agitator/.gitignore           |   3 +
 test/system/agitator/README.md            |  39 +++++
 test/system/agitator/agitator.ini.example |  48 +++++
 test/system/agitator/agitator.py          | 234 +++++++++++++++++++++++++
 test/system/agitator/hosts.example        |  16 ++
 5 files changed, 340 insertions(+)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/accumulo/blob/43f4464b/test/system/agitator/.gitignore
----------------------------------------------------------------------
diff --git a/test/system/agitator/.gitignore b/test/system/agitator/.gitignore
new file mode 100644
index 0000000..3429b01
--- /dev/null
+++ b/test/system/agitator/.gitignore
@@ -0,0 +1,3 @@
+*~
+*.ini
+*.pyc

http://git-wip-us.apache.org/repos/asf/accumulo/blob/43f4464b/test/system/agitator/README.md
----------------------------------------------------------------------
diff --git a/test/system/agitator/README.md b/test/system/agitator/README.md
new file mode 100644
index 0000000..fdff65b
--- /dev/null
+++ b/test/system/agitator/README.md
@@ -0,0 +1,39 @@
+<!--
+Licensed to the Apache Software Foundation (ASF) under one or more
+contributor license agreements.  See the NOTICE file distributed with
+this work for additional information regarding copyright ownership.
+The ASF licenses this file to You under the Apache License, Version 2.0
+(the "License"); you may not use this file except in compliance with
+the License.  You may obtain a copy of the License at 
+ 
+    http://www.apache.org/licenses/LICENSE-2.0
+ 
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+-->
+
+Agitator: randomly kill processes
+===========================
+
+The agitator is used to randomly select processes for termination during
+system test.
+
+Configure the agitator using the example agitator.ini file provided.
+
+Create a list of hosts to be agitated:
+
+	$ cp ../../../conf/slaves hosts
+	$ echo master >> hosts
+	$ echo namenode >> hosts
+
+The agitator can be used to kill and restart any part of the accumulo
+ecosystem: zookeepers, namenode, datanodes, tablet servers and master.
+You can choose to agitate them all with "--all"
+
+	$ ./agitator.py --all --hosts=hosts --config=agitator.ini --log DEBUG
+
+You will need to be able to ssh, without passwords, to all your hosts as 
+the user that can kill and start the services.

http://git-wip-us.apache.org/repos/asf/accumulo/blob/43f4464b/test/system/agitator/agitator.ini.example
----------------------------------------------------------------------
diff --git a/test/system/agitator/agitator.ini.example b/test/system/agitator/agitator.ini.example
new file mode 100644
index 0000000..825f0ed
--- /dev/null
+++ b/test/system/agitator/agitator.ini.example
@@ -0,0 +1,48 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+[DEFAULT]
+install=%(env.pwd)s/../../../..
+user=%(env.user)s
+
+[agitator]
+kill=kill -9
+ssh=ssh -q -t -A -o StrictHostKeyChecking=no
+sleep=300
+sleep.restart=30
+sleep.jitter=30
+
+[accumulo]
+home=%(install)s/accumulo
+tserver.kill.min=1
+tserver.kill.max=1
+tserver.frequency=0.8
+master.kill.min=1
+master.kill.max=1
+master.frequency=0.1
+
+[hadoop]
+home=%(install)s/hadoop
+bin=%(home)s/bin
+datanode.frequency=0.8
+datanode.kill.min=1
+datanode.kill.max=1
+namenode.frequency=0.05
+namenode.kill.min=1
+namenode.kill.max=1
+
+[zookeeper]
+home=%(install)s/zookeeper
+frequency=0.05

http://git-wip-us.apache.org/repos/asf/accumulo/blob/43f4464b/test/system/agitator/agitator.py
----------------------------------------------------------------------
diff --git a/test/system/agitator/agitator.py b/test/system/agitator/agitator.py
new file mode 100755
index 0000000..92b0b28
--- /dev/null
+++ b/test/system/agitator/agitator.py
@@ -0,0 +1,234 @@
+#! /usr/bin/python
+
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import random
+import logging
+import ConfigParser
+
+# add the environment variables as default settings
+import os
+defaults=dict([('env.' + k, v) for k, v in os.environ.iteritems()])
+config = ConfigParser.ConfigParser(defaults)
+
+# things you can do to a particular kind of process
+class Proc:
+   program = 'Unknown'
+   _frequencyToKill = 1.0
+
+   def start(self, host):
+       pass
+
+   def find(self, host):
+       pass
+
+   def numberToKill(self):
+       return (1, 1)
+
+   def frequencyToKill(self):
+       return self._frequencyToKill
+
+   def user(self):
+       return config.get(self.program, 'user')
+
+   def kill(self, host, pid):
+      kill = config.get('agitator', 'kill').split()
+      code, stdout, stderr = self.runOn(host, kill + [pid])
+      if code != 0:
+         raise logging.warn("Unable to kill %d on %s (%s)", pid, host, stderr)
+
+   def runOn(self, host, cmd):
+      ssh = config.get('agitator', 'ssh').split()
+      return self.run(ssh + ["%s@%s" % (self.user(), host)] + cmd)
+
+   def run(self, cmd):
+      import subprocess
+      cmd = map(str, cmd)
+      logging.debug('Running %s', cmd)
+      p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+      stdout, stderr = p.communicate()
+      if stdout.strip():
+         logging.debug("%s", stdout.strip())
+      if stderr.strip():
+         logging.error("%s", stderr.strip())
+      if p.returncode != 0:
+         logging.error("Problem running %s", ' '.join(cmd))
+      return p.returncode, stdout, stderr
+
+   def __repr__(self):
+      return self.program
+
+class Zookeeper(Proc):
+   program = 'zookeeper'
+   def __init__(self):
+      _frequencyToKill = config.get(self.program, 'frequency')
+
+   def start(self, host):
+      self.runOn(host, [config.get(self.program, 'home') + '/bin/zkServer.sh start'])
+
+   def find(self, host):
+     code, stdout, stderr = self.runOn(host, ['pgrep -f [Q]uorumPeerMain || true'])
+     return map(int, [line for line in stdout.split("\n") if line])
+
+class Hadoop(Proc):
+   section = 'hadoop'
+   def __init__(self, program):
+      self.program = program
+      self._frequencyToKill = config.getfloat(self.section, program + '.frequency')
+      self.minimumToKill = config.getint(self.section, program + '.kill.min')
+      self.maximumToKill = config.getint(self.section, program + '.kill.max')
+
+   def start(self, host):
+      binDir = config.get(self.section, 'bin')
+      self.runOn(host, ['nohup', binDir + "/hdfs", self.program, '&'])
+     
+   def find(self, host):
+      code, stdout, stderr = self.runOn(host, ["pgrep -f 'proc[_]%s' || true" % (self.program,)])
+      return map(int, [line for line in stdout.split("\n") if line])
+
+   def numberToKill(self):
+      return (self.minimumToKill, self.maximumToKill)
+
+   def user(self):
+      return config.get(self.section, 'user')
+
+class Accumulo(Hadoop):
+   section = 'accumulo'
+   def start(self, host):
+      home = config.get(self.section, 'home')
+      self.runOn(host, ['nohup', home + '/bin/accumulo', self.program, '&'])
+
+   def find(self, host):
+      code, stdout, stderr = self.runOn(host, ["pgrep -f 'app[=]%s' || true" % self.program])
+      return map(int, [line for line in stdout.split("\n") if line])
+
+def fail(msg):
+   import sys
+   logging.critical(msg)
+   sys.exit(1)
+
+def jitter(n):
+   return random.random() * n - n / 2
+
+def sleep(n):
+   if n > 0:
+       logging.info("Sleeping %.2f", n)
+       import time
+       time.sleep(n)
+
+def agitate(hosts, procs):
+   starters = []
+
+   logging.info("Agitating %s on %d hosts" % (procs, len(hosts)))
+
+   section = 'agitator'
+
+   # repeatedly...
+   while True:
+      if starters:
+         # start up services that were previously killed
+         t = max(0, config.getfloat(section, 'sleep.restart') + jitter(config.getfloat(section, 'sleep.jitter')))
+         sleep(t)
+         for host, proc in starters:
+            logging.info('Starting %s on %s', proc, host)
+            proc.start(host)
+         starters = []
+
+      # wait some time
+      t = max(0, config.getfloat(section, 'sleep') + jitter(config.getfloat(section, 'sleep.jitter')))
+      sleep(t)
+
+      # for some processes
+      for p in procs:
+
+         # roll dice: should it be killed?
+         if random.random() < p.frequencyToKill():
+
+            # find them
+            # TODO: in parallel
+            candidates = {}
+            for host in hosts:
+               pids = p.find(host)
+               if pids:
+                  candidates[host] = pids
+
+            # how many?
+            minKill, maxKill = p.numberToKill()
+            count = min(random.randrange(minKill, maxKill + 1), len(candidates))
+
+            # pick the victims
+            doomedHosts = random.sample(candidates.keys(), count)
+
+            # kill them
+            logging.info("Killing %s on %s", p, doomedHosts)
+            for doomedHost in doomedHosts:
+               pids = candidates[doomedHost]
+               if not pids:
+                  logging.error("Unable to kill any %s on %s: no processes of that type are running", p, doomedHost)
+               else:
+                  pid = random.choice(pids)
+                  logging.info("Killing %s (%d) on %s", p, pid, host)
+                  p.kill(doomedHost, pid)
+                  # remember to restart them later
+                  starters.append((doomedHost, p))
+
+def main():
+   import argparse
+   parser = argparse.ArgumentParser(description='Kill random processes')
+   parser.add_argument('--log', help='set the log level', default='INFO')
+   parser.add_argument('--namenodes', help='randomly kill namenodes', action="store_true")
+   parser.add_argument('--datanodes', help='randomly kill datanodes', action="store_true")
+   parser.add_argument('--tservers', help='randomly kill tservers', action="store_true")
+   parser.add_argument('--masters', help='randomly kill masters', action="store_true")
+   parser.add_argument('--zookeepers', help='randomly kill zookeepers', action="store_true")
+   parser.add_argument('--all', 
+                       help='kill any of the tservers, masters, datanodes, namenodes or zookeepers', 
+                       action='store_true')
+   parser.add_argument('--hosts', type=argparse.FileType('r'), required=True)
+   parser.add_argument('--config', type=argparse.FileType('r'), required=True)
+   args = parser.parse_args()
+
+   config.readfp(args.config)
+
+   level = getattr(logging, args.log.upper(), None)
+   if isinstance(level, int):
+      logging.basicConfig(level=level)
+
+   procs = []
+   def addIf(flag, proc):
+       if flag or args.all:
+          procs.append(proc)
+
+   addIf(args.namenodes,  Hadoop('namenode'))
+   addIf(args.datanodes,  Hadoop('datanode'))
+   addIf(args.tservers,   Accumulo('tserver'))
+   addIf(args.masters,    Accumulo('master'))
+   addIf(args.zookeepers, Zookeeper())
+   if len(procs) == 0:
+       fail("No processes to agitate!\n")
+
+   hosts = []
+   for line in args.hosts.readlines():
+       line = line.strip()
+       if line and line[0] != '#':
+           hosts.append(line)
+   if not hosts:
+       fail('No hosts to agitate!\n')
+
+   agitate(hosts, procs)
+
+if __name__ == '__main__':
+   main()

http://git-wip-us.apache.org/repos/asf/accumulo/blob/43f4464b/test/system/agitator/hosts.example
----------------------------------------------------------------------
diff --git a/test/system/agitator/hosts.example b/test/system/agitator/hosts.example
new file mode 100644
index 0000000..63fb8bb
--- /dev/null
+++ b/test/system/agitator/hosts.example
@@ -0,0 +1,16 @@
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+localhost