You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@slider.apache.org by sm...@apache.org on 2014/10/12 02:57:59 UTC

[11/50] git commit: SLIDER-341. Add a window based failure count for auto-start to limit indefinite attempt

SLIDER-341. Add a window based failure count for auto-start to limit indefinite attempt


Project: http://git-wip-us.apache.org/repos/asf/incubator-slider/repo
Commit: http://git-wip-us.apache.org/repos/asf/incubator-slider/commit/961e1704
Tree: http://git-wip-us.apache.org/repos/asf/incubator-slider/tree/961e1704
Diff: http://git-wip-us.apache.org/repos/asf/incubator-slider/diff/961e1704

Branch: refs/heads/feature/SLIDER-481_allow_dedicated_handling_of_exports
Commit: 961e17043f589c4096dce896d64a582dd000dd4e
Parents: d8b36ca
Author: Sumit Mohanty <sm...@hortonworks.com>
Authored: Mon Oct 6 20:47:44 2014 -0700
Committer: Sumit Mohanty <sm...@hortonworks.com>
Committed: Mon Oct 6 20:47:44 2014 -0700

----------------------------------------------------------------------
 slider-agent/conf/agent.ini                     |  1 +
 .../src/main/python/agent/AgentConfig.py        | 14 +++++
 .../src/main/python/agent/Controller.py         | 35 ++++++++++-
 .../src/test/python/agent/TestController.py     | 63 ++++++++++++++++++++
 slider-agent/src/test/python/agent/TestMain.py  | 37 ++++++++++++
 5 files changed, 148 insertions(+), 2 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/961e1704/slider-agent/conf/agent.ini
----------------------------------------------------------------------
diff --git a/slider-agent/conf/agent.ini b/slider-agent/conf/agent.ini
index 7b9d57d..48113e3 100644
--- a/slider-agent/conf/agent.ini
+++ b/slider-agent/conf/agent.ini
@@ -43,6 +43,7 @@ log_level=INFO
 [command]
 max_retries=2
 sleep_between_retries=1
+auto_restart=5,5
 
 [security]
 

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/961e1704/slider-agent/src/main/python/agent/AgentConfig.py
----------------------------------------------------------------------
diff --git a/slider-agent/src/main/python/agent/AgentConfig.py b/slider-agent/src/main/python/agent/AgentConfig.py
index e45ba23..86925b1 100644
--- a/slider-agent/src/main/python/agent/AgentConfig.py
+++ b/slider-agent/src/main/python/agent/AgentConfig.py
@@ -61,6 +61,7 @@ log_level=INFO
 [command]
 max_retries=2
 sleep_between_retries=1
+auto_restart=5,5
 
 [security]
 keysdir=security/keys
@@ -109,6 +110,8 @@ class AgentConfig:
   # agent version file
   VERSION_FILE = "version_file"
 
+  AUTO_RESTART = "auto_restart"
+
   FOLDER_MAPPING = {
     APP_PACKAGE_DIR: "WORK",
     APP_INSTALL_DIR: "WORK",
@@ -164,6 +167,17 @@ class AgentConfig:
       return ""
     return command
 
+  # return max, window - max failures within window minutes
+  def getErrorWindow(self):
+    window = config.get(AgentConfig.COMMAND_SECTION, AgentConfig.AUTO_RESTART)
+    if window != None:
+      parts = window.split(',')
+      if len(parts) == 2:
+        if parts[0].isdigit() and parts[1].isdigit():
+          return (int(parts[0]), int(parts[1]))
+      pass
+    return (0, 0)
+
   def set(self, category, name, value):
     global config
     return config.set(category, name, value)

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/961e1704/slider-agent/src/main/python/agent/Controller.py
----------------------------------------------------------------------
diff --git a/slider-agent/src/main/python/agent/Controller.py b/slider-agent/src/main/python/agent/Controller.py
index 11db21c..77f932c 100644
--- a/slider-agent/src/main/python/agent/Controller.py
+++ b/slider-agent/src/main/python/agent/Controller.py
@@ -27,6 +27,7 @@ import time
 import threading
 import urllib2
 import pprint
+import math
 from random import randint
 
 from AgentConfig import AgentConfig
@@ -86,7 +87,8 @@ class Controller(threading.Thread):
     self.statusCommand = None
     self.failureCount = 0
     self.heartBeatRetryCount = 0
-    self.autoRestart = False
+    self.autoRestartFailures = 0
+    self.autoRestartTrackingSince = 0
 
 
   def __del__(self):
@@ -275,7 +277,7 @@ class Controller(threading.Thread):
           stored_command = self.actionQueue.customServiceOrchestrator.stored_command
           if len(stored_command) > 0:
             auto_start_command = self.create_start_command(stored_command)
-            if auto_start_command:
+            if auto_start_command and self.shouldAutoRestart():
               logger.info("Automatically adding a start command.")
               logger.debug("Auto start command: " + pprint.pformat(auto_start_command))
               self.updateStateBasedOnCommand([auto_start_command], False)
@@ -486,6 +488,35 @@ class Controller(threading.Thread):
             return {'exitstatus': 1, 'log': err_msg}
 
 
+  # Basic window that only counts failures till the window duration expires
+  def shouldAutoRestart(self):
+    max, window = self.config.getErrorWindow()
+    if max <= 0 or window <= 0:
+      return True
+
+    seconds_now = time.time()
+    if self.autoRestartTrackingSince == 0:
+      self.autoRestartTrackingSince = seconds_now
+      self.autoRestartFailures = 1
+      return True
+
+    self.autoRestartFailures += 1
+    minutes = math.floor((seconds_now - self.autoRestartTrackingSince) / 60)
+    if self.autoRestartFailures > max:
+      logger.info("Auto restart not allowed due to " + str(self.autoRestartFailures) + " failures in " + str(minutes) +
+                  " minutes. Max restarts allowed is " + str(max) + " in " + str(window) + " minutes.")
+      return False
+
+    if minutes > window:
+      logger.info("Resetting window as number of minutes passed is " + str(minutes))
+      self.autoRestartTrackingSince = seconds_now
+      self.autoRestartFailures = 1
+      return True
+    return True
+
+    pass
+
+
 def main(argv=None):
   # Allow Ctrl-C
   signal.signal(signal.SIGINT, signal.SIG_DFL)

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/961e1704/slider-agent/src/test/python/agent/TestController.py
----------------------------------------------------------------------
diff --git a/slider-agent/src/test/python/agent/TestController.py b/slider-agent/src/test/python/agent/TestController.py
index 401d69a..02b0d0e 100644
--- a/slider-agent/src/test/python/agent/TestController.py
+++ b/slider-agent/src/test/python/agent/TestController.py
@@ -25,6 +25,7 @@ import unittest, threading
 from agent import Controller, ActionQueue
 from agent import hostname
 import sys
+import time
 from Controller import AGENT_AUTO_RESTART_EXIT_CODE
 from Controller import State
 from AgentConfig import AgentConfig
@@ -255,6 +256,68 @@ class TestController(unittest.TestCase):
     self.assertTrue(os_exit_mock.call_args[0][0] == AGENT_AUTO_RESTART_EXIT_CODE)
 
 
+  @patch("time.time")
+  def test_failure_window(self, mock_time):
+    config = AgentConfig("", "")
+    original_config = config.get(AgentConfig.COMMAND_SECTION, AgentConfig.AUTO_RESTART)
+    config.set(AgentConfig.COMMAND_SECTION, AgentConfig.AUTO_RESTART, '2,1')
+    ## The behavior of side_effect is different when you run tests in command line and when you do it through IDE
+    ## So few extra items are there in the list
+    mock_time.side_effect = [200, 500, 500]
+    controller5 = Controller.Controller(config)
+
+    try:
+      self.assertTrue(controller5.shouldAutoRestart())
+      self.assertTrue(controller5.shouldAutoRestart())
+    finally:
+      config.set(AgentConfig.COMMAND_SECTION, AgentConfig.AUTO_RESTART, original_config)
+
+
+  @patch("time.time")
+  def test_failure_window(self, mock_time):
+    config = AgentConfig("", "")
+    original_config = config.get(AgentConfig.COMMAND_SECTION, AgentConfig.AUTO_RESTART)
+    config.set(AgentConfig.COMMAND_SECTION, AgentConfig.AUTO_RESTART, '3,1')
+    ## The behavior of side_effect is different when you run tests in command line and when you do it through IDE
+    ## So few extra items are there in the list
+    mock_time.side_effect = [200, 210, 220, 230, 240, 250]
+    controller5 = Controller.Controller(config)
+
+    try:
+      self.assertTrue(controller5.shouldAutoRestart())
+      self.assertTrue(controller5.shouldAutoRestart())
+      self.assertTrue(controller5.shouldAutoRestart())
+      self.assertFalse(controller5.shouldAutoRestart())
+    finally:
+      config.set(AgentConfig.COMMAND_SECTION, AgentConfig.AUTO_RESTART, original_config)
+
+
+  def test_failure_window2(self):
+    config = MagicMock()
+    config.getErrorWindow.return_value = (0, 0)
+    controller = Controller.Controller(config)
+
+    self.assertTrue(controller.shouldAutoRestart())
+
+    config.getErrorWindow.return_value = (0, 1)
+    self.assertTrue(controller.shouldAutoRestart())
+
+    config.getErrorWindow.return_value = (1, 0)
+    self.assertTrue(controller.shouldAutoRestart())
+
+    config.getErrorWindow.return_value = (-1, -1)
+    self.assertTrue(controller.shouldAutoRestart())
+
+    config.getErrorWindow.return_value = (1, 1)
+    self.assertTrue(controller.shouldAutoRestart())
+
+    #second failure within a minute
+    self.assertFalse(controller.shouldAutoRestart())
+
+    #do not reset unless window expires
+    self.assertFalse(controller.shouldAutoRestart())
+
+
   @patch("urllib2.urlopen")
   def test_sendRequest(self, requestMock):
 

http://git-wip-us.apache.org/repos/asf/incubator-slider/blob/961e1704/slider-agent/src/test/python/agent/TestMain.py
----------------------------------------------------------------------
diff --git a/slider-agent/src/test/python/agent/TestMain.py b/slider-agent/src/test/python/agent/TestMain.py
index e73a05a..7c0036b 100644
--- a/slider-agent/src/test/python/agent/TestMain.py
+++ b/slider-agent/src/test/python/agent/TestMain.py
@@ -312,6 +312,43 @@ class TestMain(unittest.TestCase):
       AgentConfig_set_mock.assert_any_call("server", "zk_reg_path", "/registry/org-apache-slider/cl1")
 
 
+  def test_config1(self):
+    config = AgentConfig("", "")
+    (max, window) = config.getErrorWindow()
+    self.assertEqual(max, 5)
+    self.assertEqual(window, 5)
+
+    config.set(AgentConfig.COMMAND_SECTION, AgentConfig.AUTO_RESTART, '')
+    (max, window) = config.getErrorWindow()
+    self.assertEqual(max, 0)
+    self.assertEqual(window, 0)
+
+    config.set(AgentConfig.COMMAND_SECTION, AgentConfig.AUTO_RESTART, '33')
+    (max, window) = config.getErrorWindow()
+    self.assertEqual(max, 0)
+    self.assertEqual(window, 0)
+
+    config.set(AgentConfig.COMMAND_SECTION, AgentConfig.AUTO_RESTART, '-4,-6')
+    (max, window) = config.getErrorWindow()
+    self.assertEqual(max, 0)
+    self.assertEqual(window, 0)
+
+    config.set(AgentConfig.COMMAND_SECTION, AgentConfig.AUTO_RESTART, 'wd,er')
+    (max, window) = config.getErrorWindow()
+    self.assertEqual(max, 0)
+    self.assertEqual(window, 0)
+
+    config.set(AgentConfig.COMMAND_SECTION, AgentConfig.AUTO_RESTART, '2,20')
+    (max, window) = config.getErrorWindow()
+    self.assertEqual(max, 2)
+    self.assertEqual(window, 20)
+
+    config.set(AgentConfig.COMMAND_SECTION, AgentConfig.AUTO_RESTART, ' 2, 30')
+    (max, window) = config.getErrorWindow()
+    self.assertEqual(max, 0)
+    self.assertEqual(window, 0)
+
+
 if __name__ == "__main__":
   logging.basicConfig(format='%(asctime)s %(message)s', level=logging.DEBUG)
   unittest.main()
\ No newline at end of file