You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by de...@apache.org on 2016/11/10 13:35:33 UTC
svn commit: r1769115 -
/uima/uima-ducc/trunk/src/main/admin/tools/ducc_watcher
Author: degenaro
Date: Thu Nov 10 13:35:32 2016
New Revision: 1769115
URL: http://svn.apache.org/viewvc?rev=1769115&view=rev
Log:
UIMA-5053 DUCC ducc_watcher optional admin script to determine status and send notifications
- report down whenever something is newly down, but report should contains all down things
- only report up when all things are up
- honor --agents flag to determine if agents participate in the reporting decision
Modified:
uima/uima-ducc/trunk/src/main/admin/tools/ducc_watcher
Modified: uima/uima-ducc/trunk/src/main/admin/tools/ducc_watcher
URL: http://svn.apache.org/viewvc/uima/uima-ducc/trunk/src/main/admin/tools/ducc_watcher?rev=1769115&r1=1769114&r2=1769115&view=diff
==============================================================================
--- uima/uima-ducc/trunk/src/main/admin/tools/ducc_watcher (original)
+++ uima/uima-ducc/trunk/src/main/admin/tools/ducc_watcher Thu Nov 10 13:35:32 2016
@@ -106,6 +106,9 @@ class DuccHtmlParser(HTMLParser):
name = 'ducc_watcher'
+webserver = 'Webserver'
+head_daemons = [ 'Orchestrator', 'ResourceManager', 'Database', 'Broker', 'ProcessManager', 'ServiceManager', webserver ]
+
flag_info = True
flag_trace = False
logger = None
@@ -261,17 +264,19 @@ def validate_target(options):
global port
global target
global ducc_url
+ protocol = 'http://'
+ servlet = '/ducc-servlet/classic-system-daemons-data'
if(options.target == None):
error('required "target" not specified')
exit(1)
target = options.target
if(':' not in target):
target = target+':'+str(port)
- if(target.startswith('http://')):
- target = target.replace('http://','',1)
- ducc_url = 'http://'+target+'/ducc-servlet/classic-system-daemons-data'
+ if(target.startswith(protocol)):
+ target = target.replace(protocol,'',1)
+ ducc_url = protocol+target+servlet
debug('target: '+ducc_url)
-
+
# list of e-mail recipients, if any
def validate_email_list(options):
global email_list
@@ -312,6 +317,43 @@ def parse_cmdline():
# -a
validate_agents(options)
+# determine if named daemon is one of the head node ones
+def is_head(key):
+ global head_daemons
+ retVal = False
+ if(key in head_daemons):
+ retVal = True
+ return retVal
+
+# get rid of noise. remove if
+# 1. state is unknown
+# 2. if is agent and agents are not wanted
+def filter(state_dict):
+ global flag_agents
+ retVal = {}
+ for key in state_dict:
+ if(state_dict[key] == 'unknown'):
+ pass
+ else:
+ if(is_head(key)):
+ retVal[key] = state_dict[key]
+ elif(flag_agents):
+ retVal[key] = state_dict[key]
+ return retVal
+
+# summarize state of all ducc daemons
+def summarize(state_dict):
+ global head_daemons
+ retVal = 'up'
+ if(len(state_dict) < len(head_daemons)):
+ retVal = 'down'
+ else:
+ for key in state_dict:
+ if(not state_dict[key] == 'up'):
+ retVal = 'down'
+ break;
+ return retVal
+
# read precious daemons state
def read_state_previous():
global state_dict_previous
@@ -322,6 +364,8 @@ def read_state_previous():
s = f.read()
state_dict_previous = ast.literal_eval(s)
debug('state_previous(read): '+str(state_dict_previous))
+ state_dict_previous = filter(state_dict_previous)
+ debug('state_previous(filter): '+str(state_dict_previous))
except Exception,e:
error('unable to read state from '+state_file)
exception(e)
@@ -352,6 +396,7 @@ def fetch_state_current():
global flag_agents
global state_dict_current
global ducc_url
+ global webserver
state_dict_current = {}
try:
import urllib2
@@ -370,9 +415,12 @@ def fetch_state_current():
status = daemons[daemon]
trace(daemon+':'+' '+status+' ')
state_dict_current[daemon] = status
+ debug('state_current(read): '+str(state_dict_current))
+ state_dict_current = filter(state_dict_current)
+ debug('state_current(filter): '+str(state_dict_current))
except Exception,e:
# for WS status to down whenever contact fails
- daemon = 'Webserver'
+ daemon = webserver
status = 'unreachable'
state_dict_current[daemon] = status
error('unable to fetch data from '+ducc_url)
@@ -383,21 +431,23 @@ def fetch_state_current():
def determine_state_changes():
global state_dict_current
global state_dict_previous
- global state_dict_changes
- state_dict_changes = {}
+ global state_dict_not_up
+ global state_changes_count
+ state_changes_count = 0
+ state_dict_not_up = {}
for key in state_dict_current:
state_current = state_dict_current.get(key, '?')
- state_previous = state_dict_previous.get(key, '?')
- if(state_current == state_previous):
+ if(state_current == 'up'):
pass
else:
- info(key+' '+'from'+' '+state_previous+' '+'to'+' '+state_current)
- if(state_current == 'up'):
+ state_dict_not_up[key] = state_current
+ state_previous = state_dict_previous.get(key, '?')
+ if(state_current == state_previous):
pass
else:
- state_dict_changes[key] = state_current
- info(key+' '+state_current)
-
+ state_changes_count = state_changes_count + 1
+ info(key+' '+'from'+' '+state_previous+' '+'to'+' '+state_current)
+
# send email
def email(text):
global name
@@ -423,12 +473,71 @@ def email(text):
exception(e)
return
+
+# check if all head node daemons are reported
+def is_all_head_daemons():
+ global state_dict_current
+ global head_daemons
+ debug('states: '+str(state_dict_current))
+ debug('daemons: '+str(head_daemons))
+ for daemon in head_daemons:
+ if(daemon in state_dict_current):
+ debug(daemon+' reporting')
+ else:
+ info(daemon+' not reporting')
+ retVal = False
+ break
+ retVal = True
+ debug('all head daemons: '+str(retVal))
+ return retVal
+
+# check if only webserver is reported
+def is_only_webserver():
+ global state_dict_current
+ global webserver
+ retVal = False
+ len_cur = len(state_dict_current)
+ if(len_cur == 1):
+ if(webserver in state_dict_current):
+ debug(webserver+' only reporting')
+ retVal = True
+ debug('webserver only: '+str(retVal))
+ return retVal
+
+# not reportable when ducc boot is in progress
+def is_reportable():
+ global head_daemons
+ global state_dict_current
+ retVal = False
+ if(is_only_webserver()):
+ retVal = True
+ elif(is_all_head_daemons()):
+ retVal = True
+ return retVal
+
# e-mail state changes, if any
def email_state_changes():
- global state_dict_changes
- if(len(state_dict_changes) > 0):
- email(str(state_dict_changes))
-
+ global state_dict_current
+ global state_dict_previous
+ global state_dict_not_up
+ global state_changes_count
+ if(is_reportable()):
+ if(state_changes_count > 0):
+ info('state_changes(count): '+str(state_changes_count))
+ email(str(state_dict_not_up))
+ else:
+ debug('state_changes(count): '+str(state_changes_count))
+ sum_cur = summarize(state_dict_current)
+ sum_prv = summarize(state_dict_previous)
+ if(sum_cur == sum_prv):
+ debug('state_current(summary): '+str(sum_cur))
+ debug('state_previous(summary): '+str(sum_prv))
+ else:
+ info('state_current(summary): '+str(sum_cur))
+ info('state_previous(summary): '+str(sum_prv))
+ if(sum_cur == 'up'):
+ email('All daemons up')
+
# check for newly down DUCC daemons
def main(argv):
global logger
@@ -438,9 +547,9 @@ def main(argv):
parse_cmdline()
read_state_previous()
fetch_state_current()
- determine_state_changes()
- update_state_previous()
+ determine_state_changes()
email_state_changes()
+ update_state_previous()
-if __name__ == "__main__":
+if __name__ == '__main__':
main(sys.argv[1:])