You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@cloudstack.apache.org by ja...@apache.org on 2013/11/25 10:44:24 UTC

git commit: updated refs/heads/master to 0be4a68

Updated Branches:
  refs/heads/master ab2c38c05 -> 0be4a685e


CLOUDSTACK-5164 Unmonit for 30 minutes for a failed process


Project: http://git-wip-us.apache.org/repos/asf/cloudstack/repo
Commit: http://git-wip-us.apache.org/repos/asf/cloudstack/commit/0be4a685
Tree: http://git-wip-us.apache.org/repos/asf/cloudstack/tree/0be4a685
Diff: http://git-wip-us.apache.org/repos/asf/cloudstack/diff/0be4a685

Branch: refs/heads/master
Commit: 0be4a685e8cb0caedf670e45075f1b4e52237f5c
Parents: ab2c38c
Author: Jayapal <ja...@apache.org>
Authored: Mon Nov 25 14:58:12 2013 +0530
Committer: Jayapal <ja...@apache.org>
Committed: Mon Nov 25 15:12:48 2013 +0530

----------------------------------------------------------------------
 .../config/opt/cloud/bin/monitor_service.sh     |   2 +-
 .../debian/config/root/monitorServices.py       | 199 +++++++++++++++----
 2 files changed, 161 insertions(+), 40 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/cloudstack/blob/0be4a685/systemvm/patches/debian/config/opt/cloud/bin/monitor_service.sh
----------------------------------------------------------------------
diff --git a/systemvm/patches/debian/config/opt/cloud/bin/monitor_service.sh b/systemvm/patches/debian/config/opt/cloud/bin/monitor_service.sh
index c4d99d2..51b6923 100755
--- a/systemvm/patches/debian/config/opt/cloud/bin/monitor_service.sh
+++ b/systemvm/patches/debian/config/opt/cloud/bin/monitor_service.sh
@@ -64,7 +64,7 @@ crontab -l | grep -v  monitorServices.py | crontab -
 create_config $config
 
 #add cron job
-(crontab -l ;echo -e "SHELL=/bin/bash\nPATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin\n */1 * * * * /usr/bin/python /root/monitorServices.py") | crontab -
+(crontab -l ;echo -e "SHELL=/bin/bash\nPATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin\n */3 * * * * /usr/bin/python /root/monitorServices.py") | crontab -
 
 
 unlock_exit 0 $lock $locked

http://git-wip-us.apache.org/repos/asf/cloudstack/blob/0be4a685/systemvm/patches/debian/config/root/monitorServices.py
----------------------------------------------------------------------
diff --git a/systemvm/patches/debian/config/root/monitorServices.py b/systemvm/patches/debian/config/root/monitorServices.py
index 2cec672..4e1b7e0 100755
--- a/systemvm/patches/debian/config/root/monitorServices.py
+++ b/systemvm/patches/debian/config/root/monitorServices.py
@@ -19,14 +19,13 @@
 
 
 
-__author__ = 'jayapalreddy'
 
 from ConfigParser import SafeConfigParser
 from subprocess import *
 from os import path
 import time
+import os
 
-monitor_log='/var/log/monitor.log'
 class StatusCodes:
     SUCCESS      = 0
     FAILED       = 1
@@ -35,42 +34,58 @@ class StatusCodes:
     STOPPED      = 4
     STARTING     = 5
 
-class log:
+class Log:
     INFO = 'INFO'
     ALERT = 'ALERT'
     CRIT  = 'CRIT'
     NOTIF = 'NOTIF'
 
-
+class Config:
+    MONIT_AFTER_MINS = 30
+    SLEEP_SEC = 1
+    RETRY_ITERATIONS = 10
+    RETRY_FOR_RESTART = 5
+    MONITOR_LOG = '/var/log/monitor.log'
+    UNMONIT_PS_FILE = '/etc/unmonit_psList.txt'
 
 
 def getConfig( config_file_path = "/etc/monitor.conf" ):
+    """
+    Reads the process configuration from the config file.
+    Config file contains the processes to be monitored.
+
+    """
     process_dict = {}
     parser = SafeConfigParser()
     parser.read( config_file_path )
 
-    #print 'Read values:\n'
 
     for section in parser.sections():
-        #   print section
         process_dict[section] = {}
 
         for name, value in parser.items(section):
             process_dict[section][name] = value
-#           print '  %s = %r' % (name, value)
+#           printd (" %s = %r" % (name, value))
 
     return  process_dict
 
 def printd (msg):
+    """
+    prints the debug messages
+    """
 
+    #for debug
+    #print msg
     return 0
 
-    f= open(monitor_log,'r+')
+    f= open(Config.MONITOR_LOG,'r+')
     f.seek(0, 2)
     f.write(str(msg)+"\n")
     f.close()
 
 def raisealert(severity, msg, process_name=None):
+    """ Writes the alert message"""
+
     #timeStr=str(time.ctime())
     if process_name is not None:
         log = '['+severity +']'+" " + '['+process_name+']' + " " + msg +"\n"
@@ -82,9 +97,12 @@ def raisealert(severity, msg, process_name=None):
 
 
 def isPidMatchPidFile(pidfile, pids):
+    """ Compares the running process pid with the pid in pid file.
+        If a process with multiple pids then it matches with pid file
+    """
 
     if pids is None or isinstance(pids,list) != True or len(pids) == 0:
-        print "Invalid Arguments"
+        printd ("Invalid Arguments")
         return StatusCodes.FAILED
     if not path.isfile(pidfile):
         #It seems there is no pid file for this service
@@ -100,12 +118,18 @@ def isPidMatchPidFile(pidfile, pids):
 
 
     inp = fd.read()
+
+    if not inp:
+        fd.close()
+        return StatusCodes.FAILED
+
     printd("file content "+str(inp))
     printd(pids)
     tocheck_pid  =  inp.strip()
     for item in pids:
         if str(tocheck_pid) ==  item.strip():
             printd("pid file matched")
+            fd.close()
             return StatusCodes.SUCCESS
 
     fd.close()
@@ -114,19 +138,22 @@ def isPidMatchPidFile(pidfile, pids):
 
 
 def checkProcessStatus( process ):
+    """
+    Check the process running status, if not running tries to restart
+    """
     process_name = process.get('processname')
     service_name = process.get('servicename')
     pidfile = process.get('pidfile')
     #temp_out = None
     restartFailed=False
-    pidFileMatched=1
+    pidFileMatched=False
+    pids=''
     cmd=''
     if process_name is None:
-        print "\n Invalid Process Name"
+        printd ("\n Invalid Process Name")
         return StatusCodes.INVALID_INP
     else:
-        msg="checking the process " + process_name
-        printd(msg)
+        printd("checking the process " + process_name)
         cmd = 'pidof ' + process_name
         printd(cmd)
         #cmd = 'service ' + process_name + ' status'
@@ -136,20 +163,19 @@ def checkProcessStatus( process ):
 
     #check there is only one pid or not
     if exitStatus == 0:
+        pids = temp_out.split(' ')
         msg="pids: " +temp_out;
         printd(msg)
-        pids = temp_out.split(' ')
 
         #there is more than one process so match the pid file
-        #if not matched set pidFileMatched=0
+        #if not matched set pidFileMatched=False
         printd("Checking pid file")
         if isPidMatchPidFile(pidfile, pids) == StatusCodes.SUCCESS:
-            pidFileMatched = 1;
+            pidFileMatched = True;
         else:
-            pidFileMatched = 0;
+            pidFileMatched = False;
 
-    printd(pidFileMatched)
-    if exitStatus == 0 and pidFileMatched == 1:
+    if exitStatus == 0 and pidFileMatched == True:
         printd("The process is running ....")
         return  StatusCodes.RUNNING
     else:
@@ -157,28 +183,28 @@ def checkProcessStatus( process ):
         msg="The process " + process_name +" is not running trying recover "
         printd(msg)
         #Retry the process state for few seconds
-        for i in range(1,10):
+        for i in range(1, Config.RETRY_ITERATIONS):
             pout = Popen(cmd, shell=True, stdout=PIPE)
             exitStatus = pout.wait()
             temp_out = pout.communicate()[0]
 
-            if i < 5: # this is just for trying few more times
+            if i < Config.RETRY_FOR_RESTART: # this is just for trying few more times
                 if exitStatus == 0:
                     pids = temp_out.split(' ')
 
                     if isPidMatchPidFile(pidfile, pids) == StatusCodes.SUCCESS:
-                        pidFileMatched = 1;
+                        pidFileMatched = True;
                         printd("pid file is matched ...")
-                        raisealert(log.ALERT, "The process detected as running", process_name)
+                        raisealert(Log.ALERT, "The process detected as running", process_name)
                         break
                     else:
                         printd("pid file is not matched ...")
-                        pidFileMatched = 0;
+                        pidFileMatched = False;
+                        time.sleep(Config.SLEEP_SEC)
                         continue
-                    time.sleep(1)
             else:
                 msg="The process " +process_name+" is not running trying recover "
-                raisealert(log.INFO,process_name,msg)
+                raisealert(Log.INFO,process_name,msg)
 
                 if service_name == 'apache2':
                     # Killing apache2 process with this the main service will not start
@@ -189,7 +215,7 @@ def checkProcessStatus( process ):
 
                 cmd = 'service ' + service_name + ' restart'
 
-                time.sleep(1)
+                time.sleep(Config.SLEEP_SEC)
                 #return_val= check_call(cmd , shell=True)
 
                 cout = Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT)
@@ -198,37 +224,135 @@ def checkProcessStatus( process ):
                 if return_val == 0:
                     printd("The process" + process_name +" recovered successfully ")
                     msg="The process " +process_name+" is recovered successfully "
-                    raisealert(log.INFO,msg,process_name)
+                    raisealert(Log.INFO,msg,process_name)
 
                     break;
                 else:
                     #retry restarting the process for few tries
                     printd("process restart failing trying again ....")
                     restartFailed=True
-                    time.sleep(1)
+                    time.sleep(Config.SLEEP_SEC)
                     continue
         #for end here
 
         if restartFailed == True:
             msg="The process %s recover failed "%process_name
-            raisealert(log.ALERT,process_name,msg)
+            raisealert(Log.ALERT,process_name,msg)
 
             printd("Restart failed after number of retries")
             return StatusCodes.STOPPED
 
     return  StatusCodes.RUNNING
 
-def raiseAlert( process_name ):
-    print "process name %s is raised "%process_name
 
 def monitProcess( processes_info ):
+    """
+    Monitors the processes which got from the config file
+    """
     if len( processes_info ) == 0:
-        print "Invalid Input"
+        printd("Invalid Input")
         return  StatusCodes.INVALID_INP
+
+    dict_unmonit={}
+    umonit_update={}
+
+    if not path.isfile(Config.UNMONIT_PS_FILE):
+        printd('Unmonit File not exist')
+    else:
+        #load the dictionary with unmonit process list
+        dict_unmonit = loadPsFromUnMonitFile()
+
+    #time for noting process down time
+    csec = repr(time.time()).split('.')[0]
+
+    unMonitPs=False
+
     for process,properties in processes_info.items():
+        #skip the process it its time stamp less than Config.MONIT_AFTER_MINS
+        printd ("checking the process %s \n" %process)
+
+        if not is_emtpy(dict_unmonit):
+            if dict_unmonit.has_key(process):
+                ts = dict_unmonit[process]
+                printd("Time difference=%s" %str(int(csec) - int(ts)))
+                tmin = (int(csec) - int(ts) )/60
+
+                if ( int(csec) - int(ts) )/60 < Config.MONIT_AFTER_MINS:
+                    raisealert(Log.ALERT, "The %s get monitor after %s minutes " %(process, Config.MONIT_AFTER_MINS))
+                    printd('process will be monitored after %s min' %(str(int(Config.MONIT_AFTER_MINS) - tmin)))
+                    unMonitPs=True
+                    continue
+
         if checkProcessStatus( properties) != StatusCodes.RUNNING:
-            print "\n Process %s is not Running"%process
+            printd( "\n Process %s is not Running"%process)
+            #add this process into unmonit list
+            printd ("updating the process for unmonit %s\n" %process)
+            umonit_update[process]=csec
+
+
+    #if dict is not empty write to file else delete it
+    if not is_emtpy(umonit_update):
+        writePsListToUnmonitFile(umonit_update)
+    else:
+        if is_emtpy(umonit_update) and unMonitPs == False:
+            #delete file it is there
+            if path.isfile(Config.UNMONIT_PS_FILE):
+                printd("Removing the file %s" %Config.UNMONIT_PS_FILE)
+                os.remove(Config.UNMONIT_PS_FILE)
+
+
+
+def loadPsFromUnMonitFile():
 
+    dict_unmonit = {}
+
+    try:
+        fd = open(Config.UNMONIT_PS_FILE)
+    except:
+        printd("Failed to open file %s " %(Config.UNMONIT_PS_FILE))
+        return StatusCodes.FAILED
+
+    ps = fd.read()
+
+    if not ps:
+        printd("File %s content is empty " %Config.UNMONIT_PS_FILE)
+        return StatusCodes.FAILED
+
+    printd(ps)
+    plist = ps.split(',')
+    plist.remove('')
+    for i in plist:
+        dict_unmonit[i.split(':')[0]] = i.split(':')[1]
+
+    fd.close();
+
+    return dict_unmonit;
+
+
+def writePsListToUnmonitFile(umonit_update):
+    printd("Write updated unmonit list to file")
+    line=''
+    for i in umonit_update:
+        line+=str(i)+":"+str(umonit_update[i])+','
+    printd(line)
+    try:
+        fd=open(Config.UNMONIT_PS_FILE,'w')
+    except:
+        printd("Failed to open file %s " %Config.UNMONIT_PS_FILE)
+        return StatusCodes.FAILED
+
+    fd.write(line);
+    fd.close()
+
+
+def is_emtpy(struct):
+    """
+    Checks wether the given struct is empty or not
+    """
+    if struct:
+        return False
+    else:
+        return True
 
 def main():
     '''
@@ -238,14 +362,11 @@ def main():
     printd("monitoring started")
     temp_dict  = getConfig()
 
-    '''
-    Step2: Get Previous Run Log
-    '''
 
     '''
-    Step3: Monitor and Raise Alert
+    Step2: Monitor and Raise Alert
     '''
-    #raisealert(log.INFO, 'Monit started')
+    #raisealert(Log.INFO, 'Monit started')
     monitProcess( temp_dict )