You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@cloudstack.apache.org by ja...@apache.org on 2013/11/25 10:44:24 UTC
git commit: updated refs/heads/master to 0be4a68
Updated Branches:
refs/heads/master ab2c38c05 -> 0be4a685e
CLOUDSTACK-5164 Unmonit for 30 minutes for a failed process
Project: http://git-wip-us.apache.org/repos/asf/cloudstack/repo
Commit: http://git-wip-us.apache.org/repos/asf/cloudstack/commit/0be4a685
Tree: http://git-wip-us.apache.org/repos/asf/cloudstack/tree/0be4a685
Diff: http://git-wip-us.apache.org/repos/asf/cloudstack/diff/0be4a685
Branch: refs/heads/master
Commit: 0be4a685e8cb0caedf670e45075f1b4e52237f5c
Parents: ab2c38c
Author: Jayapal <ja...@apache.org>
Authored: Mon Nov 25 14:58:12 2013 +0530
Committer: Jayapal <ja...@apache.org>
Committed: Mon Nov 25 15:12:48 2013 +0530
----------------------------------------------------------------------
.../config/opt/cloud/bin/monitor_service.sh | 2 +-
.../debian/config/root/monitorServices.py | 199 +++++++++++++++----
2 files changed, 161 insertions(+), 40 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/cloudstack/blob/0be4a685/systemvm/patches/debian/config/opt/cloud/bin/monitor_service.sh
----------------------------------------------------------------------
diff --git a/systemvm/patches/debian/config/opt/cloud/bin/monitor_service.sh b/systemvm/patches/debian/config/opt/cloud/bin/monitor_service.sh
index c4d99d2..51b6923 100755
--- a/systemvm/patches/debian/config/opt/cloud/bin/monitor_service.sh
+++ b/systemvm/patches/debian/config/opt/cloud/bin/monitor_service.sh
@@ -64,7 +64,7 @@ crontab -l | grep -v monitorServices.py | crontab -
create_config $config
#add cron job
-(crontab -l ;echo -e "SHELL=/bin/bash\nPATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin\n */1 * * * * /usr/bin/python /root/monitorServices.py") | crontab -
+(crontab -l ;echo -e "SHELL=/bin/bash\nPATH=/usr/local/sbin:/usr/local/bin:/sbin:/bin:/usr/sbin:/usr/bin\n */3 * * * * /usr/bin/python /root/monitorServices.py") | crontab -
unlock_exit 0 $lock $locked
http://git-wip-us.apache.org/repos/asf/cloudstack/blob/0be4a685/systemvm/patches/debian/config/root/monitorServices.py
----------------------------------------------------------------------
diff --git a/systemvm/patches/debian/config/root/monitorServices.py b/systemvm/patches/debian/config/root/monitorServices.py
index 2cec672..4e1b7e0 100755
--- a/systemvm/patches/debian/config/root/monitorServices.py
+++ b/systemvm/patches/debian/config/root/monitorServices.py
@@ -19,14 +19,13 @@
-__author__ = 'jayapalreddy'
from ConfigParser import SafeConfigParser
from subprocess import *
from os import path
import time
+import os
-monitor_log='/var/log/monitor.log'
class StatusCodes:
SUCCESS = 0
FAILED = 1
@@ -35,42 +34,58 @@ class StatusCodes:
STOPPED = 4
STARTING = 5
-class log:
+class Log:
INFO = 'INFO'
ALERT = 'ALERT'
CRIT = 'CRIT'
NOTIF = 'NOTIF'
-
+class Config:
+ MONIT_AFTER_MINS = 30
+ SLEEP_SEC = 1
+ RETRY_ITERATIONS = 10
+ RETRY_FOR_RESTART = 5
+ MONITOR_LOG = '/var/log/monitor.log'
+ UNMONIT_PS_FILE = '/etc/unmonit_psList.txt'
def getConfig( config_file_path = "/etc/monitor.conf" ):
+ """
+ Reads the process configuration from the config file.
+ Config file contains the processes to be monitored.
+
+ """
process_dict = {}
parser = SafeConfigParser()
parser.read( config_file_path )
- #print 'Read values:\n'
for section in parser.sections():
- # print section
process_dict[section] = {}
for name, value in parser.items(section):
process_dict[section][name] = value
-# print ' %s = %r' % (name, value)
+# printd (" %s = %r" % (name, value))
return process_dict
def printd (msg):
+ """
+ prints the debug messages
+ """
+ #for debug
+ #print msg
return 0
- f= open(monitor_log,'r+')
+ f= open(Config.MONITOR_LOG,'r+')
f.seek(0, 2)
f.write(str(msg)+"\n")
f.close()
def raisealert(severity, msg, process_name=None):
+ """ Writes the alert message"""
+
#timeStr=str(time.ctime())
if process_name is not None:
log = '['+severity +']'+" " + '['+process_name+']' + " " + msg +"\n"
@@ -82,9 +97,12 @@ def raisealert(severity, msg, process_name=None):
def isPidMatchPidFile(pidfile, pids):
+ """ Compares the running process pid with the pid in pid file.
+ If a process with multiple pids then it matches with pid file
+ """
if pids is None or isinstance(pids,list) != True or len(pids) == 0:
- print "Invalid Arguments"
+ printd ("Invalid Arguments")
return StatusCodes.FAILED
if not path.isfile(pidfile):
#It seems there is no pid file for this service
@@ -100,12 +118,18 @@ def isPidMatchPidFile(pidfile, pids):
inp = fd.read()
+
+ if not inp:
+ fd.close()
+ return StatusCodes.FAILED
+
printd("file content "+str(inp))
printd(pids)
tocheck_pid = inp.strip()
for item in pids:
if str(tocheck_pid) == item.strip():
printd("pid file matched")
+ fd.close()
return StatusCodes.SUCCESS
fd.close()
@@ -114,19 +138,22 @@ def isPidMatchPidFile(pidfile, pids):
def checkProcessStatus( process ):
+ """
+ Check the process running status, if not running tries to restart
+ """
process_name = process.get('processname')
service_name = process.get('servicename')
pidfile = process.get('pidfile')
#temp_out = None
restartFailed=False
- pidFileMatched=1
+ pidFileMatched=False
+ pids=''
cmd=''
if process_name is None:
- print "\n Invalid Process Name"
+ printd ("\n Invalid Process Name")
return StatusCodes.INVALID_INP
else:
- msg="checking the process " + process_name
- printd(msg)
+ printd("checking the process " + process_name)
cmd = 'pidof ' + process_name
printd(cmd)
#cmd = 'service ' + process_name + ' status'
@@ -136,20 +163,19 @@ def checkProcessStatus( process ):
#check there is only one pid or not
if exitStatus == 0:
+ pids = temp_out.split(' ')
msg="pids: " +temp_out;
printd(msg)
- pids = temp_out.split(' ')
#there is more than one process so match the pid file
- #if not matched set pidFileMatched=0
+ #if not matched set pidFileMatched=False
printd("Checking pid file")
if isPidMatchPidFile(pidfile, pids) == StatusCodes.SUCCESS:
- pidFileMatched = 1;
+ pidFileMatched = True;
else:
- pidFileMatched = 0;
+ pidFileMatched = False;
- printd(pidFileMatched)
- if exitStatus == 0 and pidFileMatched == 1:
+ if exitStatus == 0 and pidFileMatched == True:
printd("The process is running ....")
return StatusCodes.RUNNING
else:
@@ -157,28 +183,28 @@ def checkProcessStatus( process ):
msg="The process " + process_name +" is not running trying recover "
printd(msg)
#Retry the process state for few seconds
- for i in range(1,10):
+ for i in range(1, Config.RETRY_ITERATIONS):
pout = Popen(cmd, shell=True, stdout=PIPE)
exitStatus = pout.wait()
temp_out = pout.communicate()[0]
- if i < 5: # this is just for trying few more times
+ if i < Config.RETRY_FOR_RESTART: # this is just for trying few more times
if exitStatus == 0:
pids = temp_out.split(' ')
if isPidMatchPidFile(pidfile, pids) == StatusCodes.SUCCESS:
- pidFileMatched = 1;
+ pidFileMatched = True;
printd("pid file is matched ...")
- raisealert(log.ALERT, "The process detected as running", process_name)
+ raisealert(Log.ALERT, "The process detected as running", process_name)
break
else:
printd("pid file is not matched ...")
- pidFileMatched = 0;
+ pidFileMatched = False;
+ time.sleep(Config.SLEEP_SEC)
continue
- time.sleep(1)
else:
msg="The process " +process_name+" is not running trying recover "
- raisealert(log.INFO,process_name,msg)
+ raisealert(Log.INFO,process_name,msg)
if service_name == 'apache2':
# Killing apache2 process with this the main service will not start
@@ -189,7 +215,7 @@ def checkProcessStatus( process ):
cmd = 'service ' + service_name + ' restart'
- time.sleep(1)
+ time.sleep(Config.SLEEP_SEC)
#return_val= check_call(cmd , shell=True)
cout = Popen(cmd, shell=True, stdout=PIPE, stderr=STDOUT)
@@ -198,37 +224,135 @@ def checkProcessStatus( process ):
if return_val == 0:
printd("The process" + process_name +" recovered successfully ")
msg="The process " +process_name+" is recovered successfully "
- raisealert(log.INFO,msg,process_name)
+ raisealert(Log.INFO,msg,process_name)
break;
else:
#retry restarting the process for few tries
printd("process restart failing trying again ....")
restartFailed=True
- time.sleep(1)
+ time.sleep(Config.SLEEP_SEC)
continue
#for end here
if restartFailed == True:
msg="The process %s recover failed "%process_name
- raisealert(log.ALERT,process_name,msg)
+ raisealert(Log.ALERT,process_name,msg)
printd("Restart failed after number of retries")
return StatusCodes.STOPPED
return StatusCodes.RUNNING
-def raiseAlert( process_name ):
- print "process name %s is raised "%process_name
def monitProcess( processes_info ):
+ """
+ Monitors the processes which got from the config file
+ """
if len( processes_info ) == 0:
- print "Invalid Input"
+ printd("Invalid Input")
return StatusCodes.INVALID_INP
+
+ dict_unmonit={}
+ umonit_update={}
+
+ if not path.isfile(Config.UNMONIT_PS_FILE):
+ printd('Unmonit File not exist')
+ else:
+ #load the dictionary with unmonit process list
+ dict_unmonit = loadPsFromUnMonitFile()
+
+ #time for noting process down time
+ csec = repr(time.time()).split('.')[0]
+
+ unMonitPs=False
+
for process,properties in processes_info.items():
+ #skip the process it its time stamp less than Config.MONIT_AFTER_MINS
+ printd ("checking the process %s \n" %process)
+
+ if not is_emtpy(dict_unmonit):
+ if dict_unmonit.has_key(process):
+ ts = dict_unmonit[process]
+ printd("Time difference=%s" %str(int(csec) - int(ts)))
+ tmin = (int(csec) - int(ts) )/60
+
+ if ( int(csec) - int(ts) )/60 < Config.MONIT_AFTER_MINS:
+ raisealert(Log.ALERT, "The %s get monitor after %s minutes " %(process, Config.MONIT_AFTER_MINS))
+ printd('process will be monitored after %s min' %(str(int(Config.MONIT_AFTER_MINS) - tmin)))
+ unMonitPs=True
+ continue
+
if checkProcessStatus( properties) != StatusCodes.RUNNING:
- print "\n Process %s is not Running"%process
+ printd( "\n Process %s is not Running"%process)
+ #add this process into unmonit list
+ printd ("updating the process for unmonit %s\n" %process)
+ umonit_update[process]=csec
+
+
+ #if dict is not empty write to file else delete it
+ if not is_emtpy(umonit_update):
+ writePsListToUnmonitFile(umonit_update)
+ else:
+ if is_emtpy(umonit_update) and unMonitPs == False:
+ #delete file it is there
+ if path.isfile(Config.UNMONIT_PS_FILE):
+ printd("Removing the file %s" %Config.UNMONIT_PS_FILE)
+ os.remove(Config.UNMONIT_PS_FILE)
+
+
+
+def loadPsFromUnMonitFile():
+ dict_unmonit = {}
+
+ try:
+ fd = open(Config.UNMONIT_PS_FILE)
+ except:
+ printd("Failed to open file %s " %(Config.UNMONIT_PS_FILE))
+ return StatusCodes.FAILED
+
+ ps = fd.read()
+
+ if not ps:
+ printd("File %s content is empty " %Config.UNMONIT_PS_FILE)
+ return StatusCodes.FAILED
+
+ printd(ps)
+ plist = ps.split(',')
+ plist.remove('')
+ for i in plist:
+ dict_unmonit[i.split(':')[0]] = i.split(':')[1]
+
+ fd.close();
+
+ return dict_unmonit;
+
+
+def writePsListToUnmonitFile(umonit_update):
+ printd("Write updated unmonit list to file")
+ line=''
+ for i in umonit_update:
+ line+=str(i)+":"+str(umonit_update[i])+','
+ printd(line)
+ try:
+ fd=open(Config.UNMONIT_PS_FILE,'w')
+ except:
+ printd("Failed to open file %s " %Config.UNMONIT_PS_FILE)
+ return StatusCodes.FAILED
+
+ fd.write(line);
+ fd.close()
+
+
+def is_emtpy(struct):
+ """
+ Checks wether the given struct is empty or not
+ """
+ if struct:
+ return False
+ else:
+ return True
def main():
'''
@@ -238,14 +362,11 @@ def main():
printd("monitoring started")
temp_dict = getConfig()
- '''
- Step2: Get Previous Run Log
- '''
'''
- Step3: Monitor and Raise Alert
+ Step2: Monitor and Raise Alert
'''
- #raisealert(log.INFO, 'Monit started')
+ #raisealert(Log.INFO, 'Monit started')
monitProcess( temp_dict )