You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by be...@apache.org on 2011/06/05 06:57:55 UTC
svn commit: r1131689 - in /incubator/mesos/trunk/frameworks/torque:
test_date_sleep_date_3node.qsub torquesched.py
Author: benh
Date: Sun Jun 5 04:57:55 2011
New Revision: 1131689
URL: http://svn.apache.org/viewvc?rev=1131689&view=rev
Log:
Removed ugly torque hack, added sleep in hopes to eliminate non deterministic framework failure at startup because pbs_server doesn't get started completely.
Modified:
incubator/mesos/trunk/frameworks/torque/test_date_sleep_date_3node.qsub
incubator/mesos/trunk/frameworks/torque/torquesched.py
Modified: incubator/mesos/trunk/frameworks/torque/test_date_sleep_date_3node.qsub
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/frameworks/torque/test_date_sleep_date_3node.qsub?rev=1131689&r1=1131688&r2=1131689&view=diff
==============================================================================
--- incubator/mesos/trunk/frameworks/torque/test_date_sleep_date_3node.qsub (original)
+++ incubator/mesos/trunk/frameworks/torque/test_date_sleep_date_3node.qsub Sun Jun 5 04:57:55 2011
@@ -11,7 +11,7 @@
date
#wait 120 seconds
-sleep 10
+sleep 120
#print the time and date again
date
Modified: incubator/mesos/trunk/frameworks/torque/torquesched.py
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/frameworks/torque/torquesched.py?rev=1131689&r1=1131688&r2=1131689&view=diff
==============================================================================
--- incubator/mesos/trunk/frameworks/torque/torquesched.py (original)
+++ incubator/mesos/trunk/frameworks/torque/torquesched.py Sun Jun 5 04:57:55 2011
@@ -20,7 +20,12 @@ from socket import gethostname
PBS_SERVER_FILE = "/var/spool/torque/server_name"
EVENT_LOG_FILE = "log_fw_utilization.txt"
+LOG_FILE = "scheduler_log.txt"
+SCHEDULER_ITERATION = 2 #number of seconds torque waits before looping through
+ #the queue to try to match resources to jobs. default is
+ #10min (ie 600) but we want it to be low so jobs will run
+ #as soon as the framework has acquired enough resources
SAFE_ALLOCATION = {"cpus":5,"mem":134217728} #just set statically for now, 128MB
MIN_SLOT_SIZE = {"cpus":"1","mem":1073741824} #1GB
@@ -81,11 +86,6 @@ class MyScheduler(nexus.Scheduler):
self.servers[self.id] = offer.host
self.regComputeNode(offer.host)
self.numToRegister -= 1
- #HUGE HACK HERE. THIS IS BAD!
- if self.numToRegister == 0:# and len(torquelib.getActiveJobs()) == 1:
- #submit job that will fail because it is asking for too many resources
- time.sleep(8)
- Popen("echo date | qsub -l nodes=1", shell=True)
self.id += 1
driverlog.info("writing logfile")
eventlog.info(len(self.servers))
@@ -139,8 +139,8 @@ class MyScheduler(nexus.Scheduler):
monitorlog.info("We still have to kill %d of the %d compute nodes which master is tracking" % (toKill, len(self.servers)))
monitorlog.info("unregistering node " + str(hostname))
self.unregComputeNode(hostname)
- eventlog.info(len(sched.servers))
self.servers.pop(tid)
+ eventlog.info(len(sched.servers))
toKill = toKill - 1
monitorlog.info("killing corresponding task with tid %d" % tid)
self.driver.killTask(tid)
@@ -211,6 +211,7 @@ if __name__ == "__main__":
#these lines might not be necessary since we hacked the torque fifo scheduler
Popen("qmgr -c \"set queue batch resources_max.nodect=%s\"" % SAFE_ALLOCATION["cpus"], shell=True)
Popen("qmgr -c \"set server resources_max.nodect=%s\"" % SAFE_ALLOCATION["cpus"], shell=True)
+ Popen("qmgr -c \"set server scheduler_iteration=%s\"" % SCHEDULER_ITERATION, shell=True)
outp = Popen("qmgr -c \"l queue batch\"", shell=True, stdout=PIPE).stdout
for l in outp:
@@ -219,7 +220,7 @@ if __name__ == "__main__":
monitorlog.info("RE-killing pbs_server for resources_available setting to take effect")
#Popen("/etc/init.d/pbs_server start", shell=True)
Popen("qterm", shell=True)
- #time.sleep(1)
+ time.sleep(1)
monitorlog.info("RE-starting pbs_server for resources_available setting to take effect")
Popen("pbs_server", shell=True)