You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by be...@apache.org on 2011/06/05 06:58:19 UTC

svn commit: r1131692 - in /incubator/mesos/trunk: frameworks/torque/start_pbs_mom.py frameworks/torque/test_date_sleep_date_2node.qsub frameworks/torque/torquesched.py src/ec2/deploy.karmic64/root/nexus-ec2/setup-torque

Author: benh
Date: Sun Jun  5 04:58:19 2011
New Revision: 1131692

URL: http://svn.apache.org/viewvc?rev=1131692&view=rev
Log:
updates, combined with two hacks to torque source code, torque fw is now function, though not yet incorporated with mpi

Modified:
    incubator/mesos/trunk/frameworks/torque/start_pbs_mom.py
    incubator/mesos/trunk/frameworks/torque/test_date_sleep_date_2node.qsub
    incubator/mesos/trunk/frameworks/torque/torquesched.py
    incubator/mesos/trunk/src/ec2/deploy.karmic64/root/nexus-ec2/setup-torque

Modified: incubator/mesos/trunk/frameworks/torque/start_pbs_mom.py
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/frameworks/torque/start_pbs_mom.py?rev=1131692&r1=1131691&r2=1131692&view=diff
==============================================================================
--- incubator/mesos/trunk/frameworks/torque/start_pbs_mom.py (original)
+++ incubator/mesos/trunk/frameworks/torque/start_pbs_mom.py Sun Jun  5 04:58:19 2011
@@ -8,7 +8,7 @@ import atexit
 from subprocess import *
 
 PBS_MOM_CONF_FILE = "/var/spool/torque/mom_priv/config"
-PBS_SERVER_FILE = "/var/spool/torque/server_name"
+PBS_SERVER_NAME_FILE = "/var/spool/torque/server_name"
 
 def cleanup():
   try:
@@ -31,42 +31,42 @@ class MyExecutor(nexus.Executor):
   def launchTask(self, driver, task):
     print "Running task %d" % task.taskId
     
-    print "checking pbs_mom conf file " + PBS_MOM_CONF_FILE + " is it a file? "\
-           + str(os.path.isfile(PBS_MOM_CONF_FILE))
     #TODO: if config file exists, check to see that it is correct
     #      (right now we overwrite it no matter what)
-    if not os.path.isfile(PBS_MOM_CONF_FILE):
-      print PBS_MOM_CONF_FILE + " file not found, about to create it"
-    else:
-      print "about to overwrite file " + PBS_MOM_CONF_FILE + " to update "#\
-#            + "pbs_server on this node"
 
-    #overwrite $(TORQUECFG)/server_name file with fqdn of pbs_server
-    FILE = open(PBS_SERVER_FILE,'w')
-    FILE.write(self.pbs_server_fqdn)
-    FILE.close()
+    #print "checking pbs_mom conf file " + PBS_MOM_CONF_FILE + " is it a file? "\
+    #       + str(os.path.isfile(PBS_MOM_CONF_FILE))
+    #if not os.path.isfile(PBS_MOM_CONF_FILE):
+    #  print PBS_MOM_CONF_FILE + " file not found, about to create it"
+    #else:
+    #  print "about to overwrite file " + PBS_MOM_CONF_FILE + " to update "#\
+    #         + "pbs_server on this node"
 
     #print "adding line to conf file: $pbsserver " + self.pbs_server_ip + "\n"
     #FILE = open(PBS_MOM_CONF_FILE,'w')
     #FILE.write("$pbsserver " + self.pbs_server_ip + "\n")
     #FILE.write("$logevent 255 #bitmap of which events to log\n")
-
     #FILE.close()
-   
+
     #print "overwrote pbs_mom config file, its contents now are:"
     #FILE = open(PBS_MOM_CONF_FILE,'r')
     #for line in FILE: print line + "\n"
     #FILE.close()
 
-    #try killing pbs_mom in case we changed the config
-    rval = Popen("momctl -s",shell=True).wait()
-    print "rval of momctl -s command was " + str(rval)
-    if rval != 0:
-      print "tried to kill pbs_mom, but momctl -s command failed, prob because no mom was running"
-    else:
-      time.sleep(1) #not sure if necessary, but wait a sec to be sure the mom lock file is deleted 
+    #overwrite $(TORQUECFG)/server_name file with fqdn of pbs_server
+    #FILE = open(PBS_SERVER_NAME_FILE,'w')
+    #FILE.write(self.pbs_server_fqdn)
+    #FILE.close()
+
+    ##try killing pbs_mom in case we changed the config
+    #rval = Popen("momctl -s",shell=True).wait()
+    #print "rval of momctl -s command was " + str(rval)
+    #if rval != 0:
+    #  print "tried to kill pbs_mom, but momctl -s command failed, prob because no mom was running"
+    #else:
+    #  time.sleep(1) #not sure if necessary, but wait a sec to be sure the mom lock file is deleted 
 
-    #run pbs_mom
+    ##run pbs_mom
     print "running pbs_mom on compute node"
     Popen("pbs_mom", shell=True)
 
@@ -77,7 +77,7 @@ class MyExecutor(nexus.Executor):
 
   def shutdown(self, driver):
     print "shutdown"
-    cleanup()
+    #cleanup()
 
   def error(self, driver, code, message):
     print "Error: %s" % message

Modified: incubator/mesos/trunk/frameworks/torque/test_date_sleep_date_2node.qsub
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/frameworks/torque/test_date_sleep_date_2node.qsub?rev=1131692&r1=1131691&r2=1131692&view=diff
==============================================================================
--- incubator/mesos/trunk/frameworks/torque/test_date_sleep_date_2node.qsub (original)
+++ incubator/mesos/trunk/frameworks/torque/test_date_sleep_date_2node.qsub Sun Jun  5 04:58:19 2011
@@ -1,17 +1,16 @@
-#!/bin/sh
+#!/bin/bash
 #
-#This is an example script example.sh
-#
-#These commands set up the Grid Environment for your job:
-#PBS -N date_sleep_date_test_job
 #PBS -l nodes=2
-#PBS -q batch 
 
-#print the time and date
-date
+/bin/cat $PBS_NODEFILE
+
+echo "Print out the hostname and date"
+/bin/hostname
+/bin/date
+#PBS -q batch 
 
 #wait 120 seconds
-sleep 10
+sleep 240
 
 #print the time and date again
 date

Modified: incubator/mesos/trunk/frameworks/torque/torquesched.py
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/frameworks/torque/torquesched.py?rev=1131692&r1=1131691&r2=1131692&view=diff
==============================================================================
--- incubator/mesos/trunk/frameworks/torque/torquesched.py (original)
+++ incubator/mesos/trunk/frameworks/torque/torquesched.py Sun Jun  5 04:58:19 2011
@@ -23,10 +23,11 @@ EVENT_LOG_FILE = "log_fw_utilization.txt
 LOG_FILE = "scheduler_log.txt"
 
 SCHEDULER_ITERATION = 2 #number of seconds torque waits before looping through
-                        #the queue to try to match resources to jobs. default is
-                        #10min (ie 600) but we want it to be low so jobs will run
-                        #as soon as the framework has acquired enough resources
-SAFE_ALLOCATION = {"cpus":5,"mem":134217728} #just set statically for now, 128MB
+                        #the queue to try to match resources to jobs. default
+                        #is 10min (ie 600) but we want it to be low so jobs 
+                        #will run as soon as the framework has acquired enough
+                        #resources
+SAFE_ALLOCATION = {"cpus":10,"mem":134217728} #just set statically for now, 128MB
 MIN_SLOT_SIZE = {"cpus":"1","mem":1073741824} #1GB
 
 eventlog = logging.getLogger("event_logger")
@@ -129,7 +130,7 @@ class MyScheduler(nexus.Scheduler):
     toKill = (len(self.servers)-1)
     
     monitorlog.debug("getting and filtering list of nodes using torquelib")
-    noJobs = lambda x: x.status.has_key("jobs") == False or (x.status.has_key("jobs") == True and x.status["jobs"] == "")
+    noJobs = lambda x: x.state != "job-exclusive"
     inactiveNodes = map(lambda x: x.name,filter(noJobs, torquelib.getNodes()))
     monitorlog.debug("victim pool of inactive nodes:")
     for inode in inactiveNodes:
@@ -190,49 +191,50 @@ if __name__ == "__main__":
   fqdn = socket.getfqdn()
   ip = socket.gethostbyname(gethostname())
 
-  monitorlog.info("running killall pbs_server")
-  Popen("killall pbs_server", shell=True)
-  time.sleep(1)
-
-  monitorlog.info("writing $(TORQUECFG)/server_name file with fqdn of pbs_server: " + fqdn)
-  FILE = open(PBS_SERVER_FILE,'w')
-  FILE.write(fqdn)
-  FILE.close()
+  #monitorlog.info("running killall pbs_server")
+  #Popen("killall pbs_server", shell=True)
+  #time.sleep(1)
+
+  #monitorlog.info("writing $(TORQUECFG)/server_name file with fqdn of pbs_server: " + fqdn)
+  #Popen("touch %s" % PBS_SERVER_FILE, shell=True)
+  #FILE = open(PBS_SERVER_FILE,'w')
+  #FILE.write(fqdn)
+  #FILE.close()
 
-  monitorlog.info("starting pbs_server")
+  #monitorlog.info("starting pbs_server")
   #Popen("/etc/init.d/pbs_server start", shell=True)
-  Popen("pbs_server", shell=True)
-  time.sleep(2)
-
-  monitorlog.info("running command: qmgr -c \"set queue batch resources_available.nodes=%s\"" % SAFE_ALLOCATION["cpus"])
-  Popen("qmgr -c \"set queue batch resources_available.nodect=%s\"" % SAFE_ALLOCATION["cpus"], shell=True)
-  Popen("qmgr -c \"set server resources_available.nodect=%s\"" % SAFE_ALLOCATION["cpus"], shell=True)
-
-  #these lines might not be necessary since we hacked the torque fifo scheduler
-  Popen("qmgr -c \"set queue batch resources_max.nodect=%s\"" % SAFE_ALLOCATION["cpus"], shell=True)
-  Popen("qmgr -c \"set server resources_max.nodect=%s\"" % SAFE_ALLOCATION["cpus"], shell=True)
-  Popen("qmgr -c \"set server scheduler_iteration=%s\"" % SCHEDULER_ITERATION, shell=True)
-
-  outp = Popen("qmgr -c \"l queue batch\"", shell=True, stdout=PIPE).stdout
-  for l in outp:
-    monitorlog.info(l)
-
-  monitorlog.info("RE-killing pbs_server for resources_available setting to take effect")
-  #Popen("/etc/init.d/pbs_server start", shell=True)
-  Popen("qterm", shell=True)
-  time.sleep(1)
-
-  monitorlog.info("RE-starting pbs_server for resources_available setting to take effect")
-  Popen("pbs_server", shell=True)
-  monitorlog.debug("qmgr list queue settings: ")
-  output = Popen("qmgr -c 'l q batch'", shell=True, stdout=PIPE).stdout
-  for line in output:
-    monitorlog.debug(line)
-
-  monitorlog.info("running killall pbs_sched")
-  Popen("killall pbs_sched", shell=True)
+  #Popen("pbs_server", shell=True)
   #time.sleep(2)
 
+ # monitorlog.info("running command: qmgr -c \"set queue batch resources_available.nodes=%s\"" % SAFE_ALLOCATION["cpus"])
+ # Popen("qmgr -c \"set queue batch resources_available.nodect=%s\"" % SAFE_ALLOCATION["cpus"], shell=True)
+ # Popen("qmgr -c \"set server resources_available.nodect=%s\"" % SAFE_ALLOCATION["cpus"], shell=True)
+
+ # #these lines might not be necessary since we hacked the torque fifo scheduler
+ # Popen("qmgr -c \"set queue batch resources_max.nodect=%s\"" % SAFE_ALLOCATION["cpus"], shell=True)
+ # Popen("qmgr -c \"set server resources_max.nodect=%s\"" % SAFE_ALLOCATION["cpus"], shell=True)
+ # Popen("qmgr -c \"set server scheduler_iteration=%s\"" % SCHEDULER_ITERATION, shell=True)
+
+ # outp = Popen("qmgr -c \"l queue batch\"", shell=True, stdout=PIPE).stdout
+ # for l in outp:
+ #   monitorlog.info(l)
+
+ # monitorlog.info("RE-killing pbs_server for resources_available setting to take effect")
+ # #Popen("/etc/init.d/pbs_server start", shell=True)
+ # Popen("qterm", shell=True)
+ # time.sleep(1)
+
+ # monitorlog.info("RE-starting pbs_server for resources_available setting to take effect")
+ #Popen("pbs_server", shell=True)
+ # monitorlog.debug("qmgr list queue settings: ")
+ # output = Popen("qmgr -c 'l q batch'", shell=True, stdout=PIPE).stdout
+ # for line in output:
+ #   monitorlog.debug(line)
+
+ # monitorlog.info("running killall pbs_sched")
+ # Popen("killall pbs_sched", shell=True)
+ # #time.sleep(2)
+
   monitorlog.info("starting pbs_scheduler")
   #Popen("/etc/init.d/pbs_sched start", shell=True)
   Popen("pbs_sched", shell=True)

Modified: incubator/mesos/trunk/src/ec2/deploy.karmic64/root/nexus-ec2/setup-torque
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/src/ec2/deploy.karmic64/root/nexus-ec2/setup-torque?rev=1131692&r1=1131691&r2=1131692&view=diff
==============================================================================
--- incubator/mesos/trunk/src/ec2/deploy.karmic64/root/nexus-ec2/setup-torque (original)
+++ incubator/mesos/trunk/src/ec2/deploy.karmic64/root/nexus-ec2/setup-torque Sun Jun  5 04:58:19 2011
@@ -6,44 +6,53 @@ SLAVES_FILE="/root/nexus-ec2/slaves"
 MASTER="`cat master`"
 SLAVES="`cat $SLAVES_FILE`"
 
+SCHEDULER_ITERATION=5
+
 #These seem to be broken, i.e. missing directories after install
 #ssh $MASTER "apt-get install -y torque-server"
 #ssh $MASTER "apt-get install -y torque-scheduler"
 #ssh $MASTER "apt-get install -y torque-client"
 
 #install torque: download/unzip torque
-function installmaster {
+function installtorque {
 	pushd ~
 	echo "downloading and installing torque on master"
 	#wget http://www.clusterresources.com/downloads/torque/torque-2.4.7.tar.gz
 	wget http://nexus.berkeley.edu/torque-2.4.7.tar.gz
 	tar xzf torque-2.4.7.tar.gz
-	cd torque-2.4.7
-	./configure --prefix=/usr/local --with-debug --disable-gcc-warnings
+	pushd torque-2.4.7
+	./configure --prefix=/usr
 	make -j8
 	make install
+	popd;popd
+}
+
+function setuptorque {
+	pushd ~/torque-2.4.7
 	echo "running ldconfig on master"
 	ldconfig
-        ./torque.setup root #localhost # Note: sets some defaults for batch queue
-
-	echo "copying init.d control scripts to master"
-	cp contrib/init.d/debian.pbs_mom /etc/init.d/pbs_mom
-	cp contrib/init.d/debian.pbs_sched /etc/init.d/pbs_sched
-	cp contrib/init.d/debian.pbs_server /etc/init.d/pbs_server
+        #./torque.setup root # Note: sets some defaults for batch queue
+	qterm
+        yes|./torque.setup root localhost # Note: sets some defaults for batch queue
 
-	popd
 	#WARNING: allow root to qsub for debug purposes only, may be dangerous
 	qmgr -c 'set server acl_roots+=root@*' #allow root to submit jobs
-	qmgr -c 's s allow_node_submit=true' #other hosts can submit too
-	NUM_SLAVES=`cat slaves|wc -l`
-        qmgr -c "s queue batch resources_max.nodect=$NUM_SLAVES"
-        qmgr -c "s queue batch resources_available.nodect=$NUM_SLAVES" #the framework should update this on its own and the server has to be restarted after this
-        qterm
+	qmgr -c "set server scheduler_iteration=$SCHEDULER_ITERATION"
+	#qmgr -c 's s allow_node_submit=true' #other hosts can submit too
+
+	NUM_SLAVES=`cat ~/nexus-ec2/slaves|wc -l`
+	#the server be restarted after this
+	qmgr -c "set queue batch resources_available.nodect=$NUM_SLAVES"
+	#qmgr -c "set server resources_available.nodect=$NUM_SLAVES"
+	qterm
+        pbs_server
 
 	touch ~/.rhosts
 	echo `hostname` |cat >> ~/.rhosts
 	echo `hostname -f` |cat >> ~/.rhosts
 	echo localhost |cat >> ~/.rhosts
+
+	popd
 }
 
 function installslaves {
@@ -59,18 +68,14 @@ function installslaves {
 	cp torque-package-mom-linux-x86_64.sh /nfs/torque/torque-package-clients-linux-x86_64.sh
 
 	echo "installing torque mom and clients package on slaves"
-	dsh -f $SLAVES_FILE /nfs/torque/torque-package-mom-linux-x86_64.sh --install
-	dsh -f $SLAVES_FILE /nfs/torque/torque-package-clients-linux-x86_64.sh --install
-
-	echo "copying pbs_mom init.d control script to slaves"
-	mkdir /nfs/torque/init.d
-	cp contrib/init.d/debian.pbs_mom /nfs/torque/init.d/debian.pbs_mom
-	dsh -f $SLAVES_FILE cp /nfs/torque/init.d/debian.pbs_mom /etc/init.d/pbs_mom
+	for i in `cat $SLAVES_FILE`; do ssh $i /nfs/torque/torque-package-mom-linux-x86_64.sh --install; ldconfig; done
+	for i in `cat $SLAVES_FILE`; do ssh $i /nfs/torque/torque-package-clients-linux-x86_64.sh --install; ldconfig; done
 
 	echo "Running ldconfig on slaves"
 	dsh -f $SLAVES_FILE ldconfig
 	popd
 }
 
-installmaster
-installslaves
+#installtorque
+setuptorque
+#installslaves