You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by be...@apache.org on 2011/06/05 06:58:19 UTC
svn commit: r1131692 - in /incubator/mesos/trunk:
frameworks/torque/start_pbs_mom.py
frameworks/torque/test_date_sleep_date_2node.qsub
frameworks/torque/torquesched.py
src/ec2/deploy.karmic64/root/nexus-ec2/setup-torque
Author: benh
Date: Sun Jun 5 04:58:19 2011
New Revision: 1131692
URL: http://svn.apache.org/viewvc?rev=1131692&view=rev
Log:
updates, combined with two hacks to torque source code, torque fw is now function, though not yet incorporated with mpi
Modified:
incubator/mesos/trunk/frameworks/torque/start_pbs_mom.py
incubator/mesos/trunk/frameworks/torque/test_date_sleep_date_2node.qsub
incubator/mesos/trunk/frameworks/torque/torquesched.py
incubator/mesos/trunk/src/ec2/deploy.karmic64/root/nexus-ec2/setup-torque
Modified: incubator/mesos/trunk/frameworks/torque/start_pbs_mom.py
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/frameworks/torque/start_pbs_mom.py?rev=1131692&r1=1131691&r2=1131692&view=diff
==============================================================================
--- incubator/mesos/trunk/frameworks/torque/start_pbs_mom.py (original)
+++ incubator/mesos/trunk/frameworks/torque/start_pbs_mom.py Sun Jun 5 04:58:19 2011
@@ -8,7 +8,7 @@ import atexit
from subprocess import *
PBS_MOM_CONF_FILE = "/var/spool/torque/mom_priv/config"
-PBS_SERVER_FILE = "/var/spool/torque/server_name"
+PBS_SERVER_NAME_FILE = "/var/spool/torque/server_name"
def cleanup():
try:
@@ -31,42 +31,42 @@ class MyExecutor(nexus.Executor):
def launchTask(self, driver, task):
print "Running task %d" % task.taskId
- print "checking pbs_mom conf file " + PBS_MOM_CONF_FILE + " is it a file? "\
- + str(os.path.isfile(PBS_MOM_CONF_FILE))
#TODO: if config file exists, check to see that it is correct
# (right now we overwrite it no matter what)
- if not os.path.isfile(PBS_MOM_CONF_FILE):
- print PBS_MOM_CONF_FILE + " file not found, about to create it"
- else:
- print "about to overwrite file " + PBS_MOM_CONF_FILE + " to update "#\
-# + "pbs_server on this node"
- #overwrite $(TORQUECFG)/server_name file with fqdn of pbs_server
- FILE = open(PBS_SERVER_FILE,'w')
- FILE.write(self.pbs_server_fqdn)
- FILE.close()
+ #print "checking pbs_mom conf file " + PBS_MOM_CONF_FILE + " is it a file? "\
+ # + str(os.path.isfile(PBS_MOM_CONF_FILE))
+ #if not os.path.isfile(PBS_MOM_CONF_FILE):
+ # print PBS_MOM_CONF_FILE + " file not found, about to create it"
+ #else:
+ # print "about to overwrite file " + PBS_MOM_CONF_FILE + " to update "#\
+ # + "pbs_server on this node"
#print "adding line to conf file: $pbsserver " + self.pbs_server_ip + "\n"
#FILE = open(PBS_MOM_CONF_FILE,'w')
#FILE.write("$pbsserver " + self.pbs_server_ip + "\n")
#FILE.write("$logevent 255 #bitmap of which events to log\n")
-
#FILE.close()
-
+
#print "overwrote pbs_mom config file, its contents now are:"
#FILE = open(PBS_MOM_CONF_FILE,'r')
#for line in FILE: print line + "\n"
#FILE.close()
- #try killing pbs_mom in case we changed the config
- rval = Popen("momctl -s",shell=True).wait()
- print "rval of momctl -s command was " + str(rval)
- if rval != 0:
- print "tried to kill pbs_mom, but momctl -s command failed, prob because no mom was running"
- else:
- time.sleep(1) #not sure if necessary, but wait a sec to be sure the mom lock file is deleted
+ #overwrite $(TORQUECFG)/server_name file with fqdn of pbs_server
+ #FILE = open(PBS_SERVER_NAME_FILE,'w')
+ #FILE.write(self.pbs_server_fqdn)
+ #FILE.close()
+
+ ##try killing pbs_mom in case we changed the config
+ #rval = Popen("momctl -s",shell=True).wait()
+ #print "rval of momctl -s command was " + str(rval)
+ #if rval != 0:
+ # print "tried to kill pbs_mom, but momctl -s command failed, prob because no mom was running"
+ #else:
+ # time.sleep(1) #not sure if necessary, but wait a sec to be sure the mom lock file is deleted
- #run pbs_mom
+ ##run pbs_mom
print "running pbs_mom on compute node"
Popen("pbs_mom", shell=True)
@@ -77,7 +77,7 @@ class MyExecutor(nexus.Executor):
def shutdown(self, driver):
print "shutdown"
- cleanup()
+ #cleanup()
def error(self, driver, code, message):
print "Error: %s" % message
Modified: incubator/mesos/trunk/frameworks/torque/test_date_sleep_date_2node.qsub
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/frameworks/torque/test_date_sleep_date_2node.qsub?rev=1131692&r1=1131691&r2=1131692&view=diff
==============================================================================
--- incubator/mesos/trunk/frameworks/torque/test_date_sleep_date_2node.qsub (original)
+++ incubator/mesos/trunk/frameworks/torque/test_date_sleep_date_2node.qsub Sun Jun 5 04:58:19 2011
@@ -1,17 +1,16 @@
-#!/bin/sh
+#!/bin/bash
#
-#This is an example script example.sh
-#
-#These commands set up the Grid Environment for your job:
-#PBS -N date_sleep_date_test_job
#PBS -l nodes=2
-#PBS -q batch
-#print the time and date
-date
+/bin/cat $PBS_NODEFILE
+
+echo "Print out the hostname and date"
+/bin/hostname
+/bin/date
+#PBS -q batch
#wait 120 seconds
-sleep 10
+sleep 240
#print the time and date again
date
Modified: incubator/mesos/trunk/frameworks/torque/torquesched.py
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/frameworks/torque/torquesched.py?rev=1131692&r1=1131691&r2=1131692&view=diff
==============================================================================
--- incubator/mesos/trunk/frameworks/torque/torquesched.py (original)
+++ incubator/mesos/trunk/frameworks/torque/torquesched.py Sun Jun 5 04:58:19 2011
@@ -23,10 +23,11 @@ EVENT_LOG_FILE = "log_fw_utilization.txt
LOG_FILE = "scheduler_log.txt"
SCHEDULER_ITERATION = 2 #number of seconds torque waits before looping through
- #the queue to try to match resources to jobs. default is
- #10min (ie 600) but we want it to be low so jobs will run
- #as soon as the framework has acquired enough resources
-SAFE_ALLOCATION = {"cpus":5,"mem":134217728} #just set statically for now, 128MB
+ #the queue to try to match resources to jobs. default
+ #is 10min (ie 600) but we want it to be low so jobs
+ #will run as soon as the framework has acquired enough
+ #resources
+SAFE_ALLOCATION = {"cpus":10,"mem":134217728} #just set statically for now, 128MB
MIN_SLOT_SIZE = {"cpus":"1","mem":1073741824} #1GB
eventlog = logging.getLogger("event_logger")
@@ -129,7 +130,7 @@ class MyScheduler(nexus.Scheduler):
toKill = (len(self.servers)-1)
monitorlog.debug("getting and filtering list of nodes using torquelib")
- noJobs = lambda x: x.status.has_key("jobs") == False or (x.status.has_key("jobs") == True and x.status["jobs"] == "")
+ noJobs = lambda x: x.state != "job-exclusive"
inactiveNodes = map(lambda x: x.name,filter(noJobs, torquelib.getNodes()))
monitorlog.debug("victim pool of inactive nodes:")
for inode in inactiveNodes:
@@ -190,49 +191,50 @@ if __name__ == "__main__":
fqdn = socket.getfqdn()
ip = socket.gethostbyname(gethostname())
- monitorlog.info("running killall pbs_server")
- Popen("killall pbs_server", shell=True)
- time.sleep(1)
-
- monitorlog.info("writing $(TORQUECFG)/server_name file with fqdn of pbs_server: " + fqdn)
- FILE = open(PBS_SERVER_FILE,'w')
- FILE.write(fqdn)
- FILE.close()
+ #monitorlog.info("running killall pbs_server")
+ #Popen("killall pbs_server", shell=True)
+ #time.sleep(1)
+
+ #monitorlog.info("writing $(TORQUECFG)/server_name file with fqdn of pbs_server: " + fqdn)
+ #Popen("touch %s" % PBS_SERVER_FILE, shell=True)
+ #FILE = open(PBS_SERVER_FILE,'w')
+ #FILE.write(fqdn)
+ #FILE.close()
- monitorlog.info("starting pbs_server")
+ #monitorlog.info("starting pbs_server")
#Popen("/etc/init.d/pbs_server start", shell=True)
- Popen("pbs_server", shell=True)
- time.sleep(2)
-
- monitorlog.info("running command: qmgr -c \"set queue batch resources_available.nodes=%s\"" % SAFE_ALLOCATION["cpus"])
- Popen("qmgr -c \"set queue batch resources_available.nodect=%s\"" % SAFE_ALLOCATION["cpus"], shell=True)
- Popen("qmgr -c \"set server resources_available.nodect=%s\"" % SAFE_ALLOCATION["cpus"], shell=True)
-
- #these lines might not be necessary since we hacked the torque fifo scheduler
- Popen("qmgr -c \"set queue batch resources_max.nodect=%s\"" % SAFE_ALLOCATION["cpus"], shell=True)
- Popen("qmgr -c \"set server resources_max.nodect=%s\"" % SAFE_ALLOCATION["cpus"], shell=True)
- Popen("qmgr -c \"set server scheduler_iteration=%s\"" % SCHEDULER_ITERATION, shell=True)
-
- outp = Popen("qmgr -c \"l queue batch\"", shell=True, stdout=PIPE).stdout
- for l in outp:
- monitorlog.info(l)
-
- monitorlog.info("RE-killing pbs_server for resources_available setting to take effect")
- #Popen("/etc/init.d/pbs_server start", shell=True)
- Popen("qterm", shell=True)
- time.sleep(1)
-
- monitorlog.info("RE-starting pbs_server for resources_available setting to take effect")
- Popen("pbs_server", shell=True)
- monitorlog.debug("qmgr list queue settings: ")
- output = Popen("qmgr -c 'l q batch'", shell=True, stdout=PIPE).stdout
- for line in output:
- monitorlog.debug(line)
-
- monitorlog.info("running killall pbs_sched")
- Popen("killall pbs_sched", shell=True)
+ #Popen("pbs_server", shell=True)
#time.sleep(2)
+ # monitorlog.info("running command: qmgr -c \"set queue batch resources_available.nodes=%s\"" % SAFE_ALLOCATION["cpus"])
+ # Popen("qmgr -c \"set queue batch resources_available.nodect=%s\"" % SAFE_ALLOCATION["cpus"], shell=True)
+ # Popen("qmgr -c \"set server resources_available.nodect=%s\"" % SAFE_ALLOCATION["cpus"], shell=True)
+
+ # #these lines might not be necessary since we hacked the torque fifo scheduler
+ # Popen("qmgr -c \"set queue batch resources_max.nodect=%s\"" % SAFE_ALLOCATION["cpus"], shell=True)
+ # Popen("qmgr -c \"set server resources_max.nodect=%s\"" % SAFE_ALLOCATION["cpus"], shell=True)
+ # Popen("qmgr -c \"set server scheduler_iteration=%s\"" % SCHEDULER_ITERATION, shell=True)
+
+ # outp = Popen("qmgr -c \"l queue batch\"", shell=True, stdout=PIPE).stdout
+ # for l in outp:
+ # monitorlog.info(l)
+
+ # monitorlog.info("RE-killing pbs_server for resources_available setting to take effect")
+ # #Popen("/etc/init.d/pbs_server start", shell=True)
+ # Popen("qterm", shell=True)
+ # time.sleep(1)
+
+ # monitorlog.info("RE-starting pbs_server for resources_available setting to take effect")
+ #Popen("pbs_server", shell=True)
+ # monitorlog.debug("qmgr list queue settings: ")
+ # output = Popen("qmgr -c 'l q batch'", shell=True, stdout=PIPE).stdout
+ # for line in output:
+ # monitorlog.debug(line)
+
+ # monitorlog.info("running killall pbs_sched")
+ # Popen("killall pbs_sched", shell=True)
+ # #time.sleep(2)
+
monitorlog.info("starting pbs_scheduler")
#Popen("/etc/init.d/pbs_sched start", shell=True)
Popen("pbs_sched", shell=True)
Modified: incubator/mesos/trunk/src/ec2/deploy.karmic64/root/nexus-ec2/setup-torque
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/src/ec2/deploy.karmic64/root/nexus-ec2/setup-torque?rev=1131692&r1=1131691&r2=1131692&view=diff
==============================================================================
--- incubator/mesos/trunk/src/ec2/deploy.karmic64/root/nexus-ec2/setup-torque (original)
+++ incubator/mesos/trunk/src/ec2/deploy.karmic64/root/nexus-ec2/setup-torque Sun Jun 5 04:58:19 2011
@@ -6,44 +6,53 @@ SLAVES_FILE="/root/nexus-ec2/slaves"
MASTER="`cat master`"
SLAVES="`cat $SLAVES_FILE`"
+SCHEDULER_ITERATION=5
+
#These seem to be broken, i.e. missing directories after install
#ssh $MASTER "apt-get install -y torque-server"
#ssh $MASTER "apt-get install -y torque-scheduler"
#ssh $MASTER "apt-get install -y torque-client"
#install torque: download/unzip torque
-function installmaster {
+function installtorque {
pushd ~
echo "downloading and installing torque on master"
#wget http://www.clusterresources.com/downloads/torque/torque-2.4.7.tar.gz
wget http://nexus.berkeley.edu/torque-2.4.7.tar.gz
tar xzf torque-2.4.7.tar.gz
- cd torque-2.4.7
- ./configure --prefix=/usr/local --with-debug --disable-gcc-warnings
+ pushd torque-2.4.7
+ ./configure --prefix=/usr
make -j8
make install
+ popd;popd
+}
+
+function setuptorque {
+ pushd ~/torque-2.4.7
echo "running ldconfig on master"
ldconfig
- ./torque.setup root #localhost # Note: sets some defaults for batch queue
-
- echo "copying init.d control scripts to master"
- cp contrib/init.d/debian.pbs_mom /etc/init.d/pbs_mom
- cp contrib/init.d/debian.pbs_sched /etc/init.d/pbs_sched
- cp contrib/init.d/debian.pbs_server /etc/init.d/pbs_server
+ #./torque.setup root # Note: sets some defaults for batch queue
+ qterm
+ yes|./torque.setup root localhost # Note: sets some defaults for batch queue
- popd
#WARNING: allow root to qsub for debug purposes only, may be dangerous
qmgr -c 'set server acl_roots+=root@*' #allow root to submit jobs
- qmgr -c 's s allow_node_submit=true' #other hosts can submit too
- NUM_SLAVES=`cat slaves|wc -l`
- qmgr -c "s queue batch resources_max.nodect=$NUM_SLAVES"
- qmgr -c "s queue batch resources_available.nodect=$NUM_SLAVES" #the framework should update this on its own and the server has to be restarted after this
- qterm
+ qmgr -c "set server scheduler_iteration=$SCHEDULER_ITERATION"
+ #qmgr -c 's s allow_node_submit=true' #other hosts can submit too
+
+ NUM_SLAVES=`cat ~/nexus-ec2/slaves|wc -l`
+ #the server be restarted after this
+ qmgr -c "set queue batch resources_available.nodect=$NUM_SLAVES"
+ #qmgr -c "set server resources_available.nodect=$NUM_SLAVES"
+ qterm
+ pbs_server
touch ~/.rhosts
echo `hostname` |cat >> ~/.rhosts
echo `hostname -f` |cat >> ~/.rhosts
echo localhost |cat >> ~/.rhosts
+
+ popd
}
function installslaves {
@@ -59,18 +68,14 @@ function installslaves {
cp torque-package-mom-linux-x86_64.sh /nfs/torque/torque-package-clients-linux-x86_64.sh
echo "installing torque mom and clients package on slaves"
- dsh -f $SLAVES_FILE /nfs/torque/torque-package-mom-linux-x86_64.sh --install
- dsh -f $SLAVES_FILE /nfs/torque/torque-package-clients-linux-x86_64.sh --install
-
- echo "copying pbs_mom init.d control script to slaves"
- mkdir /nfs/torque/init.d
- cp contrib/init.d/debian.pbs_mom /nfs/torque/init.d/debian.pbs_mom
- dsh -f $SLAVES_FILE cp /nfs/torque/init.d/debian.pbs_mom /etc/init.d/pbs_mom
+ for i in `cat $SLAVES_FILE`; do ssh $i /nfs/torque/torque-package-mom-linux-x86_64.sh --install; ldconfig; done
+ for i in `cat $SLAVES_FILE`; do ssh $i /nfs/torque/torque-package-clients-linux-x86_64.sh --install; ldconfig; done
echo "Running ldconfig on slaves"
dsh -f $SLAVES_FILE ldconfig
popd
}
-installmaster
-installslaves
+#installtorque
+setuptorque
+#installslaves