You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by be...@apache.org on 2011/06/05 07:00:28 UTC

svn commit: r1131710 - in /incubator/mesos/trunk/frameworks/torque: hpl-48node.qsub hpl-8node.qsub torquesched.py

Author: benh
Date: Sun Jun  5 05:00:27 2011
New Revision: 1131710

URL: http://svn.apache.org/viewvc?rev=1131710&view=rev
Log:
Fixed bug which causes framework to release all nodes (kill all tasks) every time unregNNodes is called.

Added:
    incubator/mesos/trunk/frameworks/torque/hpl-48node.qsub
    incubator/mesos/trunk/frameworks/torque/hpl-8node.qsub
Modified:
    incubator/mesos/trunk/frameworks/torque/torquesched.py

Added: incubator/mesos/trunk/frameworks/torque/hpl-48node.qsub
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/frameworks/torque/hpl-48node.qsub?rev=1131710&view=auto
==============================================================================
--- incubator/mesos/trunk/frameworks/torque/hpl-48node.qsub (added)
+++ incubator/mesos/trunk/frameworks/torque/hpl-48node.qsub Sun Jun  5 05:00:27 2011
@@ -0,0 +1,6 @@
+#! /usr/bin/env sh 
+#PBS -l nodes=48
+#PBS -N 48_node_hpl_job 
+
+cd /nfs/hpl/48node/ 
+nmpiexec -n 48 /nfs/hpl/48node/xhpl

Added: incubator/mesos/trunk/frameworks/torque/hpl-8node.qsub
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/frameworks/torque/hpl-8node.qsub?rev=1131710&view=auto
==============================================================================
--- incubator/mesos/trunk/frameworks/torque/hpl-8node.qsub (added)
+++ incubator/mesos/trunk/frameworks/torque/hpl-8node.qsub Sun Jun  5 05:00:27 2011
@@ -0,0 +1,6 @@
+#! /usr/bin/env sh 
+#PBS -l nodes=8
+#PBS -N 8_node_hpl_job
+
+cd /nfs/hpl/8node/ 
+nmpiexec -n 8 /nfs/hpl/8node/xhpl

Modified: incubator/mesos/trunk/frameworks/torque/torquesched.py
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/frameworks/torque/torquesched.py?rev=1131710&r1=1131709&r2=1131710&view=diff
==============================================================================
--- incubator/mesos/trunk/frameworks/torque/torquesched.py (original)
+++ incubator/mesos/trunk/frameworks/torque/torquesched.py Sun Jun  5 05:00:27 2011
@@ -20,14 +20,14 @@ from socket import gethostname
 
 PBS_SERVER_FILE = "/var/spool/torque/server_name"
 EVENT_LOG_FILE = "log_fw_utilization.txt"
-LOG_FILE = "scheduler_log.txt"
+LOG_FILE = "log.txt"
 
 SCHEDULER_ITERATION = 2 #number of seconds torque waits before looping through
                         #the queue to try to match resources to jobs. default
                         #is 10min (ie 600) but we want it to be low so jobs 
                         #will run as soon as the framework has acquired enough
                         #resources
-SAFE_ALLOCATION = {"cpus":10,"mem":134217728} #just set statically for now, 128MB
+SAFE_ALLOCATION = {"cpus":48,"mem":134217728} #just set statically for now, 128MB
 MIN_SLOT_SIZE = {"cpus":"1","mem":1073741824} #1GB
 
 eventlog = logging.getLogger("event_logger")
@@ -36,11 +36,18 @@ fh = logging.FileHandler(EVENT_LOG_FILE,
 fh.setFormatter(logging.Formatter("%(asctime)s %(message)s"))
 eventlog.addHandler(fh)
 
+ch = logging.StreamHandler()
+fh = logging.FileHandler(LOG_FILE,"w")
+
 driverlog = logging.getLogger("driver_logger")
-driverlog.setLevel(logging.INFO)
+driverlog.setLevel(logging.DEBUG)
+driverlog.addHandler(fh)
+driverlog.addHandler(ch)
 
 monitorlog = logging.getLogger("monitor_logger")
-monitorlog.setLevel(logging.INFO)
+monitorlog.setLevel(logging.DEBUG)
+monitorlog.addHandler(fh)
+monitorlog.addHandler(ch)
 
 class MyScheduler(nexus.Scheduler):
   def __init__(self, ip):
@@ -127,7 +134,7 @@ class MyScheduler(nexus.Scheduler):
     monitorlog.debug("unregNNodes called with arg %d" % numNodes)
     if numNodes > len(self.servers)-1:
       monitorlog.debug("... however, only unregistering %d nodes, leaving one alive" % (len(self.servers)-1))
-    toKill = (len(self.servers)-1)
+    toKill = min(numNodes,len(self.servers)-1))
     
     monitorlog.debug("getting and filtering list of nodes using torquelib")
     noJobs = lambda x: x.state != "job-exclusive"