You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mesos.apache.org by be...@apache.org on 2011/06/05 07:02:39 UTC
svn commit: r1131730 - in /incubator/mesos/trunk/frameworks/torque:
hpl-24node.qsub hpl-48node.qsub hpl-8node-small.qsub hpl-8node.qsub
torquesched.py
Author: benh
Date: Sun Jun 5 05:02:38 2011
New Revision: 1131730
URL: http://svn.apache.org/viewvc?rev=1131730&view=rev
Log:
Now scales down to zero slaves intead of one. Adding HPLinpack qsub scripts
Added:
incubator/mesos/trunk/frameworks/torque/hpl-24node.qsub
incubator/mesos/trunk/frameworks/torque/hpl-8node-small.qsub
- copied, changed from r1131729, incubator/mesos/trunk/frameworks/torque/hpl-8node.qsub
Modified:
incubator/mesos/trunk/frameworks/torque/hpl-48node.qsub
incubator/mesos/trunk/frameworks/torque/hpl-8node.qsub
incubator/mesos/trunk/frameworks/torque/torquesched.py
Added: incubator/mesos/trunk/frameworks/torque/hpl-24node.qsub
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/frameworks/torque/hpl-24node.qsub?rev=1131730&view=auto
==============================================================================
--- incubator/mesos/trunk/frameworks/torque/hpl-24node.qsub (added)
+++ incubator/mesos/trunk/frameworks/torque/hpl-24node.qsub Sun Jun 5 05:02:38 2011
@@ -0,0 +1,6 @@
+#! /usr/bin/env sh
+#PBS -l nodes=24
+#PBS -N 24_node_hpl_job
+
+cd /nfs/hpl/24node/
+mpiexec -n 24 /nfs/hpl/24node/xhpl
Modified: incubator/mesos/trunk/frameworks/torque/hpl-48node.qsub
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/frameworks/torque/hpl-48node.qsub?rev=1131730&r1=1131729&r2=1131730&view=diff
==============================================================================
--- incubator/mesos/trunk/frameworks/torque/hpl-48node.qsub (original)
+++ incubator/mesos/trunk/frameworks/torque/hpl-48node.qsub Sun Jun 5 05:02:38 2011
@@ -3,4 +3,4 @@
#PBS -N 48_node_hpl_job
cd /nfs/hpl/48node/
-nmpiexec -n 48 /nfs/hpl/48node/xhpl
+mpiexec -n 48 /nfs/hpl/48node/xhpl
Copied: incubator/mesos/trunk/frameworks/torque/hpl-8node-small.qsub (from r1131729, incubator/mesos/trunk/frameworks/torque/hpl-8node.qsub)
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/frameworks/torque/hpl-8node-small.qsub?p2=incubator/mesos/trunk/frameworks/torque/hpl-8node-small.qsub&p1=incubator/mesos/trunk/frameworks/torque/hpl-8node.qsub&r1=1131729&r2=1131730&rev=1131730&view=diff
==============================================================================
--- incubator/mesos/trunk/frameworks/torque/hpl-8node.qsub (original)
+++ incubator/mesos/trunk/frameworks/torque/hpl-8node-small.qsub Sun Jun 5 05:02:38 2011
@@ -3,4 +3,4 @@
#PBS -N 8_node_hpl_job
cd /nfs/hpl/8node/
-nmpiexec -n 8 /nfs/hpl/8node/xhpl
+mpiexec -n 8 /nfs/hpl/8node/xhpl
Modified: incubator/mesos/trunk/frameworks/torque/hpl-8node.qsub
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/frameworks/torque/hpl-8node.qsub?rev=1131730&r1=1131729&r2=1131730&view=diff
==============================================================================
--- incubator/mesos/trunk/frameworks/torque/hpl-8node.qsub (original)
+++ incubator/mesos/trunk/frameworks/torque/hpl-8node.qsub Sun Jun 5 05:02:38 2011
@@ -3,4 +3,4 @@
#PBS -N 8_node_hpl_job
cd /nfs/hpl/8node/
-nmpiexec -n 8 /nfs/hpl/8node/xhpl
+mpiexec -n 8 /nfs/hpl/8node/xhpl
Modified: incubator/mesos/trunk/frameworks/torque/torquesched.py
URL: http://svn.apache.org/viewvc/incubator/mesos/trunk/frameworks/torque/torquesched.py?rev=1131730&r1=1131729&r2=1131730&view=diff
==============================================================================
--- incubator/mesos/trunk/frameworks/torque/torquesched.py (original)
+++ incubator/mesos/trunk/frameworks/torque/torquesched.py Sun Jun 5 05:02:38 2011
@@ -10,7 +10,7 @@ import threading
import re
import socket
import torquelib
-import datetime
+import time
import logging
import logging.handlers
@@ -29,6 +29,7 @@ SCHEDULER_ITERATION = 2 #number of secon
#resources
SAFE_ALLOCATION = {"cpus":48,"mem":134217728} #just set statically for now, 128MB
MIN_SLOT_SIZE = {"cpus":"1","mem":1073741824} #1GB
+MIN_SLOTS_HELD = 0 #keep at least this many slots even if none are needed
eventlog = logging.getLogger("event_logger")
eventlog.setLevel(logging.DEBUG)
@@ -36,18 +37,23 @@ fh = logging.FileHandler(EVENT_LOG_FILE,
fh.setFormatter(logging.Formatter("%(asctime)s %(message)s"))
eventlog.addHandler(fh)
+#Something special about this file makes logging not work normally
+#I think it might be swig? the StreamHandler prints at DEBUG level
+#even though I setLevel to INFO
ch = logging.StreamHandler()
+ch.setLevel(logging.INFO)
fh = logging.FileHandler(LOG_FILE,"w")
+fh.setLevel(logging.DEBUG)
driverlog = logging.getLogger("driver_logger")
driverlog.setLevel(logging.DEBUG)
driverlog.addHandler(fh)
-driverlog.addHandler(ch)
+#driverlog.addHandler(ch)
monitorlog = logging.getLogger("monitor_logger")
monitorlog.setLevel(logging.DEBUG)
monitorlog.addHandler(fh)
-monitorlog.addHandler(ch)
+#monitorlog.addHandler(ch)
class MyScheduler(nexus.Scheduler):
def __init__(self, ip):
@@ -57,7 +63,7 @@ class MyScheduler(nexus.Scheduler):
self.ip = ip
self.servers = {}
self.overloaded = False
- self.numToRegister = 1
+ self.numToRegister = MIN_SLOTS_HELD
def getExecutorInfo(self, driver):
execPath = os.path.join(os.getcwd(), "start_pbs_mom.sh")
@@ -96,7 +102,7 @@ class MyScheduler(nexus.Scheduler):
self.numToRegister -= 1
self.id += 1
driverlog.info("writing logfile")
- eventlog.info(len(self.servers))
+ eventlog.info("%d %d" % (time.time(),len(self.servers)))
driverlog.info("done writing logfile")
driverlog.info("self.id now set to " + str(self.id))
#print "---"
@@ -132,9 +138,9 @@ class MyScheduler(nexus.Scheduler):
#unreg up to N random compute nodes, leave at least one
def unregNNodes(self, numNodes):
monitorlog.debug("unregNNodes called with arg %d" % numNodes)
- if numNodes > len(self.servers)-1:
- monitorlog.debug("... however, only unregistering %d nodes, leaving one alive" % (len(self.servers)-1))
- toKill = min(numNodes,len(self.servers)-1))
+ if numNodes > len(self.servers)-MIN_SLOTS_HELD:
+ monitorlog.debug("... however, only unregistering %d nodes, leaving one alive" % (len(self.servers)-MIN_SLOTS_HELD))
+ toKill = min(numNodes,len(self.servers)-MIN_SLOTS_HELD)
monitorlog.debug("getting and filtering list of nodes using torquelib")
noJobs = lambda x: x.state != "job-exclusive"
@@ -143,12 +149,12 @@ class MyScheduler(nexus.Scheduler):
for inode in inactiveNodes:
monitorlog.debug(inode)
for tid, hostname in self.servers.items():
- if len(self.servers) > 1 and toKill > 0 and hostname in inactiveNodes:
+ if len(self.servers) > MIN_SLOTS_HELD and toKill > 0 and hostname in inactiveNodes:
monitorlog.info("We still have to kill %d of the %d compute nodes which master is tracking" % (toKill, len(self.servers)))
monitorlog.info("unregistering node " + str(hostname))
self.unregComputeNode(hostname)
self.servers.pop(tid)
- eventlog.info(len(sched.servers))
+ eventlog.info("%d %d" % (time.time(),len(self.servers)))
toKill = toKill - 1
monitorlog.info("killing corresponding task with tid %d" % tid)
self.driver.killTask(tid)