You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by ch...@apache.org on 2015/04/09 21:13:16 UTC
svn commit: r1672463 - in
/uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm:
JobManagerConverter.java scheduler/NodePool.java
scheduler/NodepoolScheduler.java scheduler/RmJob.java
Author: challngr
Date: Thu Apr 9 19:13:16 2015
New Revision: 1672463
URL: http://svn.apache.org/r1672463
Log:
UIMA-4327 Don't reallocate for failed AP or Service.
Modified:
uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/JobManagerConverter.java
uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/NodePool.java
uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/NodepoolScheduler.java
uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/RmJob.java
Modified: uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/JobManagerConverter.java
URL: http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/JobManagerConverter.java?rev=1672463&r1=1672462&r2=1672463&view=diff
==============================================================================
--- uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/JobManagerConverter.java (original)
+++ uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/JobManagerConverter.java Thu Apr 9 19:13:16 2015
@@ -743,6 +743,14 @@ public class JobManagerConverter
throw new SchedulingException(jobid, "Process completion arrives for share " + s.toString() +
" but job " + jobid + "cannot be found.");
}
+
+ switch ( l.getDuccType() ) { // UIMA-4326, if not a jobjob, the job must not get reallocations
+ case Job:
+ break;
+ default:
+ j.markComplete();
+ }
+
scheduler.signalCompletion(j, s);
logger.info(methodName, jobid,
String.format("Process %5s", p.getPID()),
@@ -786,7 +794,7 @@ public class JobManagerConverter
);
} else {
if ( (pr.getPID() == null) && (pl.getPID() != null) ) {
- logger.trace(methodName, jobid,
+ logger.trace(methodName, jobid,
String.format("Process %5s", pl.getPID()),
"PID assignement for share", shareL);
}
@@ -826,11 +834,19 @@ public class JobManagerConverter
// logger.debug(methodName, jobid, "Process update to process ", pid, "mem", mem, "state", state, "is assigned for share", s.toString());
} else if ( pl.isComplete() ) {
+ IRmJob j = scheduler.getJob(jobid);
if ( s != null ) { // in some final states the share is already gone, not an error (e.g. Stopped)
- IRmJob j = scheduler.getJob(jobid);
- scheduler.signalCompletion(j, s);
+ scheduler.signalCompletion(j, s); // signal the **process** (not job) is complete
logger.info(methodName, jobid, "Process", pl.getPID(), " completed due to state", state);
}
+
+ switch ( l.getDuccType() ) { // UIMA-4326, if not a jobjob, the job must not get reallocations
+ case Job:
+ break;
+ default:
+ j.markComplete();
+ }
+
} else {
logger.info(methodName, jobid, "Process", pl.getPID(), "ignoring update because of state", state);
}
Modified: uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/NodePool.java
URL: http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/NodePool.java?rev=1672463&r1=1672462&r2=1672463&view=diff
==============================================================================
--- uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/NodePool.java (original)
+++ uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/NodePool.java Thu Apr 9 19:13:16 2015
@@ -54,6 +54,9 @@ class NodePool
HashMap<Node, Machine> offlineMachines = new HashMap<Node, Machine>();
HashMap<Integer, HashMap<Node, Machine>> machinesByOrder = new HashMap<Integer, HashMap<Node, Machine>>(); // All schedulable machines, not necessarily free
HashMap<String, Machine> machinesByName = new HashMap<String, Machine>(); // by name, for nodepool support
+ HashMap<String, Machine> deadByName = new HashMap<String, Machine>(); // anything we move to offline or unresponsive,
+ // but with the same name we used, because
+ // sometimes stupid domain gets in the way
HashMap<String, Machine> machinesByIp = new HashMap<String, Machine>(); // by IP, for nodepool support
HashMap<Share, Share> allShares = new HashMap<Share, Share>();
@@ -1004,6 +1007,22 @@ class NodePool
break;
}
+ switch ( j.getDuccType() ) {
+ case Reservation:
+ // UIMA-3614. Only actual reservation is left intact
+ logger.info(methodName, null, "Nodepool:", id, "Host dead/offline:", m.getId(), "Not purging", j.getDuccType());
+ break;
+
+ case Service:
+ case Pop:
+ j.markComplete(); // UIMA-4327 Must avoid reallocation, these guys are toast if they get purged.
+ logger.info(methodName, null, "Nodepool:", id, "Host dead/offline:", m.getId(), "Mark service/pop completed.");
+ // NO BREAK, must fall through
+ case Job:
+ default:
+ break;
+ }
+
logger.info(methodName, j.getId(), "Nodepool:", id, "Purge", j.getDuccType(), "on dead/offline:", m.getId());
j.shrinkByOne(s);
nPendingByOrder[order]++;
@@ -1041,7 +1060,17 @@ class NodePool
// name? see resolve() in Scheduler.java.
boolean hasNode(String n)
{
- return machinesByName.containsKey(n);
+ if ( machinesByName.containsKey(n) ) return true;
+
+ // If not we have to search the offline machines and the unresponsive machines which are
+ // keyed differently. This is really ugly but hard to fix at this point, so cope.
+ for ( Node node : offlineMachines.keySet() ) {
+ if ( node.getNodeIdentity().getName().equals(n) ) return true;
+ }
+ for ( Node node : unresponsiveMachines.keySet() ) {
+ if ( node.getNodeIdentity().getName().equals(n) ) return true;
+ }
+ return false;
}
String varyoff(String node)
Modified: uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/NodepoolScheduler.java
URL: http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/NodepoolScheduler.java?rev=1672463&r1=1672462&r2=1672463&view=diff
==============================================================================
--- uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/NodepoolScheduler.java (original)
+++ uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/NodepoolScheduler.java Thu Apr 9 19:13:16 2015
@@ -1333,6 +1333,10 @@ public class NodepoolScheduler
logger.info(methodName, j.getId(), "Counting shares for", j.getShortType() + "." + j.getId(), "in class", rc.getName());
NodePool np = rc.getNodepool();
+ if ( j.isCompleted() ) {
+ return;
+ }
+
if ( j.countNShares() > 0 ) { // only 1 allowed, UIMA-4275
// already accounted for as well, since it is a non-preemptable share
logger.info(methodName, j.getId(), "[stable]", "assigned", j.countNShares(), "processes, ",
@@ -1395,6 +1399,10 @@ public class NodepoolScheduler
continue;
}
+ if ( j.isCompleted() ) { // UIMA-4327 - reinstated, if this gets set we aren't allowed to expand any more
+ continue;
+ }
+
int order = j.getShareOrder();
int count = j.countNSharesGiven();
@@ -1571,6 +1579,10 @@ public class NodepoolScheduler
continue;
}
+ if ( j.isCompleted() ) { // UIMA-4327 - reinstated, if this gets set we aren't allowed to expand any more
+ continue;
+ }
+
try {
np.findMachines(j, rc);
} catch (Exception e) {
Modified: uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/RmJob.java
URL: http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/RmJob.java?rev=1672463&r1=1672462&r2=1672463&view=diff
==============================================================================
--- uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/RmJob.java (original)
+++ uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/RmJob.java Thu Apr 9 19:13:16 2015
@@ -180,11 +180,8 @@ public class RmJob
}
/**
- * For non-preemptable, remember max alloc has been reached, so we don't try to expand if
- * used for a job-job.
- *
- * For preemptable, must remember the job completed for defrag, because it could stick
- * around a while after completion.
+ * For preemptable, must remember the job completed so we don't accidentally reexpand it. Can
+ * happen in defrag and maybe various races with OR state.
*/
public void markComplete()
{