You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by ch...@apache.org on 2015/04/09 21:13:16 UTC

svn commit: r1672463 - in /uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm: JobManagerConverter.java scheduler/NodePool.java scheduler/NodepoolScheduler.java scheduler/RmJob.java

Author: challngr
Date: Thu Apr  9 19:13:16 2015
New Revision: 1672463

URL: http://svn.apache.org/r1672463
Log:
UIMA-4327 Don't reallocate for failed AP or Service.

Modified:
    uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/JobManagerConverter.java
    uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/NodePool.java
    uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/NodepoolScheduler.java
    uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/RmJob.java

Modified: uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/JobManagerConverter.java
URL: http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/JobManagerConverter.java?rev=1672463&r1=1672462&r2=1672463&view=diff
==============================================================================
--- uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/JobManagerConverter.java (original)
+++ uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/JobManagerConverter.java Thu Apr  9 19:13:16 2015
@@ -743,6 +743,14 @@ public class JobManagerConverter
                 throw new SchedulingException(jobid, "Process completion arrives for share " + s.toString() +
                                               " but job " + jobid + "cannot be found.");
             }
+
+            switch ( l.getDuccType() ) {        // UIMA-4326, if not a jobjob, the job must not get reallocations
+                case Job:
+                    break;
+                default:
+                    j.markComplete();
+            }
+
             scheduler.signalCompletion(j, s);
             logger.info(methodName, jobid, 
                          String.format("Process %5s", p.getPID()),
@@ -786,7 +794,7 @@ public class JobManagerConverter
                              );
             } else {
                 if ( (pr.getPID() == null) && (pl.getPID() != null) ) {
-                    logger.trace(methodName, jobid, 
+                      logger.trace(methodName, jobid, 
                                 String.format("Process %5s", pl.getPID()),
                                 "PID assignement for share", shareL);
                 }
@@ -826,11 +834,19 @@ public class JobManagerConverter
                 // logger.debug(methodName, jobid, "Process update to process ", pid, "mem", mem, "state", state, "is assigned for share", s.toString());
 
             } else if ( pl.isComplete() ) {
+                IRmJob j = scheduler.getJob(jobid);
                 if ( s != null ) {              // in some final states the share is already gone, not an error (e.g. Stopped)
-                    IRmJob j = scheduler.getJob(jobid);
-                    scheduler.signalCompletion(j, s);
+                    scheduler.signalCompletion(j, s);          // signal the **process** (not job) is complete
                     logger.info(methodName, jobid, "Process", pl.getPID(), " completed due to state", state);
                 }
+
+                switch ( l.getDuccType() ) {        // UIMA-4326, if not a jobjob, the job must not get reallocations
+                    case Job:
+                        break;
+                    default:
+                        j.markComplete();
+                }
+                
             } else {
                 logger.info(methodName, jobid, "Process", pl.getPID(), "ignoring update because of state", state);
             }

Modified: uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/NodePool.java
URL: http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/NodePool.java?rev=1672463&r1=1672462&r2=1672463&view=diff
==============================================================================
--- uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/NodePool.java (original)
+++ uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/NodePool.java Thu Apr  9 19:13:16 2015
@@ -54,6 +54,9 @@ class NodePool
     HashMap<Node, Machine> offlineMachines                   = new HashMap<Node, Machine>();
     HashMap<Integer, HashMap<Node, Machine>> machinesByOrder = new HashMap<Integer, HashMap<Node, Machine>>(); // All schedulable machines, not necessarily free
     HashMap<String, Machine>                 machinesByName  = new HashMap<String, Machine>();                 // by name, for nodepool support
+    HashMap<String, Machine>                 deadByName      = new HashMap<String, Machine>();                 // anything we move to offline or unresponsive,
+                                                                                                               // but with the same name we used, because
+                                                                                                               // sometimes stupid domain gets in the way
     HashMap<String, Machine>                 machinesByIp    = new HashMap<String, Machine>();                 // by IP, for nodepool support
 
     HashMap<Share, Share>                    allShares       = new HashMap<Share, Share>();
@@ -1004,6 +1007,22 @@ class NodePool
                     break;
                 }
 
+                switch ( j.getDuccType() ) {
+                    case Reservation:
+                    // UIMA-3614.  Only actual reservation is left intact
+                    logger.info(methodName, null, "Nodepool:", id, "Host dead/offline:", m.getId(), "Not purging", j.getDuccType());
+                    break;
+
+                    case Service:                        
+                    case Pop:
+                        j.markComplete();      // UIMA-4327 Must avoid reallocation, these guys are toast if they get purged.
+                        logger.info(methodName, null, "Nodepool:", id, "Host dead/offline:", m.getId(), "Mark service/pop completed.");
+                        // NO BREAK, must fall through
+                    case Job:
+                    default:
+                        break;
+                }
+
                 logger.info(methodName, j.getId(), "Nodepool:", id, "Purge", j.getDuccType(), "on dead/offline:", m.getId());
                 j.shrinkByOne(s);
                 nPendingByOrder[order]++;
@@ -1041,7 +1060,17 @@ class NodePool
     // name?  see resolve() in Scheduler.java.
     boolean hasNode(String n)
     {
-        return machinesByName.containsKey(n);
+        if ( machinesByName.containsKey(n) ) return true;
+
+        // If not we have to search the offline machines and the unresponsive machines which are
+        // keyed differently.  This is really ugly but hard to fix at this point, so cope.
+        for ( Node node : offlineMachines.keySet() ) {
+            if ( node.getNodeIdentity().getName().equals(n) ) return true;
+        }
+        for ( Node node : unresponsiveMachines.keySet() ) {
+            if ( node.getNodeIdentity().getName().equals(n) ) return true;
+        }
+        return false;
     }
 
     String varyoff(String node)

Modified: uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/NodepoolScheduler.java
URL: http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/NodepoolScheduler.java?rev=1672463&r1=1672462&r2=1672463&view=diff
==============================================================================
--- uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/NodepoolScheduler.java (original)
+++ uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/NodepoolScheduler.java Thu Apr  9 19:13:16 2015
@@ -1333,6 +1333,10 @@ public class NodepoolScheduler
         logger.info(methodName, j.getId(), "Counting shares for", j.getShortType() + "." + j.getId(), "in class", rc.getName());
         NodePool np = rc.getNodepool();
 
+        if ( j.isCompleted() ) {
+            return;
+        }
+
         if ( j.countNShares() > 0 ) {                  // only 1 allowed, UIMA-4275
             // already accounted for as well, since it is a non-preemptable share
             logger.info(methodName, j.getId(), "[stable]", "assigned", j.countNShares(), "processes, ", 
@@ -1395,6 +1399,10 @@ public class NodepoolScheduler
                     continue;
                 }
 
+                if ( j.isCompleted() ) {                    // UIMA-4327 - reinstated, if this gets set we aren't allowed to expand any more
+                    continue;
+                }
+
                 int order = j.getShareOrder();
                 int count = j.countNSharesGiven();
 
@@ -1571,6 +1579,10 @@ public class NodepoolScheduler
                     continue;
                 }
 
+                if ( j.isCompleted() ) {                 // UIMA-4327 - reinstated, if this gets set we aren't allowed to expand any more
+                    continue;
+                }
+
                 try {
                     np.findMachines(j, rc);
                 } catch (Exception e) {

Modified: uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/RmJob.java
URL: http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/RmJob.java?rev=1672463&r1=1672462&r2=1672463&view=diff
==============================================================================
--- uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/RmJob.java (original)
+++ uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/RmJob.java Thu Apr  9 19:13:16 2015
@@ -180,11 +180,8 @@ public class RmJob
     }
 
     /**
-     * For non-preemptable, remember max alloc has been reached, so we don't try to expand if 
-     * used for a job-job.
-     * 
-     * For preemptable, must remember the job completed for defrag, because it could stick 
-     * around a while after completion.
+     * For preemptable, must remember the job completed so we don't accidentally reexpand it.  Can
+     * happen in defrag and maybe various races with OR state.
      */
     public void markComplete()
     {