You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by ch...@apache.org on 2015/02/13 21:55:46 UTC
svn commit: r1659681 - in /uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler: Machine.java NodepoolScheduler.java

Author: challngr
Date: Fri Feb 13 20:55:45 2015
New Revision: 1659681

URL: http://svn.apache.org/r1659681
Log:
UIMA-4142 Defrag updates.

Modified:
    uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/Machine.java
    uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/NodepoolScheduler.java

Modified: uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/Machine.java
URL: http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/Machine.java?rev=1659681&r1=1659680&r2=1659681&view=diff
==============================================================================
--- uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/Machine.java (original)
+++ uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/Machine.java Fri Feb 13 20:55:45 2015
@@ -48,9 +48,14 @@ public class Machine
     //    - virtual_share_order is reset to share_order at the start of every scheduling cycle.  It
     //      represents the *potential* shares in this machine.  As a rule, once we give out shares on
     //      this machine we'll try to not move them around but eviction happens, and this helps us
-    //      keep track of what we *could* give away on this machine.
+    //      keep track of what we *could* give away on this machine.  It represents the logical capacity
+    //      of the machine, that is, true capacity, less shares given to orchestrator, less shares that
+    //      we might be giving away this scheduling cycle.
+    //
     //    - shares_left tracks exactly the number of shares that are physically available to give away
-    //      without preemption.
+    //      without preemption. This is updated when a share is assigned, or when it is returned. It
+    //      represents the true capacity of the machine at this moment, less the shares that have been
+    //      given to the orchestrator.
     //
     // Throughout much of the scheudling cycle these guys will tend to track each other, and at the end
     // of a cycle they should probably bethe same, but they may diverge if shares are given out that we

Modified: uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/NodepoolScheduler.java
URL: http://svn.apache.org/viewvc/uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/NodepoolScheduler.java?rev=1659681&r1=1659680&r2=1659681&view=diff
==============================================================================
--- uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/NodepoolScheduler.java (original)
+++ uima/sandbox/uima-ducc/trunk/uima-ducc-rm/src/main/java/org/apache/uima/ducc/rm/scheduler/NodepoolScheduler.java Fri Feb 13 20:55:45 2015
@@ -20,6 +20,7 @@ package org.apache.uima.ducc.rm.schedule
 
 import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Collection;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.HashMap;
@@ -1485,7 +1486,7 @@ public class NodepoolScheduler
                     continue;
                 }
 
-                if ( j.isCompleted() ) {                    // UIMA-3614 - may have bene purged, don't give it more
+                if ( j.isCompleted() ) {                 // UIMA-3614 - may have bene purged, don't give it more
                     continue;
                 }
 
@@ -1656,16 +1657,29 @@ public class NodepoolScheduler
         return np.containsMachine(m);           // can we get to the candidate share from 'needy's np?
     }
 
+    // /**
+    //  * Discover whether the potential job is able or unable to supply shares to a needy job because of nodepool restrictions.
+    //  */
+    // boolean compatibleNodepools(IRmJob potential, IRmJob needy)
+    // {
+    //     ResourceClass prc = potential.getResourceClass();
+    //     ResourceClass nrc = needy.getResourceClass();
+
+    //     NodePool pp = prc.getNodepool();
+    //     NodePool np = nrc.getNodepool();
+
+    //     return np.containsSubpool(pp) || pp.containsSubpool(np);
+    // }
+
     /**
-     * Discover whether the potential job is able or unable to supply shares to a needy job because of nodepool restrictions.
+     * Discover whether the potential resource class is able or unable to supply shares to a jobs in a needy class because of nodepool restrictions.
      */
-    boolean compatibleNodepools(IRmJob potential, IRmJob needy)
+    boolean compatibleNodepools(ResourceClass potential, IRmJob needy)
     {
-        ResourceClass prc = potential.getResourceClass();
         ResourceClass nrc = needy.getResourceClass();
 
+        NodePool pp = potential.getNodepool();
         NodePool np = nrc.getNodepool();
-        NodePool pp = prc.getNodepool();
 
         return np.containsSubpool(pp) || pp.containsSubpool(np);
     }
@@ -1741,9 +1755,10 @@ public class NodepoolScheduler
      *         evicted a process smaller than is needed, because there was already some free space on
      *         the machine.
      */
-    int takeFromTheRich(IRmJob nj, int needed,
-                            TreeMap<User, User> users_by_wealth,
-                            HashMap<User, TreeMap<IRmJob, IRmJob>> jobs_by_user)
+    int takeFromTheRich(IRmJob nj, 
+                        int needed,
+                        TreeMap<User, User> users_by_wealth,
+                        HashMap<User, TreeMap<IRmJob, IRmJob>> jobs_by_user)
     {
     	String methodName = "takeFromTheRich";
         // 1. Collect all machines that have shares, which if evicted, would make enough space
@@ -1760,8 +1775,7 @@ public class NodepoolScheduler
         //    a) have given what is needed
         //    b) nothing left to give
 
-        // Map<Share, Share>     exemptShares = new HashMap<Share, Share>(); // not eligible for various reasons
-        Map<IRmJob, IRmJob>   candidateJobs = new HashMap<IRmJob, IRmJob>();
+        Map<IRmJob,  IRmJob>  candidateJobs    = new HashMap<IRmJob,  IRmJob>();
         Map<Machine, Machine> eligibleMachines = new TreeMap<Machine, Machine>(new EligibleMachineSorter());
 
         for ( TreeMap<IRmJob, IRmJob> jobs : jobs_by_user.values() ) {
@@ -1771,45 +1785,61 @@ public class NodepoolScheduler
         int given = 0;
         int orderNeeded = nj.getShareOrder();
         
-        ResourceClass cl = nj.getResourceClass();
-        String npname = cl.getNodepoolName();
-        NodePool np = globalNodepool.getSubpool(npname);
+        ResourceClass cl     = nj.getResourceClass();               // needy job's resource class
+        String        npname = cl.getNodepoolName();                // name of the class
+        NodePool      np     = globalNodepool.getSubpool(npname);   // job's nodepool
         Map<Node, Machine> machines = np.getAllMachines();          // everything here is a candidate, nothing else is
+                                                                    //   this is the machines in the pool, and all the
+                                                                    //   subpools
 
-        for ( Machine m : machines.values() ) {
-            if ( m.getShareOrder() < orderNeeded ) {
-                logger.trace(methodName, nj.getId(), "Bypass ", m.getId(), ": too small for request of order", orderNeeded); 
-                logger.info(methodName, nj.getId(), "Bypass ", m.getId(), ": too small for request of order", orderNeeded); 
-                continue;
-            }
+        // Here we filter all the machines looking for machines that *might* be able to satisfy the defrag.  At the 
+        // end this set of machines is eligbleMachines.
+        machine_loop : 
+            for ( Machine m : machines.values() ) {
 
-            // if the job is a reservation the machine size has to match
-            if ( nj.isReservation() && ( m.getShareOrder() != orderNeeded )) {
-                logger.trace(methodName, nj.getId(), "Bypass ", m.getId(), ": reservation requires exact match for order", orderNeeded);
-                logger.info(methodName, nj.getId(), "Bypass ", m.getId(), ": reservation requires exact match for order", orderNeeded);
-                continue;
-            }
+                if ( m.getShareOrder() < orderNeeded ) {                // nope, too small
+                    logger.trace(methodName, nj.getId(), "Bypass ", m.getId(), ": too small for request of order", orderNeeded); 
+                    continue;
+                }
+
+                // if the job is a reservation the machine size has to matchm and machine must be clearable
+                if ( nj.isReservation() ) {
+                    if ( m.getShareOrder() != orderNeeded ) {
+                        logger.trace(methodName, nj.getId(), "Bypass ", m.getId(), ": reservation requires exact match for order", orderNeeded);
+                        continue;
+                    }
+                    // machine must be clearable as well
+                    Collection<Share> shares = m.getActiveShares().values();
+                    for ( Share s : shares ) {
+                        if ( ! candidateJobs.containsKey(s.getJob()) ) {
+                            logger.trace(methodName, nj.getId(), "Bypass ", m.getId(), ": for reservation, machine contains non-candidate job", s.getJob().getId());
+                            continue machine_loop;
+                        }
+                    }
+                
+                }
 
-            Map<Share, Share> as = m.getActiveShares();
-            int g = m.getVirtualShareOrder();
-            for ( Share s : as.values() ) {
-                IRmJob j = s.getJob();
-                if ( s.isForceable() && candidateJobs.containsKey(j) ) {
-                    g += j.getShareOrder();
+                Map<Share, Share> as = m.getActiveShares();            // everything alloacated here
+                int g = m.getVirtualShareOrder();                      // g is space that we might be able to make after defrag:
+                //    free space + freeable-from-candidates
+                for ( Share s : as.values() ) {
+                    IRmJob j = s.getJob();
+                    if ( s.isForceable() && candidateJobs.containsKey(j) ) {  // evictable, and a candidate for reclamation by defrag
+                        g += j.getShareOrder();
+                    }
+                }
+
+                if ( g >= orderNeeded ) {                              // if it's usable by the job, it's a candidate
+                    logger.info(methodName, nj.getId(), "Candidate machine:", m.getId());
+                    eligibleMachines.put(m, m);
+                } else {
+                    logger.info(methodName, nj.getId(), "Not a candidate, insufficient free space + candidate shares:", m.getId());
                 }
             }
-            if ( g >= orderNeeded ) {
-                logger.trace(methodName, nj.getId(), "Candidate machine:", m.getId());
-                logger.info(methodName, nj.getId(), "Candidate machine:", m.getId());
-                eligibleMachines.put(m, m);
-            } else {
-                // (a) the share is not forceable (non-preemptbable, or already being removed), or
-                // (b) the share is not owned by a rich job
-                logger.trace(methodName, nj.getId(), "Not a candidate, insufficient rich jobs:", m.getId());
-                logger.info(methodName, nj.getId(), "Not a candidate, insufficient rich jobs:", m.getId());
-            }
-        }
+        
+        // Now eligibleMachines is the set of candidate machines for defrag
 
+        // All-or-nothing policy, can we satisfy the reservation with defrag?  If not, we're done.
         if ( nj.isReservation() && ( eligibleMachines.size() < needed ) ) {
             // if we can't clear enough for the reservation we have to wait.  Very unlikely, but not impossible.
             logger.info(methodName, nj.getId(), "Found insufficient machines (", eligibleMachines.size(), "for reservation. Not clearing.");
@@ -1824,7 +1854,7 @@ public class NodepoolScheduler
             }
         logger.info(methodName, nj.getId(), "Eligible machines:", buf.toString());
 
-        // first part done
+        // first part done, we know where to look.
 
         // Now just bop through the machines until either we can't find anything, or we find everything.
         int given_per_round = 0;
@@ -1846,15 +1876,14 @@ public class NodepoolScheduler
                 sh.addAll(m.getActiveShares().values());
                 Collections.sort(sh, new ShareByWealthSorter());
 
-                g = m.getVirtualShareOrder();
+                g = m.getVirtualShareOrder();         // ( free space at this point )
                 List<Share> potentialShares     = new ArrayList<Share>();
                 for ( Share s : sh ) {
                     IRmJob j = s.getJob();
                     User u = j.getUser();
                     
                     if ( s.isForceable() ) {
-                        TreeMap<IRmJob, IRmJob> potentialJobs = jobs_by_user.get(u);
-                        if ( (potentialJobs != null) && ( potentialJobs.containsKey(j) ) ) {
+                        if ( candidateJobs.containsKey(j) ) {
                             g += s.getShareOrder();
                             if ( s.getShareOrder() == orderNeeded ) {
                                 potentialShares.add(0, s);    // exact matches first
@@ -1897,7 +1926,7 @@ public class NodepoolScheduler
                     eligibleMachines.put(m, m);
                 }
 
-                // and also must track how many processes we ma made space for
+                // and also must track how many processes we made space for
                 given = given + (g / orderNeeded);    // at least one,or else we have a bug 
                 logger.debug(methodName, nj.getId(), "LOOPEND: given[", given, "] g[", g, "] orderNeeded[", orderNeeded, "]");
             }
@@ -1916,113 +1945,170 @@ public class NodepoolScheduler
         }
 
         //
-        // Put candidate donors into a map, ordered by "most able to be generous".
-        // Candidates must not be needy, must be initialized already, be in compatible nodepools, and have sufficient shares to give.
+        // Search for candidate donors and order by "most able to be generous".  Nodepools must be compatible.
+        //
+        // If prioritiy of needy is same or better, the cCandidates must not be needy, must be initialized already, 
+        //     and have sufficient shares to give.
+        // 
+        // If priority of needy is better we keep track of the rich vs the poor jobs and possibly perform a second
+        //     pass that includes poor jobs, if we can't get enougg from the rich.
         //
-
         for ( IRmJob nj : needy.keySet() ) {
-            TreeMap<IRmJob, IRmJob> candidates = new TreeMap<IRmJob, IRmJob>(new FragmentationSorter());
+            int priority_needy = nj.getSchedulingPriority();
+            TreeMap<IRmJob, IRmJob> rich_candidates = new TreeMap<IRmJob, IRmJob>(new FragmentationSorter());  // first class candidates, they're rich and available
+            TreeMap<IRmJob, IRmJob> poor_candidates = new TreeMap<IRmJob, IRmJob>(new FragmentationSorter());  // clearing for better priority job, we only use this if it's
+                                                                                                               // impossible to clear from the rich candidates
+
             for ( ResourceClass rc : resourceClasses.values() ) {
                 
                 if ( rc.getPolicy() == Policy.RESERVE )     continue;          // exempt from preemption
                 if ( rc.getPolicy() == Policy.FIXED_SHARE ) continue;          // exempt from preemption
 
+                if ( ! compatibleNodepools(rc, nj) ) {
+                    logger.debug(methodName, nj.getId(), "Skipping class", rc.getName(), "vs job class", nj.getResourceClass().getName(), "because of incompatible nodepools.");
+                    continue;
+                }
+
+                int priority_candidate = rc.getPriority();
+                boolean use_expanded_pool = false;      // better priority job is allowed to look at poor jobs if can't be satisfied from the rich
+
+                if ( priority_needy > priority_candidate ) {  // Greater means worse 
+                    logger.debug(methodName, nj.getId(), "Jobs in class", rc.getName(), "are not candidates because better priority: [", 
+                                 priority_candidate, "vs", priority_needy, "]");
+                    continue;
+                }
+
+                if ( priority_needy < priority_candidate ) {   // less means better
+                    logger.debug(methodName, nj.getId(), "Needy job has better priority than jobs in class", rc.getName(), "[", 
+                                 priority_candidate, "vs", priority_needy, "]. Using expanded pool.");
+                    use_expanded_pool = true;
+                }
+
                 HashMap<IRmJob, IRmJob> jobs = rc.getAllJobs();
                 for ( IRmJob j : jobs.values() ) {
                     int nshares = j.countNShares();
                     int qshares = nshares * j.getShareOrder();
 
-                    if ( nj.isReservation() && (nj.getSchedulingPriority() <= j.getSchedulingPriority()) ) {
-                        if ( nshares == 0 ) {
-                            logger.debug(methodName, nj.getId(), "Job", j.getId(), "is not a candidate because it has no share.");
-                            continue;
-                        } 
-                        // We could end up evictin really needy stuff - hopefully not, but these guys are Top Men so there.
-                        logger.debug(methodName, nj.getId(), "Reservation priority override on candidate selection.");
-                    } else {
-                        if ( needy.containsKey(j) ) {                            // if needy it's not a candidate
-                            logger.debug(methodName, nj.getId(), "Job", j.getId(), "is not a candidate because it's needy.");
-                            continue;
+                    if ( nshares == 0 ) {
+                        logger.debug(methodName, nj.getId(), "Job", j.getId(), "is not a candidate because it has no share.");
+                        continue;
+                    } 
+
+                    if ( needy.containsKey(j) ) {
+                        if ( use_expanded_pool ) {
+                            logger.debug(methodName, nj.getId(), "Job", j.getId(), "is a backup candidate because it's needy.");
+                            poor_candidates.put(j, j);
+                        } else {
+                            logger.debug(methodName, nj.getId(), "Job", j.getId(), "is a not a candidate because it's needy.");
                         }
-                        
-                        if ( ! j.isInitialized() ) {
+                        continue;
+                    }
+                    
+                    if ( ! j.isInitialized() ) {
+                        if ( use_expanded_pool ) {
+                            logger.debug(methodName, nj.getId(), "Job", j.getId(), "is a backup candidate because it's not initialized yet.");
+                            poor_candidates.put(j, j);
+                        } else {
                             logger.debug(methodName, nj.getId(), "Job", j.getId(), "is not a candidate because it's not initialized yet.");
-                            continue;                                            // if not initialized its not a candidate
-                        }
-                        
-                        //
-                        // Need at least one potential candidate of worse or equal priority
-                        //
-                        
-                        if ( j.getSchedulingPriority() < nj.getSchedulingPriority() ) {
-                            logger.debug(methodName, nj.getId(), "Job", j.getId(), "is not a candidate because it has better priority.");
-                            continue;
-                        }
-                        
-                        if ( ! compatibleNodepools(j, nj) ) {
-                            logger.debug(methodName, nj.getId(), "Job", j.getId(), "is not a candidate because of incompatible nodepools.");
-                            continue;
-                        }
-                        
-                        if ( nshares < fragmentationThreshold ) {
-                            // If you're already below the threshold then you're safe, unless we're clearing for a reservation.
-                            logger.debug(methodName, nj.getId(), "Job", j.getId(), "is not a candidate because not enough processes[", nshares, "] qshares[", qshares, "]");
-                            continue;
                         }
+                        continue;
                     }
                     
+                    if ( nshares < fragmentationThreshold ) {
+                        if ( use_expanded_pool ) {
+                            logger.debug(methodName, nj.getId(), "Job", j.getId(), "is a backup candidate because below frag threshold. nshares[", nshares, "] qshares[", qshares, "] threshold[", fragmentationThreshold, "]");
+                            poor_candidates.put(j, j);
+                        } else {
+                            logger.debug(methodName, nj.getId(), "Job", j.getId(), "is not a candidate because below frag threshold. nshares[", nshares, "] qshares[", qshares, "] threshold[", fragmentationThreshold, "]");
+                        }
+                        continue;
+                    }
+
                     logger.debug(methodName, nj.getId(), "Job", j.getId(), "is a candidate with processes[", nshares, "] qshares[", qshares, "]");
-                    candidates.put(j, j);
+                    rich_candidates.put(j, j);
                 }
             }
 
-            //
-            // Collect total wealth and order the wealthy by spondulix
-            //
-            HashMap<User, Integer> shares_by_user = new HashMap<User, Integer>();                                // use this to track user's wealth
-            HashMap<User, TreeMap<IRmJob, IRmJob>> jobs_by_user = new HashMap<User, TreeMap<IRmJob, IRmJob>>();  // use this to track where the wealth originates
-            for ( IRmJob j : candidates.values() ) {
-                User u = j.getUser();
-                
-                if ( shares_by_user.get(u) == null ) {
-                    shares_by_user.put(u, 0);
-                }
-                shares_by_user.put(u, shares_by_user.get(u) + (j.countNShares() * j.getShareOrder()));
-                
-                TreeMap<IRmJob, IRmJob> ujobs = jobs_by_user.get(u);
-                if ( ujobs == null ) {
-                    ujobs = new TreeMap<IRmJob, IRmJob>(new JobByShareSorter()); // orders by largest number of assigned shares
-                    jobs_by_user.put(u, ujobs);
-                }
-                ujobs.put(j, j);
-            }
+            HashMap<User, TreeMap<IRmJob, IRmJob>> jobs_by_user = new HashMap<User, TreeMap<IRmJob, IRmJob>>();  // use this to track where the wealth originatse
             TreeMap<User, User> users_by_wealth = new TreeMap<User, User>(new UserByWealthSorter()); // orders users by wealth
-                                                                                                     // and tracks their fat jobs
-            for ( User u : shares_by_user.keySet() ) {
-                u.setShareWealth(shares_by_user.get(u));       // qshares
-                users_by_wealth.put(u, u);
-            }
 
-            //
-            // Try stealing shares from 'users_by_wealth' until the needy
-            // job has met its fragmentation threshold, or until we decide its impossible to do so.
-            //
+            collectWealth(rich_candidates, users_by_wealth, jobs_by_user);
 
             int needed = needy.get(nj);      // this was adjusted to a reasonable level in the caller
             logger.debug(methodName, nj.getId(), "Needy job looking for", needed, "more processes of O[", nj.getShareOrder(), "]");
 
-            // while ( ( needed > 0 ) && takeFromTheRichX(nj, users_by_wealth, jobs_by_user) ) {
+            //
+            // Try stealing shares from the "rich" candidates first.
+            //
             needed -= takeFromTheRich(nj, needed, users_by_wealth, jobs_by_user);
             if ( needed <= 0 ) {
                 // This can go <0 if total space freed + unused space on a node adds up to >1 share.
                 // It's slimplest to just not sweat it and call it satisfied.
                 logger.info(methodName, nj.getId(), "Satisfied needs of job by taking from the rich.");
-            } else {
-                logger.info(methodName, nj.getId(), "Could not get enough from the rich. Asked for", needy.get(nj), "still needing", needed);
+                continue;
+            }
+
+            //
+            // The needy job had sufficient priority that be built up a list of emergency-backup jobs to evict.
+            //
+            if ( poor_candidates.size() > 0) {
+                logger.info(methodName, nj.getId(), "Could not clear sufficient space from rich candidates.  Retrying with all candidates.");
+                jobs_by_user.clear();
+                users_by_wealth.clear();
+                rich_candidates.putAll(poor_candidates);
+                collectWealth(rich_candidates, users_by_wealth, jobs_by_user);
+
+                needed -= takeFromTheRich(nj, needed, users_by_wealth, jobs_by_user);
+                if ( needed <= 0 ) {
+                    // This can go <0 if total space freed + unused space on a node adds up to >1 share.
+                    // It's slimplest to just not sweat it and call it satisfied.
+                    logger.info(methodName, nj.getId(), "Satisfied needs of job by taking from all candidates.");
+                    continue;
+                }
+            }
+            logger.info(methodName, nj.getId(), "Could not get enough from the rich. Asked for", needy.get(nj), "still needing", needed);
+        }
+    }
+
+    
+    void collectWealth(TreeMap<IRmJob, IRmJob> candidates, TreeMap<User, User> users_by_wealth, HashMap<User, TreeMap<IRmJob, IRmJob>> jobs_by_user)
+    {
+        // Candidates are ordered by the FragmentationSorter
+        //   - most over pure fair share
+        //   - hten most currently allocated
+
+        // user_by_wealth is ordered by the UserByWealthSorter
+        //   - ordered by most wealth - actual qshares over all jobs
+
+        //
+        // Collect total wealth and order the wealthy by spondulix
+        //
+        HashMap<User, Integer> shares_by_user = new HashMap<User, Integer>();                                // use this to track user's wealth
+        
+        for ( IRmJob j : candidates.values() ) {
+            User u = j.getUser();
+            
+            if ( shares_by_user.get(u) == null ) {
+                shares_by_user.put(u, 0);
+            }
+            shares_by_user.put(u, shares_by_user.get(u) + (j.countNShares() * j.getShareOrder()));
+            
+            TreeMap<IRmJob, IRmJob> ujobs = jobs_by_user.get(u);
+            if ( ujobs == null ) {
+                ujobs = new TreeMap<IRmJob, IRmJob>(new JobByShareSorter()); // orders by largest number of assigned shares
+                jobs_by_user.put(u, ujobs);
             }
+            ujobs.put(j, j);
+        }
+        
+        // and tracks their fat jobs
+        for ( User u : shares_by_user.keySet() ) {
+            u.setShareWealth(shares_by_user.get(u));       // qshares
+            users_by_wealth.put(u, u);
         }
     }
 
+
     void getNodepools(NodePool top, List<NodePool> nodepools)
     {        
         for ( NodePool np : top.getChildren().values()) {
@@ -2106,7 +2192,9 @@ public class NodepoolScheduler
                 int counted = 0;
                 switch ( rc.getPolicy() ) {
                     case FAIR_SHARE:
-                        counted = j.countNSharesGiven();       // fair share allocation
+                        counted = j.countNSharesGiven();       // fair share allocation, accounting for
+                                                               // ramp-up, various caps, etc.  could be more, could be less than
+                                                               // the "pure" fair share.
                         break;
                     default:
                         counted = j.countInstances();          // fixed, all, or nothing
@@ -2120,7 +2208,9 @@ public class NodepoolScheduler
                 if ( j.getSchedulingPolicy() == Policy.FAIR_SHARE ) {   // cap on frag threshold
                     if ( current >= fragmentationThreshold ) { 
                         needed = 0;
-                    } else if ( needed < 0 ) {
+                    } else if ( current >= j.getPureFairShare() ) {     // more than our pure share, we're not needy
+                        needed = 0;
+                    } else if ( needed < 0 ) {                          // more than out count, likely are evicting
                         needed = 0;
                     } else if ( needed > 0) {
                         needed = Math.min(needed, fragmentationThreshold);
@@ -2219,15 +2309,7 @@ public class NodepoolScheduler
                 int available = nmach[order];
                 int to_remove = 0;
 
-                //if ( j.getSchedulingPolicy() == Policy.FAIR_SHARE ) {
-                   // Preference is given during expinsion in next cycle because usually, if
-                   // we Took From The Rich, we took from older jobs, which would normally
-                   // have priority for available resources.
-                   //
-                   // We don't need to include the non-preemptable jobs here, they're handled
-                   // well enough in their normal what-of code.
-                   needyJobs.put(j, j);
-                //}
+                needyJobs.put(j, j);
                 
                 if ( available >= needed ) {
                     needed = 0;
@@ -2563,7 +2645,7 @@ public class NodepoolScheduler
 
             // pure fair-share
             int p1 = j1.getPureFairShare();    // qshares
-            int p2 = j2.getPureFairShare();
+            int p2 = j2.getPureFairShare(); 
 
             // actual current allocation
             int c1 = j1.countNShares() * j1.getShareOrder();  // to qshares
@@ -2625,6 +2707,16 @@ public class NodepoolScheduler
 
     //
     // Sort machines for defrag.
+
+    // 1 any machine with free space F, and a candidate job j of order O(j) such
+    //    that F + O(j) == O(nj)
+    //    Tiebraker on j is wealth W: W(j1) > W(j2)
+
+    // 2 choice, any machine with a candidate job of the same order as the needy job
+    //    Secondary sort, candidate job A is richer than candidate job B
+    //
+    // Tiebreak 1 ~ 2: W(j1) > W(j2) - choose host whose job is richest
+    //
     // a) machines with richest users first
     // b) largest machine second
     //