You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by sc...@apache.org on 2015/11/02 18:15:23 UTC

svn commit: r1712088 - in /uima/uimaj/branches/experiment-v3-jcas/uimaj-core/src: main/java/org/apache/uima/internal/util/IntHashSet.java test/java/org/apache/uima/internal/util/IntHashSetTest.java

Author: schor
Date: Mon Nov  2 17:15:23 2015
New Revision: 1712088

URL: http://svn.apache.org/viewvc?rev=1712088&view=rev
Log:
[UIMA-4682] reuse removed slots, add tests

Modified:
    uima/uimaj/branches/experiment-v3-jcas/uimaj-core/src/main/java/org/apache/uima/internal/util/IntHashSet.java
    uima/uimaj/branches/experiment-v3-jcas/uimaj-core/src/test/java/org/apache/uima/internal/util/IntHashSetTest.java

Modified: uima/uimaj/branches/experiment-v3-jcas/uimaj-core/src/main/java/org/apache/uima/internal/util/IntHashSet.java
URL: http://svn.apache.org/viewvc/uima/uimaj/branches/experiment-v3-jcas/uimaj-core/src/main/java/org/apache/uima/internal/util/IntHashSet.java?rev=1712088&r1=1712087&r2=1712088&view=diff
==============================================================================
--- uima/uimaj/branches/experiment-v3-jcas/uimaj-core/src/main/java/org/apache/uima/internal/util/IntHashSet.java (original)
+++ uima/uimaj/branches/experiment-v3-jcas/uimaj-core/src/main/java/org/apache/uima/internal/util/IntHashSet.java Mon Nov  2 17:15:23 2015
@@ -22,7 +22,7 @@ package org.apache.uima.internal.util;
 import java.util.Arrays;
 import java.util.NoSuchElementException;
 
-import org.apache.uima.jcas.impl.JCasHashMap;
+import org.apache.uima.util.Misc;
 
 /**
  * A set of non-zero ints. 
@@ -55,6 +55,10 @@ public class IntHashSet implements Posit
   
   public static final int SIZE_NEEDING_4_BYTES = 256 * 256 - 2;  
   public static final float DEFAULT_LOAD_FACTOR = 0.66F;
+  
+  private static final int REMOVED4 = Integer.MIN_VALUE;
+  private static final int REMOVED2 = Short.MIN_VALUE;
+  
   // set to true to collect statistics for tuning
   // you have to also put a call to showHistogram() at the end of the run
   private static final boolean TUNE = false;
@@ -79,6 +83,8 @@ public class IntHashSet implements Posit
   private int size; // number of elements in the table
   private int nbrRemoved; // number of removed elements (coded as removed)
   
+  private int removedPositionToUse = -1;
+  
   // offset only used with keys2.  values stored are key - offset
   //   intent is to have them fit into short data type.
   //   If the offset is 100,000, then the keys range from 100000 - 32766 to 100000 + 32767, includes "0"
@@ -227,7 +233,7 @@ public class IntHashSet implements Posit
         // switch to 4
         if (TUNE) {System.out.println("Switching to 4 byte keys, Capacity increasing from " + oldCapacity + " to " + newCapacity);}
         for (short adjKey : oldKeys) {
-          if (adjKey != 0 && adjKey != Short.MIN_VALUE) {
+          if (adjKey != 0 && adjKey != REMOVED2) {
             addInner4(getRawFromAdjKey(adjKey));
           }
         }
@@ -235,7 +241,7 @@ public class IntHashSet implements Posit
       } else {
         if (TUNE) {System.out.println("Keeping 2 byte keys, Capacity increasing from " + oldCapacity + " to " + newCapacity);}
         for (short adjKey : oldKeys) {
-          if (adjKey != 0 && adjKey != Short.MIN_VALUE) {
+          if (adjKey != 0 && adjKey != REMOVED2) {
             addInner2(adjKey);
           }
         }
@@ -246,7 +252,7 @@ public class IntHashSet implements Posit
       if (TUNE) {System.out.println("Capacity increasing from " + oldCapacity + " to " + newCapacity);}
       newTableKeepSize(newCapacity, true);
       for (int key : oldKeys) {
-        if (key != 0 && key != Integer.MIN_VALUE) {
+        if (key != 0 && key != REMOVED4) {
           addInner4(key);
         }
       }
@@ -321,25 +327,35 @@ public class IntHashSet implements Posit
   * @return the probeAddr in keys array - might have a 0 value, or the key value if found
   */
   private int findPosition(final int adjKey) {
+    return findPosition(adjKey, false);
+  } 
+  
+  private int findPosition(final int adjKey, final boolean includeRemoved) {
     if (adjKey == 0) {
       throw new IllegalArgumentException("0 is an invalid key");
     }
 
-    final int hash = JCasHashMap.hashInt(adjKey);
+    final int hash = Misc.hashInt(adjKey);
     int nbrProbes = 1;
     int probeDelta = 1;
     int probeAddr;
+    removedPositionToUse = -1;
     
     if (keys4 == null) {
       final short[] localKeys2 = keys2;
       final int bitMask = localKeys2.length - 1;
       probeAddr = hash & bitMask;
-      while (true) {
+
+       while (true) {
         final int testKey = localKeys2[probeAddr];
+        if (includeRemoved && (removedPositionToUse == -1) && (testKey == REMOVED2)) {
+          nbrRemoved --;  // because caller will use this as a slot for adding
+          removedPositionToUse = probeAddr;
+        }
         if (testKey == 0 || testKey == adjKey) {
-          break;
+           break;
         }
-        nbrProbes++;
+        nbrProbes++; 
         probeAddr = bitMask & (probeAddr + (probeDelta++));
       }
     } else {
@@ -348,6 +364,10 @@ public class IntHashSet implements Posit
       probeAddr = hash & bitMask;
       while (true) {
         final int testKey = localKeys4[probeAddr];
+        if (includeRemoved && (removedPositionToUse == -1) && (testKey == REMOVED4)) {
+          nbrRemoved --;  // because caller will use this as a slot for adding
+          removedPositionToUse = probeAddr;
+        }
         if (testKey == 0 || testKey == adjKey) {
           break;
         }
@@ -430,7 +450,7 @@ public class IntHashSet implements Posit
     final short[] oldKeys = keys2;
     newTableKeepSize(getCapacity(), true);  // make a 4 table. same size
     for (short adjKey : oldKeys) {
-      if (adjKey != 0 && adjKey != Short.MIN_VALUE) {
+      if (adjKey != 0 && adjKey != REMOVED2) {
         addInner4(getRawFromAdjKey(adjKey));
       }
     } 
@@ -459,34 +479,36 @@ public class IntHashSet implements Posit
     }
     
     if (keys4 != null) {
-      return find4(rawKey);
+      return find4AndAddIfMissing(rawKey);
       
       // short keys
     } else {
       int adjKey = getAdjKey(rawKey);
       if (isAdjKeyOutOfRange(adjKey)) {
         switchTo4byte();
-        return find4(rawKey);
+        return find4AndAddIfMissing(rawKey);
         
         // key in range
       } else {
-        final int i = findPosition(adjKey);
+        final int i = findPosition(adjKey, true);  // reuse removed slots
         if (keys2[i] == adjKey) {
           return false;
         }
-        keys2[i] = (short) adjKey;
+        
+        keys2[ (removedPositionToUse != -1) ? removedPositionToUse : i ] = (short) adjKey;
         incrementSize();
         return true;
       }
     }
   }
   
-  private boolean find4(int rawKey) {
-    final int i = findPosition(rawKey);
+  private boolean find4AndAddIfMissing(int rawKey) {
+    removedPositionToUse = -1;
+    final int i = findPosition(rawKey, true);
     if (keys4[i] == rawKey) {
       return false;
     }
-    keys4[i] = rawKey;
+    keys4[ (removedPositionToUse == -1) ? i : removedPositionToUse] = rawKey;
     incrementSize();
     return true;
   }
@@ -552,9 +574,9 @@ public class IntHashSet implements Posit
     }
     
     if (keys4 == null) {
-      keys2[pos] = Short.MIN_VALUE;
+      keys2[pos] = REMOVED2;
     } else {
-      keys4[pos] = Integer.MIN_VALUE;
+      keys4[pos] = REMOVED4;
     }
     
     size--;
@@ -633,13 +655,13 @@ public class IntHashSet implements Posit
     final int adjKey;
     if (keys4 == null) {
       adjKey = keys2[index];
-      if (adjKey == 0 || adjKey == Short.MIN_VALUE) {
+      if (adjKey == 0 || adjKey == REMOVED2) {
         return 0;  // null, not present
       }
       return getRawFromAdjKey(adjKey);
     } else {
       adjKey = keys4[index];
-      if (adjKey == 0 || adjKey == Integer.MIN_VALUE) {
+      if (adjKey == 0 || adjKey == REMOVED4) {
         return 0;  // null, not present
       }
       return adjKey;
@@ -663,7 +685,7 @@ public class IntHashSet implements Posit
           return pos;
         }
         int v = get(pos);
-        if (v != 0 && v != Short.MIN_VALUE) {
+        if (v != 0 && v != REMOVED2) {
           return pos;
         }
         pos++;
@@ -675,7 +697,7 @@ public class IntHashSet implements Posit
           return pos;
         }
         int v = get(pos);
-        if (v != 0 && v != Integer.MIN_VALUE) {
+        if (v != 0 && v != REMOVED4) {
           return pos;
         }
         pos ++;
@@ -699,7 +721,7 @@ public class IntHashSet implements Posit
         return pos;
       }
       int v = get(pos);
-      if (v != 0 && v != ( (keys4 == null) ? Short.MIN_VALUE : Integer.MIN_VALUE)) {
+      if (v != 0 && v != ( (keys4 == null) ? REMOVED2 : REMOVED4)) {
         return pos;
       }
       pos --;
@@ -801,13 +823,13 @@ public class IntHashSet implements Posit
   public void bulkAddTo(IntVector v) {
     if (null == keys4) {
       for (int k : keys2) {
-        if (k != 0 && k != Short.MIN_VALUE) {
+        if (k != 0 && k != REMOVED2) {
           v.add(getRawFromAdjKey(k));
         }
       }
     } else {
       for (int k : keys4) {
-        if (k != 0 && k != Integer.MIN_VALUE) {
+        if (k != 0 && k != REMOVED4) {
           v.add(k);
         }
       }

Modified: uima/uimaj/branches/experiment-v3-jcas/uimaj-core/src/test/java/org/apache/uima/internal/util/IntHashSetTest.java
URL: http://svn.apache.org/viewvc/uima/uimaj/branches/experiment-v3-jcas/uimaj-core/src/test/java/org/apache/uima/internal/util/IntHashSetTest.java?rev=1712088&r1=1712087&r2=1712088&view=diff
==============================================================================
--- uima/uimaj/branches/experiment-v3-jcas/uimaj-core/src/test/java/org/apache/uima/internal/util/IntHashSetTest.java (original)
+++ uima/uimaj/branches/experiment-v3-jcas/uimaj-core/src/test/java/org/apache/uima/internal/util/IntHashSetTest.java Mon Nov  2 17:15:23 2015
@@ -20,6 +20,7 @@
 package org.apache.uima.internal.util;
 
 import java.util.Arrays;
+import java.util.Random;
 
 import junit.framework.TestCase;
 
@@ -124,7 +125,62 @@ public class IntHashSetTest extends Test
     }
   }
   
+  public void testAddIntoRemovedSlot() {
+    for (int i = 1; i < 100; i++) {
+      ihs.add(i);
+    }
+    
+    /** Test with 2 byte numbers */
+    checkRemovedReuse(true);
+    
+    ihs = new IntHashSet();
+    for (int i = 1; i < 99; i++) {
+      ihs.add(i);
+    }
+    ihs.add(100000);  // force 4 byte
+    checkRemovedReuse(false);
+  }
   
+  private void checkRemovedReuse(boolean is2) {
+    Random r = new Random();
+    assertTrue(ihs.getSpaceUsedInWords() == ((is2) ? 128 : 256));
+    for (int i = 0; i < 100000; i++) {
+      int v = 1 + r.nextInt(100 + (i % 30000));
+      ihs.remove(v);
+      assertTrue(!(ihs.contains(v)));
+      v = 1 + r.nextInt(100 + (i % 30000));
+      ihs.add(v);
+      assertTrue(ihs.contains(v));
+    }
+    assertTrue(ihs.getSpaceUsedInWords() == ((is2) ? 16384 : 32768) );
+    
+    for (int i = ((is2) ? 16384 : 32768); i > 128; i = i / 2) {
+      ihs.clear();
+      assertTrue(ihs.getSpaceUsedInWords() == ((is2 || i == 32768) ? i : i/2));
+      ihs.clear();
+      assertTrue(ihs.getSpaceUsedInWords() == ((is2 || i == 32768) ? i : i/2));
+    }
+    ihs.clear();
+    
+    assertTrue(ihs.getSpaceUsedInWords() == (is2 ? 128 : 64));
+    for (int i = 1; i < ((is2) ? 100 : 99); i++) {
+      ihs.add(i);
+    }
+    if (!is2) {
+      ihs.add(100000);
+    }
+    
+    assertTrue(ihs.getSpaceUsedInWords() == ((is2) ? 128 : 256));
+    for (int i = 0; i < 1000; i++) {
+      int v = 1 + r.nextInt(100);
+      ihs.remove(v);
+      assertTrue(!(ihs.contains(v)));
+      ihs.add(v);
+      assertTrue(ihs.contains(v));
+    }
+    assertTrue(ihs.getSpaceUsedInWords() == ((is2) ? 128 : 256));
+    
+  }
  
   
 }