You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by sc...@apache.org on 2014/11/03 23:15:42 UTC
svn commit: r1636458 [1/2] - in /uima/uimaj/trunk/uimaj-core/src:
main/java/org/apache/uima/cas/impl/ main/java/org/apache/uima/internal/util/
test/java/org/apache/uima/internal/util/ test/java/org/apache/uima/jcas/test/
Author: schor
Date: Mon Nov 3 22:15:41 2014
New Revision: 1636458
URL: http://svn.apache.org/r1636458
Log:
[UIMA-4061] PositiveIntSet enhancement, use for FSBagIndex.
Added:
uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/internal/util/PositiveIntSet_impl.java
- copied, changed from r1634141, uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/internal/util/PositiveIntSet.java
Modified:
uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/FSBagIndex.java
uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/ListUtils.java
uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/internal/util/IntBitSet.java
uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/internal/util/IntHashSet.java
uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/internal/util/IntSet.java
uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/internal/util/IntVector.java
uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/internal/util/PositiveIntSet.java
uima/uimaj/trunk/uimaj-core/src/test/java/org/apache/uima/internal/util/IntBitSetTest.java
uima/uimaj/trunk/uimaj-core/src/test/java/org/apache/uima/internal/util/IntHashSetTest.java
uima/uimaj/trunk/uimaj-core/src/test/java/org/apache/uima/internal/util/PositiveIntSetTest.java
uima/uimaj/trunk/uimaj-core/src/test/java/org/apache/uima/jcas/test/JCasTest.java
Modified: uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/FSBagIndex.java
URL: http://svn.apache.org/viewvc/uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/FSBagIndex.java?rev=1636458&r1=1636457&r2=1636458&view=diff
==============================================================================
--- uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/FSBagIndex.java (original)
+++ uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/FSBagIndex.java Mon Nov 3 22:15:41 2014
@@ -28,12 +28,16 @@ import org.apache.uima.internal.util.Com
import org.apache.uima.internal.util.IntComparator;
import org.apache.uima.internal.util.IntPointerIterator;
import org.apache.uima.internal.util.IntVector;
+import org.apache.uima.internal.util.PositiveIntSet;
+import org.apache.uima.internal.util.PositiveIntSet_impl;
/**
* Used for UIMA FS Bag Indexes
* Uses IntVector to hold values of FSs
*/
public class FSBagIndex extends FSLeafIndexImpl {
+
+ private static boolean USE_POSITIVE_INT_SET = false;
private class IntVectorIterator implements ComparableIntPointerIterator, LowLevelIterator {
@@ -46,7 +50,7 @@ public class FSBagIndex extends FSLeafIn
private int[] detectIllegalIndexUpdates; // shared copy with Index Repository
private int typeCode;
-
+
public boolean isConcurrentModification() {
return this.modificationSnapshot != this.detectIllegalIndexUpdates[this.typeCode];
}
@@ -57,7 +61,7 @@ public class FSBagIndex extends FSLeafIn
private IntVectorIterator() {
super();
- this.itPos = 0;
+ moveToFirst();
}
private IntVectorIterator(IntComparator comp) {
@@ -66,30 +70,44 @@ public class FSBagIndex extends FSLeafIn
}
public boolean isValid() {
- return ((this.itPos >= 0) && (this.itPos < FSBagIndex.this.index.size()));
+ if (USE_POSITIVE_INT_SET) {
+ return FSBagIndex.this.indexP.isValid(this.itPos);
+ } else {
+ return (this.itPos >=0) && (this.itPos < FSBagIndex.this.index.size());
+ }
}
public void moveToFirst() {
- this.itPos = 0;
+ this.itPos = (USE_POSITIVE_INT_SET) ?
+ FSBagIndex.this.indexP.moveToFirst() :
+ 0;
}
public void moveToLast() {
- this.itPos = FSBagIndex.this.index.size() - 1;
+ this.itPos = (USE_POSITIVE_INT_SET) ?
+ FSBagIndex.this.indexP.moveToLast() :
+ FSBagIndex.this.index.size() - 1;
}
public void moveToNext() {
- ++this.itPos;
+ this.itPos = (USE_POSITIVE_INT_SET) ?
+ FSBagIndex.this.indexP.moveToNext(itPos) :
+ itPos + 1;
}
public void moveToPrevious() {
- --this.itPos;
+ this.itPos = (USE_POSITIVE_INT_SET) ?
+ FSBagIndex.this.indexP.moveToPrevious(itPos) :
+ itPos - 1;
}
public int ll_get() {
if (!isValid()) {
throw new NoSuchElementException();
}
- return FSBagIndex.this.index.get(this.itPos);
+ return (USE_POSITIVE_INT_SET) ?
+ FSBagIndex.this.indexP.get(this.itPos) :
+ FSBagIndex.this.index.get(this.itPos);
}
/**
@@ -159,13 +177,15 @@ public class FSBagIndex extends FSLeafIn
// The index, a vector of FS references.
private IntVector index;
+
+ final private PositiveIntSet indexP = USE_POSITIVE_INT_SET ? new PositiveIntSet_impl() : null;
private int initialSize;
FSBagIndex(CASImpl cas, Type type, int initialSize, int indexType) {
super(cas, type, indexType);
this.initialSize = initialSize;
- this.index = new IntVector(initialSize);
+ this.index = USE_POSITIVE_INT_SET ? null : new IntVector(initialSize);
}
/**
@@ -185,21 +205,30 @@ public class FSBagIndex extends FSLeafIn
return super.init(newComp);
}
+ // not used (2014)
IntVector getVector() {
return this.index;
}
public void flush() {
// done this way to reset to initial size if it grows
- if (this.index.size() > this.initialSize) {
- this.index = new IntVector(this.initialSize);
+ if (USE_POSITIVE_INT_SET) {
+ indexP.clear();
} else {
- this.index.removeAllElements();
+ if (this.index.size() > this.initialSize) {
+ this.index = new IntVector(this.initialSize);
+ } else {
+ this.index.removeAllElements();
+ }
}
}
public final boolean insert(int fs) {
- this.index.add(fs);
+ if (USE_POSITIVE_INT_SET) {
+ indexP.add(fs);
+ } else {
+ index.add(fs);
+ }
return true;
}
@@ -217,14 +246,18 @@ public class FSBagIndex extends FSLeafIn
// // return binarySearch(this.index.getArray(), ele, 0, this.index.size());
// }
private final int find(int ele) {
- final int[] array = this.index.getArray();
- final int max = this.index.size();
- for (int i = 0; i < max; i++) {
- if (ele == array[i]) {
- return i;
+ if (USE_POSITIVE_INT_SET) {
+ return indexP.find(ele);
+ } else {
+ final int[] array = this.index.getArray();
+ final int max = this.index.size();
+ for (int i = 0; i < max; i++) {
+ if (ele == array[i]) {
+ return i;
+ }
}
+ return -1;
}
- return -1;
}
public int compare(int fs1, int fs2) {
@@ -309,10 +342,18 @@ public class FSBagIndex extends FSLeafIn
* @see org.apache.uima.cas.FSIndex#contains(FeatureStructure)
*/
public boolean contains(FeatureStructure fs) {
- return (find(((FeatureStructureImpl) fs).getAddress()) >= 0);
+ final int addr = ((FeatureStructureImpl) fs).getAddress();
+ return USE_POSITIVE_INT_SET ?
+ indexP.contains(addr) :
+ (find(addr) >= 0);
}
public FeatureStructure find(FeatureStructure fs) {
+ if (USE_POSITIVE_INT_SET) {
+ final FeatureStructureImpl fsi = (FeatureStructureImpl) fs;
+ final int addr = fsi.getAddress();
+ return (indexP.contains(addr)) ? fsi.getCASImpl().createFS(addr) : null;
+ }
// Cast to implementation.
FeatureStructureImpl fsi = (FeatureStructureImpl) fs;
// Use internal find method.
@@ -329,7 +370,7 @@ public class FSBagIndex extends FSLeafIn
* @see org.apache.uima.cas.FSIndex#size()
*/
public int size() {
- return this.index.size();
+ return USE_POSITIVE_INT_SET ? indexP.size() : index.size();
}
/**
@@ -340,9 +381,13 @@ public class FSBagIndex extends FSLeafIn
}
public void remove(int fsRef) {
- final int pos = this.index.indexOfOptimizeAscending(fsRef);
- if (pos >= 0) {
- this.index.remove(pos);
+ if (USE_POSITIVE_INT_SET) {
+ indexP.remove(fsRef);
+ } else {
+ final int pos = this.index.indexOfOptimizeAscending(fsRef);
+ if (pos >= 0) {
+ this.index.remove(pos);
+ }
}
}
Modified: uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/ListUtils.java
URL: http://svn.apache.org/viewvc/uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/ListUtils.java?rev=1636458&r1=1636457&r2=1636458&view=diff
==============================================================================
--- uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/ListUtils.java (original)
+++ uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/cas/impl/ListUtils.java Mon Nov 3 22:15:41 2014
@@ -26,7 +26,7 @@ import java.util.ListIterator;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.impl.XmiSerializationSharedData.OotsElementData;
-import org.apache.uima.internal.util.PositiveIntSet;
+import org.apache.uima.internal.util.PositiveIntSet_impl;
import org.apache.uima.internal.util.IntVector;
import org.apache.uima.internal.util.XmlAttribute;
import org.apache.uima.util.Level;
@@ -194,7 +194,7 @@ public class ListUtils {
}
public int getLength(int type, int addr, int neListType, int tailFeat) {
- final PositiveIntSet visited = new PositiveIntSet();
+ final PositiveIntSet_impl visited = new PositiveIntSet_impl();
foundCycle = false;
// first count length of list so we can allocate array
int length = 0;
@@ -235,7 +235,7 @@ public class ListUtils {
final int headFeat = getHeadFeatCode(type);
final int tailFeat = getTailFeatCode(type);
final int neListType = getNeListType(type);
- final PositiveIntSet visited = new PositiveIntSet();
+ final PositiveIntSet_impl visited = new PositiveIntSet_impl();
while (curNode != CASImpl.NULL) {
final int curNodeType = cas.getHeapValue(curNode);
@@ -385,7 +385,7 @@ public class ListUtils {
int currLength = this.getLength(this.neIntListType, addr);
int curNode = addr;
int prevNode = 0;
- final PositiveIntSet visited = new PositiveIntSet();
+ final PositiveIntSet_impl visited = new PositiveIntSet_impl();
boolean foundCycle = false;
int i =0;
@@ -461,7 +461,7 @@ public class ListUtils {
int currLength = this.getLength(this.neFloatListType, addr);
int curNode = addr;
int prevNode = 0;
- final PositiveIntSet visited = new PositiveIntSet();
+ final PositiveIntSet_impl visited = new PositiveIntSet_impl();
boolean foundCycle = false;
int i =0;
@@ -536,7 +536,7 @@ public class ListUtils {
int first = addr;
int currLength = this.getLength(this.neFsListType, addr);
boolean foundCycle = false;
- final PositiveIntSet visited = new PositiveIntSet();
+ final PositiveIntSet_impl visited = new PositiveIntSet_impl();
int curNode = addr;
int prevNode = 0;
@@ -614,7 +614,7 @@ public class ListUtils {
public int updateStringList(int addr, List<String> stringValues) throws SAXException {
int first = addr;
boolean foundCycle = false;
- final PositiveIntSet visited = new PositiveIntSet();
+ final PositiveIntSet_impl visited = new PositiveIntSet_impl();
int curNode = addr;
int prevNode = 0;
int currLength = this.getLength(this.neStringListType, addr);
Modified: uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/internal/util/IntBitSet.java
URL: http://svn.apache.org/viewvc/uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/internal/util/IntBitSet.java?rev=1636458&r1=1636457&r2=1636458&view=diff
==============================================================================
--- uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/internal/util/IntBitSet.java (original)
+++ uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/internal/util/IntBitSet.java Mon Nov 3 22:15:41 2014
@@ -36,7 +36,7 @@ import java.util.NoSuchElementException;
* If using offset, you must add ints in a range equal to or above the offset
*
*/
-public class IntBitSet {
+public class IntBitSet implements PositiveIntSet {
private final BitSet set;
@@ -83,14 +83,15 @@ public class IntBitSet {
this(maxInt, 0);
}
- public IntBitSet(int maxInt, int offset) {
- set = new BitSet(Math.max(1, maxInt));
+ public IntBitSet(int maxAdjKey, int offset) {
+ set = new BitSet(Math.max(1, maxAdjKey));
this.offset = offset;
}
/**
* empty the IntBitSet
*/
+ @Override
public void clear() {
set.clear();
size = 0;
@@ -101,15 +102,22 @@ public class IntBitSet {
* @param key
* @return
*/
+ @Override
public boolean contains(int key) {
return (key == 0) ? false : set.get(key - offset);
}
+
+ @Override
+ public int find(int element) {
+ return contains(element) ? element : -1;
+ }
/**
*
* @param key
* @return true if this set did not already contain the specified element
*/
+ @Override
public boolean add(int original_key) {
if (original_key < offset) {
throw new IllegalArgumentException("key must be positive, but was " + original_key);
@@ -130,6 +138,7 @@ public class IntBitSet {
* @param key -
* @return true if this key was removed, false if not present
*/
+ @Override
public boolean remove(int original_key) {
final int key = original_key - offset;
final boolean prev = set.get(key);
@@ -145,6 +154,7 @@ public class IntBitSet {
*
* @return the number of elements in this set
*/
+ @Override
public int size() {
return size; // bit set cardinality() is slow
}
@@ -170,6 +180,12 @@ public class IntBitSet {
return set.length() - 1 + offset;
}
+ @Override
+ public int get(int position) {
+ assert(set.get(position));
+ return position + offset;
+ }
+
private class IntBitSetIterator implements IntListIterator {
protected int curKey;
@@ -221,7 +237,38 @@ public class IntBitSet {
}
- public IntBitSetIterator getIterator() {
+ @Override
+ public IntBitSetIterator iterator() {
return new IntBitSetIterator();
}
+
+ @Override
+ public int moveToFirst() {
+ return set.nextSetBit(0);
+ }
+
+ @Override
+ public int moveToLast() {
+ return set.length() - 1;
+ }
+
+ @Override
+ public int moveToNext(int position) {
+ return set.nextSetBit(position + 1);
+ }
+
+ @Override
+ public int moveToPrevious(int position) {
+ return set.previousSetBit(position - 1);
+ }
+
+ /**
+ * This impl depends on position always pointing to a valid (== non 0)
+ * element of the set, when it should be valid
+ */
+ @Override
+ public boolean isValid(int position) {
+ return (position >= 0) && set.get(position);
+ }
+
}
Modified: uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/internal/util/IntHashSet.java
URL: http://svn.apache.org/viewvc/uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/internal/util/IntHashSet.java?rev=1636458&r1=1636457&r2=1636458&view=diff
==============================================================================
--- uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/internal/util/IntHashSet.java (original)
+++ uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/internal/util/IntHashSet.java Mon Nov 3 22:15:41 2014
@@ -31,10 +31,14 @@ import org.apache.uima.jcas.impl.JCasHas
* based on Int2IntHashMap
* This impl is for use in a single thread case only
*
- * Supports shrinking (reallocating the big table)
+ * Supports shrinking (reallocating the big table)
+ *
+ * Supports representing ints as "short" 2byte values if possible,
+ * together with an offset amount.
*
+ * Automatically switches to full int representation if needed
*/
-public class IntHashSet {
+public class IntHashSet implements PositiveIntSet {
public static final float DEFAULT_LOAD_FACTOR = 0.66F;
// set to true to collect statistics for tuning
@@ -58,22 +62,43 @@ public class IntHashSet {
private int maxProbe = 0;
private int sizeWhichTriggersExpansion;
- private int size; // number of elements in the table
+ private int size; // number of elements in the table
+
+ // offset only used with keys2. values stored are key - offset
+ // intent is to have them fit into short data type.
+ // - value of 0 reserved; if value is 0 - means no entry
+ // store values at or below the offset are stored as 1 less to avoid mapping to "0".
+ private int offset;
- private int [] keys;
+ private int [] keys4;
+ private short[] keys2;
private boolean secondTimeShrinkable = false;
+ // these are true values (before any offset adjustment)
private int mostPositive = Integer.MIN_VALUE;
private int mostNegative = Integer.MAX_VALUE;
public IntHashSet() {
- this(16);
+ this(12, 0);
}
public IntHashSet(int initialCapacity) {
+ this(initialCapacity, 0);
+ }
+
+ /**
+ *
+ * @param initialCapacity - you can add this many before expansion
+ * @param offset - for values in the short range, the amount to subtract
+ * before storing.
+ * If == MIN_VALUE, then force 4 byte ints
+ */
+ public IntHashSet(int initialCapacity, int offset) {
this.initialCapacity = initialCapacity;
- newTableKeepSize(initialCapacity);
+ boolean force4 = offset == Integer.MIN_VALUE;
+ this.offset = force4 ? 0 : offset;
+ newTableKeepSize(tableSpace(initialCapacity), force4);
size = 0;
if (TUNE) {
histogram = new int[200];
@@ -100,6 +125,12 @@ public class IntHashSet {
return tableSpace(numberOfElements, loadFactor);
}
+ /**
+ *
+ * @param numberOfElements
+ * @param factor
+ * @return capacity of the main table (either 2 byte or 4 byte entries)
+ */
public static int tableSpace(int numberOfElements, Float factor) {
if (numberOfElements < 0) {
throw new IllegalArgumentException("must be > 0");
@@ -108,9 +139,20 @@ public class IntHashSet {
return Math.max(16, nextHigherPowerOf2(capacity));
}
- private void newTableKeepSize(int capacity) {
- capacity = Math.max(16, nextHigherPowerOf2(capacity));
- keys = new int[capacity];
+ private void newTableKeepSize(int capacity, boolean make4) {
+ if (!make4) {
+ capacity = Math.max(4, nextHigherPowerOf2(capacity));
+ make4 = (capacity >= 65536);
+ }
+ // don't use short values unless
+ // the number of items is < 65536
+ if (make4) {
+ keys4 = new int[capacity];
+ keys2 = null;
+ } else {
+ keys2 = new short[capacity];
+ keys4 = null;
+ }
sizeWhichTriggersExpansion = (int)(capacity * loadFactor);
}
@@ -130,32 +172,57 @@ public class IntHashSet {
}
public int getSpaceUsedInWords() {
- return keys.length; // 8 is overhead for this class excluding any Java object overhead
+ return (keys4 != null) ? keys4.length : (keys2.length >> 1);
}
public static int getSpaceOverheadInWords() {
- return 8;
+ return 11;
+ }
+
+ private int getCapacity() {
+ return (null == keys4) ? keys2.length : keys4.length;
}
private void increaseTableCapacity() {
- final int [] oldKeys = keys;
- final int oldCapacity = oldKeys.length;
- int newCapacity = 2 * oldCapacity;
+ final int oldCapacity = getCapacity();
+ final int newCapacity = oldCapacity << 1;
- if (TUNE) {
- System.out.println("Capacity increasing from " + oldCapacity + " to " + newCapacity);
- }
- newTableKeepSize(newCapacity);
- for (int key : oldKeys) {
- if (key != 0) {
- addInner(key);
+ if (keys2 != null) {
+ final short[] oldKeys = keys2;
+ newTableKeepSize(newCapacity, false);
+ if (newCapacity > 32768) {
+ // switch to 4
+ if (TUNE) {System.out.println("Switching to 4 byte keys, Capacity increasing from " + oldCapacity + " to " + newCapacity);}
+ for (short adjKey : oldKeys) {
+ if (adjKey != 0) {
+ addInner4(adjKey + offset);
+ }
+ }
+
+ } else {
+ if (TUNE) {System.out.println("Keeping 2 byte keys, Capacity increasing from " + oldCapacity + " to " + newCapacity);}
+ for (short adjKey : oldKeys) {
+ if (adjKey != 0) {
+ addInner2(adjKey);
+ }
+ }
+ }
+
+ } else {
+ final int [] oldKeys = keys4;
+ if (TUNE) {System.out.println("Capacity increasing from " + oldCapacity + " to " + newCapacity);}
+ newTableKeepSize(newCapacity, false);
+ for (int key : oldKeys) {
+ if (key != 0) {
+ addInner4(key);
+ }
}
}
}
// called by clear
private void newTable(int capacity) {
- newTableKeepSize(capacity);
+ newTableKeepSize(capacity, false);
size = 0;
resetHistogram();
}
@@ -168,6 +235,19 @@ public class IntHashSet {
}
}
+ private void resetArray() {
+ if (keys4 == null) {
+ Arrays.fill(keys2, (short)0);
+ } else {
+ Arrays.fill(keys4, 0);
+ }
+ mostPositive = Integer.MIN_VALUE;
+ mostNegative = Integer.MAX_VALUE;
+ resetHistogram();
+ size = 0;
+ }
+
+ @Override
public void clear() {
// see if size is less than the 1/2 size that triggers expansion
if (size < (sizeWhichTriggersExpansion >>> 1)) {
@@ -175,26 +255,22 @@ public class IntHashSet {
// this is done to avoid thrashing around the threshold
if (secondTimeShrinkable) {
secondTimeShrinkable = false;
- final int newCapacity = Math.max(initialCapacity, keys.length >>> 1);
- if (newCapacity < keys.length) {
+ final int currentCapacity = getCapacity();
+ final int newCapacity = Math.max(initialCapacity, currentCapacity >>> 1);
+ if (newCapacity < currentCapacity) {
newTable(newCapacity); // shrink table by 50%
} else { // don't shrink below minimum
- Arrays.fill(keys, 0);
+ resetArray();
}
- size = 0;
- resetHistogram();
return;
+
} else {
secondTimeShrinkable = true;
}
} else {
secondTimeShrinkable = false; // reset this to require 2 triggers in a row
}
- size = 0;
- Arrays.fill(keys, 0);
- resetHistogram();
- mostPositive = Integer.MIN_VALUE;
- mostNegative = Integer.MAX_VALUE;
+ resetArray();
}
/**
@@ -204,24 +280,25 @@ public class IntHashSet {
* current value of keys[position] would be 0.
*
* if the key is found, then keys[position] == key
- * @param key -
+ * @param adjKey -
* @return the probeAddr in keys array - might have a 0 value, or the key value if found
*/
- private int find(final int key) {
- if (key == 0) {
+ private int findPosition(final int adjKey) {
+ if (adjKey == 0) {
throw new IllegalArgumentException("0 is an invalid key");
}
- final int hash = JCasHashMap.hashInt(key);
+ final int hash = JCasHashMap.hashInt(adjKey);
int nbrProbes = 1;
- final int[] localKeys = keys;
- final int bitMask = localKeys.length - 1;
+ final int[] localKeys4 = keys4;
+ final short[] localKeys2 = keys2;
+ final int bitMask = ((localKeys4 == null) ? localKeys2.length : localKeys4.length) - 1;
int probeAddr = hash & bitMask;
int probeDelta = 1;
while (true) {
- final int testKey = localKeys[probeAddr];
- if (testKey == 0 || testKey == key) {
+ final int testKey = (localKeys4 != null) ? localKeys4[probeAddr] : localKeys2[probeAddr];
+ if (testKey == 0 || testKey == adjKey) {
break;
}
nbrProbes++;
@@ -240,72 +317,141 @@ public class IntHashSet {
return probeAddr;
}
- public boolean contains(int key) {
- return (key == 0) ? false : keys[find(key)] == key;
+ @Override
+ public boolean contains(int rawKey) {
+ if (keys4 == null) {
+ final int adjKey = getAdjKey(rawKey);
+ return (rawKey == 0) ? false : keys2[findPosition(adjKey)] == adjKey;
+ }
+ return (rawKey == 0) ? false : keys4[findPosition(rawKey)] == rawKey;
+ }
+
+ @Override
+ public int find(int rawKey) {
+ final int adjKey = getAdjKey(rawKey);
+ final int pos = findPosition(adjKey);
+ if (keys4 == null) {
+ return (keys2[pos] == adjKey) ? pos : -1;
+ } else {
+ return (keys4[pos] == adjKey) ? pos : -1;
+ }
}
+ private int getAdjKey(int rawKey) {
+ if (keys4 != null) {
+ return rawKey;
+ }
+ int adjKey = rawKey - offset;
+ if (adjKey <= 0) {
+ adjKey --; // because 0 is reserved for null
+ }
+ if ((adjKey > 32767) || (adjKey < -32768)) {
+ // convert to 4 byte because values can't be offset and fit in a short
+ final short[] oldKeys = keys2;
+ newTableKeepSize(getCapacity(), true); // make a 4 table
+ // next fails to convert short to int
+// System.arraycopy(oldKeys, 0, keys4, 0, oldKeys.length);
+ int i = 0;
+ for (int v : oldKeys) {
+ keys4[i++] = v;
+ }
+ return rawKey; // now using 4 byte table
+ } else {
+ return adjKey;
+ }
+ }
+
/**
*
* @param key
* @return true if this set did not already contain the specified element
*/
- public boolean add(int key) {
- if (key == 0) {
- throw new IllegalArgumentException("0 not allowed as a key to add");
- }
+ @Override
+ public boolean add(int rawKey) {
+ final int adjKey = getAdjKey(rawKey);
if (size == 0) {
- mostPositive = mostNegative = key;
+ mostPositive = mostNegative = rawKey;
} else {
- if (key > mostPositive) {
- mostPositive = key;
+ if (rawKey > mostPositive) {
+ mostPositive = rawKey;
}
- if (key < mostNegative) {
- mostNegative = key;
+ if (rawKey < mostNegative) {
+ mostNegative = rawKey;
}
}
- final int i = find(key);
- if (keys[i] == 0) {
- keys[i] = key;
+ final int i = findPosition(adjKey);
+ if (keys4 == null) {
+ if (keys2[i] == 0) {
+ keys2[i] = (short) adjKey;
+ incrementSize();
+ return true;
+ }
+ return false;
+ }
+ if (keys4[i] == 0) {
+ keys4[i] = adjKey;
incrementSize();
return true;
}
return false;
}
-
+
/**
* used for increasing table size
- * @param key
+ * @param rawKey
*/
- private void addInner(int key) {
- final int i = find(key);
- assert(keys[i] == 0);
- keys[i] = key;
+ private void addInner4(int rawKey) {
+ final int i = findPosition(rawKey);
+ assert(keys4[i] == 0);
+ keys4[i] = rawKey;
}
-
+
+ private void addInner2(short adjKey) {
+ final int i = findPosition(adjKey);
+ assert(keys2[i] == 0);
+ keys2[i] = adjKey;
+ }
+
/**
* mostPositive and mostNegative are not updated
* for removes. So these values may be inaccurate,
* but mostPositive is always >= actual most positive,
* and mostNegative is always <= actual most negative.
- * @param key -
+ * No conversion from int to short
+ * @param rawKey -
* @return true if the key was present
*/
- public boolean remove(int key) {
- final int i = find(key);
- if (keys[i] != 0) {
- keys[i] = 0;
+ @Override
+ public boolean remove(int rawKey) {
+ final int adjKey = getAdjKey(rawKey);
+ final int i = findPosition(adjKey);
+ boolean removed = false;
+ if (keys4 == null) {
+ if (keys2[i] != 0) {
+ keys2[i] = 0;
+ removed = true;
+ }
+ } else {
+ if (keys4[i] != 0) {
+ keys4[i] = 0;
+ removed = true;
+ }
+ }
+
+ if (removed) {
size--;
- if (key == mostPositive) {
+ if (rawKey == mostPositive) {
mostPositive --; // a weak adjustment
}
- if (key == mostNegative) {
+ if (rawKey == mostNegative) {
mostNegative ++; // a weak adjustment
}
return true;
- }
+ }
return false;
}
-
+
+ @Override
public int size() {
return size;
}
@@ -345,14 +491,27 @@ public class IntHashSet {
System.out.println(i + ": " + histogram[i]);
}
- System.out.println("bytes / entry = " + (float) (keys.length) * 4 / size());
- System.out.format("size = %,d, prevExpansionTriggerSize = %,d, next = %,d%n",
+ if (keys4 == null) {
+ System.out.println("bytes / entry = " + (float) (keys2.length) * 2 / size());
+ System.out.format("size = %,d, prevExpansionTriggerSize = %,d, next = %,d%n",
size(),
- (int) ((keys.length >>> 1) * loadFactor),
- (int) (keys.length * loadFactor));
+ (int) ((keys2.length >>> 1) * loadFactor),
+ (int) (keys2.length * loadFactor));
+ } else {
+ System.out.println("bytes / entry = " + (float) (keys4.length) * 4 / size());
+ System.out.format("size = %,d, prevExpansionTriggerSize = %,d, next = %,d%n",
+ size(),
+ (int) ((keys4.length >>> 1) * loadFactor),
+ (int) (keys4.length * loadFactor));
+ }
}
}
+ @Override
+ public int get(int index) {
+ return offset + ((keys4 == null) ? keys2[index] : keys4[index]);
+ }
+
/**
* advance pos until it points to a non 0 or is 1 past end
* @param pos
@@ -363,12 +522,12 @@ public class IntHashSet {
pos = 0;
}
- final int max = keys.length;
+ final int max = getCapacity();
while (true) {
if (pos >= max) {
return pos;
}
- if (keys[pos] != 0) {
+ if (get(pos) != 0) {
return pos;
}
pos ++;
@@ -381,7 +540,7 @@ public class IntHashSet {
* @return
*/
private int moveToPreviousFilled(int pos) {
- final int max = keys.length;
+ final int max = getCapacity();
if (pos > max) {
pos = max - 1;
}
@@ -390,7 +549,7 @@ public class IntHashSet {
if (pos < 0) {
return pos;
}
- if (keys[pos] != 0) {
+ if (get(pos) != 0) {
return pos;
}
pos --;
@@ -407,14 +566,14 @@ public class IntHashSet {
public final boolean hasNext() {
curPosition = moveToNextFilled(curPosition);
- return curPosition < keys.length;
+ return curPosition < getCapacity();
}
public final int next() {
if (!hasNext()) {
throw new NoSuchElementException();
}
- return keys[curPosition++];
+ return get(curPosition++);
}
/**
@@ -432,14 +591,14 @@ public class IntHashSet {
if (!hasPrevious()) {
throw new NoSuchElementException();
}
- return keys[curPosition--];
+ return get(curPosition--);
}
/**
* @see org.apache.uima.internal.util.IntListIterator#moveToEnd()
*/
public void moveToEnd() {
- curPosition = keys.length - 1;
+ curPosition = getCapacity() - 1;
}
/**
@@ -448,11 +607,36 @@ public class IntHashSet {
public void moveToStart() {
curPosition = 0;
}
- }
-
+ }
- public IntListIterator getIterator() {
+ @Override
+ public IntListIterator iterator() {
return new IntHashSetIterator();
}
+ @Override
+ public int moveToFirst() {
+ return (size() == 0) ? -1 : moveToNextFilled(0);
+ }
+
+ @Override
+ public int moveToLast() {
+ return (size() == 0) ? -1 : moveToPreviousFilled(getCapacity() -1);
+ }
+
+ @Override
+ public int moveToNext(int position) {
+ final int n = moveToNextFilled(position + 1);
+ return (n == getCapacity()) ? -1 : n;
+ }
+
+ @Override
+ public int moveToPrevious(int position) {
+ return moveToPreviousFilled(position - 1);
+ }
+
+ @Override
+ public boolean isValid(int position) {
+ return (position >= 0) && (position < getCapacity());
+ }
}
Modified: uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/internal/util/IntSet.java
URL: http://svn.apache.org/viewvc/uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/internal/util/IntSet.java?rev=1636458&r1=1636457&r2=1636458&view=diff
==============================================================================
--- uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/internal/util/IntSet.java (original)
+++ uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/internal/util/IntSet.java Mon Nov 3 22:15:41 2014
@@ -19,19 +19,33 @@
package org.apache.uima.internal.util;
+import java.util.NoSuchElementException;
+
/**
* This class implements a set of integers. It does not implement the <code>Set</code> interface
* for performance reasons, though methods with the same name are equivalent.
*
+ * This does not implement shorts + offset, like the IntHashSet does
+ * because by the time that might be of interest, we would switch to
+ * IntHashSet to get ~O(1) operations including contains.
+ *
*/
-public class IntSet {
+public class IntSet implements PositiveIntSet {
/** The data. */
- private IntVector data;
+ private IntVector iVec;
/** Creates a new instance of this set. */
public IntSet() {
- this.data = new IntVector();
+ this.iVec = new IntVector();
+ }
+
+ /**
+ *
+ * @param capacity allocate enough space to hold at least this before expanding
+ */
+ public IntSet(int capacity) {
+ this.iVec = new IntVector(capacity);
}
/**
@@ -42,9 +56,10 @@ public class IntSet {
* @return <code>true</code> if this set did not already contain this element,
* <code>false</code> otherwise.
*/
+ @Override
public boolean add(int element) {
- if (!this.data.contains(element)) {
- this.data.add(element);
+ if (!this.iVec.contains(element)) {
+ this.iVec.add(element);
return true;
}
return false;
@@ -58,23 +73,31 @@ public class IntSet {
* @return <code>true</code> if the element is contained in this set, <code>false</code>
* otherwise.
*/
+ @Override
public boolean contains(int element) {
- return this.data.contains(element);
+ return this.iVec.contains(element);
+ }
+
+ @Override
+ public int find(int element) {
+ return this.iVec.indexOf(element);
}
/** @return the size of this set. */
+ @Override
public int size() {
- return this.data.size();
+ return this.iVec.size();
}
/** @return the <code>n</code>-th element in this set. */
+ @Override
public int get(int n) {
- return this.data.get(n);
+ return this.iVec.get(n);
}
/** Removes the <code>n</code>-th element in this set. */
- public void remove(int n) {
- this.data.remove(n);
+ public void removeElementAt(int n) {
+ this.iVec.remove(n);
}
/**
@@ -102,14 +125,14 @@ public class IntSet {
int sum1 = 0;
int sum2 = 0;
for (int i = 0; i < size; i++) {
- sum1 += this.data.get(i);
+ sum1 += this.iVec.get(i);
sum2 += s.get(i);
}
if (sum1 != sum2)
return false;
for (int i = 0; i < size; i++) {
- if (!s.contains(this.data.get(i)))
+ if (!s.contains(this.iVec.get(i)))
return false;
}
return true;
@@ -118,17 +141,120 @@ public class IntSet {
}
public int hashCode() {
- if (this.data == null) {
+ if (this.iVec == null) {
return 0;
}
int sum = 0;
for (int i = 0; i < this.size(); i++) {
- sum += this.data.get(i);
+ sum += this.iVec.get(i);
}
return sum;
}
public int indexOf(int element) {
- return this.data.indexOf(element);
+ return this.iVec.indexOf(element);
}
+
+ @Override
+ public void clear() {
+ iVec.removeAllElements(); // doesn't reallocate
+ }
+
+ @Override
+ public boolean remove(int key) {
+ int i = iVec.indexOf(key);
+ if (i != -1) {
+ iVec.remove(i);
+ }
+ return i != -1;
+ }
+
+ private class IntSetIterator implements IntListIterator {
+
+ protected int pos = 0;
+
+ protected IntSetIterator() {}
+
+ @Override
+ public final boolean hasNext() {
+ return (pos >= 0 && pos < size());
+ }
+
+ @Override
+ public final int next() {
+ if (!hasNext()) {
+ throw new NoSuchElementException();
+ }
+ return iVec.get(pos++);
+ }
+
+ /**
+ * @see org.apache.uima.internal.util.IntListIterator#hasPrevious()
+ */
+ @Override
+ public boolean hasPrevious() {
+ final int posm1 = pos - 1;
+ return (posm1 >= 0 && posm1 < size());
+ }
+
+ /**
+ * @see org.apache.uima.internal.util.IntListIterator#previous()
+ */
+ @Override
+ public int previous() {
+ if (!hasPrevious()) {
+ throw new NoSuchElementException();
+ }
+ return iVec.get(pos--);
+ }
+
+ /**
+ * @see org.apache.uima.internal.util.IntListIterator#moveToEnd()
+ */
+ @Override
+ public void moveToEnd() {
+ pos = size() - 1;
+ }
+
+ /**
+ * @see org.apache.uima.internal.util.IntListIterator#moveToStart()
+ */
+ @Override
+ public void moveToStart() {
+ pos = 0;
+ }
+
+ }
+
+ @Override
+ public IntSetIterator iterator() {
+ return new IntSetIterator();
+ }
+
+ @Override
+ public int moveToFirst() {
+ return (size() == 0) ? -1 : 0;
+ }
+
+ @Override
+ public int moveToLast() {
+ return size() -1;
+ }
+
+ @Override
+ public int moveToNext(int position) {
+ return (size() == (position + 1)) ? -1 : position + 1;
+ }
+
+ @Override
+ public int moveToPrevious(int position) {
+ return position - 1;
+ }
+
+ @Override
+ public boolean isValid(int position) {
+ // TODO Auto-generated method stub
+ return (position >= 0) && (position < size());
+ }
+
}
Modified: uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/internal/util/IntVector.java
URL: http://svn.apache.org/viewvc/uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/internal/util/IntVector.java?rev=1636458&r1=1636457&r2=1636458&view=diff
==============================================================================
--- uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/internal/util/IntVector.java (original)
+++ uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/internal/util/IntVector.java Mon Nov 3 22:15:41 2014
@@ -290,14 +290,15 @@ public class IntVector implements Serial
* @return The position, or <code>-1</code> if it doesn't exist.
*/
public int position(int elem) {
- int i = 0;
- while (i < this.pos) {
- if (this.array[i] == elem) {
- return i;
- }
- ++i;
- }
- return -1;
+ return indexOf(elem);
+// int i = 0;
+// while (i < this.pos) {
+// if (this.array[i] == elem) {
+// return i;
+// }
+// ++i;
+// }
+// return -1;
}
/**
Modified: uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/internal/util/PositiveIntSet.java
URL: http://svn.apache.org/viewvc/uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/internal/util/PositiveIntSet.java?rev=1636458&r1=1636457&r2=1636458&view=diff
==============================================================================
--- uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/internal/util/PositiveIntSet.java (original)
+++ uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/internal/util/PositiveIntSet.java Mon Nov 3 22:15:41 2014
@@ -16,387 +16,85 @@
* specific language governing permissions and limitations
* under the License.
*/
-
package org.apache.uima.internal.util;
-import java.util.Arrays;
-import java.util.NoSuchElementException;
-
-
+public interface PositiveIntSet {
-/**
- * An set of non-zero ints.
- *
- * Uses 2 1/2 different representations depending on the ratio of the max int stored and the total number of ints stored.
- * For large sparse ints, it uses the IntHashSet
- *
- * For more dense ints, it uses bitSets or bitSets with an offset
- *
- * It switches when adding new members if the other representation is sufficiently smaller
- */
-public class PositiveIntSet {
-
- // number of words a inthashset has to be better than intbitset to cause switch
- private static final int HYSTERESIS = 16;
-
- // Extra space in bitset for initial capacity - can add an int up to that size
- // 64 *4 is 8 words, = 256.
- private static final int BIT_SET_OVERALLOCATE = 64 * 4;
-
- // Extra space in hashset for initial capacity - can add a minimum of that many more members
- // without hitting the hash-set doubling.
- private static final int HASH_SET_OVERALLOCATE = 8;
- // Extra space in hashset for initial capacity - when creating from bitmap set - multiplier of existing size
- private static final int HASH_SET_OVERALLOCATE_MULTIPLIER = 1; // bit shift right = divide by 2 = 50% of existing capacity
-
- private IntBitSet intBitSet = null;
-
- private IntHashSet intHashSet = null;
-
- //package private for test case
- boolean isBitSet = true;
-
/**
- * Set false once we find we have to reduce the bit offset
+ * remove all members of the set
*/
- //package private for test case
- boolean useOffset = true;
-
- public IntListIterator getUnorderedIterator() {
- if (isBitSet) {
- return intBitSet.getIterator();
- }
- return intHashSet.getIterator();
- }
-
- public IntListIterator getOrderedIterator() {
- if (isBitSet) {
- return intBitSet.getIterator();
- }
-
- int[] allValues = new int[size()];
- IntListIterator it = intHashSet.getIterator();
- int i = 0;
- while (it.hasNext()) {
- allValues[i++] = it.next();
- }
- Arrays.sort(allValues);
- return new IntListIteratorOverArray(allValues);
- }
-
- private class IntListIteratorOverArray implements IntListIterator {
- private int[] a;
- private int pos;
-
- IntListIteratorOverArray(int[] a) {
- this.a = a;
- pos = 0;
- }
-
- @Override
- public boolean hasNext() {
- return pos >= 0 && pos < a.length;
- }
-
- @Override
- public int next() throws NoSuchElementException {
- if (hasNext()) {
- return a[pos++];
- }
- throw new NoSuchElementException();
- }
+ void clear();
- @Override
- public boolean hasPrevious() {
- return pos >= 0 && pos < a.length;
- }
-
- @Override
- public int previous() {
- if (hasPrevious()) {
- return a[pos--];
- }
- throw new NoSuchElementException();
- }
-
- @Override
- public void moveToStart() {
- pos = 0;
- }
+ /**
+ * @param key -
+ * @return true if key is in the set
+ */
+ boolean contains(int key);
- @Override
- public void moveToEnd() {
- pos = a.length - 1;
- }
- }
-
- public void clear() {
- if (isBitSet) {
- intBitSet.clear();
- } else {
- intHashSet.clear();
- }
- }
-
- public boolean contains(int key) {
- if (isBitSet) {
- return intBitSet.contains(key);
- } else {
- return intHashSet.contains(key);
- }
- }
-
/**
*
* @param key -
* @return true if this set did not already contain the specified element
*/
- public boolean add(int key) {
- maybeSwitchRepresentation(key);
- if (isBitSet) {
- return intBitSet.add(key);
- } else {
- return intHashSet.add(key);
- }
- }
+ boolean add(int key);
/**
- * When a key is added which is lower than the offset, we need to adjust the whole int set table.
- * Although an array copy could do this, we don't have access to the bitset impl.
- *
- * So we just set the offset to 0, and either convert the whole thing to a inthashset or copy all the bits to a new
- * bit set.
*
- * @param key
+ * @param key -
+ * @return true if the set had this element before the remove
*/
- private void adjustBitSetForLowerOffset(int key) {
- final int largestInt = intBitSet.getLargestMenber();
- final int bitSetSpaceNeeded = getBitSetSpaceFromMaxInt(largestInt, 0); // doesn't use offset
- final int hashSetSpaceNeeded = getHashSetSpace(intBitSet.size());
-
- if (hashSetSpaceNeeded < (bitSetSpaceNeeded - HYSTERESIS)) {
- switchToHashSet(size());
- } else {
- IntBitSet bs = new IntBitSet(largestInt + BIT_SET_OVERALLOCATE);
- IntListIterator it = intBitSet.getIterator();
- while (it.hasNext()) {
- bs.add(it.next());
- }
- intBitSet = bs;
- }
- useOffset = false; // stop using because we have evidence it isn't appropriate
- }
-
- public void add(int[] vs) {
- for (int i : vs) {
- add(i);
- }
- }
-
- public boolean remove(int key) {
- if (isBitSet) {
- return intBitSet.remove(key);
- } else {
- return intHashSet.remove(key);
- }
- }
+ boolean remove(int key);
- public int size() {
- if (isBitSet) {
- return intBitSet.size();
- } else {
- return intHashSet.size();
- }
- }
-
- public int[] toUnorderedIntArray() {
- final int[] a = new int[size()];
- IntListIterator it = getUnorderedIterator();
- int i = 0;
- while (it.hasNext()) {
- a[i++] = it.next();
- }
- return a;
- }
+ /**
+ * @return number of elements in the set
+ */
+ int size();
- public int[] toOrderedIntArray() {
- int [] a = toUnorderedIntArray();
- if (isBitSet) {
- return a;
- }
- Arrays.sort(a);
- return a;
- }
+ /**
+ * @return an iterator (may be ordered or unordered) over the members of the set
+ */
+ IntListIterator iterator();
- /**
- * Only called if key won't fit in existing hashset allocation
- *
- * Using existing bitset, compute what its size() (capacity) would be if the
- * key were added; include the overhead of the intbitset class.
- *
- * long words required = larger of double the current size, or the number of words required
+ /**
+ * @param element an item which may be in the set
+ * @return -1 if the item is not in the set, or a position value that can be used with iterators to start at that item.
+ */
+ int find(int element);
- * @param maxKeyLessOffset
- * @return number of words needed to store the highest value
+ /**
+ * For FSBagIndex low level iterator use
+ * @param position - get the element at this position
+ * @return the element
*/
- private int getBitSetSpaceFromMaxInt(int maxKeyLessOffset, int spaceUsed) {
- int w64 = 1 + (maxKeyLessOffset >> 6); // 64 bits per long, add one because 0 to 63 takes one word)
- spaceUsed = spaceUsed >> 5; // in # of long words, * 2 because the alloc doubles the space
-
- int newSpace = Math.max(spaceUsed, w64) << 1; // size is in 32 bit words at the end
- // e.g., 31 key => 1 word, 32 key => 2 words
- return newSpace + 2; // 2 more for IntBitSet object overhead
- }
+ int get(int position);
/**
- * Only called if key won't fit in existing allocation
- *
- * returns new hash table size ( usually double) + overhead
- *
- * @param numberOfEntries
- * @return
+ * For FSBagIndex low level iterator use
+ * @return the position of the first element, or -1;
*/
- private int getHashSetSpace() {
- return intHashSet.getSpaceUsedInWords() * 2 + IntHashSet.getSpaceOverheadInWords();
- }
+ int moveToFirst();
/**
- * When converting from bitset to hash set, the initial hash set should be
- * big enough so that it takes a while before it is doubled-expanded.
- *
- * @param existingSize
- * @return
+ * For FSBagIndex low level iterator use
+ * @return the position of the last element, or -1;
*/
- private int getHashSetOverAllocateSize(int existingSize) {
- return existingSize + (existingSize >> HASH_SET_OVERALLOCATE_MULTIPLIER) + HASH_SET_OVERALLOCATE;
- }
+ int moveToLast();
/**
- * For the case where the intHashSet doesn't yet exist
- * @param numberOfElements -
- * @return the size in words
- */
- private int getHashSetSpace(int numberOfElements) {
- return IntHashSet.tableSpace(numberOfElements, IntHashSet.DEFAULT_LOAD_FACTOR) +
- IntHashSet.getSpaceOverheadInWords();
- }
+ * For FSBagIndex low level iterator use
+ * @return the position of the next element, or -1;
+ */
+ int moveToNext(int position);
/**
- * logic for switching representation
- *
- * BitSet preferred - is faster, can be more compact, and permits ordered iteration
- * HashSet used when BitSet is too space inefficient.
- *
- * Avoid switching if the new key being added won't increase the representation size
- * - for hash: if number of elements +1 won't cause doubling of the hash table
- * - for bitset: if key will fit in existing "capacity"
- *
- * Compute space needed after adding
- * - bitset: array doubles
- * - hashset: array doubles
- *
- * Preventing jitters back and forth:
- * - removes don't cause switching
- * - compare against other size including its non-linear space jumps.
+ * For FSBagIndex low level iterator use
+ * @return the position of the next element, or -1;
*/
-
- private void maybeSwitchRepresentation(int key) {
- if (isBitSet) {
- /********************
- * Bit Set
- ********************/
-
- /*********
- * Size is 0,
- * use this opportunity to set the offset
- * unless that has been a bad strategy for this
- * instance already
- ********/
- if (intBitSet == null) {
- if (useOffset) {
-
- // initialize bitSetOffset to the key, minus some amount to allow some storage of previous values
- // a minimum of 63, a maximum of 512 == 16 words, when key is > 4K (128 words)
- final int spaceForLesserKeys = Math.min(1023, Math.max(63, key >> 3));
- final int offset = Math.max(0, key - spaceForLesserKeys);
- intBitSet = new IntBitSet(BIT_SET_OVERALLOCATE + key - offset, offset);
- // because of offset, won't have to switch to hash for this key
- //
- // force an initial allocation of this sufficient to keep
- // a) the space below, plus
- // b)
- return;
- } else {
- intBitSet = new IntBitSet(BIT_SET_OVERALLOCATE + key);
- // don't return, see if we should immediately switch to hash representation
- }
- }
-
- final int offset = intBitSet.getOffset();
-
- if (key < offset) {
- adjustBitSetForLowerOffset(key);
- return;
- }
-
- // return if key fits in existing allocation
- final int spaceUsed = intBitSet.getSpaceUsed_in_bits();
- if ((key - offset) < spaceUsed) {
- return;
- }
+ int moveToPrevious(int position);
- // space used in words (32 bit words)
- final int bitSetSpaceNeeded = getBitSetSpaceFromMaxInt(key - offset, spaceUsed);
- final int hashSetOverAllocateSize = getHashSetOverAllocateSize(intBitSet.size() + 1);
- final int hashSetOverAllocateSpace = getHashSetSpace(hashSetOverAllocateSize);
- // switch if hashmap space plus hysteresis would be < bitmap space
- if (hashSetOverAllocateSpace < (bitSetSpaceNeeded - HYSTERESIS)) {
- switchToHashSet(hashSetOverAllocateSize);
- }
- return;
- }
-
- /********************
- * Hash Set
- ********************/
- // maybe switch to IntBitSet
- // space used in words (32 bit words)
- // don't bother to include the new element - it might not be "added"
- // if it was already there, and only serves to decrease hysteresis a bit
-
- if (intHashSet.wontExpand()) {
- return;
- }
-
- final int hashSetSpaceNeeded = getHashSetSpace();
-
- final int maxInt = intHashSet.getMostPositive();
- final int offset = useOffset ? intHashSet.getMostNegative() : 0;
- final int bitSetSpaceNeeded = getBitSetSpaceFromMaxInt(BIT_SET_OVERALLOCATE + maxInt - offset, 0);
-
- if (bitSetSpaceNeeded < (hashSetSpaceNeeded - HYSTERESIS)) {
- switchToBitSet(maxInt, offset);
- }
- }
-
- private void switchToHashSet(int size) {
- isBitSet = false;
- intHashSet = new IntHashSet(size);
- IntListIterator it = intBitSet.getIterator();
- while (it.hasNext()) {
- intHashSet.add(it.next());
- }
- intBitSet = null;
- }
-
- private void switchToBitSet(int maxInt, int offset) {
- isBitSet = true;
- intBitSet = new IntBitSet(BIT_SET_OVERALLOCATE + maxInt - offset, offset);
- IntListIterator it = intHashSet.getIterator();
- while (it.hasNext()) {
- intBitSet.add(it.next());
- }
- intHashSet = null;
- }
-
-}
+ /**
+ * For FSBagIndex low level iterator use
+ * @return true if the position is between the first and last element inclusive.
+ */
+ boolean isValid(int position);
+}
\ No newline at end of file
Copied: uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/internal/util/PositiveIntSet_impl.java (from r1634141, uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/internal/util/PositiveIntSet.java)
URL: http://svn.apache.org/viewvc/uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/internal/util/PositiveIntSet_impl.java?p2=uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/internal/util/PositiveIntSet_impl.java&p1=uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/internal/util/PositiveIntSet.java&r1=1634141&r2=1636458&rev=1636458&view=diff
==============================================================================
--- uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/internal/util/PositiveIntSet.java (original)
+++ uima/uimaj/trunk/uimaj-core/src/main/java/org/apache/uima/internal/util/PositiveIntSet_impl.java Mon Nov 3 22:15:41 2014
@@ -25,36 +25,66 @@ import java.util.NoSuchElementException;
/**
- * An set of non-zero ints.
+ * An set of non-zero integers, ability to iterate over them (possibly in a sorted way),
+ * with O(1) operations for adding, removing, and testing for contains.
*
- * Uses 2 1/2 different representations depending on the ratio of the max int stored and the total number of ints stored.
- * For large sparse ints, it uses the IntHashSet
- *
- * For more dense ints, it uses bitSets or bitSets with an offset
+ * Optimized for adding mostly increasing integers (with some space for less-than-first-one-added).
+ * - major uses: indexes of Feature Structures.
+ * Optimize for small sets
+ *
+ * Graceful degradation for completely random integer operations on large sets;
+ * keep O(1) operations, at the expense of extra space (< 3 x size).
+ *
+ * Uses several different representations depending on the range of ints stored and the total number of ints stored.
+ *
+ * Sizes: Tiny, medium, large
+ * Ranges: semi-knowable, unknowable;
+ * if semi-knowable, dense, small-range (< 65K), large-range
*
- * It switches when adding new members if the other representation is sufficiently smaller
+ * For all sizes,
+ * if dense, use IntBitSet (with offset)
+ *
+ * else
+ * For large, (implies large-range, too) use IntHashSet
+ *
+ * For medium,
+ * if small-range < 65K, use IntHashSet with offset
+ * else use IntHashSet
+ *
+ * Arrange switching between representations
+ * to occur infrequently, especially as cost of switching (size) grows
+ * Arrange checking for switching to occur infrequently, taking into account how underlying data structures grow
+ * (which is often by doubling)
+ * Switch when adding new member(s) if alternative representation is sufficiently smaller
+ *
*/
-public class PositiveIntSet {
+public class PositiveIntSet_impl implements PositiveIntSet {
// number of words a inthashset has to be better than intbitset to cause switch
private static final int HYSTERESIS = 16;
- // Extra space in bitset for initial capacity - can add an int up to that size
+ private static final int INT_SET_MAX_SIZE = 16;
+
+ private static final int HASH_SET_SHORT_MAX_SIZE = (1 << 16) - 1; // 65535
+
+ // Extra space in bitset for expansion - can add an int (- offset) up to that size
+ // Initial space = 2 words = 1 long.
// 64 *4 is 8 words, = 256.
- private static final int BIT_SET_OVERALLOCATE = 64 * 4;
+ private static final int BIT_SET_OVERALLOCATE = 64 * 4; // 8 words minimum
- // Extra space in hashset for initial capacity - can add a minimum of that many more members
- // without hitting the hash-set doubling.
- private static final int HASH_SET_OVERALLOCATE = 8;
// Extra space in hashset for initial capacity - when creating from bitmap set - multiplier of existing size
- private static final int HASH_SET_OVERALLOCATE_MULTIPLIER = 1; // bit shift right = divide by 2 = 50% of existing capacity
-
- private IntBitSet intBitSet = null;
+ private static final int HASH_SET_OVERALLOCATE_DIVIDER_SHIFT = 1; // bit shift right = divide by 2 = 50% of existing capacity
- private IntHashSet intHashSet = null;
+ private static final int[] EMPTY_INT_ARRAY = new int[0];
+
+ private PositiveIntSet intSet; // one of 3 representations: IntBitSet, IntHashSet, IntSet (based on IntVector)
//package private for test case
- boolean isBitSet = true;
+ boolean isBitSet = false;
+ boolean isIntSet = false;
+ boolean isHashSet = false;
+
+ boolean secondTimeShrinkable = false;
/**
* Set false once we find we have to reduce the bit offset
@@ -62,20 +92,90 @@ public class PositiveIntSet {
//package private for test case
boolean useOffset = true;
+ public PositiveIntSet_impl() {
+ this(0, 0, 0);
+ }
+
+ /**
+ * Set up a Positive Bit Set
+ * @param initialSize - if 0, don't allocate yet, wait for first add.
+ * If isBitSetDense, then this number is interpreted as
+ * the first int to be added, typically with an offset.
+ * The next two params are used only if initialSize is not 0.
+ * @param estMin - the estimated minimum int value to be added
+ * @param estMax - the estimated max int value to be added
+ * @param isBitSetDense - true if we should use a bit set
+ * @param offsetFlag - a number >= 0, the offset to use, or -1 to automatically
+ * calculate the offset based on the initialSize
+ */
+ public PositiveIntSet_impl(int initialSize, int estMin, int estMax) {
+ if (initialSize == 0) {
+ isBitSet = true; // try first as bit set with offset perhaps
+ return; // delay allocation until we know what the 1st int will be
+ }
+
+ if ((initialSize < 0) ||
+ (estMin < 0) ||
+ (estMax < estMin)) {
+ throw new IllegalArgumentException();
+ }
+
+ estMax = Math.max(initialSize, estMax); // because its a positive int set
+
+
+ // offsets are different for bit set and hash set
+ // for bit set, it's set to the estMin - capped 12.5% from the estMin (cap is 16 words)
+ // for hash set, if the size > 65K, it's ignored
+ // it's set to 3/4 up the "range" of est max - est min
+
+ int offsetBitSet = estimatedBitSetOffset(estMin, -1);
+
+ int offsetHashSet = getHashSetOffset(estMax, estMin);
+
+ int bitSetWordsNeeded = getBitSetSpaceFromRange(estMax - offsetBitSet, 0);
+ int hashSetWordsNeeded = getHashSetSpace(initialSize, estMax, estMin);
+
+ // no HYSTERESIS in test below - pick absolute smaller for initial alloc
+ if (bitSetWordsNeeded < hashSetWordsNeeded) {
+ allocateIntBitSet(estMax, estMin, offsetBitSet);
+ isBitSet = true;
+ return;
+ }
+
+ // choose between IntHashSet and IntSet based on size
+ if (initialSize <= INT_SET_MAX_SIZE) {
+ intSet = new IntSet(initialSize);
+ isIntSet = true;
+ return;
+ }
+
+ intSet = new IntHashSet(initialSize, offsetHashSet);
+ isHashSet = true;
+
+ }
+
+ @Override
+ public IntListIterator iterator() {
+ return getUnorderedIterator();
+ }
+
public IntListIterator getUnorderedIterator() {
- if (isBitSet) {
- return intBitSet.getIterator();
+ if (null == intSet) {
+ intSet = new IntSet(0);
}
- return intHashSet.getIterator();
+ return intSet.iterator();
}
public IntListIterator getOrderedIterator() {
if (isBitSet) {
- return intBitSet.getIterator();
+ if (null == intSet) {
+ return new IntListIteratorOverArray(EMPTY_INT_ARRAY);
+ }
+ return intSet.iterator();
}
int[] allValues = new int[size()];
- IntListIterator it = intHashSet.getIterator();
+ IntListIterator it = intSet.iterator();
int i = 0;
while (it.hasNext()) {
allValues[i++] = it.next();
@@ -130,36 +230,74 @@ public class PositiveIntSet {
}
}
+ /* (non-Javadoc)
+ * @see org.apache.uima.internal.util.PositiveIntSetI#clear()
+ */
+ @Override
public void clear() {
- if (isBitSet) {
- intBitSet.clear();
- } else {
- intHashSet.clear();
+ if (null != intSet) {
+ if (isBitSet) {
+ if (secondTimeShrinkable) {
+ secondTimeShrinkable = false;
+ IntBitSet ib = (IntBitSet) intSet;
+ int max = ib.getLargestMenber();
+ int offset = ib.getOffset();
+ intSet = new IntBitSet((max - offset) >> 1, offset);
+ } else {
+ intSet.clear();
+ secondTimeShrinkable = true;
+ }
+ } else {
+ intSet.clear();
+ }
}
}
+ /* (non-Javadoc)
+ * @see org.apache.uima.internal.util.PositiveIntSetI#contains(int)
+ */
+ @Override
public boolean contains(int key) {
- if (isBitSet) {
- return intBitSet.contains(key);
- } else {
- return intHashSet.contains(key);
- }
+ return (null != intSet) ? intSet.contains(key) : false;
}
- /**
- *
- * @param key -
- * @return true if this set did not already contain the specified element
+ @Override
+ public int find(int element) {
+ return (null != intSet) ? intSet.find(element) : -1;
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.uima.internal.util.PositiveIntSetI#add(int)
*/
+ @Override
public boolean add(int key) {
maybeSwitchRepresentation(key);
- if (isBitSet) {
- return intBitSet.add(key);
- } else {
- return intHashSet.add(key);
- }
+ return intSet.add(key);
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.uima.internal.util.PositiveIntSetI#remove(int)
+ */
+ @Override
+ public boolean remove(int key) {
+ return (null != intSet) ? intSet.remove(key) : false;
}
+ /* (non-Javadoc)
+ * @see org.apache.uima.internal.util.PositiveIntSetI#size()
+ */
+ @Override
+ public int size() {
+ return (null != intSet) ? intSet.size() : 0;
+ }
+
+// public void add(int[] vs) {
+// for (int i : vs) {
+// add(i);
+// }
+// }
+
+
/**
* When a key is added which is lower than the offset, we need to adjust the whole int set table.
* Although an array copy could do this, we don't have access to the bitset impl.
@@ -170,43 +308,36 @@ public class PositiveIntSet {
* @param key
*/
private void adjustBitSetForLowerOffset(int key) {
- final int largestInt = intBitSet.getLargestMenber();
- final int bitSetSpaceNeeded = getBitSetSpaceFromMaxInt(largestInt, 0); // doesn't use offset
- final int hashSetSpaceNeeded = getHashSetSpace(intBitSet.size());
+ final IntBitSet intBitSet = (IntBitSet) intSet;
+ final int size = intBitSet.size() + 1;
+ final int largestInt = intBitSet.getLargestMenber(); // not called if key > largest member
+ final int bitSetSpaceNeeded = getBitSetSpaceFromRange(largestInt, 0); // doesn't use offset
+ final int hashSetSpaceNeeded = getHashSetSpace(size, largestInt, key); // computes using load factor,
+ useOffset = false; // stop using because we have evidence it isn't appropriate
if (hashSetSpaceNeeded < (bitSetSpaceNeeded - HYSTERESIS)) {
- switchToHashSet(size());
+ int offset = getHashSetOffset(largestInt, key);
+ switchFromBitSet(size, offset);
+
+ // keep as bit set - it's smaller, but drop the offset
} else {
- IntBitSet bs = new IntBitSet(largestInt + BIT_SET_OVERALLOCATE);
- IntListIterator it = intBitSet.getIterator();
+ IntListIterator it = intSet.iterator();
+ allocateIntBitSet(largestInt, key, 0);
while (it.hasNext()) {
- bs.add(it.next());
+ intSet.add(it.next());
}
- intBitSet = bs;
- }
- useOffset = false; // stop using because we have evidence it isn't appropriate
- }
-
- public void add(int[] vs) {
- for (int i : vs) {
- add(i);
}
}
- public boolean remove(int key) {
- if (isBitSet) {
- return intBitSet.remove(key);
- } else {
- return intHashSet.remove(key);
- }
- }
-
- public int size() {
- if (isBitSet) {
- return intBitSet.size();
- } else {
- return intHashSet.size();
- }
+ private static int getHashSetOffset(int estMax, int estMin) {
+ int range = estMax - estMin;
+ assert (range >= 0);
+ return (range > HASH_SET_SHORT_MAX_SIZE) ?
+ Integer.MIN_VALUE : // signal to force use of 4 byte ints
+ // make the offset such that the 0 point is 3/4 of the way
+ // toward the top of the range, because there's more likelyhood
+ // of expansion in that direction
+ estMax - (range >> 2);
}
public int[] toUnorderedIntArray() {
@@ -229,26 +360,37 @@ public class PositiveIntSet {
}
/**
- * Only called if key won't fit in existing hashset allocation
+ * Only called if key won't fit in existing IntHashSet, IntSet or IntBitSet allocation
*
- * Using existing bitset, compute what its size() (capacity) would be if the
- * key were added; include the overhead of the intbitset class.
+ * If IntBitSet (spaceUsed != 0) compute what its size() (capacity) would be if the
+ * key were added (causing doubling); include the overhead of the intbitset class.
*
- * long words required = larger of double the current size, or the number of words required
+ * long words required = larger of double the current size, or the number of long words required
- * @param maxKeyLessOffset
- * @return number of words needed to store the highest value
+ * @param adjKey - the range of bits needed
+ * @param spaceUsed - the number of bits currently allocated (if currently a bit set), else 0
+ * @return number of words needed to store the range,
*/
- private int getBitSetSpaceFromMaxInt(int maxKeyLessOffset, int spaceUsed) {
- int w64 = 1 + (maxKeyLessOffset >> 6); // 64 bits per long, add one because 0 to 63 takes one word)
- spaceUsed = spaceUsed >> 5; // in # of long words, * 2 because the alloc doubles the space
+ private static int getBitSetSpaceFromRange(int adjKey, int spaceUsed) {
+ int w64 = 1 + (adjKey >> 6); // 64 bits per long, add one because 0 to 63 takes one word)
+ spaceUsed = spaceUsed >> (6 - 1); // in # of long words, * 2 because the alloc doubles the space
- int newSpace = Math.max(spaceUsed, w64) << 1; // size is in 32 bit words at the end
+ int newSpace = Math.max(spaceUsed, w64) << 1; // <<1 to convert to # of words from # of longs
// e.g., 31 key => 1 word, 32 key => 2 words
return newSpace + 2; // 2 more for IntBitSet object overhead
}
/**
+ *
+ * @param size including new element to be added
+ * @return number of words including overhead to store that many elements
+ * unless is bigger than INT_SET_MAX_SIZE, then return MAX_VALUE
+ */
+ private static int getIntSetSpace(int size) {
+ return (size < INT_SET_MAX_SIZE) ? size + 4 : Integer.MAX_VALUE;
+ }
+
+ /**
* Only called if key won't fit in existing allocation
*
* returns new hash table size ( usually double) + overhead
@@ -257,28 +399,40 @@ public class PositiveIntSet {
* @return
*/
private int getHashSetSpace() {
- return intHashSet.getSpaceUsedInWords() * 2 + IntHashSet.getSpaceOverheadInWords();
+ return ((IntHashSet)intSet).getSpaceUsedInWords() * 2 + IntHashSet.getSpaceOverheadInWords();
}
/**
+ * For the case where the intHashSet doesn't yet exist
+ * @param numberOfElements -
+ * @param estMax - the largest int to be stored
+ * @param estMin - the smallest int to be stored
+ * @return the size in words
+ */
+ private static int getHashSetSpace(int numberOfElements, int estMax, int estMin) {
+ boolean isShort;
+ if (numberOfElements > HASH_SET_SHORT_MAX_SIZE) {
+ isShort = false;
+ } else {
+ isShort = (estMax - estMin) < HASH_SET_SHORT_MAX_SIZE;
+ }
+ int numberOfTableElements = IntHashSet.tableSpace(numberOfElements, IntHashSet.DEFAULT_LOAD_FACTOR);
+ return (numberOfTableElements >> ((isShort) ? 1 : 0))
+ + IntHashSet.getSpaceOverheadInWords();
+ }
+
+ /**
* When converting from bitset to hash set, the initial hash set should be
* big enough so that it takes a while before it is doubled-expanded.
*
+ * Reduce overallocation for small sizes because the cost to switch is low, and the % overallocate
+ * could be a large.
+ *
* @param existingSize
* @return
*/
- private int getHashSetOverAllocateSize(int existingSize) {
- return existingSize + (existingSize >> HASH_SET_OVERALLOCATE_MULTIPLIER) + HASH_SET_OVERALLOCATE;
- }
-
- /**
- * For the case where the intHashSet doesn't yet exist
- * @param numberOfElements -
- * @return the size in words
- */
- private int getHashSetSpace(int numberOfElements) {
- return IntHashSet.tableSpace(numberOfElements, IntHashSet.DEFAULT_LOAD_FACTOR) +
- IntHashSet.getSpaceOverheadInWords();
+ private static int getHashSetOverAllocateSize(int existingSize) {
+ return existingSize + (existingSize >> HASH_SET_OVERALLOCATE_DIVIDER_SHIFT);
}
/**
@@ -299,104 +453,265 @@ public class PositiveIntSet {
* - removes don't cause switching
* - compare against other size including its non-linear space jumps.
*/
-
- private void maybeSwitchRepresentation(int key) {
+
+ private void maybeSwitchRepresentation(int newKey) {
if (isBitSet) {
- /********************
- * Bit Set
- ********************/
-
+ handleBitSet(newKey);
+ return;
+ }
+ if (isIntSet) {
+ handleIntSet(newKey);
+ return;
+ }
+ handleHashSet(newKey);
+ }
+
+
+ /********************
+ * Bit Set
+ ********************/
+ private void handleBitSet(int newKey) {
+ IntBitSet intBitSet = (IntBitSet) intSet;
+
+ if (intBitSet == null) {
/*********
* Size is 0,
* use this opportunity to set the offset
* unless that has been a bad strategy for this
* instance already
********/
- if (intBitSet == null) {
- if (useOffset) {
-
- // initialize bitSetOffset to the key, minus some amount to allow some storage of previous values
- // a minimum of 63, a maximum of 512 == 16 words, when key is > 4K (128 words)
- final int spaceForLesserKeys = Math.min(1023, Math.max(63, key >> 3));
- final int offset = Math.max(0, key - spaceForLesserKeys);
- intBitSet = new IntBitSet(BIT_SET_OVERALLOCATE + key - offset, offset);
- // because of offset, won't have to switch to hash for this key
- //
- // force an initial allocation of this sufficient to keep
- // a) the space below, plus
- // b)
- return;
- } else {
- intBitSet = new IntBitSet(BIT_SET_OVERALLOCATE + key);
- // don't return, see if we should immediately switch to hash representation
- }
- }
-
- final int offset = intBitSet.getOffset();
-
- if (key < offset) {
- adjustBitSetForLowerOffset(key);
- return;
- }
-
- // return if key fits in existing allocation
- final int spaceUsed = intBitSet.getSpaceUsed_in_bits();
- if ((key - offset) < spaceUsed) {
+ boolean guaranteedFits = allocateIntBitSet(newKey, newKey, -1);
+ intBitSet = (IntBitSet) intSet;
+ if (guaranteedFits) {
return;
}
+ }
+
+ final int offset = intBitSet.getOffset();
+
+ if (newKey < offset) {
+ // may change representation; always abandons this bit set
+ adjustBitSetForLowerOffset(newKey);
+ return;
+ }
+
+ // return if newKey fits in existing allocation
+ final int spaceUsed = intBitSet.getSpaceUsed_in_bits();
+ final int adjKey = newKey - offset;
+ if (adjKey < spaceUsed) {
+ return;
+ }
+
+
+ // this newKey doesn't fit in existing bit set allocation; it's too large
+ // figure out if we should expand this bit set or switch representations
- // space used in words (32 bit words)
- final int bitSetSpaceNeeded = getBitSetSpaceFromMaxInt(key - offset, spaceUsed);
- final int hashSetOverAllocateSize = getHashSetOverAllocateSize(intBitSet.size() + 1);
- final int hashSetOverAllocateSpace = getHashSetSpace(hashSetOverAllocateSize);
- // switch if hashmap space plus hysteresis would be < bitmap space
- if (hashSetOverAllocateSpace < (bitSetSpaceNeeded - HYSTERESIS)) {
- switchToHashSet(hashSetOverAllocateSize);
- }
- return;
- }
+ final int bitSetSpaceNeeded = getBitSetSpaceFromRange(adjKey, spaceUsed); // in 32 bit words
+ final int sizeNeeded = intBitSet.size() + 1;
- /********************
- * Hash Set
- ********************/
+ if (getIntSetSpace(sizeNeeded) < bitSetSpaceNeeded) {
+ switchToIntSet(sizeNeeded);
+ return;
+ }
+
+ // when computing hashset size, overallocate by 50% to avoid immediately doubling
+ final int hashSetOverAllocateSize = getHashSetOverAllocateSize(sizeNeeded);
+ final int hashSetOverAllocateSpace = getHashSetSpace(hashSetOverAllocateSize, newKey, offset);
+ // switch if hashmap space plus hysteresis would be < bitmap space
+ if (hashSetOverAllocateSpace < (bitSetSpaceNeeded - HYSTERESIS)) {
+ switchToHashSet(hashSetOverAllocateSize, getHashSetOffset(newKey, offset));
+ }
+ return;
+ }
+
+ /********************
+ * IntSet *
+ ********************/
+ private void handleIntSet(int newKey) {
+ IntSet is = (IntSet) intSet;
+ final int size = is.size() + 1;
+ if (size < INT_SET_MAX_SIZE) {
+ return;
+ }
+
+
+ // switch to bit or IntHash
+
+ // first, compute max and min values
+ int mostPos = Integer.MIN_VALUE;
+ int mostNeg = Integer.MAX_VALUE;
+ for (int i = 0; i < size - 1; i++) {
+ final int v = is.get(i);
+ if (v > mostPos) { mostPos = v;}
+ if (v < mostNeg) { mostNeg = v;}
+ }
+ // include the newKey in the computation of these
+ if (newKey > mostPos) { mostPos = newKey;}
+ if (newKey < mostNeg) { mostNeg = newKey;}
+
+ // make a bit set or hash set.
+ // have 16 members, so the hash set is approx 32 + 11 words = 43.
+ // bit set is approx 2 + range / 32
+ final int bitSetSpace = getBitSetSpaceFromRange(mostPos - mostNeg, 0);
+ final int hashSetOverAllocSize = getHashSetOverAllocateSize(size);
+ final int hashSetSpace = getHashSetSpace(hashSetOverAllocSize, mostPos, mostNeg);
+ if (bitSetSpace < hashSetSpace) {
+ switchToBitSet(mostPos, mostNeg, estimatedBitSetOffset(mostNeg, -1));
+ return;
+ }
+ switchToHashSet(hashSetOverAllocSize, getHashSetOffset(mostPos, mostNeg));
+ }
+
+ /********************
+ * Hash Set
+ ********************/
+ private void handleHashSet(int newKey) {
// maybe switch to IntBitSet
// space used in words (32 bit words)
// don't bother to include the new element - it might not be "added"
// if it was already there, and only serves to decrease hysteresis a bit
-
+ final IntHashSet intHashSet = (IntHashSet) intSet;
if (intHashSet.wontExpand()) {
return;
}
- final int hashSetSpaceNeeded = getHashSetSpace();
+ final int hashSetSpaceNeeded = getHashSetSpace(); //computes the doubline of space
final int maxInt = intHashSet.getMostPositive();
final int offset = useOffset ? intHashSet.getMostNegative() : 0;
- final int bitSetSpaceNeeded = getBitSetSpaceFromMaxInt(BIT_SET_OVERALLOCATE + maxInt - offset, 0);
+ final int bitSetSpaceNeeded = getBitSetSpaceFromRange(BIT_SET_OVERALLOCATE + maxInt - offset, 0);
if (bitSetSpaceNeeded < (hashSetSpaceNeeded - HYSTERESIS)) {
- switchToBitSet(maxInt, offset);
+ switchToBitSet(maxInt, intHashSet.getMostNegative(), offset);
}
}
+
+ /**
+ * Allocate a new IntBitSet
+ * @param estMax the largest int to store in the bit set - determines the initial size
+ * @param estMin this is a lower bound on the value that can be in the bit set
+ * @param offsetSpec this is the offset to use, or -1, to calculate the offset from
+ * the initial_size (assuming a small amount of adds will be for
+ * ints somewhat less than the initial_size entry (being treated
+ * as the first int to be added)
+ * @return true if allocated with offset, implying guarantee the estMax key fits.
+ */
+ private boolean allocateIntBitSet(int estMax, int estMin, int offsetSpec) {
+ isBitSet = true;
+ isHashSet = isIntSet = false;
+ if (useOffset) {
+
+ // initialize bitSetOffset to the key, minus some amount to allow some storage of previous values
+ // a minimum of 63, a maximum of 512 == 16 words, when key is > 4K (128 words)
+ final int offset = estimatedBitSetOffset(estMin, offsetSpec);
+ intSet = new IntBitSet(BIT_SET_OVERALLOCATE + estMax - offset, offset);
+ // because of offset, won't have to switch to hash for this key
+ return true;
+ } else {
+ intSet = new IntBitSet(BIT_SET_OVERALLOCATE + estMax);
+ return false;
+ }
+ }
+
+ private static int estimatedBitSetOffset(int estMin, int offsetSpec) {
+ return // initialize bitSetOffset to the key, minus some amount to allow some storage of previous values
+ // a minimum of 63, a maximum of 512 == 16 words, when key is > 4K (128 words)
+ (offsetSpec == -1) ?
+ Math.max(0, estMin - Math.min(1023, Math.max(63, estMin >> 3))) :
+ offsetSpec;
+ }
+
+ /**
+ * switching from bit set to either IntSet or IntHashSet
+ * @param size - space needed including new item
+ * @param offset - for IntHashSet values kept as short, the offset subtracted from values before storing them
+ * offset == Integer.MIN_VALUE means use 4 byte ints
+ */
+ private void switchFromBitSet(int size, int offset) {
+ if (size < INT_SET_MAX_SIZE) {
+ switchToIntSet(size);
+ return;
+ }
+ switchToHashSet(size, offset);
+ }
+
+ /**
+ * switch to IntSet
+ * @param size number of elements to be able to store without expanding
+ */
+ private void switchToIntSet(int size) {
+ IntListIterator it = intSet.iterator();
+
+ intSet = new IntSet(size);
+ isIntSet = true;
+ isBitSet = isHashSet = false;
- private void switchToHashSet(int size) {
- isBitSet = false;
- intHashSet = new IntHashSet(size);
- IntListIterator it = intBitSet.getIterator();
while (it.hasNext()) {
- intHashSet.add(it.next());
+ intSet.add(it.next());
}
- intBitSet = null;
+ return;
}
- private void switchToBitSet(int maxInt, int offset) {
- isBitSet = true;
- intBitSet = new IntBitSet(BIT_SET_OVERALLOCATE + maxInt - offset, offset);
- IntListIterator it = intHashSet.getIterator();
+ /**
+ * Switch from any intSet impl to Hash Set
+ * @param size - the capacity - this many elements could be stored before swtiching
+ * @param offset - used only when the size < 65K, and is a value subtracted from elements before
+ * storing them, in an attempt to fit the results into just "short"s instead of "int"s
+ * if == MIN_VALUE, then force 4 byte ints
+ */
+ private void switchToHashSet(int size, int offset) {
+ IntListIterator it = intSet.iterator();
+
+ intSet = new IntHashSet(size);
+ isIntSet = isBitSet = false;
+ isHashSet = true;
+
+ while (it.hasNext()) {
+ intSet.add(it.next());
+ }
+ }
+
+ private void switchToBitSet(int estMax, int estMin, int offset) {
+ final IntListIterator it = intSet.iterator();
+
+ allocateIntBitSet(estMax, estMin, offset);
+
while (it.hasNext()) {
- intBitSet.add(it.next());
+ intSet.add(it.next());
+ }
+ }
+
+ @Override
+ public int get(int position) {
+ if (null == intSet) {
+ throw new NoSuchElementException();
}
- intHashSet = null;
+ return intSet.get(position);
+ }
+
+ @Override
+ public int moveToFirst() {
+ return (null == intSet) ? -1 : intSet.moveToFirst();
+ }
+
+ @Override
+ public int moveToLast() {
+ return (null == intSet) ? -1 : intSet.moveToLast();
+ }
+
+ @Override
+ public int moveToNext(int position) {
+ return (null == intSet) ? -1 : intSet.moveToNext(position);
+ }
+
+ @Override
+ public int moveToPrevious(int position) {
+ return (null == intSet) ? -1 : intSet.moveToPrevious(position);
}
-
+
+ @Override
+ public boolean isValid(int position) {
+ return (null == intSet) ? false : intSet.isValid(position);
+ }
+
}
Modified: uima/uimaj/trunk/uimaj-core/src/test/java/org/apache/uima/internal/util/IntBitSetTest.java
URL: http://svn.apache.org/viewvc/uima/uimaj/trunk/uimaj-core/src/test/java/org/apache/uima/internal/util/IntBitSetTest.java?rev=1636458&r1=1636457&r2=1636458&view=diff
==============================================================================
--- uima/uimaj/trunk/uimaj-core/src/test/java/org/apache/uima/internal/util/IntBitSetTest.java (original)
+++ uima/uimaj/trunk/uimaj-core/src/test/java/org/apache/uima/internal/util/IntBitSetTest.java Mon Nov 3 22:15:41 2014
@@ -37,7 +37,7 @@ public class IntBitSetTest extends TestC
ibs.add(15);
ibs.add(188);
- IntListIterator it = ibs.getIterator();
+ IntListIterator it = ibs.iterator();
assertTrue(it.hasNext());
assertEquals(15, it.next());
assertTrue(it.hasNext());
@@ -50,7 +50,7 @@ public class IntBitSetTest extends TestC
ibs.add(1015);
ibs.add(1188);
- it = ibs.getIterator();
+ it = ibs.iterator();
assertTrue(it.hasNext());
assertEquals(1015, it.next());
assertTrue(it.hasNext());
@@ -89,7 +89,7 @@ public class IntBitSetTest extends TestC
ibs.remove(188);
assertEquals(101, ibs.getLargestMenber());
assertEquals(2,ibs.size());
- IntListIterator it = ibs.getIterator();
+ IntListIterator it = ibs.iterator();
assertTrue(it.hasNext());
assertEquals(15, it.next());
assertTrue(it.hasNext());
@@ -113,4 +113,18 @@ public class IntBitSetTest extends TestC
assertEquals(2,ibs.size());
}
+
+ public void testIterator() {
+ ibs = new IntBitSet(63, 1000);
+ for (int i = 0; i < 10; i = i + 4) {
+ ibs.add(1000 + i);
+ }
+
+ IntListIterator it = ibs.iterator();
+ for (int i = 0; i < 10; i += 4) {
+ assertTrue(it.hasNext());
+ assertEquals(1000 + i, it.next());
+ }
+ assertFalse(it.hasNext());
+ }
}
Modified: uima/uimaj/trunk/uimaj-core/src/test/java/org/apache/uima/internal/util/IntHashSetTest.java
URL: http://svn.apache.org/viewvc/uima/uimaj/trunk/uimaj-core/src/test/java/org/apache/uima/internal/util/IntHashSetTest.java?rev=1636458&r1=1636457&r2=1636458&view=diff
==============================================================================
--- uima/uimaj/trunk/uimaj-core/src/test/java/org/apache/uima/internal/util/IntHashSetTest.java (original)
+++ uima/uimaj/trunk/uimaj-core/src/test/java/org/apache/uima/internal/util/IntHashSetTest.java Mon Nov 3 22:15:41 2014
@@ -64,7 +64,7 @@ public class IntHashSetTest extends Test
}
private int[] getSortedValues(IntHashSet s) {
- IntListIterator it = s.getIterator();
+ IntListIterator it = s.iterator();
int[] r = new int[s.size()];
int i = 0;
while (it.hasNext()) {
@@ -91,7 +91,7 @@ public class IntHashSetTest extends Test
public void testWontExpand() {
ihs = new IntHashSet(21);
- assertEquals(32, ihs.getSpaceUsedInWords());
+ assertEquals(16, ihs.getSpaceUsedInWords());
assertTrue(ihs.wontExpand(20));
assertFalse(ihs.wontExpand(21));
}