You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by sc...@apache.org on 2015/11/01 16:39:23 UTC
svn commit: r1711818 -
/uima/uimaj/branches/experiment-v3-jcas/uimaj-core/src/main/java/org/apache/uima/internal/util/ObjHashSet.java
Author: schor
Date: Sun Nov 1 15:39:23 2015
New Revision: 1711818
URL: http://svn.apache.org/viewvc?rev=1711818&view=rev
Log:
[UIMA-4664]a hash set that supports shrinking and a more efficient representation than typical Java library implementations.
Added:
uima/uimaj/branches/experiment-v3-jcas/uimaj-core/src/main/java/org/apache/uima/internal/util/ObjHashSet.java
Added: uima/uimaj/branches/experiment-v3-jcas/uimaj-core/src/main/java/org/apache/uima/internal/util/ObjHashSet.java
URL: http://svn.apache.org/viewvc/uima/uimaj/branches/experiment-v3-jcas/uimaj-core/src/main/java/org/apache/uima/internal/util/ObjHashSet.java?rev=1711818&view=auto
==============================================================================
--- uima/uimaj/branches/experiment-v3-jcas/uimaj-core/src/main/java/org/apache/uima/internal/util/ObjHashSet.java (added)
+++ uima/uimaj/branches/experiment-v3-jcas/uimaj-core/src/main/java/org/apache/uima/internal/util/ObjHashSet.java Sun Nov 1 15:39:23 2015
@@ -0,0 +1,559 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.internal.util;
+
+import java.lang.reflect.Array;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+
+import org.apache.uima.cas.FeatureStructure;
+import org.apache.uima.cas.impl.FeatureStructureImplC;
+import org.apache.uima.util.Misc;
+
+/**
+ * A set of Objects of type T
+
+ * This impl is for use in a single thread case only, when table is being updated.
+ * Multiple reader threads are OK if there's no writing.
+ *
+ * Supports shrinking (reallocating the big table)
+ *
+ * Removed objects replaced with special marker object in the table
+ * so find operations continue to work (they can't stop upon finding this object).
+ *
+ */
+public class ObjHashSet<T> implements Collection<T>{
+
+ public static final float DEFAULT_LOAD_FACTOR = 0.66F;
+ // set to true to collect statistics for tuning
+ // you have to also put a call to showHistogram() at the end of the run
+ private static final boolean TUNE = true;
+
+ private final static FeatureStructureImplC REMOVED = new FeatureStructureImplC();
+
+ private final Class<T> clazz; // for rieifying the T type
+
+ private final float loadFactor = DEFAULT_LOAD_FACTOR;
+
+ private final int initialCapacity;
+
+ private int histogram [];
+ private int maxProbe = 0;
+
+ private int sizeWhichTriggersExpansion;
+ private int size; // number of elements in the table
+ private int nbrRemoved; // number of removed elements (coded as removed)
+
+ // the actual Object table
+ private T [] keys;
+
+ private boolean secondTimeShrinkable = false;
+
+ public ObjHashSet(Class<T> clazz) {
+ this(12, clazz); // default initial size
+ }
+
+ /**
+ * @param initialCapacity - you can add this many before expansion
+ */
+ public ObjHashSet(int initialCapacity, Class<T> clazz) {
+ this.clazz = clazz;
+ this.initialCapacity = initialCapacity;
+ newTableKeepSize(initialCapacity);
+ size = 0;
+ if (TUNE) {
+ histogram = new int[200];
+ Arrays.fill(histogram, 0);
+ maxProbe = 0;
+ }
+ }
+
+ private void newTableKeepSize(int capacity) {
+ keys = (T[]) Array.newInstance(clazz, capacity);
+ sizeWhichTriggersExpansion = (int)(capacity * loadFactor);
+ nbrRemoved = 0;
+ }
+
+ private void incrementSize() {
+ if (size + nbrRemoved >= sizeWhichTriggersExpansion) {
+ increaseTableCapacity();
+ }
+ size++;
+ }
+
+// public boolean wontExpand() {
+// return wontExpand(1);
+// }
+//
+// public boolean wontExpand(int n) {
+// return (size + nbrRemoved + n) < sizeWhichTriggersExpansion;
+// }
+
+ public int getCapacity() {
+ return keys.length;
+ }
+
+ /**
+ * This may not increase the table capacity, but may just
+ * clean out the REMOVED items
+ */
+ private void increaseTableCapacity() {
+ final int oldCapacity = getCapacity();
+ // keep same capacity if just removing the "removed" markers would
+ // shrink the used slots to the same they would have been had there been no removed, and
+ // the capacity was doubled.
+ final int newCapacity = (nbrRemoved > size) ? oldCapacity : oldCapacity << 1;
+
+ final T [] oldKeys = keys;
+ if (TUNE) {System.out.println("Capacity increasing from " + oldCapacity + " to " + newCapacity);}
+ newTableKeepSize(newCapacity);
+ for (T key : oldKeys) {
+ if (key != null && key != REMOVED) {
+ addInner(key);
+ }
+ }
+ }
+
+ // called by clear
+ private void newTable(int capacity) {
+ newTableKeepSize(capacity);
+ resetTable();
+ }
+
+ private void resetHistogram() {
+ if (TUNE) {
+ histogram = new int[200];
+ Arrays.fill(histogram, 0);
+ maxProbe = 0;
+ }
+ }
+
+ private void resetArray() {
+ Arrays.fill(keys, null);
+ resetTable();
+ }
+
+ private void resetTable() {
+ resetHistogram();
+ size = 0;
+ }
+
+ @Override
+ public void clear() {
+ // see if size is less than the 1/2 size that triggers expansion
+ if (size < (sizeWhichTriggersExpansion >>> 1)) {
+ // if 2nd time then shrink by 50%
+ // this is done to avoid thrashing around the threshold
+ if (secondTimeShrinkable) {
+ secondTimeShrinkable = false;
+ final int currentCapacity = getCapacity();
+ final int newCapacity = Math.max(initialCapacity, currentCapacity >>> 1);
+ if (newCapacity < currentCapacity) {
+ newTable(newCapacity); // shrink table by 50%
+ } else { // don't shrink below minimum
+ resetArray();
+ }
+ return;
+
+ } else {
+ secondTimeShrinkable = true;
+ }
+ } else {
+ secondTimeShrinkable = false; // reset this to require 2 triggers in a row
+ }
+ resetArray();
+ }
+
+ /**
+ * returns a position in the key/value table
+ * if the key is not found, then the position will be to the
+ * place where the key would be put upon adding (without reusing REMOVED, and the
+ * current internal value of keys[position] would be 0.
+ *
+ * if the key is found, then keys[position] == key
+ * @param obj the object to find
+ * @return the probeAddr in keys array - might reference a slot holding null, or the key value if found
+ */
+ private int findPosition(final T obj) {
+ return findPosition(obj, false);
+ }
+
+ private int findPosition(final T obj, final boolean includeRemoved) {
+ if (obj == null) {
+ throw new IllegalArgumentException("null is an invalid key");
+ }
+
+ final int hash = Misc.hashInt(obj.hashCode());
+ int nbrProbes = 1;
+ int probeDelta = 1;
+ int probeAddr;
+
+ final T[] localKeys = keys;
+ final int bitMask = localKeys.length - 1;
+ probeAddr = hash & bitMask;
+ while (true) {
+ final T testKey = localKeys[probeAddr];
+ if (testKey == null || testKey.equals(obj) || (includeRemoved && testKey == REMOVED)) {
+ break;
+ }
+
+ nbrProbes++;
+ probeAddr = bitMask & (probeAddr + (probeDelta++));
+ }
+
+ if (TUNE) {
+ final int pv = histogram[nbrProbes];
+
+ histogram[nbrProbes] = 1 + pv;
+ if (maxProbe < nbrProbes) {
+ maxProbe = nbrProbes;
+ }
+ }
+
+ return probeAddr;
+ }
+
+ @Override
+ public boolean contains(Object obj) { // arg must be Object to fit Collection API
+ return (clazz.isAssignableFrom(obj.getClass())) ? (find((T) obj) != -1) : false;
+ }
+
+ /**
+ * @param obj the object to find in the table (if it is there)
+ * @return the position of obj in the table, or -1 if not in the table
+ */
+ public int find(T obj) {
+ if (obj == null || size == 0) {
+ return -1;
+ }
+
+ final int pos = findPosition(obj);
+ return (keys[pos] == obj) ? pos : -1;
+ }
+
+ /**
+ *
+ * @param obj - the object to add
+ * @return true if this set did not already contain the specified element
+ */
+ @Override
+ public boolean add(T obj) {
+ if (obj == null) {
+ throw new IllegalArgumentException("argument must be non-null");
+ }
+
+ final int i = findPosition(obj, true); // include REMOVED
+ if (keys[i] == obj) {
+ return false; // false if already present
+ }
+ if (keys[i] == REMOVED) {
+ nbrRemoved --;
+ assert (nbrRemoved >= 0);
+ }
+ keys[i] = obj;
+ incrementSize();
+ return true;
+ }
+
+ /**
+ * used for increasing table size
+ * @param rawKey
+ */
+ private void addInner(T obj) {
+ final int i = findPosition(obj);
+ assert(keys[i] == null);
+ keys[i] = obj;
+ }
+
+ /**
+ * Can't replace the item with a null because other keys that were
+ * stored in the table which previously collided with the removed item
+ * won't be found. UIMA-4204
+ * @param rawKey the value to remove
+ * @return true if the key was present
+ */
+ @Override
+ public boolean remove(Object obj) {
+ if (obj == null) {
+ return false;
+ }
+
+ final int pos = findPosition((T) obj); // null or equal obj
+
+ return (obj.equals(keys[pos])) ? removeAtPosition(pos) : false;
+ }
+
+ private boolean removeAtPosition(int pos) {
+ // found, remove it
+ keys[pos] = (T) REMOVED; // at runtime, this cast is a no-op
+ size--;
+ nbrRemoved ++;
+ return true;
+ }
+
+ @Override
+ public int size() {
+ return size;
+ }
+
+ public void showHistogram() {
+ if (TUNE) {
+ int sumI = 0;
+ for (int i : histogram) {
+ sumI += i;
+ }
+
+ System.out.format(
+ "Histogram of number of probes, loadfactor = %.1f, maxProbe=%,d nbr of find operations at last table size=%,d%n",
+ loadFactor, maxProbe, sumI);
+ for (int i = 0; i <= maxProbe; i++) {
+ if (i == maxProbe && histogram[i] == 0) {
+ System.out.println("huh?");
+ }
+ System.out.println(i + ": " + histogram[i]);
+ }
+
+ System.out.println("bytes / entry = " + (float) (keys.length) * 4 / size());
+ System.out.format("size = %,d, prevExpansionTriggerSize = %,d, next = %,d%n",
+ size(),
+ (int) ((keys.length >>> 1) * loadFactor),
+ (int) (keys.length * loadFactor));
+ }
+ }
+
+ /**
+ */
+ /**
+ * For iterator use
+ * @param index - a magic number returned by the internal find
+ * @return the T at that spot, or null if nothing there
+ */
+ public T get(int index) {
+ T obj = keys[index];
+ if (obj == null || obj == REMOVED) {
+ return null; // null, not present
+ }
+ return obj;
+ }
+
+ /**
+ * advance pos until it points to a non 0 or is 1 past end
+ * @param pos
+ * @return updated pos
+ */
+ public int moveToNextFilled(int pos) {
+ if (pos < 0) {
+ pos = 0;
+ }
+
+ final int max = getCapacity();
+ while (true) {
+ if (pos >= max) {
+ return pos;
+ }
+ T v = get(pos);
+ if (v != null && v != REMOVED) {
+ return pos;
+ }
+ pos ++;
+ }
+ }
+
+ /**
+ * decrement pos until it points to a non 0 or is -1
+ * @param pos
+ * @return updated pos
+ */
+ public int moveToPreviousFilled(int pos) {
+ final int max = getCapacity();
+ if (pos > max) {
+ pos = max - 1;
+ }
+
+ while (true) {
+ if (pos < 0) {
+ return pos;
+ }
+ T v = get(pos);
+ if (v != null && v != REMOVED) {
+ return pos;
+ }
+ pos --;
+ }
+ }
+
+ private class ObjHashSetIterator implements Iterator<T> {
+
+ protected int curPosition;
+
+ private ObjHashSetIterator() {
+ this.curPosition = 0;
+ }
+
+ @Override
+ public final boolean hasNext() {
+ curPosition = moveToNextFilled(curPosition);
+ return curPosition < getCapacity();
+ }
+
+ @Override
+ public final T next() {
+ if (!hasNext()) {
+ throw new NoSuchElementException();
+ }
+ return get(curPosition++);
+ }
+
+ @Override
+ public void remove() {
+ removeAtPosition(curPosition - 1); // only valid following a hasNext()
+ }
+ }
+
+
+ @Override
+ public Iterator<T> iterator() {
+ return new ObjHashSetIterator();
+ }
+
+ public int moveToFirst() {
+ return (size() == 0) ? -1 : moveToNextFilled(0);
+ }
+
+ public int moveToLast() {
+ return (size() == 0) ? -1 : moveToPreviousFilled(getCapacity() -1);
+ }
+
+ public int moveToNext(int position) {
+ if (position < 0) {
+ return position;
+ }
+ final int n = moveToNextFilled(position + 1);
+ return (n >= getCapacity()) ? -1 : n;
+ }
+
+ public int moveToPrevious(int position) {
+ if (position >= getCapacity()) {
+ return -1;
+ }
+ return moveToPreviousFilled(position - 1);
+ }
+
+ public boolean isValid(int position) {
+ return (position >= 0) && (position < getCapacity());
+ }
+
+ /**
+ * if the fs is in the set, the iterator should return it.
+ * if not, move to the first - just to return something.
+ * @param fs position to this fs
+ * @return the index if present, otherwise moveToNextFileed(0);
+ */
+ public int moveTo(FeatureStructure fs) {
+ if (clazz.isAssignableFrom(fs.getClass())) {
+ int pos = find((T)fs);
+ if (pos >= 0) {
+ return pos;
+ }
+ }
+ return moveToFirst();
+ }
+
+ @Override
+ public <T2> T2[] toArray(T2[] a) {
+ final int s = size();
+ if (s == 0) {
+ if (a.length >= 1) {
+ a[0] = null;
+ }
+ return a;
+ }
+
+ final T2[] r = (a.length >= s)? a : (T2[]) Array.newInstance(a.getClass(), s);
+ int pos = moveToFirst();
+ for (int i = 0; i < size; i ++) {
+ r[i] = (T2) get(pos);
+ pos = moveToNextFilled(pos + 1);
+ }
+ if (a.length > s) {
+ r[s] = null;
+ }
+ return r;
+ }
+
+ @Override
+ public T[] toArray() {
+ return toArray((T[]) Array.newInstance(clazz, size()));
+ }
+
+ /* (non-Javadoc)
+ * @see java.lang.Object#toString()
+ */
+ @Override
+ public String toString() {
+ return String
+ .format(
+ "ObjHashSet [loadFactor=%s, initialCapacity=%s, sizeWhichTriggersExpansion=%s, size=%s, secondTimeShrinkable=%s%n keys=%s]",
+ loadFactor, initialCapacity, sizeWhichTriggersExpansion, size, secondTimeShrinkable,
+ Arrays.toString(keys));
+ }
+
+ // Collection methods
+ @Override
+ public boolean isEmpty() {
+ return size == 0;
+ }
+
+ @Override
+ public boolean containsAll(Collection<?> c) {
+ c.stream().allMatch(item -> contains(item));
+ return false;
+ }
+
+ @Override
+ public boolean addAll(Collection<? extends T> c) {
+ boolean[] anyChanged = {false};
+ c.stream().forEach(item -> anyChanged[0] |= add(item));
+ return anyChanged[0];
+ }
+
+ @Override
+ public boolean removeAll(Collection<?> c) {
+ boolean[] anyChanged = {false};
+ c.stream().forEach(item -> anyChanged[0] |= remove(item));
+ return anyChanged[0];
+ }
+
+ @Override
+ public boolean retainAll(Collection<?> c) {
+ boolean anyChanged = false;
+ Iterator<T> it = iterator();
+ while (it.hasNext()) {
+ T v = it.next();
+ if (!c.contains(v)) {
+ anyChanged = true;
+ it.remove();
+ }
+ }
+ return anyChanged;
+ }
+}