You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@uima.apache.org by sc...@apache.org on 2015/11/01 16:39:23 UTC

svn commit: r1711818 - /uima/uimaj/branches/experiment-v3-jcas/uimaj-core/src/main/java/org/apache/uima/internal/util/ObjHashSet.java

Author: schor
Date: Sun Nov  1 15:39:23 2015
New Revision: 1711818

URL: http://svn.apache.org/viewvc?rev=1711818&view=rev
Log:
[UIMA-4664]a hash set that supports shrinking and a more efficient representation than typical Java library implementations.

Added:
    uima/uimaj/branches/experiment-v3-jcas/uimaj-core/src/main/java/org/apache/uima/internal/util/ObjHashSet.java

Added: uima/uimaj/branches/experiment-v3-jcas/uimaj-core/src/main/java/org/apache/uima/internal/util/ObjHashSet.java
URL: http://svn.apache.org/viewvc/uima/uimaj/branches/experiment-v3-jcas/uimaj-core/src/main/java/org/apache/uima/internal/util/ObjHashSet.java?rev=1711818&view=auto
==============================================================================
--- uima/uimaj/branches/experiment-v3-jcas/uimaj-core/src/main/java/org/apache/uima/internal/util/ObjHashSet.java (added)
+++ uima/uimaj/branches/experiment-v3-jcas/uimaj-core/src/main/java/org/apache/uima/internal/util/ObjHashSet.java Sun Nov  1 15:39:23 2015
@@ -0,0 +1,559 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+package org.apache.uima.internal.util;
+
+import java.lang.reflect.Array;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.NoSuchElementException;
+
+import org.apache.uima.cas.FeatureStructure;
+import org.apache.uima.cas.impl.FeatureStructureImplC;
+import org.apache.uima.util.Misc;
+
+/**
+ * A set of Objects of type T 
+
+ * This impl is for use in a single thread case only, when table is being updated.
+ * Multiple reader threads are OK if there's no writing.
+ * 
+ * Supports shrinking (reallocating the big table)
+ * 
+ * Removed objects replaced with special marker object in the table
+ * so find operations continue to work (they can't stop upon finding this object).
+ *
+ */
+public class ObjHashSet<T> implements Collection<T>{
+  
+  public static final float DEFAULT_LOAD_FACTOR = 0.66F;
+  // set to true to collect statistics for tuning
+  // you have to also put a call to showHistogram() at the end of the run
+  private static final boolean TUNE = true;
+
+  private final static FeatureStructureImplC REMOVED = new FeatureStructureImplC();
+
+  private final Class<T> clazz;  // for rieifying the T type
+   
+  private final float loadFactor = DEFAULT_LOAD_FACTOR;  
+  
+  private final int initialCapacity; 
+
+  private int histogram [];
+  private int maxProbe = 0;
+
+  private int sizeWhichTriggersExpansion;
+  private int size; // number of elements in the table
+  private int nbrRemoved; // number of removed elements (coded as removed)
+  
+  // the actual Object table
+  private T [] keys;
+
+  private boolean secondTimeShrinkable = false;
+  
+  public ObjHashSet(Class<T> clazz) {
+    this(12, clazz);  // default initial size
+  }
+    
+  /**
+   * @param initialCapacity - you can add this many before expansion
+   */
+  public ObjHashSet(int initialCapacity, Class<T> clazz) {
+    this.clazz = clazz;
+    this.initialCapacity = initialCapacity;
+    newTableKeepSize(initialCapacity);
+    size = 0;
+    if (TUNE) {
+      histogram = new int[200];
+      Arrays.fill(histogram, 0);
+      maxProbe = 0;
+    }
+  }
+  
+  private void newTableKeepSize(int capacity) {
+    keys = (T[]) Array.newInstance(clazz, capacity);
+    sizeWhichTriggersExpansion = (int)(capacity * loadFactor);
+    nbrRemoved = 0;
+  }
+
+  private void incrementSize() {
+    if (size + nbrRemoved >= sizeWhichTriggersExpansion) {
+      increaseTableCapacity();
+    }
+    size++;
+  }
+        
+//  public boolean wontExpand() {
+//    return wontExpand(1);
+//  }
+//  
+//  public boolean wontExpand(int n) {
+//    return (size + nbrRemoved + n) < sizeWhichTriggersExpansion;  
+//  }
+  
+  public int getCapacity() {
+    return keys.length;
+  }
+    
+  /**
+   * This may not increase the table capacity, but may just 
+   * clean out the REMOVED items
+   */
+  private void increaseTableCapacity() {
+    final int oldCapacity = getCapacity();
+    // keep same capacity if just removing the "removed" markers would 
+    // shrink the used slots to the same they would have been had there been no removed, and 
+    // the capacity was doubled.
+    final int newCapacity = (nbrRemoved > size) ? oldCapacity : oldCapacity << 1;
+    
+    final T [] oldKeys = keys;      
+    if (TUNE) {System.out.println("Capacity increasing from " + oldCapacity + " to " + newCapacity);}
+    newTableKeepSize(newCapacity);
+    for (T key : oldKeys) {
+      if (key != null && key != REMOVED) {
+        addInner(key);
+      }
+    }
+  }
+  
+  // called by clear
+  private void newTable(int capacity) {
+    newTableKeepSize(capacity);
+    resetTable();
+  }
+
+  private void resetHistogram() {
+    if (TUNE) {
+      histogram = new int[200];
+      Arrays.fill(histogram, 0);
+      maxProbe = 0;
+    }
+  }
+  
+  private void resetArray() {
+    Arrays.fill(keys, null);
+    resetTable();
+  }
+  
+  private void resetTable() {
+    resetHistogram();
+    size = 0;
+  }
+  
+  @Override
+  public void clear() {
+    // see if size is less than the 1/2 size that triggers expansion
+    if (size <  (sizeWhichTriggersExpansion >>> 1)) {
+      // if 2nd time then shrink by 50%
+      //   this is done to avoid thrashing around the threshold
+      if (secondTimeShrinkable) {
+        secondTimeShrinkable = false;
+        final int currentCapacity = getCapacity();
+        final int newCapacity = Math.max(initialCapacity, currentCapacity >>> 1);
+        if (newCapacity < currentCapacity) { 
+          newTable(newCapacity);  // shrink table by 50%
+        } else { // don't shrink below minimum
+          resetArray();
+        }
+        return;
+        
+      } else {
+        secondTimeShrinkable = true;
+      }
+    } else {
+      secondTimeShrinkable = false; // reset this to require 2 triggers in a row
+    }
+   resetArray();
+  }
+
+  /** 
+  * returns a position in the key/value table
+  *   if the key is not found, then the position will be to the
+  *   place where the key would be put upon adding (without reusing REMOVED, and the 
+  *   current internal value of keys[position] would be 0.
+  *   
+  *   if the key is found, then keys[position] == key
+  * @param obj the object to find
+  * @return the probeAddr in keys array - might reference a slot holding null, or the key value if found
+  */
+  private int findPosition(final T obj) {
+    return findPosition(obj, false);
+  }
+  
+  private int findPosition(final T obj, final boolean includeRemoved) {
+    if (obj == null) {
+      throw new IllegalArgumentException("null is an invalid key");
+    }
+
+    final int hash = Misc.hashInt(obj.hashCode());
+    int nbrProbes = 1;
+    int probeDelta = 1;
+    int probeAddr;
+    
+    final T[] localKeys = keys;
+    final int bitMask = localKeys.length - 1;
+    probeAddr = hash & bitMask;
+    while (true) {
+      final T testKey = localKeys[probeAddr];
+      if (testKey == null || testKey.equals(obj) || (includeRemoved && testKey == REMOVED)) { 
+        break;
+      }
+      
+      nbrProbes++;
+      probeAddr = bitMask & (probeAddr + (probeDelta++));
+    }
+
+    if (TUNE) {
+      final int pv = histogram[nbrProbes];
+
+      histogram[nbrProbes] = 1 + pv;
+      if (maxProbe < nbrProbes) {
+        maxProbe = nbrProbes;
+      }
+    }
+
+    return probeAddr;
+  }
+
+  @Override
+  public boolean contains(Object obj) {  // arg must be Object to fit Collection API
+    return (clazz.isAssignableFrom(obj.getClass())) ? (find((T) obj) != -1) : false;
+  }
+  
+  /**
+   * @param obj the object to find in the table (if it is there)
+   * @return the position of obj in the table, or -1 if not in the table
+   */
+  public int find(T obj) {
+    if (obj == null || size == 0) {
+      return -1;
+    }
+    
+    final int pos = findPosition(obj);
+    return (keys[pos] == obj) ? pos : -1;
+  }
+      
+  /**
+   * 
+   * @param obj - the object to add
+   * @return true if this set did not already contain the specified element
+   */
+  @Override
+  public boolean add(T obj) {
+    if (obj == null) {
+      throw new IllegalArgumentException("argument must be non-null");
+    }
+           
+    final int i = findPosition(obj, true);  // include REMOVED
+    if (keys[i] == obj) {
+      return false;  // false if already present
+    }
+    if (keys[i] == REMOVED) {
+      nbrRemoved --;
+      assert (nbrRemoved >= 0);
+    }
+    keys[i] = obj;
+    incrementSize();
+    return true;
+  }
+        
+  /**
+   * used for increasing table size
+   * @param rawKey
+   */
+  private void addInner(T obj) {
+    final int i = findPosition(obj);
+    assert(keys[i] == null);
+    keys[i] = obj;
+  }
+  
+  /**
+   * Can't replace the item with a null because other keys that were
+   * stored in the table which previously collided with the removed item
+   * won't be found.  UIMA-4204
+   * @param rawKey the value to remove
+   * @return true if the key was present
+   */
+  @Override
+  public boolean remove(Object obj) {
+    if (obj == null) {
+      return false;
+    }
+    
+    final int pos = findPosition((T) obj);  // null or equal obj
+    
+    return (obj.equals(keys[pos])) ? removeAtPosition(pos) : false;
+  } 
+  
+  private boolean removeAtPosition(int pos) {   
+    // found, remove it
+    keys[pos] = (T) REMOVED;  // at runtime, this cast is a no-op    
+    size--;
+    nbrRemoved ++;
+    return true;
+  }
+  
+  @Override
+  public int size() {
+    return size;
+  }
+  
+  public void showHistogram() {
+    if (TUNE) {
+      int sumI = 0;
+      for (int i : histogram) {
+        sumI += i;
+      }
+      
+      System.out.format(
+          "Histogram of number of probes, loadfactor = %.1f, maxProbe=%,d nbr of find operations at last table size=%,d%n",
+          loadFactor, maxProbe, sumI);
+      for (int i = 0; i <= maxProbe; i++) {
+        if (i == maxProbe && histogram[i] == 0) {
+          System.out.println("huh?");
+        }
+        System.out.println(i + ": " + histogram[i]);
+      }     
+      
+      System.out.println("bytes / entry = " + (float) (keys.length) * 4 / size());
+      System.out.format("size = %,d, prevExpansionTriggerSize = %,d, next = %,d%n",
+        size(),
+        (int) ((keys.length >>> 1) * loadFactor),
+        (int) (keys.length * loadFactor));        
+    }
+  }
+  
+  /**
+   */
+  /**
+   * For iterator use
+   * @param index - a magic number returned by the internal find
+   * @return the T at that spot, or null if nothing there 
+   */
+  public T get(int index) {
+    T obj = keys[index];
+    if (obj == null || obj == REMOVED) {
+      return null;  // null, not present
+    }
+    return obj;
+  }
+  
+  /**
+   * advance pos until it points to a non 0 or is 1 past end
+   * @param pos
+   * @return updated pos
+   */
+  public int moveToNextFilled(int pos) {
+    if (pos < 0) {
+      pos = 0;
+    }
+    
+    final int max = getCapacity();
+    while (true) {
+      if (pos >= max) {
+        return pos;
+      }
+      T v = get(pos);
+      if (v != null && v != REMOVED) {
+        return pos;
+      }
+      pos ++;
+    }
+  }
+   
+  /**
+   * decrement pos until it points to a non 0 or is -1
+   * @param pos
+   * @return updated pos
+   */
+  public int moveToPreviousFilled(int pos) {
+    final int max = getCapacity();
+    if (pos > max) {
+      pos = max - 1;
+    }
+    
+    while (true) {
+      if (pos < 0) {
+        return pos;
+      }
+      T v = get(pos);
+      if (v != null && v != REMOVED) {
+        return pos;
+      }
+      pos --;
+    }
+  }
+
+  private class ObjHashSetIterator implements Iterator<T> {
+
+    protected int curPosition;
+
+    private ObjHashSetIterator() {
+      this.curPosition = 0;
+    }
+
+    @Override
+    public final boolean hasNext() {
+      curPosition = moveToNextFilled(curPosition);
+      return curPosition < getCapacity();
+    }
+
+    @Override
+    public final T next() {
+      if (!hasNext()) {
+        throw new NoSuchElementException();
+      }
+      return get(curPosition++);
+    }
+
+    @Override
+    public void remove() {
+      removeAtPosition(curPosition - 1);  // only valid following a hasNext()
+    }   
+  }
+  
+    
+  @Override
+  public Iterator<T> iterator() {
+    return new ObjHashSetIterator();
+  }
+  
+  public int moveToFirst() {
+    return (size() == 0) ? -1 : moveToNextFilled(0);
+  }
+
+  public int moveToLast() {
+    return (size() == 0) ? -1 : moveToPreviousFilled(getCapacity() -1);
+  }
+
+  public int moveToNext(int position) {
+    if (position < 0) {
+      return position;
+    }
+    final int n = moveToNextFilled(position + 1); 
+    return (n >= getCapacity()) ? -1 : n;
+  }
+
+  public int moveToPrevious(int position) {
+    if (position >= getCapacity()) {
+      return -1;
+    }
+    return moveToPreviousFilled(position - 1);
+  }
+
+  public boolean isValid(int position) {
+    return (position >= 0) && (position < getCapacity());
+  }
+  
+  /**
+   * if the fs is in the set, the iterator should return it.
+   * if not, move to the first - just to return something.
+   * @param fs position to this fs
+   * @return the index if present, otherwise moveToNextFileed(0);
+   */
+  public int moveTo(FeatureStructure fs) {
+    if (clazz.isAssignableFrom(fs.getClass())) {
+      int pos = find((T)fs);
+      if (pos >= 0) {
+        return pos;
+      }
+    }
+    return moveToFirst(); 
+  }
+
+  @Override
+  public <T2> T2[] toArray(T2[] a) {
+    final int s = size();
+    if (s == 0) {
+      if (a.length >= 1) { 
+        a[0] = null;
+      }
+      return a;
+    }
+    
+    final T2[] r = (a.length >= s)? a : (T2[]) Array.newInstance(a.getClass(), s);
+    int pos = moveToFirst();
+    for (int i = 0; i < size; i ++) {
+      r[i] = (T2) get(pos);
+      pos = moveToNextFilled(pos + 1);
+    }
+    if (a.length > s) {
+      r[s] = null;
+    }
+    return r;
+  }
+
+  @Override
+  public T[] toArray() {
+    return toArray((T[]) Array.newInstance(clazz, size()));
+  }
+
+  /* (non-Javadoc)
+   * @see java.lang.Object#toString()
+   */
+  @Override
+  public String toString() {
+    return String
+        .format(
+            "ObjHashSet [loadFactor=%s, initialCapacity=%s, sizeWhichTriggersExpansion=%s, size=%s, secondTimeShrinkable=%s%n keys=%s]",
+            loadFactor, initialCapacity, sizeWhichTriggersExpansion, size, secondTimeShrinkable, 
+            Arrays.toString(keys));
+  }
+
+  // Collection methods
+  @Override
+  public boolean isEmpty() {
+    return size == 0;
+  }
+
+  @Override
+  public boolean containsAll(Collection<?> c) {
+    c.stream().allMatch(item -> contains(item));
+    return false;
+  }
+
+  @Override
+  public boolean addAll(Collection<? extends T> c) {
+    boolean[] anyChanged = {false};
+    c.stream().forEach(item -> anyChanged[0] |= add(item));
+    return anyChanged[0];
+  }
+
+  @Override
+  public boolean removeAll(Collection<?> c) {
+    boolean[] anyChanged = {false};
+    c.stream().forEach(item -> anyChanged[0] |= remove(item));
+    return anyChanged[0];
+  }
+
+  @Override
+  public boolean retainAll(Collection<?> c) {
+    boolean anyChanged = false;
+    Iterator<T> it = iterator();
+    while (it.hasNext()) {
+      T v = it.next();
+      if (!c.contains(v)) {
+        anyChanged = true;
+        it.remove();
+      }
+    }
+    return anyChanged;
+  }
+}