You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@mahout.apache.org by td...@apache.org on 2012/05/16 13:29:10 UTC

svn commit: r1339121 [4/4] - in /mahout/trunk/math: ./ src/main/java-templates/org/apache/mahout/math/function/ src/main/java-templates/org/apache/mahout/math/list/ src/main/java-templates/org/apache/mahout/math/map/ src/main/java-templates/org/apache/...

Added: mahout/trunk/math/src/main/java/org/apache/mahout/math/map/QuickOpenIntIntHashMap.java
URL: http://svn.apache.org/viewvc/mahout/trunk/math/src/main/java/org/apache/mahout/math/map/QuickOpenIntIntHashMap.java?rev=1339121&view=auto
==============================================================================
--- mahout/trunk/math/src/main/java/org/apache/mahout/math/map/QuickOpenIntIntHashMap.java (added)
+++ mahout/trunk/math/src/main/java/org/apache/mahout/math/map/QuickOpenIntIntHashMap.java Wed May 16 11:29:08 2012
@@ -0,0 +1,213 @@
+/*
+Copyright � 1999 CERN - European Organization for Nuclear Research.
+Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose 
+is hereby granted without fee, provided that the above copyright notice appear in all copies and 
+that both that copyright notice and this permission notice appear in supporting documentation. 
+CERN makes no representations about the suitability of this software for any purpose. 
+It is provided "as is" without expressed or implied warranty.
+*/
+package org.apache.mahout.math.map;
+
+/**
+ * Status: Experimental; Do not use for production yet. Hash map holding (key,value) associations of type
+ * <tt>(int-->int)</tt>; Automatically grows and shrinks as needed; Implemented using open addressing with double
+ * hashing. First see the <a href="package-summary.html">package summary</a> and javadoc <a
+ * href="package-tree.html">tree view</a> to get the broad picture.
+ *
+ * Implements open addressing with double hashing, using "Brent's variation". Brent's variation slows insertions a bit
+ * down (not much) but reduces probes (collisions) for successful searches, in particular for large load factors. (It
+ * does not improve unsuccessful searches.) See D. Knuth, Searching and Sorting, 3rd ed., p.533-545
+ *
+ * @author wolfgang.hoschek@cern.ch
+ * @version 1.0, 09/24/99
+ * @see java.util.HashMap
+ */
+class QuickOpenIntIntHashMap extends OpenIntIntHashMap {
+  //public int totalProbesSaved = 0; // benchmark only
+
+  /** Constructs an empty map with default capacity and default load factors. */
+  QuickOpenIntIntHashMap() {
+    this(defaultCapacity);
+  }
+
+  /**
+   * Constructs an empty map with the specified initial capacity and default load factors.
+   *
+   * @param initialCapacity the initial capacity of the map.
+   * @throws IllegalArgumentException if the initial capacity is less than zero.
+   */
+  QuickOpenIntIntHashMap(int initialCapacity) {
+    this(initialCapacity, defaultMinLoadFactor, defaultMaxLoadFactor);
+  }
+
+  /**
+   * Constructs an empty map with the specified initial capacity and the specified minimum and maximum load factor.
+   *
+   * @param initialCapacity the initial capacity.
+   * @param minLoadFactor   the minimum load factor.
+   * @param maxLoadFactor   the maximum load factor.
+   * @throws IllegalArgumentException if <tt>initialCapacity < 0 || (minLoadFactor < 0.0 || minLoadFactor >= 1.0) ||
+   *                                  (maxLoadFactor <= 0.0 || maxLoadFactor >= 1.0) || (minLoadFactor >=
+   *                                  maxLoadFactor)</tt>.
+   */
+  QuickOpenIntIntHashMap(int initialCapacity, double minLoadFactor, double maxLoadFactor) {
+    setUp(initialCapacity, minLoadFactor, maxLoadFactor);
+  }
+
+  /**
+   * Associates the given key with the given value. Replaces any old <tt>(key,someOtherValue)</tt> association, if
+   * existing.
+   *
+   * @param key   the key the value shall be associated with.
+   * @param value the value to be associated.
+   * @return <tt>true</tt> if the receiver did not already contain such a key; <tt>false</tt> if the receiver did
+   *         already contain such a key - the new value has now replaced the formerly associated value.
+   */
+  @Override
+  public boolean put(int key, int value) {
+    /*
+       This is open addressing with double hashing, using "Brent's variation".
+       Brent's variation slows insertions a bit down (not much) but reduces probes (collisions) for successful searches, in particular for large load factors.
+       (It does not improve unsuccessful searches.)
+       See D. Knuth, Searching and Sorting, 3rd ed., p.533-545
+
+       h1(key) = hash % M
+       h2(key) = decrement = Max(1, hash/M % M)
+       M is prime = capacity = table.length
+       probing positions are table[(h1-j*h2) % M] for j=0,1,...
+       (M and h2 could also be chosen differently, but h2 is required to be relative prime to M.)
+    */
+
+    int[] tab = table;
+    byte[] stat = state;
+    int length = tab.length;
+
+    int hash = HashFunctions.hash(key) & 0x7FFFFFFF;
+    int i = hash % length;
+    int decrement = (hash / length) % length;
+    if (decrement == 0) {
+      decrement = 1;
+    }
+
+    // stop if we find a removed or free slot, or if we find the key itself
+    // do NOT skip over removed slots (yes, open addressing is like that...)
+    //int comp = comparisons;
+    int t = 0;  // the number of probes
+    int p0 = i; // the first position to probe
+    while (stat[i] == FULL && tab[i] != key) {
+      t++;
+      i -= decrement;
+      //hashCollisions++;
+      if (i < 0) {
+        i += length;
+      }
+    }
+    if (stat[i] == FULL) {
+      // key already contained at slot i.
+      this.values[i] = value;
+      return false;
+    }
+    // not already contained, should be inserted at slot i.
+
+    if (this.distinct > this.highWaterMark) {
+      int newCapacity = chooseGrowCapacity(this.distinct + 1, this.minLoadFactor, this.maxLoadFactor);
+      rehash(newCapacity);
+      return put(key, value);
+    }
+
+    /*
+    Brent's variation does a local reorganization to reduce probes. It essentially means:
+    We test whether it is possible to move the association we probed first (table[p0]) out of the way.
+    If this is possible, it will reduce probes for the key to be inserted, since it takes its place; it gets hit earlier.
+    However, future probes for the key that we move out of the way will increase.
+    Thus we only move it out of the way, if we have a net gain, that is, if we save more probes than we loose.
+    For the first probe we safe more than we loose if the number of probes we needed was >=2 (t>=2).
+    If the first probe cannot be moved out of the way, we try the next probe (p1).
+    Now we safe more than we loose if t>=3.
+    We repeat this until we find that we cannot gain or that we can indeed move p(x) out of the way.
+
+    Note: Under the great majority of insertions t<=1, so the loop is entered very infrequently.
+    */
+    while (t > 1) {
+      int key0 = tab[p0];
+      hash = HashFunctions.hash(key0) & 0x7FFFFFFF;
+      decrement = (hash / length) % length;
+      if (decrement == 0) {
+        decrement = 1;
+      }
+      int pc = p0 - decrement; // pc = (p0-j*decrement) % M, j=1,2,..
+      if (pc < 0) {
+        pc += length;
+      }
+
+      if (stat[pc] != FREE) { // not a free slot, continue searching for free slot to move to, or break.
+        p0 = pc;
+        t--;
+      } else { // free or removed slot found, now move...
+        tab[pc] = key0;
+        stat[pc] = FULL;
+        values[pc] = values[p0];
+        i = p0; // prepare to insert: table[p0]=key
+        t = 0; // break loop
+      }
+    }
+
+    this.table[i] = key;
+    this.values[i] = value;
+    if (this.state[i] == FREE) {
+      this.freeEntries--;
+    }
+    this.state[i] = FULL;
+    this.distinct++;
+
+    if (this.freeEntries < 1) { //delta
+      int newCapacity = chooseGrowCapacity(this.distinct + 1, this.minLoadFactor, this.maxLoadFactor);
+      rehash(newCapacity);
+    }
+
+    return true;
+  }
+
+  /**
+   * Rehashes the contents of the receiver into a new table with a smaller or larger capacity. This method is called
+   * automatically when the number of keys in the receiver exceeds the high water mark or falls below the low water
+   * mark.
+   */
+  @Override
+  protected void rehash(int newCapacity) {
+    int oldCapacity = table.length;
+    //if (oldCapacity == newCapacity) return;
+
+    int[] oldTable = table;
+    int[] oldValues = values;
+    byte[] oldState = state;
+
+    int[] newTable = new int[newCapacity];
+    int[] newValues = new int[newCapacity];
+    byte[] newState = new byte[newCapacity];
+
+    this.lowWaterMark = chooseLowWaterMark(newCapacity, this.minLoadFactor);
+    this.highWaterMark = chooseHighWaterMark(newCapacity, this.maxLoadFactor);
+
+    this.table = newTable;
+    this.values = newValues;
+    this.state = newState;
+    this.freeEntries = newCapacity - this.distinct; // delta
+
+    int tmp = this.distinct;
+    this.distinct = Integer.MIN_VALUE; // switch of watermarks
+    for (int i = oldCapacity; i-- > 0;) {
+      if (oldState[i] == FULL) {
+        put(oldTable[i], oldValues[i]);
+        /*
+        int element = oldTable[i];
+        int index = indexOfInsertion(element);
+        newTable[index]=element;
+        newValues[index]=oldValues[i];
+        newState[index]=FULL;
+        */
+      }
+    }
+    this.distinct = tmp;
+  }
+}

Added: mahout/trunk/math/src/main/java/org/apache/mahout/math/map/package.html
URL: http://svn.apache.org/viewvc/mahout/trunk/math/src/main/java/org/apache/mahout/math/map/package.html?rev=1339121&view=auto
==============================================================================
--- mahout/trunk/math/src/main/java/org/apache/mahout/math/map/package.html (added)
+++ mahout/trunk/math/src/main/java/org/apache/mahout/math/map/package.html Wed May 16 11:29:08 2012
@@ -0,0 +1,290 @@
+<HTML>
+<BODY>
+Automatically growing and shrinking maps holding objects or primitive
+data types such as <tt>int</tt>, <tt>double</tt>, etc. Currently all maps are
+based upon hashing.
+<h2><a name="Overview"></a>1. Overview</h2>
+
+<p>The map package offers flexible object oriented abstractions modelling automatically
+  resizing maps. It is designed to be scalable in terms of performance and memory
+  requirements.</p>
+
+<p>Features include: </p>
+
+<p></p>
+<ul>
+  <li>Maps operating on objects as well as all primitive data types such as <code>int</code>,
+    <code>double</code>, etc.
+  </li>
+  <li>Compact representations</li>
+  <li>Support for quick access to associations</li>
+  <li>A number of general purpose map operations</li>
+</ul>
+<p>File-based I/O can be achieved through the standard Java built-in serialization
+  mechanism. All classes implement the {@link java.io.Serializable} interface.
+  However, the toolkit is entirely decoupled from advanced I/O. It provides data
+  structures and algorithms only.
+
+<p> This toolkit borrows some terminology from the Javasoft <a
+    href="http://www.javasoft.com/products/jdk/1.2/docs/guide/collections/index.html">
+  Collections framework</a> written by Josh Bloch and introduced in JDK 1.2.
+
+<h2>2. Introduction</h2>
+
+<p>A map is an associative container that manages a set of (key,value) pairs.
+  It is useful for implementing a collection of one-to-one mappings. A (key,value)
+  pair is called an <i>association</i>. A value can be looked up up via its key.
+  Associations can quickly be set, removed and retrieved. They are stored in a
+  hashing structure based on the hash code of their keys, which is obtained by
+  using a hash function. </p>
+
+<p> A map can, for example, contain <tt>Name-->Location</tt> associations like
+  <tt>{("Pete", "Geneva"), ("Steve", "Paris"), ("Robert", "New York")}</tt> used
+  in address books or <tt>Index-->Value</tt> mappings like <tt>{(0, 100), (3,
+    1000), (100000, 70)}</tt> representing sparse lists or matrices. For example
+  this could mean at index 0 we have a value of 100, at index 3 we have a value
+  of 1000, at index 1000000 we have a value of 70, and at all other indexes we
+  have a value of, say, zero. Another example is a map of IP addresses to domain
+  names (DNS). Maps can also be useful to represent<i> multi sets</i>, that is,
+  sets where elements can occur more than once. For multi sets one would have
+  <tt>Value-->Frequency</tt> mappings like <tt>{(100, 1), (50, 1000), (101, 3))}</tt>
+  meaning element 100 occurs 1 time, element 50 occurs 1000 times, element 101
+  occurs 3 times. Further, maps can also manage <tt>ObjectIdentifier-->Object</tt>
+  mappings like <tt>{(12, obj1), (7, obj2), (10000, obj3), (9, obj4)}</tt> used
+  in Object Databases.
+
+<p> A map cannot contain two or more <i>equal</i> keys; a key can map to at most
+  one value. However, more than one key can map to identical values. For primitive
+  data types "equality" of keys is defined as identity (operator <tt>==</tt>).
+  For maps using <tt>Object</tt> keys, the meaning of "equality" can be specified
+  by the user upon instance construction. It can either be defined to be identity
+  (operator <tt>==</tt>) or to be given by the method {@link java.lang.Object#equals(Object)}.
+  Associations of kind <tt>(AnyType,Object)</tt> can be of the form <tt>(AnyKey,null)
+  </tt>, i.e. values can be <tt>null</tt>.
+
+<p> The classes of this package make no guarantees as to the order of the elements
+  returned by iterators; in particular, they do not guarantee that the order will
+  remain constant over time.
+
+<h2></h2>
+<h4>Copying</h4>
+
+<p>
+
+<p>Any map can be copied. A copy is <i>equal</i> to the original but entirely
+  independent of the original. So changes in the copy are not reflected in the
+  original, and vice-versa.
+
+<h2>3. Package organization</h2>
+
+<p>For most primitive data types and for objects there exists a separate map version.
+  All versions are just the same, except that they operate on different data types.
+  Colt includes two kinds of implementations for maps: The two different implementations
+  are tagged <b>Chained</b> and <b>Open</b>.
+  Note: Chained is no more included. Wherever it is mentioned it is of historic interest only.</p>
+<ul>
+  <li><b>Chained</b> uses extendible separate chaining with chains holding unsorted
+    dynamically linked collision lists.
+  <li><b>Open</b> uses extendible open addressing with double hashing.
+</ul>
+<p>Class naming follows the schema <tt>&lt;Implementation&gt;&lt;KeyType&gt;&lt;ValueType&gt;HashMap</tt>.
+  For example, a {@link org.apache.mahout.math.map.OpenIntDoubleHashMap} holds <tt>(int-->double)</tt>
+  associations and is implemented with open addressing. A {@link org.apache.mahout.math.map.OpenIntObjectHashMap}
+  holds <tt>(int-->Object)</tt> associations and is implemented with open addressing.
+</p>
+
+<p>The classes for maps of a given (key,value) type are derived from a common
+  abstract base class tagged <tt>Abstract&lt;KeyType&gt;&lt;ValueType&gt;</tt><tt>Map</tt>.
+  For example, all maps operating on <tt>(int-->double)</tt> associations are
+  derived from {@link org.apache.mahout.math.map.AbstractIntDoubleMap}, which in turn is derived
+  from an abstract base class tying together all maps regardless of assocation
+  type, {@link org.apache.mahout.math.set.AbstractSet}. The abstract base classes provide skeleton
+  implementations for all but few methods. Experimental layouts (such as chaining,
+  open addressing, extensible hashing, red-black-trees, etc.) can easily be implemented
+  and inherit a rich set of functionality. Have a look at the javadoc <a href="package-tree.html">tree
+    view</a> to get the broad picture.</p>
+
+<h2>4. Example usage</h2>
+<TABLE>
+  <TD CLASS="PRE">
+      <PRE>
+int[]    keys   = {0    , 3     , 100000, 9   };
+double[] values = {100.0, 1000.0, 70.0  , 71.0};
+
+AbstractIntDoubleMap map = new OpenIntDoubleHashMap();
+
+// add several associations
+for (int i=0; i &lt; keys.length; i++) map.put(keys[i], values[i]);
+
+log.info("map="+map);
+log.info("size="+map.size());
+log.info(map.containsKey(3));
+log.info("get(3)="+map.get(3));
+log.info(map.containsKey(4));
+log.info("get(4)="+map.get(4));
+log.info(map.containsValue(71.0));
+log.info("keyOf(71.0)="+map.keyOf(71.0));
+
+// remove one association
+map.removeKey(3);
+log.info("\nmap="+map);
+log.info(map.containsKey(3));
+log.info("get(3)="+map.get(3));
+log.info(map.containsValue(1000.0));
+log.info("keyOf(1000.0)="+map.keyOf(1000.0));
+
+// clear
+map.clear();
+log.info("\nmap="+map);
+log.info("size="+map.size());
+</PRE>
+  </TD>
+</TABLE>
+
+yields the following output
+
+<TABLE>
+  <TD CLASS="PRE">
+<PRE>
+map=[0->100.0, 3->1000.0, 9->71.0, 100000->70.0]
+size=4
+true
+get(3)=1000.0
+false
+get(4)=0.0
+true
+keyOf(71.0)=9
+
+map=[0->100.0, 9->71.0, 100000->70.0]
+false
+get(3)=0.0
+false
+keyOf(1000.0)=-2147483648
+
+map=[]
+size=0
+</PRE>
+  </TD>
+</TABLE>
+<h2> 5. Notes </h2>
+
+<p>
+  Note that implementations are not synchronized.
+
+<p>
+  Choosing efficient parameters for hash maps is not always easy.
+  However, since parameters determine efficiency and memory requirements, here is a quick guide how to choose them.
+  If your use case does not heavily operate on hash maps but uses them just because they provide
+  convenient functionality, you can safely skip this section.
+  For those of you who care, read on.
+
+<p>
+
+  There are three parameters that can be customized upon map construction: <tt>initialCapacity</tt>,
+  <tt>minLoadFactor</tt> and <tt>maxLoadFactor</tt>.
+  The more memory one can afford, the faster a hash map.
+  The hash map's capacity is the maximum number of associations that can be added without needing to allocate new
+  internal memory.
+  A larger capacity means faster adding, searching and removing.
+  The <tt>initialCapacity</tt> corresponds to the capacity used upon instance construction.
+
+<p>
+  The <tt>loadFactor</tt> of a hash map measures the degree of "fullness".
+  It is given by the number of assocations (<tt>size()</tt>)
+  divided by the hash map capacity <tt>(0.0 &lt;= loadFactor &lt;= 1.0)</tt>.
+  The more associations are added, the larger the loadFactor and the more hash map performance degrades.
+  Therefore, when the loadFactor exceeds a customizable threshold (<tt>maxLoadFactor</tt>), the hash map is
+  automatically grown.
+  In such a way performance degradation can be avoided.
+  Similarly, when the loadFactor falls below a customizable threshold (<tt>minLoadFactor</tt>), the hash map is
+  automatically shrinked.
+  In such a way excessive memory consumption can be avoided.
+  Automatic resizing (both growing and shrinking) obeys the following invariant:
+
+<p>
+  <tt>capacity * minLoadFactor <= size() <= capacity * maxLoadFactor</tt>
+
+<p> The term <tt>capacity * minLoadFactor</tt> is called the <i>low water mark</i>,
+  <tt>capacity * maxLoadFactor</tt> is called the <i>high water mark</i>. In other
+  words, the number of associations may vary within the water mark constraints.
+  When it goes out of range, the map is automatically resized and memory consumption
+  changes proportionally.
+<ul>
+  <li>To tune for memory at the expense of performance, both increase <tt>minLoadFactor</tt> and <tt>maxLoadFactor</tt>.
+  <li>To tune for performance at the expense of memory, both decrease <tt>minLoadFactor</tt> and <tt>maxLoadFactor</tt>.
+    As as special case set <tt>minLoadFactor=0</tt> to avoid any automatic shrinking.
+</ul>
+Resizing large hash maps can be time consuming, <tt>O(size())</tt>, and should be avoided if possible (maintaining
+primes is not the reason).
+Unnecessary growing operations can be avoided if the number of associations is known before they are added, or can be
+estimated.<p>
+  In such a case good parameters are as follows:
+
+<p>
+  <i>For chaining:</i>
+  <br>Set the <tt>initialCapacity = 1.4*expectedSize</tt> or greater.
+  <br>Set the <tt>maxLoadFactor = 0.8</tt> or greater.
+
+<p>
+  <i>For open addressing:</i>
+  <br>Set the <tt>initialCapacity = 2*expectedSize</tt> or greater. Alternatively call <tt>ensureCapacity(...)</tt>.
+  <br>Set the <tt>maxLoadFactor = 0.5</tt>.
+  <br>Never set <tt>maxLoadFactor &gt; 0.55</tt>; open addressing exponentially slows down beyond that point.
+
+<p>
+  In this way the hash map will never need to grow and still stay fast.
+  It is never a good idea to set <tt>maxLoadFactor &lt; 0.1</tt>,
+  because the hash map would grow too often.
+  If it is entirelly unknown how many associations the application will use,
+  the default constructor should be used. The map will grow and shrink as needed.
+
+<p>
+
+  <b>Comparision of chaining and open addressing</b>
+
+<p> Chaining is faster than open addressing, when assuming unconstrained memory
+  consumption. Open addressing is more space efficient than chaining, because
+  it does not create entry objects but uses primitive arrays which are considerably
+  smaller. Entry objects consume significant amounts of memory compared to the
+  information they actually hold. Open addressing also poses no problems to the
+  garbage collector. In contrast, chaining can create millions of entry objects
+  which are linked; a nightmare for any garbage collector. In addition, entry
+  object creation is a bit slow. <br>
+  Therefore, with the same amount of memory, or even less memory, hash maps with
+  larger capacity can be maintained under open addressing, which yields smaller
+  loadFactors, which in turn keeps performance competitive with chaining. In our
+  benchmarks, using significantly less memory, open addressing usually is not
+  more than 1.2-1.5 times slower than chaining.
+
+<p><b>Further readings</b>:
+  <br>Knuth D., The Art of Computer Programming: Searching and Sorting, 3rd ed.
+  <br>Griswold W., Townsend G., The Design and Implementation of Dynamic Hashing for Sets and Tables in Icon, Software -
+  Practice and Experience, Vol. 23(4), 351-367 (April 1993).
+  <br>Larson P., Dynamic hash tables, Comm. of the ACM, 31, (4), 1988.
+
+<p>
+  <b>Performance:</b>
+
+<p>
+  Time complexity:
+  <br>The classes offer <i>expected</i> time complexity <tt>O(1)</tt> (i.e. constant time) for the basic operations
+  <tt>put</tt>, <tt>get</tt>, <tt>removeKey</tt>, <tt>containsKey</tt> and <tt>size</tt>,
+  assuming the hash function disperses the elements properly among the buckets.
+  Otherwise, pathological cases, although highly improbable, can occur, degrading performance to <tt>O(N)</tt> in the
+  worst case.
+  Operations <tt>containsValue</tt> and <tt>keyOf</tt> are <tt>O(N)</tt>.
+
+<p>
+  Memory requirements for <i>open addressing</i>:
+  <br>worst case: <tt>memory [bytes] = (1/minLoadFactor) * size() * (1 + sizeOf(key) + sizeOf(value))</tt>.
+  <br>best case: <tt>memory [bytes] = (1/maxLoadFactor) * size() * (1 + sizeOf(key) + sizeOf(value))</tt>.
+  Where <tt>sizeOf(int) = 4</tt>, <tt>sizeOf(double) = 8</tt>, <tt>sizeOf(Object) = 4</tt>, etc.
+  Thus, an <tt>OpenIntIntHashMap</tt> with minLoadFactor=0.25 and maxLoadFactor=0.5 and 1000000 associations uses
+  between 17 MB and 34 MB.
+  The same map with 1000 associations uses between 17 and 34 KB.
+
+<p>
+
+</BODY>
+</HTML>
+

Added: mahout/trunk/math/src/main/java/org/apache/mahout/math/package.html
URL: http://svn.apache.org/viewvc/mahout/trunk/math/src/main/java/org/apache/mahout/math/package.html?rev=1339121&view=auto
==============================================================================
--- mahout/trunk/math/src/main/java/org/apache/mahout/math/package.html (added)
+++ mahout/trunk/math/src/main/java/org/apache/mahout/math/package.html Wed May 16 11:29:08 2012
@@ -0,0 +1,5 @@
+<HTML>
+<BODY>
+Core base classes; Operations on primitive arrays such as sorting, partitioning and permuting.
+</BODY>
+</HTML>

Added: mahout/trunk/math/src/main/java/org/apache/mahout/math/set/AbstractSet.java
URL: http://svn.apache.org/viewvc/mahout/trunk/math/src/main/java/org/apache/mahout/math/set/AbstractSet.java?rev=1339121&view=auto
==============================================================================
--- mahout/trunk/math/src/main/java/org/apache/mahout/math/set/AbstractSet.java (added)
+++ mahout/trunk/math/src/main/java/org/apache/mahout/math/set/AbstractSet.java Wed May 16 11:29:08 2012
@@ -0,0 +1,188 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+/*
+Copyright 1999 CERN - European Organization for Nuclear Research.
+Permission to use, copy, modify, distribute and sell this software and its documentation for any purpose 
+is hereby granted without fee, provided that the above copyright notice appear in all copies and 
+that both that copyright notice and this permission notice appear in supporting documentation. 
+CERN makes no representations about the suitability of this software for any purpose. 
+It is provided "as is" without expressed or implied warranty.
+*/
+package org.apache.mahout.math.set;
+
+import org.apache.mahout.math.PersistentObject;
+import org.apache.mahout.math.map.PrimeFinder;
+
+public abstract class AbstractSet extends PersistentObject {
+  //public static boolean debug = false; // debug only
+
+  /** The number of distinct associations in the map; its "size()". */
+  protected int distinct;
+
+  /**
+   * The table capacity c=table.length always satisfies the invariant <tt>c * minLoadFactor <= s <= c *
+   * maxLoadFactor</tt>, where s=size() is the number of associations currently contained. The term "c * minLoadFactor"
+   * is called the "lowWaterMark", "c * maxLoadFactor" is called the "highWaterMark". In other words, the table capacity
+   * (and proportionally the memory used by this class) oscillates within these constraints. The terms are precomputed
+   * and cached to avoid recalculating them each time put(..) or removeKey(...) is called.
+   */
+  protected int lowWaterMark;
+  protected int highWaterMark;
+
+  /** The minimum load factor for the hashtable. */
+  protected double minLoadFactor;
+
+  /** The maximum load factor for the hashtable. */
+  protected double maxLoadFactor;
+
+  // these are public access for unit tests.
+  public static final int defaultCapacity = 277;
+  public static final double defaultMinLoadFactor = 0.2;
+  public static final double defaultMaxLoadFactor = 0.5;
+
+  /**
+   * Chooses a new prime table capacity optimized for growing that (approximately) satisfies the invariant <tt>c *
+   * minLoadFactor <= size <= c * maxLoadFactor</tt> and has at least one FREE slot for the given size.
+   */
+  protected int chooseGrowCapacity(int size, double minLoad, double maxLoad) {
+    return nextPrime(Math.max(size + 1, (int) ((4 * size / (3 * minLoad + maxLoad)))));
+  }
+
+  /**
+   * Returns new high water mark threshold based on current capacity and maxLoadFactor.
+   *
+   * @return int the new threshold.
+   */
+  protected int chooseHighWaterMark(int capacity, double maxLoad) {
+    return Math.min(capacity - 2, (int) (capacity * maxLoad)); //makes sure there is always at least one FREE slot
+  }
+
+  /**
+   * Returns new low water mark threshold based on current capacity and minLoadFactor.
+   *
+   * @return int the new threshold.
+   */
+  protected int chooseLowWaterMark(int capacity, double minLoad) {
+    return (int) (capacity * minLoad);
+  }
+
+  /**
+   * Chooses a new prime table capacity neither favoring shrinking nor growing, that (approximately) satisfies the
+   * invariant <tt>c * minLoadFactor <= size <= c * maxLoadFactor</tt> and has at least one FREE slot for the given
+   * size.
+   */
+  protected int chooseMeanCapacity(int size, double minLoad, double maxLoad) {
+    return nextPrime(Math.max(size + 1, (int) ((2 * size / (minLoad + maxLoad)))));
+  }
+
+  /**
+   * Chooses a new prime table capacity optimized for shrinking that (approximately) satisfies the invariant <tt>c *
+   * minLoadFactor <= size <= c * maxLoadFactor</tt> and has at least one FREE slot for the given size.
+   */
+  protected int chooseShrinkCapacity(int size, double minLoad, double maxLoad) {
+    return nextPrime(Math.max(size + 1, (int) ((4 * size / (minLoad + 3 * maxLoad)))));
+  }
+
+  /** Removes all (key,value) associations from the receiver. */
+  public abstract void clear();
+
+  /**
+   * Ensures that the receiver can hold at least the specified number of elements without needing to allocate new
+   * internal memory. If necessary, allocates new internal memory and increases the capacity of the receiver. <p> This
+   * method never need be called; it is for performance tuning only. Calling this method before <tt>put()</tt>ing a
+   * large number of associations boosts performance, because the receiver will grow only once instead of potentially
+   * many times. <p> <b>This default implementation does nothing.</b> Override this method if necessary.
+   *
+   * @param minCapacity the desired minimum capacity.
+   */
+  public void ensureCapacity(int minCapacity) {
+  }
+
+  /**
+   * Returns <tt>true</tt> if the receiver contains no (key,value) associations.
+   *
+   * @return <tt>true</tt> if the receiver contains no (key,value) associations.
+   */
+  public boolean isEmpty() {
+    return distinct == 0;
+  }
+
+  /**
+   * Returns a prime number which is <code>&gt;= desiredCapacity</code> and very close to <code>desiredCapacity</code>
+   * (within 11% if <code>desiredCapacity &gt;= 1000</code>).
+   *
+   * @param desiredCapacity the capacity desired by the user.
+   * @return the capacity which should be used for a hashtable.
+   */
+  protected int nextPrime(int desiredCapacity) {
+    return PrimeFinder.nextPrime(desiredCapacity);
+  }
+
+  /**
+   * Initializes the receiver. You will almost certainly need to override this method in subclasses to initialize the
+   * hash table.
+   *
+   * @param initialCapacity the initial capacity of the receiver.
+   * @param minLoadFactor   the minLoadFactor of the receiver.
+   * @param maxLoadFactor   the maxLoadFactor of the receiver.
+   * @throws IllegalArgumentException if <tt>initialCapacity < 0 || (minLoadFactor < 0.0 || minLoadFactor >= 1.0) ||
+   *                                  (maxLoadFactor <= 0.0 || maxLoadFactor >= 1.0) || (minLoadFactor >=
+   *                                  maxLoadFactor)</tt>.
+   */
+  protected void setUp(int initialCapacity, double minLoadFactor, double maxLoadFactor) {
+    if (initialCapacity < 0) {
+      throw new IllegalArgumentException("Initial Capacity must not be less than zero: " + initialCapacity);
+    }
+    if (minLoadFactor < 0.0 || minLoadFactor >= 1.0) {
+      throw new IllegalArgumentException("Illegal minLoadFactor: " + minLoadFactor);
+    }
+    if (maxLoadFactor <= 0.0 || maxLoadFactor >= 1.0) {
+      throw new IllegalArgumentException("Illegal maxLoadFactor: " + maxLoadFactor);
+    }
+    if (minLoadFactor >= maxLoadFactor) {
+      throw new IllegalArgumentException(
+          "Illegal minLoadFactor: " + minLoadFactor + " and maxLoadFactor: " + maxLoadFactor);
+    }
+  }
+
+  /**
+   * Returns the number of (key,value) associations currently contained.
+   *
+   * @return the number of (key,value) associations currently contained.
+   */
+  public int size() {
+    return distinct;
+  }
+
+  /**
+   * Trims the capacity of the receiver to be the receiver's current size. Releases any superfluous internal memory. An
+   * application can use this operation to minimize the storage of the receiver. <p> This default implementation does
+   * nothing. Override this method if necessary.
+   */
+  public void trimToSize() {
+  }
+  
+  protected static boolean equalsMindTheNull(Object a, Object b) {
+    if (a == null && b == null) {
+      return true;
+    }
+    if (a == null || b == null) {
+      return false;
+    }
+    return a.equals(b);
+  }
+}

Copied: mahout/trunk/math/src/main/java/org/apache/mahout/math/set/OpenHashSet.java (from r1339109, mahout/trunk/math/src/main/java-templates/org/apache/mahout/math/set/OpenKeyTypeHashSet.java.t)
URL: http://svn.apache.org/viewvc/mahout/trunk/math/src/main/java/org/apache/mahout/math/set/OpenHashSet.java?p2=mahout/trunk/math/src/main/java/org/apache/mahout/math/set/OpenHashSet.java&p1=mahout/trunk/math/src/main/java-templates/org/apache/mahout/math/set/OpenKeyTypeHashSet.java.t&r1=1339109&r2=1339121&rev=1339121&view=diff
==============================================================================
--- mahout/trunk/math/src/main/java-templates/org/apache/mahout/math/set/OpenKeyTypeHashSet.java.t (original)
+++ mahout/trunk/math/src/main/java/org/apache/mahout/math/set/OpenHashSet.java Wed May 16 11:29:08 2012
@@ -19,40 +19,37 @@
 
 package org.apache.mahout.math.set;
 
+import java.util.ArrayList;
 import java.util.Arrays;
+import java.util.Collection;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Set;
 
-import org.apache.mahout.math.function.${keyTypeCap}Procedure;
-import org.apache.mahout.math.list.${keyTypeCap}ArrayList;
-import org.apache.mahout.math.map.HashFunctions;
+import org.apache.mahout.math.function.ObjectProcedure;
 import org.apache.mahout.math.map.PrimeFinder;
 
 /**
-  * Open hash set of ${keyType} items;
+  * Open hashing alternative to java.util.HashSet.
  **/
-public class Open${keyTypeCap}HashSet extends Abstract${keyTypeCap}Set {
+public class OpenHashSet<T> extends AbstractSet implements Set<T>  {
   protected static final byte FREE = 0;
   protected static final byte FULL = 1;
   protected static final byte REMOVED = 2;
-#if (${keyTypeFloating} == 'true')
-#set ($noKeyComment = "${keyTypeCap}.NaN")
-  protected static final ${keyType} NO_KEY_VALUE = ${keyTypeCap}.NaN;
-#else
-#set ($noKeyComment = "0")
-  protected static final ${keyType} NO_KEY_VALUE = 0;
-#end
+  protected static final char NO_KEY_VALUE = 0;
 
   /** The hash table keys. */
-  protected ${keyType}[] table;
+  private Object[] table;
 
   /** The state of each hash table entry (FREE, FULL, REMOVED). */
-  protected byte[] state;
+  private byte[] state;
 
   /** The number of table entries in state==FREE. */
-  protected int freeEntries;
+  private int freeEntries;
 
 
   /** Constructs an empty map with default capacity and default load factors. */
-  public Open${keyTypeCap}HashSet() {
+  public OpenHashSet() {
     this(defaultCapacity);
   }
 
@@ -62,7 +59,7 @@ public class Open${keyTypeCap}HashSet ex
    * @param initialCapacity the initial capacity of the map.
    * @throws IllegalArgumentException if the initial capacity is less than zero.
    */
-  public Open${keyTypeCap}HashSet(int initialCapacity) {
+  public OpenHashSet(int initialCapacity) {
     this(initialCapacity, defaultMinLoadFactor, defaultMaxLoadFactor);
   }
 
@@ -76,7 +73,7 @@ public class Open${keyTypeCap}HashSet ex
    *                                  (maxLoadFactor <= 0.0 || maxLoadFactor >= 1.0) || (minLoadFactor >=
    *                                  maxLoadFactor)</tt>.
    */
-  public Open${keyTypeCap}HashSet(int initialCapacity, double minLoadFactor, double maxLoadFactor) {
+  public OpenHashSet(int initialCapacity, double minLoadFactor, double maxLoadFactor) {
     setUp(initialCapacity, minLoadFactor, maxLoadFactor);
   }
 
@@ -94,9 +91,10 @@ public class Open${keyTypeCap}HashSet ex
    *
    * @return a deep copy of the receiver.
    */
+  @SuppressWarnings("unchecked")
   @Override
   public Object clone() {
-    Open${keyTypeCap}HashSet copy = (Open${keyTypeCap}HashSet) super.clone();
+    OpenHashSet<T> copy = (OpenHashSet<T>) super.clone();
     copy.table = copy.table.clone();
     copy.state = copy.state.clone();
     return copy;
@@ -108,8 +106,9 @@ public class Open${keyTypeCap}HashSet ex
    * @return <tt>true</tt> if the receiver contains the specified key.
    */
   @Override
-  public boolean contains(${keyType} key) {
-    return indexOfKey(key) >= 0;
+  @SuppressWarnings("unchecked")
+  public boolean contains(Object key) {
+    return indexOfKey((T)key) >= 0;
   }
 
   /**
@@ -140,11 +139,11 @@ public class Open${keyTypeCap}HashSet ex
    *                  continues.
    * @return <tt>false</tt> if the procedure stopped before all keys where iterated over, <tt>true</tt> otherwise.
    */
-  @Override
-  public boolean forEachKey(${keyTypeCap}Procedure procedure) {
+  @SuppressWarnings("unchecked")
+  public boolean forEachKey(ObjectProcedure<T> procedure) {
     for (int i = table.length; i-- > 0;) {
       if (state[i] == FULL) {
-        if (!procedure.apply(table[i])) {
+        if (!procedure.apply((T)table[i])) {
           return false;
         }
       }
@@ -159,10 +158,12 @@ public class Open${keyTypeCap}HashSet ex
    *         at slot -index-1. If the returned index >= 0, then it is NOT already contained and should be inserted at
    *         slot index.
    */
-  protected int indexOfInsertion(${keyType} key) {
-    final int length = table.length;
+  protected int indexOfInsertion(T key) {
+    Object[] tab = table;
+    byte[] stat = state;
+    int length = tab.length;
 
-    final int hash = HashFunctions.hash(key) & 0x7FFFFFFF;
+    int hash = key.hashCode() & 0x7FFFFFFF;
     int i = hash % length;
     int decrement = hash % (length - 2); // double hashing, see http://www.eece.unm.edu/faculty/heileman/hash/node4.html
     //int decrement = (hash / length) % length;
@@ -172,7 +173,7 @@ public class Open${keyTypeCap}HashSet ex
 
     // stop if we find a removed or free slot, or if we find the key itself
     // do NOT skip over removed slots (yes, open addressing is like that...)
-    while (state[i] == FULL && table[i] != key) {
+    while (stat[i] == FULL && tab[i] != key) {
       i -= decrement;
       //hashCollisions++;
       if (i < 0) {
@@ -180,25 +181,25 @@ public class Open${keyTypeCap}HashSet ex
       }
     }
 
-    if (state[i] == REMOVED) {
+    if (stat[i] == REMOVED) {
       // stop if we find a free slot, or if we find the key itself.
       // do skip over removed slots (yes, open addressing is like that...)
       // assertion: there is at least one FREE slot.
-      final int j = i;
-      while (state[i] != FREE && (state[i] == REMOVED || table[i] != key)) {
+      int j = i;
+      while (stat[i] != FREE && (stat[i] == REMOVED || tab[i] != key)) {
         i -= decrement;
         //hashCollisions++;
         if (i < 0) {
           i += length;
         }
       }
-      if (state[i] == FREE) {
+      if (stat[i] == FREE) {
         i = j;
       }
     }
 
 
-    if (state[i] == FULL) {
+    if (stat[i] == FULL) {
       // key already contained at slot i.
       // return a negative number identifying the slot.
       return -i - 1;
@@ -212,10 +213,12 @@ public class Open${keyTypeCap}HashSet ex
    * @param key the key to be searched in the receiver.
    * @return the index where the key is contained in the receiver, returns -1 if the key was not found.
    */
-  protected int indexOfKey(${keyType} key) {
-    final int length = table.length;
+  protected int indexOfKey(T key) {
+    Object[] tab = table;
+    byte[] stat = state;
+    int length = tab.length;
 
-    final int hash = HashFunctions.hash(key) & 0x7FFFFFFF;
+    int hash = key.hashCode() & 0x7FFFFFFF;
     int i = hash % length;
     int decrement = hash % (length - 2); // double hashing, see http://www.eece.unm.edu/faculty/heileman/hash/node4.html
     //int decrement = (hash / length) % length;
@@ -225,7 +228,7 @@ public class Open${keyTypeCap}HashSet ex
 
     // stop if we find a free slot, or if we find the key itself.
     // do skip over removed slots (yes, open addressing is like that...)
-    while (state[i] != FREE && (state[i] == REMOVED || table[i] != key)) {
+    while (stat[i] != FREE && (stat[i] == REMOVED || (!key.equals(tab[i])))) {
       i -= decrement;
       //hashCollisions++;
       if (i < 0) {
@@ -233,7 +236,7 @@ public class Open${keyTypeCap}HashSet ex
       }
     }
 
-    if (state[i] == FREE) {
+    if (stat[i] == FREE) {
       return -1;
     } // not found
     return i; //found, return index where key is contained
@@ -241,39 +244,32 @@ public class Open${keyTypeCap}HashSet ex
 
   /**
    * Fills all keys contained in the receiver into the specified list. Fills the list, starting at index 0. After this
-   * call returns the specified list has a new size that equals <tt>this.size()</tt>. Iteration order is guaranteed to
-   * be <i>identical</i> to the order used by method {@link #forEachKey(${keyTypeCap}Procedure)}.
-   * <p> This method can be used
+   * call returns the specified list has a new size that equals <tt>this.size()</tt>. 
+   * This method can be used
    * to iterate over the keys of the receiver.
    *
    * @param list the list to be filled, can have any size.
    */
-  @Override
-  public void keys(${keyTypeCap}ArrayList list) {
-    list.setSize(distinct);
-    ${keyType} [] elements = list.elements();
-
-    int j = 0;
-    for (int i = table.length; i-- > 0;) {
-      if (state[i] == FULL) {
-        elements[j++] = table[i];
+  @SuppressWarnings("unchecked")
+  public void keys(List<T> list) {
+    list.clear();
+  
+
+    Object [] tab = table;
+    byte[] stat = state;
+
+    for (int i = tab.length; i-- > 0;) {
+      if (stat[i] == FULL) {
+        list.add((T)tab[i]);
       }
     }
   }
 
-  /**
-   * Associates the given key with the given value. Replaces any old <tt>(key,someOtherValue)</tt> association, if
-   * existing.
-   *
-   * @param key   the key the value shall be associated with.
-   * @return <tt>true</tt> if the receiver did not already contain such a key; <tt>false</tt> if the receiver did
-   *         already contain such a key - the new value has now replaced the formerly associated value.
-   */
+  @SuppressWarnings("unchecked")
   @Override
-  public boolean add(${keyType} key) {
-    int i = indexOfInsertion(key);
+  public boolean add(Object key) {
+    int i = indexOfInsertion((T)key);
     if (i < 0) { //already contained
-      //i = -i - 1;
       return false;
     }
 
@@ -293,8 +289,9 @@ public class Open${keyTypeCap}HashSet ex
     if (this.freeEntries < 1) { //delta
       int newCapacity = chooseGrowCapacity(this.distinct + 1, this.minLoadFactor, this.maxLoadFactor);
       rehash(newCapacity);
+      return add(key);
     }
-
+    
     return true;
   }
 
@@ -303,27 +300,30 @@ public class Open${keyTypeCap}HashSet ex
    * automatically when the number of keys in the receiver exceeds the high water mark or falls below the low water
    * mark.
    */
+  @SuppressWarnings("unchecked")
   protected void rehash(int newCapacity) {
     int oldCapacity = table.length;
     //if (oldCapacity == newCapacity) return;
 
-    ${keyType}[] oldTable = table;
+    Object[] oldTable = table;
     byte[] oldState = state;
 
-    this.table = new ${keyType}[newCapacity];
-    this.state = new byte[newCapacity];
+    Object[] newTable = new Object[newCapacity];
+    byte[] newState = new byte[newCapacity];
 
     this.lowWaterMark = chooseLowWaterMark(newCapacity, this.minLoadFactor);
     this.highWaterMark = chooseHighWaterMark(newCapacity, this.maxLoadFactor);
 
+    this.table = newTable;
+    this.state = newState;
     this.freeEntries = newCapacity - this.distinct; // delta
 
     for (int i = oldCapacity; i-- > 0;) {
       if (oldState[i] == FULL) {
-        ${keyType} element = oldTable[i];
-        int index = indexOfInsertion(element);
-        this.table[index] = element;
-        this.state[index] = FULL;
+        Object element = oldTable[i];
+        int index = indexOfInsertion((T)element);
+        newTable[index] = element;
+        newState[index] = FULL;
       }
     }
   }
@@ -334,9 +334,10 @@ public class Open${keyTypeCap}HashSet ex
    * @param key the key to be removed from the receiver.
    * @return <tt>true</tt> if the receiver contained the specified key, <tt>false</tt> otherwise.
    */
+  @SuppressWarnings("unchecked")
   @Override
-  public boolean remove(${keyType} key) {
-    int i = indexOfKey(key);
+  public boolean remove(Object key) {
+    int i = indexOfKey((T)key);
     if (i < 0) {
       return false;
     } // key not contained
@@ -371,7 +372,7 @@ public class Open${keyTypeCap}HashSet ex
       capacity = 1;
     } // open addressing needs at least one FREE slot at any time.
 
-    this.table = new ${keyType}[capacity];
+    this.table = new Object[capacity];
     this.state = new byte[capacity];
 
     // memory will be exhausted long before this pathological case happens, anyway.
@@ -413,11 +414,125 @@ public class Open${keyTypeCap}HashSet ex
    * @param minLoadFactor
    * @param maxLoadFactor
    */
-  protected void getInternalFactors(int[] capacity, 
+  void getInternalFactors(int[] capacity, 
       double[] minLoadFactor, 
       double[] maxLoadFactor) {
     capacity[0] = table.length;
     minLoadFactor[0] = this.minLoadFactor;
     maxLoadFactor[0] = this.maxLoadFactor;
   }
+
+  @Override
+  public boolean isEmpty() {
+    return size() == 0;
+  }
+  
+  /**
+   * OpenHashSet instances are only equal to other OpenHashSet instances, not to 
+   * any other collection. Hypothetically, we should check for and permit
+   * equals on other Sets.
+   */
+  @SuppressWarnings("unchecked")
+  public boolean equals(Object obj) {
+    if (obj == this) {
+      return true;
+    }
+
+    if (!(obj instanceof OpenHashSet)) {
+      return false;
+    }
+    final OpenHashSet<T> other = (OpenHashSet<T>) obj;
+    if (other.size() != size()) {
+      return false;
+    }
+
+    return
+        forEachKey(
+            new ObjectProcedure<T>() {
+              @Override
+              public boolean apply(T key) {
+                return other.contains(key);
+              }
+            }
+        );
+  }
+  
+  /**
+   * Implement the standard Java Collections iterator. Note that 'remove' is silently
+   * ineffectual here. This method is provided for convenience, only.
+   */
+  @Override
+  public Iterator<T> iterator() {
+    List<T> keyList = new ArrayList<T>();
+    keys(keyList);
+    return keyList.iterator();
+  }
+
+  @Override
+  public Object[] toArray() {
+    List<T> keyList = new ArrayList<T>();
+    keys(keyList);
+    return keyList.toArray();
+  }
+
+  @Override
+  public boolean addAll(Collection<? extends T> c) {
+    boolean anyAdded = false;
+    for(T o : c) {
+      boolean added = add(o);
+      anyAdded |= added;
+    }
+    return anyAdded;
+  }
+
+  @Override
+  public boolean containsAll(Collection<?> c) {
+    for (Object o : c) {
+      if (!contains(o)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  @Override
+  public boolean removeAll(Collection<?> c) {
+    boolean anyRemoved = false;
+    for(Object o : c) {
+      boolean removed = remove(o);
+      anyRemoved |= removed;
+    }
+    return anyRemoved;
+  }
+
+  @Override
+  public boolean retainAll(Collection<?> c) {
+    final Collection<?> finalCollection = c;
+    final boolean[] modified = new boolean[1];
+    modified[0] = false;
+    forEachKey(new ObjectProcedure<T>() {
+
+      @Override
+      public boolean apply(T element) {
+        if (!finalCollection.contains(element)) {
+          remove(element);
+          modified[0] = true;
+        }
+        return true;
+      }});
+    return modified[0];
+  }
+
+  @Override
+  public <T2> T2[] toArray(T2[] a) {
+    List<T> keys = new ArrayList<T>();
+    keys(keys);
+    return keys.toArray(a);
+  }
+
+  public List<T> keys() {
+    List<T> keys = new ArrayList<T>();
+    keys(keys);
+    return keys;
+  }
 }