You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@xerces.apache.org by mr...@apache.org on 2004/07/31 00:10:55 UTC

cvs commit: xml-xerces/java/src/org/apache/xerces/util SymbolTable.java

mrglavas    2004/07/30 15:10:55

  Modified:    java/src/org/apache/xerces/util SymbolTable.java
  Log:
  JIRA Issue #998:
  http://nagoya.apache.org/jira/browse/XERCESJ-998
  
  Improve the performance of the symbol table by
  dynamically resizing it and rehashing it once it
  reaches some threshold. Previously the SymbolTable
  did not scale well, since the size of the table was
  fixed.  Neeraj had started looking at this a couple
  years ago.  Thanks to John Kim for contributing
  this performance improvement.
  
  Revision  Changes    Path
  1.9       +140 -16   xml-xerces/java/src/org/apache/xerces/util/SymbolTable.java
  
  Index: SymbolTable.java
  ===================================================================
  RCS file: /home/cvs/xml-xerces/java/src/org/apache/xerces/util/SymbolTable.java,v
  retrieving revision 1.8
  retrieving revision 1.9
  diff -u -r1.8 -r1.9
  --- SymbolTable.java	24 Feb 2004 23:15:53 -0000	1.8
  +++ SymbolTable.java	30 Jul 2004 22:10:54 -0000	1.9
  @@ -38,10 +38,40 @@
    *   characters are especially prone to this poor hashing behavior.
    *  </li>
    * </ul>
  + * 
  + * An instance of <code>SymbolTable</code> has two parameters that affect its
  + * performance: <i>initial capacity</i> and <i>load factor</i>.  The
  + * <i>capacity</i> is the number of <i>buckets</i> in the SymbolTable, and the
  + * <i>initial capacity</i> is simply the capacity at the time the SymbolTable
  + * is created.  Note that the SymbolTable is <i>open</i>: in the case of a "hash
  + * collision", a single bucket stores multiple entries, which must be searched
  + * sequentially.  The <i>load factor</i> is a measure of how full the SymbolTable
  + * is allowed to get before its capacity is automatically increased.
  + * When the number of entries in the SymbolTable exceeds the product of the load
  + * factor and the current capacity, the capacity is increased by calling the
  + * <code>rehash</code> method.<p>
    *
  + * Generally, the default load factor (.75) offers a good tradeoff between
  + * time and space costs.  Higher values decrease the space overhead but
  + * increase the time cost to look up an entry (which is reflected in most
  + * <tt>SymbolTable</tt> operations, including <tt>addSymbol</tt> and <tt>containsSymbol</tt>).<p>
  + *
  + * The initial capacity controls a tradeoff between wasted space and the
  + * need for <code>rehash</code> operations, which are time-consuming.
  + * No <code>rehash</code> operations will <i>ever</i> occur if the initial
  + * capacity is greater than the maximum number of entries the
  + * <tt>Hashtable</tt> will contain divided by its load factor.  However,
  + * setting the initial capacity too high can waste space.<p>
  + *
  + * If many entries are to be made into a <code>SymbolTable</code>, 
  + * creating it with a sufficiently large capacity may allow the 
  + * entries to be inserted more efficiently than letting it perform 
  + * automatic rehashing as needed to grow the table. <p>
  +
    * @see SymbolHash
    *
    * @author Andy Clark
  + * @author John Kim, IBM
    *
    * @version $Id$
    */
  @@ -61,22 +91,71 @@
       /** Buckets. */
       protected Entry[] fBuckets = null;
   
  -    // actual table size
  +    /** actual table size **/
       protected int fTableSize;
   
  +    /** The total number of entries in the hash table. */
  +    protected transient int fCount;
  +
  +    /** The table is rehashed when its size exceeds this threshold.  (The
  +     * value of this field is (int)(capacity * loadFactor).) */
  +    protected int fThreshold;
  +							 
  +    /** The load factor for the SymbolTable. */
  +    protected float fLoadFactor;
  +
       //
       // Constructors
       //
  -
  -    /** Constructs a symbol table with a default number of buckets. */
  -    public SymbolTable() {
  -        this(TABLE_SIZE);
  +    
  +    /**
  +     * Constructs a new, empty SymbolTable with the specified initial 
  +     * capacity and the specified load factor.
  +     *
  +     * @param      initialCapacity   the initial capacity of the SymbolTable.
  +     * @param      loadFactor        the load factor of the SymbolTable.
  +     * @throws     IllegalArgumentException  if the initial capacity is less
  +     *             than zero, or if the load factor is nonpositive.
  +     */
  +    public SymbolTable(int initialCapacity, float loadFactor) {
  +        
  +        if (initialCapacity < 0) {
  +            throw new IllegalArgumentException("Illegal Capacity: " + initialCapacity);
  +        }
  +        
  +        if (loadFactor <= 0 || Float.isNaN(loadFactor)) {
  +            throw new IllegalArgumentException("Illegal Load: " + loadFactor);
  +        }
  +        
  +        if (initialCapacity == 0) {
  +            initialCapacity = 1;
  +        }
  +        
  +        fLoadFactor = loadFactor;
  +        fTableSize = initialCapacity;
  +        fBuckets = new Entry[fTableSize];
  +        fThreshold = (int)(fTableSize * loadFactor);
  +        fCount = 0;
       }
   
  -    /** Constructs a symbol table with a specified number of buckets. */
  -    public SymbolTable(int tableSize) {
  -        fTableSize = tableSize;
  -        fBuckets = new Entry[fTableSize];
  +    /**
  +     * Constructs a new, empty SymbolTable with the specified initial capacity
  +     * and default load factor, which is <tt>0.75</tt>.
  +     *
  +     * @param     initialCapacity   the initial capacity of the hashtable.
  +     * @throws    IllegalArgumentException if the initial capacity is less
  +     *            than zero.
  +     */
  +    public SymbolTable(int initialCapacity) {
  +        this(initialCapacity, 0.75f);
  +    }
  +    
  +    /**
  +     * Constructs a new, empty SymbolTable with a default initial capacity (101)
  +     * and load factor, which is <tt>0.75</tt>. 
  +     */
  +    public SymbolTable() {
  +        this(TABLE_SIZE, 0.75f);
       }
   
       //
  @@ -92,7 +171,7 @@
        * @param symbol The new symbol.
        */
       public String addSymbol(String symbol) {
  -
  +        
           // search for identical symbol
           int bucket = hash(symbol) % fTableSize;
           int length = symbol.length();
  @@ -106,12 +185,19 @@
                   return entry.symbol;
               }
           }
  -
  +        
  +        if (fCount >= fThreshold) {
  +            // Rehash the table if the threshold is exceeded
  +            rehash();
  +            bucket = hash(symbol) % fTableSize;
  +        } 
  +        
           // create new entry
           Entry entry = new Entry(symbol, fBuckets[bucket]);
           fBuckets[bucket] = entry;
  +        ++fCount;
           return entry.symbol;
  -
  +        
       } // addSymbol(String):String
   
       /**
  @@ -125,7 +211,7 @@
        * @param length The length of the new symbol in the buffer.
        */
       public String addSymbol(char[] buffer, int offset, int length) {
  -
  +        
           // search for identical symbol
           int bucket = hash(buffer, offset, length) % fTableSize;
           OUTER: for (Entry entry = fBuckets[bucket]; entry != null; entry = entry.next) {
  @@ -138,12 +224,19 @@
                   return entry.symbol;
               }
           }
  -
  +        
  +        if (fCount >= fThreshold) {
  +            // Rehash the table if the threshold is exceeded
  +            rehash();
  +            bucket = hash(buffer, offset, length) % fTableSize;
  +        } 
  +        
           // add new entry
           Entry entry = new Entry(buffer, offset, length, fBuckets[bucket]);
           fBuckets[bucket] = entry;
  +        ++fCount;
           return entry.symbol;
  -
  +        
       } // addSymbol(char[],int,int):String
   
       /**
  @@ -185,6 +278,37 @@
           return code & 0x7FFFFFF;
   
       } // hash(char[],int,int):int
  +
  +    /**
  +     * Increases the capacity of and internally reorganizes this 
  +     * SymbolTable, in order to accommodate and access its entries more 
  +     * efficiently.  This method is called automatically when the 
  +     * number of keys in the SymbolTable exceeds this hashtable's capacity 
  +     * and load factor. 
  +     */
  +    protected void rehash() {
  +
  +        int oldCapacity = fBuckets.length;
  +        Entry[] oldTable = fBuckets;
  +
  +        int newCapacity = oldCapacity * 2 + 1;
  +        Entry[] newTable = new Entry[newCapacity];
  +
  +        fThreshold = (int)(newCapacity * fLoadFactor);
  +        fBuckets = newTable;
  +        fTableSize = fBuckets.length;
  +
  +        for (int i = oldCapacity ; i-- > 0 ;) {
  +            for (Entry old = oldTable[i] ; old != null ; ) {
  +                Entry e = old;
  +                old = old.next;
  +
  +                int index = hash(e.characters, 0, e.characters.length) % newCapacity;
  +                e.next = newTable[index];
  +                newTable[index] = e;
  +            }
  +        }
  +    }
   
       /**
        * Returns true if the symbol table already contains the specified
  
  
  

---------------------------------------------------------------------
To unsubscribe, e-mail: xerces-cvs-unsubscribe@xml.apache.org
For additional commands, e-mail: xerces-cvs-help@xml.apache.org