You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by si...@apache.org on 2010/10/02 14:44:35 UTC
svn commit: r1003790 [1/2] - in /lucene/dev/trunk/lucene/src: java/org/apache/lucene/index/ java/org/apache/lucene/util/ test/org/apache/lucene/index/ test/org/apache/lucene/util/

Author: simonw
Date: Sat Oct  2 12:44:32 2010
New Revision: 1003790

URL: http://svn.apache.org/viewvc?rev=1003790&view=rev
Log:
LUCENE-2662: Refactored TermsHashPerField to utilize ByteRefHash 

Added:
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/ByteBlockPool.java
      - copied, changed from r1002577, lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/ByteBlockPool.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/BytesRefHash.java   (with props)
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/RecyclingByteBlockAllocator.java   (with props)
    lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/TestBytesRefHash.java   (with props)
    lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/TestRecyclingByteBlockAllocator.java   (with props)
Removed:
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/ByteBlockPool.java
Modified:
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/ByteSliceReader.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/ByteSliceWriter.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FreqProxFieldMergeState.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashPerField.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashPerThread.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/BytesRef.java
    lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestByteSlices.java
    lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/ByteSliceReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/ByteSliceReader.java?rev=1003790&r1=1003789&r2=1003790&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/ByteSliceReader.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/ByteSliceReader.java Sat Oct  2 12:44:32 2010
@@ -21,6 +21,7 @@ import java.io.IOException;
 
 import org.apache.lucene.store.DataInput;
 import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.util.ByteBlockPool;
 
 /* IndexInput that knows how to read the byte slices written
  * by Posting and PostingVector.  We read the bytes in
@@ -48,16 +49,16 @@ final class ByteSliceReader extends Data
     this.endIndex = endIndex;
 
     level = 0;
-    bufferUpto = startIndex / DocumentsWriter.BYTE_BLOCK_SIZE;
-    bufferOffset = bufferUpto * DocumentsWriter.BYTE_BLOCK_SIZE;
+    bufferUpto = startIndex / ByteBlockPool.BYTE_BLOCK_SIZE;
+    bufferOffset = bufferUpto * ByteBlockPool.BYTE_BLOCK_SIZE;
     buffer = pool.buffers[bufferUpto];
-    upto = startIndex & DocumentsWriter.BYTE_BLOCK_MASK;
+    upto = startIndex & ByteBlockPool.BYTE_BLOCK_MASK;
 
     final int firstSize = ByteBlockPool.levelSizeArray[0];
 
     if (startIndex+firstSize >= endIndex) {
       // There is only this one slice to read
-      limit = endIndex & DocumentsWriter.BYTE_BLOCK_MASK;
+      limit = endIndex & ByteBlockPool.BYTE_BLOCK_MASK;
     } else
       limit = upto+firstSize-4;
   }
@@ -102,11 +103,11 @@ final class ByteSliceReader extends Data
     level = ByteBlockPool.nextLevelArray[level];
     final int newSize = ByteBlockPool.levelSizeArray[level];
 
-    bufferUpto = nextIndex / DocumentsWriter.BYTE_BLOCK_SIZE;
-    bufferOffset = bufferUpto * DocumentsWriter.BYTE_BLOCK_SIZE;
+    bufferUpto = nextIndex / ByteBlockPool.BYTE_BLOCK_SIZE;
+    bufferOffset = bufferUpto * ByteBlockPool.BYTE_BLOCK_SIZE;
 
     buffer = pool.buffers[bufferUpto];
-    upto = nextIndex & DocumentsWriter.BYTE_BLOCK_MASK;
+    upto = nextIndex & ByteBlockPool.BYTE_BLOCK_MASK;
 
     if (nextIndex + newSize >= endIndex) {
       // We are advancing to the final slice

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/ByteSliceWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/ByteSliceWriter.java?rev=1003790&r1=1003789&r2=1003790&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/ByteSliceWriter.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/ByteSliceWriter.java Sat Oct  2 12:44:32 2010
@@ -1,6 +1,7 @@
 package org.apache.lucene.index;
 
 import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.util.ByteBlockPool;
 
 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
@@ -42,9 +43,9 @@ final class ByteSliceWriter extends Data
    * Set up the writer to write at address.
    */
   public void init(int address) {
-    slice = pool.buffers[address >> DocumentsWriter.BYTE_BLOCK_SHIFT];
+    slice = pool.buffers[address >> ByteBlockPool.BYTE_BLOCK_SHIFT];
     assert slice != null;
-    upto = address & DocumentsWriter.BYTE_BLOCK_MASK;
+    upto = address & ByteBlockPool.BYTE_BLOCK_MASK;
     offset0 = address;
     assert upto < slice.length;
   }

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java?rev=1003790&r1=1003789&r2=1003790&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java Sat Oct  2 12:44:32 2010
@@ -27,6 +27,8 @@ import java.util.Map;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map.Entry;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.concurrent.locks.ReentrantLock;
 
 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.document.Document;
@@ -41,8 +43,11 @@ import org.apache.lucene.store.Directory
 import org.apache.lucene.store.RAMFile;
 import org.apache.lucene.util.ArrayUtil;
 import org.apache.lucene.util.Constants;
+import org.apache.lucene.util.RecyclingByteBlockAllocator;
 import org.apache.lucene.util.ThreadInterruptedException;
 import org.apache.lucene.util.RamUsageEstimator;
+import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_MASK;
+import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE;
 
 /**
  * This class accepts multiple added documents and directly
@@ -113,6 +118,7 @@ import org.apache.lucene.util.RamUsageEs
 
 final class DocumentsWriter {
 
+  final AtomicLong bytesUsed = new AtomicLong(0);
   IndexWriter writer;
   Directory directory;
 
@@ -195,6 +201,7 @@ final class DocumentsWriter {
   /**
    * RAMFile buffer for DocWriters.
    */
+  @SuppressWarnings("serial")
   class PerDocBuffer extends RAMFile {
     
     /**
@@ -257,9 +264,12 @@ final class DocumentsWriter {
 
       final TermsHashConsumer termVectorsWriter = new TermVectorsTermsWriter(documentsWriter);
       final TermsHashConsumer freqProxWriter = new FreqProxTermsWriter();
-
-      final InvertedDocConsumer  termsHash = new TermsHash(documentsWriter, true, freqProxWriter,
-                                                           new TermsHash(documentsWriter, false, termVectorsWriter, null));
+      /*
+       * nesting TermsHash instances here to allow the secondary (TermVectors) share the interned postings
+       * via a shared ByteBlockPool. See TermsHashPerField for details. 
+       */
+      final TermsHash termVectorsTermHash = new TermsHash(documentsWriter, false, termVectorsWriter, null);
+      final InvertedDocConsumer  termsHash = new TermsHash(documentsWriter, true, freqProxWriter, termVectorsTermHash);
       final NormsWriter normsWriter = new NormsWriter();
       final DocInverter docInverter = new DocInverter(termsHash, normsWriter);
       return new DocFieldProcessor(documentsWriter, docInverter);
@@ -637,7 +647,7 @@ final class DocumentsWriter {
       for(int i=0;i<threadStates.length;i++)
         threads.add(threadStates[i].consumer);
 
-      final long startNumBytesUsed = numBytesUsed;
+      final long startNumBytesUsed = bytesUsed();
       consumer.flush(threads, flushState);
 
       if (infoStream != null) {
@@ -963,7 +973,7 @@ final class DocumentsWriter {
 
   synchronized boolean deletesFull() {
     return (ramBufferSize != IndexWriterConfig.DISABLE_AUTO_FLUSH &&
-            (deletesInRAM.bytesUsed + deletesFlushed.bytesUsed + numBytesUsed) >= ramBufferSize) ||
+            (deletesInRAM.bytesUsed + deletesFlushed.bytesUsed + bytesUsed()) >= ramBufferSize) ||
       (maxBufferedDeleteTerms != IndexWriterConfig.DISABLE_AUTO_FLUSH &&
        ((deletesInRAM.size() + deletesFlushed.size()) >= maxBufferedDeleteTerms));
   }
@@ -1256,11 +1266,9 @@ final class DocumentsWriter {
   final SkipDocWriter skipDocWriter = new SkipDocWriter();
 
   long getRAMUsed() {
-    return numBytesUsed + deletesInRAM.bytesUsed + deletesFlushed.bytesUsed;
+    return bytesUsed() + deletesInRAM.bytesUsed + deletesFlushed.bytesUsed;
   }
 
-  long numBytesUsed;
-
   NumberFormat nf = NumberFormat.getInstance();
 
   // Coarse estimates used to measure RAM usage of buffered deletes
@@ -1295,63 +1303,12 @@ final class DocumentsWriter {
 
   /* Initial chunks size of the shared byte[] blocks used to
      store postings data */
-  final static int BYTE_BLOCK_SHIFT = 15;
-  final static int BYTE_BLOCK_SIZE = 1 << BYTE_BLOCK_SHIFT;
-  final static int BYTE_BLOCK_MASK = BYTE_BLOCK_SIZE - 1;
   final static int BYTE_BLOCK_NOT_MASK = ~BYTE_BLOCK_MASK;
 
   /* if you increase this, you must fix field cache impl for
    * getTerms/getTermsIndex requires <= 32768 */
   final static int MAX_TERM_LENGTH_UTF8 = BYTE_BLOCK_SIZE-2;
 
-  private class ByteBlockAllocator extends ByteBlockPool.Allocator {
-    final int blockSize;
-
-    ByteBlockAllocator(int blockSize) {
-      this.blockSize = blockSize;
-    }
-
-    ArrayList<byte[]> freeByteBlocks = new ArrayList<byte[]>();
-    
-    /* Allocate another byte[] from the shared pool */
-    @Override
-    byte[] getByteBlock() {
-      synchronized(DocumentsWriter.this) {
-        final int size = freeByteBlocks.size();
-        final byte[] b;
-        if (0 == size) {
-          b = new byte[blockSize];
-          numBytesUsed += blockSize;
-        } else
-          b = freeByteBlocks.remove(size-1);
-        return b;
-      }
-    }
-
-    /* Return byte[]'s to the pool */
-
-    @Override
-    void recycleByteBlocks(byte[][] blocks, int start, int end) {
-      synchronized(DocumentsWriter.this) {
-        for(int i=start;i<end;i++) {
-          freeByteBlocks.add(blocks[i]);
-          blocks[i] = null;
-        }
-      }
-    }
-
-    @Override
-    void recycleByteBlocks(List<byte[]> blocks) {
-      synchronized(DocumentsWriter.this) {
-        final int size = blocks.size();
-        for(int i=0;i<size;i++) {
-          freeByteBlocks.add(blocks.get(i));
-          blocks.set(i, null);
-        }
-      }
-    }
-  }
-
   /* Initial chunks size of the shared int[] blocks used to
      store postings data */
   final static int INT_BLOCK_SHIFT = 13;
@@ -1366,14 +1323,14 @@ final class DocumentsWriter {
     final int[] b;
     if (0 == size) {
       b = new int[INT_BLOCK_SIZE];
-      numBytesUsed += INT_BLOCK_SIZE*INT_NUM_BYTE;
+      bytesUsed.addAndGet(INT_BLOCK_SIZE*INT_NUM_BYTE);
     } else
       b = freeIntBlocks.remove(size-1);
     return b;
   }
 
-  synchronized void bytesUsed(long numBytes) {
-    numBytesUsed += numBytes;
+  private long bytesUsed() {
+    return bytesUsed.get();
   }
 
   /* Return int[]s to the pool */
@@ -1384,11 +1341,11 @@ final class DocumentsWriter {
     }
   }
 
-  ByteBlockAllocator byteBlockAllocator = new ByteBlockAllocator(BYTE_BLOCK_SIZE);
+  final RecyclingByteBlockAllocator byteBlockAllocator = new RecyclingByteBlockAllocator(BYTE_BLOCK_SIZE, Integer.MAX_VALUE, bytesUsed);
 
   final static int PER_DOC_BLOCK_SIZE = 1024;
 
-  final ByteBlockAllocator perDocAllocator = new ByteBlockAllocator(PER_DOC_BLOCK_SIZE);
+  final RecyclingByteBlockAllocator perDocAllocator = new RecyclingByteBlockAllocator(PER_DOC_BLOCK_SIZE, Integer.MAX_VALUE, bytesUsed);
 
   String toMB(long v) {
     return nf.format(v/1024./1024.);
@@ -1415,19 +1372,19 @@ final class DocumentsWriter {
       }
     
       deletesRAMUsed = deletesInRAM.bytesUsed+deletesFlushed.bytesUsed;
-      doBalance = numBytesUsed+deletesRAMUsed >= ramBufferSize;
+      doBalance = bytesUsed() +deletesRAMUsed >= ramBufferSize;
     }
 
     if (doBalance) {
 
       if (infoStream != null)
-        message("  RAM: now balance allocations: usedMB=" + toMB(numBytesUsed) +
+        message("  RAM: now balance allocations: usedMB=" + toMB(bytesUsed()) +
                 " vs trigger=" + toMB(ramBufferSize) +
                 " deletesMB=" + toMB(deletesRAMUsed) +
-                " byteBlockFree=" + toMB(byteBlockAllocator.freeByteBlocks.size()*BYTE_BLOCK_SIZE) +
-                " perDocFree=" + toMB(perDocAllocator.freeByteBlocks.size()*PER_DOC_BLOCK_SIZE));
+                " byteBlockFree=" + toMB(byteBlockAllocator.bytesUsed()) +
+                " perDocFree=" + toMB(perDocAllocator.bytesUsed()));
 
-      final long startBytesUsed = numBytesUsed + deletesRAMUsed;
+      final long startBytesUsed = bytesUsed() + deletesRAMUsed;
 
       int iter = 0;
 
@@ -1437,16 +1394,16 @@ final class DocumentsWriter {
 
       boolean any = true;
 
-      while(numBytesUsed+deletesRAMUsed > freeLevel) {
+      while(bytesUsed()+deletesRAMUsed > freeLevel) {
       
         synchronized(this) {
-          if (0 == perDocAllocator.freeByteBlocks.size() &&
-              0 == byteBlockAllocator.freeByteBlocks.size() &&
+          if (0 == perDocAllocator.numBufferedBlocks() &&
+              0 == byteBlockAllocator.numBufferedBlocks() &&
               0 == freeIntBlocks.size() && !any) {
             // Nothing else to free -- must flush now.
-            bufferIsFull = numBytesUsed+deletesRAMUsed > ramBufferSize;
+            bufferIsFull = bytesUsed()+deletesRAMUsed > ramBufferSize;
             if (infoStream != null) {
-              if (numBytesUsed+deletesRAMUsed > ramBufferSize)
+              if (bytesUsed()+deletesRAMUsed > ramBufferSize)
                 message("    nothing to free; now set bufferIsFull");
               else
                 message("    nothing to free");
@@ -1454,25 +1411,15 @@ final class DocumentsWriter {
             break;
           }
 
-          if ((0 == iter % 4) && byteBlockAllocator.freeByteBlocks.size() > 0) {
-            byteBlockAllocator.freeByteBlocks.remove(byteBlockAllocator.freeByteBlocks.size()-1);
-            numBytesUsed -= BYTE_BLOCK_SIZE;
+          if ((0 == iter % 4) && byteBlockAllocator.numBufferedBlocks() > 0) {
+            byteBlockAllocator.freeBlocks(1);
           }
-
           if ((1 == iter % 4) && freeIntBlocks.size() > 0) {
             freeIntBlocks.remove(freeIntBlocks.size()-1);
-            numBytesUsed -= INT_BLOCK_SIZE * INT_NUM_BYTE;
+            bytesUsed.addAndGet(-INT_BLOCK_SIZE * INT_NUM_BYTE);
           }
-
-          if ((2 == iter % 4) && perDocAllocator.freeByteBlocks.size() > 0) {
-            // Remove upwards of 32 blocks (each block is 1K)
-            for (int i = 0; i < 32; ++i) {
-              perDocAllocator.freeByteBlocks.remove(perDocAllocator.freeByteBlocks.size() - 1);
-              numBytesUsed -= PER_DOC_BLOCK_SIZE;
-              if (perDocAllocator.freeByteBlocks.size() == 0) {
-                break;
-              }
-            }
+          if ((2 == iter % 4) && perDocAllocator.numBufferedBlocks() > 0) {
+            perDocAllocator.freeBlocks(32); // Remove upwards of 32 blocks (each block is 1K)
           }
         }
 
@@ -1484,7 +1431,7 @@ final class DocumentsWriter {
       }
 
       if (infoStream != null)
-        message("    after free: freedMB=" + nf.format((startBytesUsed-numBytesUsed-deletesRAMUsed)/1024./1024.) + " usedMB=" + nf.format((numBytesUsed+deletesRAMUsed)/1024./1024.));
+        message("    after free: freedMB=" + nf.format((startBytesUsed-bytesUsed()-deletesRAMUsed)/1024./1024.) + " usedMB=" + nf.format((bytesUsed()+deletesRAMUsed)/1024./1024.));
     }
   }
 

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FreqProxFieldMergeState.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FreqProxFieldMergeState.java?rev=1003790&r1=1003789&r2=1003790&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FreqProxFieldMergeState.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FreqProxFieldMergeState.java Sat Oct  2 12:44:32 2010
@@ -19,6 +19,8 @@ package org.apache.lucene.index;
 
 import java.io.IOException;
 import java.util.Comparator;
+
+import org.apache.lucene.util.ByteBlockPool;
 import org.apache.lucene.util.BytesRef;
 
 import org.apache.lucene.index.FreqProxTermsWriterPerField.FreqProxPostingsArray;
@@ -50,7 +52,7 @@ final class FreqProxFieldMergeState {
 
   public FreqProxFieldMergeState(FreqProxTermsWriterPerField field, Comparator<BytesRef> termComp) {
     this.field = field;
-    this.numPostings = field.termsHashPerField.numPostings;
+    this.numPostings = field.termsHashPerField.bytesHash.size();
     this.bytePool = field.perThread.termsHashPerThread.bytePool;
     this.termIDs = field.termsHashPerField.sortPostings(termComp);
     this.postings = (FreqProxPostingsArray) field.termsHashPerField.postingsArray;

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java?rev=1003790&r1=1003789&r2=1003790&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java Sat Oct  2 12:44:32 2010
@@ -67,7 +67,7 @@ final class FreqProxTermsWriter extends 
 
       for (final TermsHashConsumerPerField i : fields) {
         final FreqProxTermsWriterPerField perField = (FreqProxTermsWriterPerField) i;
-        if (perField.termsHashPerField.numPostings > 0)
+        if (perField.termsHashPerField.bytesHash.size() > 0)
           allFields.add(perField);
       }
     }
@@ -116,7 +116,7 @@ final class FreqProxTermsWriter extends 
 
       for(int i=0;i<fields.length;i++) {
         TermsHashPerField perField = fields[i].termsHashPerField;
-        int numPostings = perField.numPostings;
+        int numPostings = perField.bytesHash.size();
         perField.reset();
         perField.shrinkHash(numPostings);
         fields[i].reset();

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java?rev=1003790&r1=1003789&r2=1003790&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java Sat Oct  2 12:44:32 2010
@@ -144,7 +144,7 @@ final class FreqProxTermsWriterPerField 
       }
     } else {
       if (docState.docID != postings.lastDocIDs[termID]) {
-        assert docState.docID > postings.lastDocIDs[termID];
+        assert docState.docID > postings.lastDocIDs[termID]:"id: "+docState.docID + " postings ID: "+ postings.lastDocIDs[termID] + " termID: "+termID;
         // Term not yet seen in the current doc but previously
         // seen in other doc(s) since the last flush
 

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java?rev=1003790&r1=1003789&r2=1003790&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java Sat Oct  2 12:44:32 2010
@@ -22,6 +22,7 @@ import java.io.IOException;
 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
 import org.apache.lucene.document.Fieldable;
 import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ByteBlockPool;
 import org.apache.lucene.util.BytesRef;
 
 final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
@@ -80,7 +81,7 @@ final class TermVectorsTermsWriterPerFie
 
       assert perThread.doc.docID == docState.docID;
 
-      if (termsHashPerField.numPostings != 0) {
+      if (termsHashPerField.bytesHash.size() != 0) {
         // Only necessary if previous doc hit a
         // non-aborting exception while writing vectors in
         // this field:
@@ -106,7 +107,7 @@ final class TermVectorsTermsWriterPerFie
 
     assert docState.testPoint("TermVectorsTermsWriterPerField.finish start");
 
-    final int numPostings = termsHashPerField.numPostings;
+    final int numPostings = termsHashPerField.bytesHash.size();
 
     final BytesRef flushTerm = perThread.flushTerm;
 

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashPerField.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashPerField.java?rev=1003790&r1=1003789&r2=1003790&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashPerField.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashPerField.java Sat Oct  2 12:44:32 2010
@@ -18,15 +18,19 @@ package org.apache.lucene.index;
  */
 
 import java.io.IOException;
-import java.util.Arrays;
 import java.util.Comparator;
+import java.util.concurrent.atomic.AtomicLong;
 
 import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
 import org.apache.lucene.document.Fieldable;
+import org.apache.lucene.util.ByteBlockPool;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.BytesRefHash;
+import org.apache.lucene.util.BytesRefHash.BytesStartArray;
+import org.apache.lucene.util.BytesRefHash.MaxBytesLengthExceededException;
 
 final class TermsHashPerField extends InvertedDocConsumerPerField {
+  private static final int HASH_INIT_SIZE = 4;
 
   final TermsHashConsumerPerField consumer;
 
@@ -46,16 +50,12 @@ final class TermsHashPerField extends In
 
   final FieldInfo fieldInfo;
 
-  boolean postingsCompacted;
-  int numPostings;
-  private int postingsHashSize = 4;
-  private int postingsHashHalfSize = postingsHashSize/2;
-  private int postingsHashMask = postingsHashSize-1;
-  private int[] postingsHash;
+  // nocommit - how to communicate byte usage to DocumentsWriter?
+  final BytesRefHash bytesHash;
  
   ParallelPostingsArray postingsArray;
-  private final BytesRef utf8;
-  private Comparator<BytesRef> termComp;
+  private final BytesRef termBytesRef;
+  private final AtomicLong bytesUsed;
 
   public TermsHashPerField(DocInverterPerField docInverterPerField, final TermsHashPerThread perThread, final TermsHashPerThread nextPerThread, final FieldInfo fieldInfo) {
     this.perThread = perThread;
@@ -63,18 +63,15 @@ final class TermsHashPerField extends In
     bytePool = perThread.bytePool;
     termBytePool = perThread.termBytePool;
     docState = perThread.docState;
-
-    postingsHash = new int[postingsHashSize];
-    Arrays.fill(postingsHash, -1);
-    bytesUsed(postingsHashSize * RamUsageEstimator.NUM_BYTES_INT);
+    bytesUsed =  perThread.termsHash.trackAllocations?perThread.termsHash.docWriter.bytesUsed:new AtomicLong();
 
     fieldState = docInverterPerField.fieldState;
     this.consumer = perThread.consumer.addField(this, fieldInfo);
-    initPostingsArray();
-
+    PostingsBytesStartArray byteStarts = new PostingsBytesStartArray(this, bytesUsed);
+    bytesHash = new BytesRefHash(termBytePool, HASH_INIT_SIZE, byteStarts); 
     streamCount = consumer.getStreamCount();
     numPostingInt = 2*streamCount;
-    utf8 = perThread.utf8;
+    termBytesRef = perThread.termBytesRef;
     this.fieldInfo = fieldInfo;
     if (nextPerThread != null)
       nextPerField = (TermsHashPerField) nextPerThread.addField(docInverterPerField, fieldInfo);
@@ -82,48 +79,14 @@ final class TermsHashPerField extends In
       nextPerField = null;
   }
 
-  private void initPostingsArray() {
-    postingsArray = consumer.createPostingsArray(2);
-    bytesUsed(postingsArray.size * postingsArray.bytesPerPosting());
-  }
-
-  // sugar: just forwards to DW
-  private void bytesUsed(long size) {
-    if (perThread.termsHash.trackAllocations) {
-      perThread.termsHash.docWriter.bytesUsed(size);
-    }
-  }
-  
   void shrinkHash(int targetSize) {
-    assert postingsCompacted || numPostings == 0;
-
-    final int newSize = 4;
-    if (newSize != postingsHash.length) {
-      final long previousSize = postingsHash.length;
-      postingsHash = new int[newSize];
-      bytesUsed((newSize-previousSize)*RamUsageEstimator.NUM_BYTES_INT);
-      Arrays.fill(postingsHash, -1);
-      postingsHashSize = newSize;
-      postingsHashHalfSize = newSize/2;
-      postingsHashMask = newSize-1;
-    }
-
-    // Fully free the postings array on each flush:
-    if (postingsArray != null) {
-      bytesUsed(-postingsArray.bytesPerPosting() * postingsArray.size);
-      postingsArray = null;
-    }
+    // Fully free the bytesHash on each flush but keep the pool untouched
+    // bytesHash.clear will clear the ByteStartArray and in turn the ParallelPostingsArray too
+    bytesHash.clear(false); 
   }
 
   public void reset() {
-    if (!postingsCompacted)
-      compactPostings();
-    assert numPostings <= postingsHash.length;
-    if (numPostings > 0) {
-      Arrays.fill(postingsHash, 0, numPostings, -1);
-      numPostings = 0;
-    }
-    postingsCompacted = false;
+    bytesHash.clear(false);
     if (nextPerField != null)
       nextPerField.reset();
   }
@@ -134,12 +97,6 @@ final class TermsHashPerField extends In
     if (nextPerField != null)
       nextPerField.abort();
   }
-  
-  private final void growParallelPostingsArray() {
-    int oldSize = postingsArray.size;
-    this.postingsArray = this.postingsArray.grow();
-    bytesUsed(postingsArray.bytesPerPosting() * (postingsArray.size - oldSize));
-  }
 
   public void initReader(ByteSliceReader reader, int termID, int stream) {
     assert stream < streamCount;
@@ -151,139 +108,12 @@ final class TermsHashPerField extends In
                 ints[upto+stream]);
   }
 
-  private synchronized void compactPostings() {
-    int upto = 0;
-    for(int i=0;i<postingsHashSize;i++) {
-      if (postingsHash[i] != -1) {
-        if (upto < i) {
-          postingsHash[upto] = postingsHash[i];
-          postingsHash[i] = -1;
-        }
-        upto++;
-      }
-    }
-
-    assert upto == numPostings;
-    postingsCompacted = true;
-  }
 
   /** Collapse the hash table & sort in-place. */
   public int[] sortPostings(Comparator<BytesRef> termComp) {
-    this.termComp = termComp;
-    compactPostings();
-    quickSort(postingsHash, 0, numPostings-1);
-    return postingsHash;
+   return bytesHash.sort(termComp);
   }
 
-  void quickSort(int[] termIDs, int lo, int hi) {
-    if (lo >= hi)
-      return;
-    else if (hi == 1+lo) {
-      if (comparePostings(termIDs[lo], termIDs[hi]) > 0) {
-        final int tmp = termIDs[lo];
-        termIDs[lo] = termIDs[hi];
-        termIDs[hi] = tmp;
-      }
-      return;
-    }
-
-    int mid = (lo + hi) >>> 1;
-
-    if (comparePostings(termIDs[lo], termIDs[mid]) > 0) {
-      int tmp = termIDs[lo];
-      termIDs[lo] = termIDs[mid];
-      termIDs[mid] = tmp;
-    }
-
-    if (comparePostings(termIDs[mid], termIDs[hi]) > 0) {
-      int tmp = termIDs[mid];
-      termIDs[mid] = termIDs[hi];
-      termIDs[hi] = tmp;
-
-      if (comparePostings(termIDs[lo], termIDs[mid]) > 0) {
-        int tmp2 = termIDs[lo];
-        termIDs[lo] = termIDs[mid];
-        termIDs[mid] = tmp2;
-      }
-    }
-
-    int left = lo + 1;
-    int right = hi - 1;
-
-    if (left >= right)
-      return;
-
-    int partition = termIDs[mid];
-
-    for (; ;) {
-      while (comparePostings(termIDs[right], partition) > 0)
-        --right;
-
-      while (left < right && comparePostings(termIDs[left], partition) <= 0)
-        ++left;
-
-      if (left < right) {
-        int tmp = termIDs[left];
-        termIDs[left] = termIDs[right];
-        termIDs[right] = tmp;
-        --right;
-      } else {
-        break;
-      }
-    }
-
-    quickSort(termIDs, lo, left);
-    quickSort(termIDs, left + 1, hi);
-  }
-
-  /** Compares term text for two Posting instance and
-   *  returns -1 if p1 < p2; 1 if p1 > p2; else 0. */
-  int comparePostings(int term1, int term2) {
-
-    if (term1 == term2) {
-      // Our quicksort does this, eg during partition
-      return 0;
-    }
-
-    termBytePool.setBytesRef(perThread.tr1, postingsArray.textStarts[term1]);
-    termBytePool.setBytesRef(perThread.tr2, postingsArray.textStarts[term2]);
-
-    return termComp.compare(perThread.tr1, perThread.tr2);
-  }
-
-  /** Test whether the text for current RawPostingList p equals
-   *  current tokenText in utf8. */
-  private boolean postingEquals(final int termID) {
-    final int textStart = postingsArray.textStarts[termID];
-    final byte[] text = termBytePool.buffers[textStart >> DocumentsWriter.BYTE_BLOCK_SHIFT];
-    assert text != null;
-
-    int pos = textStart & DocumentsWriter.BYTE_BLOCK_MASK;
-    
-    final int len;
-    if ((text[pos] & 0x80) == 0) {
-      // length is 1 byte
-      len = text[pos];
-      pos += 1;
-    } else {
-      // length is 2 bytes
-      len = (text[pos]&0x7f) + ((text[pos+1]&0xff)<<7);
-      pos += 2;
-    }
-
-    if (len == utf8.length) {
-      final byte[] utf8Bytes = utf8.bytes;
-      for(int tokenPos=0;tokenPos<utf8.length;pos++,tokenPos++) {
-        if (utf8Bytes[tokenPos] != text[pos]) {
-          return false;
-        }
-      }
-      return true;
-    } else {
-      return false;
-    }
-  }
-  
   private boolean doCall;
   private boolean doNextCall;
 
@@ -299,10 +129,7 @@ final class TermsHashPerField extends In
   @Override
   boolean start(Fieldable[] fields, int count) throws IOException {
     doCall = consumer.start(fields, count);
-    if (postingsArray == null) {
-      initPostingsArray();
-    }
-
+    bytesHash.reinit();
     if (nextPerField != null)
       doNextCall = nextPerField.start(fields, count);
     return doCall || doNextCall;
@@ -312,52 +139,15 @@ final class TermsHashPerField extends In
   // because token text has already been "interned" into
   // textStart, so we hash by textStart
   public void add(int textStart) throws IOException {
-    int code = textStart;
-
-    int hashPos = code & postingsHashMask;
-
-    assert !postingsCompacted;
-
-    // Locate RawPostingList in hash
-    int termID = postingsHash[hashPos];
-
-    if (termID != -1 && postingsArray.textStarts[termID] != textStart) {
-      // Conflict: keep searching different locations in
-      // the hash table.
-      final int inc = ((code>>8)+code)|1;
-      do {
-        code += inc;
-        hashPos = code & postingsHashMask;
-        termID = postingsHash[hashPos];
-      } while (termID != -1 && postingsArray.textStarts[termID] != textStart);
-    }
-
-    if (termID == -1) {
-
+    int termID = bytesHash.addByPoolOffset(textStart);
+    if (termID >= 0) {      // New posting
       // First time we are seeing this token since we last
       // flushed the hash.
-
-      // New posting
-      termID = numPostings++;
-      if (termID >= postingsArray.size) {
-        growParallelPostingsArray();
-      }
-
-      assert termID >= 0;
-
-      postingsArray.textStarts[termID] = textStart;
-          
-      assert postingsHash[hashPos] == -1;
-      postingsHash[hashPos] = termID;
-
-      if (numPostings == postingsHashHalfSize)
-        rehashPostings(2*postingsHashSize);
-
       // Init stream slices
       if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE)
         intPool.nextBuffer();
 
-      if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE)
+      if (ByteBlockPool.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE)
         bytePool.nextBuffer();
 
       intUptos = intPool.buffer;
@@ -375,6 +165,7 @@ final class TermsHashPerField extends In
       consumer.newTerm(termID);
 
     } else {
+      termID = (-termID)-1;
       int intStart = postingsArray.intStarts[termID];
       intUptos = intPool.buffers[intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
       intUptoStart = intStart & DocumentsWriter.INT_BLOCK_MASK;
@@ -386,105 +177,39 @@ final class TermsHashPerField extends In
   @Override
   void add() throws IOException {
 
-    assert !postingsCompacted;
-
     // We are first in the chain so we must "intern" the
     // term text into textStart address
-
     // Get the text & hash of this term.
-    int code = termAtt.toBytesRef(utf8);
-
-    int hashPos = code & postingsHashMask;
-
-    // Locate RawPostingList in hash
-    int termID = postingsHash[hashPos];
-
-    if (termID != -1 && !postingEquals(termID)) {
-      // Conflict: keep searching different locations in
-      // the hash table.
-      final int inc = ((code>>8)+code)|1;
-      do {
-        code += inc;
-        hashPos = code & postingsHashMask;
-        termID = postingsHash[hashPos];
-      } while (termID != -1 && !postingEquals(termID));
-    }
-
-    if (termID == -1) {
-
-      // First time we are seeing this token since we last
-      // flushed the hash.
-      final int textLen2 = 2+utf8.length;
-      if (textLen2 + bytePool.byteUpto > DocumentsWriter.BYTE_BLOCK_SIZE) {
-        // Not enough room in current block
-
-        if (utf8.length > DocumentsWriter.MAX_TERM_LENGTH_UTF8) {
-          // Just skip this term, to remain as robust as
-          // possible during indexing.  A TokenFilter
-          // can be inserted into the analyzer chain if
-          // other behavior is wanted (pruning the term
-          // to a prefix, throwing an exception, etc).
-          if (docState.maxTermPrefix == null) {
-            final int saved = utf8.length;
-            try {
-              utf8.length = Math.min(30, DocumentsWriter.MAX_TERM_LENGTH_UTF8);
-              docState.maxTermPrefix = utf8.toString();
-            } finally {
-              utf8.length = saved;
-            }
-          }
-
-          consumer.skippingLongTerm();
-          return;
+    int termID;
+    try{
+       termID = bytesHash.add(termBytesRef, termAtt.toBytesRef(termBytesRef));
+    }catch (MaxBytesLengthExceededException e) {
+      // Not enough room in current block
+      // Just skip this term, to remain as robust as
+      // possible during indexing.  A TokenFilter
+      // can be inserted into the analyzer chain if
+      // other behavior is wanted (pruning the term
+      // to a prefix, throwing an exception, etc).
+      if (docState.maxTermPrefix == null) {
+        final int saved = termBytesRef.length;
+        try {
+          termBytesRef.length = Math.min(30, DocumentsWriter.MAX_TERM_LENGTH_UTF8);
+          docState.maxTermPrefix = termBytesRef.toString();
+        } finally {
+          termBytesRef.length = saved;
         }
-        bytePool.nextBuffer();
-      }
-
-      // New posting
-      termID = numPostings++;
-      if (termID >= postingsArray.size) {
-        growParallelPostingsArray();
       }
-
-      assert termID != -1;
-      assert postingsHash[hashPos] == -1;
-
-      postingsHash[hashPos] = termID;
-
-      final byte[] text = bytePool.buffer;
-      final int textUpto = bytePool.byteUpto;
-      postingsArray.textStarts[termID] = textUpto + bytePool.byteOffset;
-
-      // We first encode the length, followed by the UTF8
-      // bytes.  Length is encoded as vInt, but will consume
-      // 1 or 2 bytes at most (we reject too-long terms,
-      // above).
-
-      // encode length @ start of bytes
-      if (utf8.length < 128) {
-        // 1 byte to store length
-        text[textUpto] = (byte) utf8.length;
-        bytePool.byteUpto += utf8.length + 1;
-        System.arraycopy(utf8.bytes, 0, text, textUpto+1, utf8.length);
-      } else {
-        // 2 byte to store length
-        text[textUpto] = (byte) (0x80 | (utf8.length & 0x7f));
-        text[textUpto+1] = (byte) ((utf8.length>>7) & 0xff);
-        bytePool.byteUpto += utf8.length + 2;
-        System.arraycopy(utf8.bytes, 0, text, textUpto+2, utf8.length);
-      }
-
-      if (numPostings == postingsHashHalfSize) {
-        rehashPostings(2*postingsHashSize);
-        bytesUsed(2*numPostings * RamUsageEstimator.NUM_BYTES_INT);
-      }
-
+      consumer.skippingLongTerm();
+      return;
+    }
+    if (termID >= 0) {// New posting
+      bytesHash.byteStart(termID);
       // Init stream slices
       if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE) {
         intPool.nextBuffer();
       }
 
-      if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE) {
+      if (ByteBlockPool.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE) {
         bytePool.nextBuffer();
       }
 
@@ -503,6 +228,7 @@ final class TermsHashPerField extends In
       consumer.newTerm(termID);
 
     } else {
+      termID = (-termID)-1;
       final int intStart = postingsArray.intStarts[termID];
       intUptos = intPool.buffers[intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
       intUptoStart = intStart & DocumentsWriter.INT_BLOCK_MASK;
@@ -518,9 +244,9 @@ final class TermsHashPerField extends In
 
   void writeByte(int stream, byte b) {
     int upto = intUptos[intUptoStart+stream];
-    byte[] bytes = bytePool.buffers[upto >> DocumentsWriter.BYTE_BLOCK_SHIFT];
+    byte[] bytes = bytePool.buffers[upto >> ByteBlockPool.BYTE_BLOCK_SHIFT];
     assert bytes != null;
-    int offset = upto & DocumentsWriter.BYTE_BLOCK_MASK;
+    int offset = upto & ByteBlockPool.BYTE_BLOCK_MASK;
     if (bytes[offset] != 0) {
       // End of slice; allocate a new one
       offset = bytePool.allocSlice(bytes, offset);
@@ -553,61 +279,51 @@ final class TermsHashPerField extends In
     if (nextPerField != null)
       nextPerField.finish();
   }
+  
+  private static final class PostingsBytesStartArray extends BytesStartArray {
 
-  /** Called when postings hash is too small (> 50%
-   *  occupied) or too large (< 20% occupied). */
-  void rehashPostings(final int newSize) {
-
-    final int newMask = newSize-1;
-
-    int[] newHash = new int[newSize];
-    Arrays.fill(newHash, -1);
-    for(int i=0;i<postingsHashSize;i++) {
-      int termID = postingsHash[i];
-      if (termID != -1) {
-        int code;
-        if (perThread.primary) {
-          final int textStart = postingsArray.textStarts[termID];
-          final int start = textStart & DocumentsWriter.BYTE_BLOCK_MASK;
-          final byte[] text = bytePool.buffers[textStart >> DocumentsWriter.BYTE_BLOCK_SHIFT];
-          code = 0;
-
-          final int len;
-          int pos;
-          if ((text[start] & 0x80) == 0) {
-            // length is 1 byte
-            len = text[start];
-            pos = start+1;
-          } else {
-            len = (text[start]&0x7f) + ((text[start+1]&0xff)<<7);
-            pos = start+2;
-          }
-
-          final int endPos = pos+len;
-          while(pos < endPos) {
-            code = (code*31) + text[pos++];
-          }
-        } else {
-          code = postingsArray.textStarts[termID];
-        }
+    private final TermsHashPerField perField;
+    private final AtomicLong bytesUsed;
 
-        int hashPos = code & newMask;
-        assert hashPos >= 0;
-        if (newHash[hashPos] != -1) {
-          final int inc = ((code>>8)+code)|1;
-          do {
-            code += inc;
-            hashPos = code & newMask;
-          } while (newHash[hashPos] != -1);
-        }
-        newHash[hashPos] = termID;
+    private PostingsBytesStartArray(
+        TermsHashPerField perField, AtomicLong bytesUsed) {
+      this.perField = perField;
+      this.bytesUsed = bytesUsed;
+    }
+    
+    @Override
+    public int[] init() {
+      if(perField.postingsArray == null) { 
+        perField.postingsArray = perField.consumer.createPostingsArray(2);
+        bytesUsed.addAndGet(perField.postingsArray.size * perField.postingsArray.bytesPerPosting());
+      }
+      return perField.postingsArray.textStarts;
+    }
+
+    @Override
+    public int[] grow() {
+      ParallelPostingsArray postingsArray = perField.postingsArray;
+      final int oldSize = perField.postingsArray.size;
+      postingsArray = perField.postingsArray = postingsArray.grow();
+      bytesUsed
+          .addAndGet((postingsArray.bytesPerPosting() * (postingsArray.size - oldSize)));
+      return postingsArray.textStarts;
+    }
+
+    @Override
+    public int[] clear() {
+      if(perField.postingsArray != null) {
+        bytesUsed.addAndGet(-perField.postingsArray.size * perField.postingsArray.bytesPerPosting());
+        perField.postingsArray = null;
       }
+      return null;
     }
 
-    postingsHashMask = newMask;
-    postingsHash = newHash;
+    @Override
+    public AtomicLong bytesUsed() {
+      return bytesUsed;
+    }
 
-    postingsHashSize = newSize;
-    postingsHashHalfSize = newSize >> 1;
   }
+
 }

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashPerThread.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashPerThread.java?rev=1003790&r1=1003789&r2=1003790&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashPerThread.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashPerThread.java Sat Oct  2 12:44:32 2010
@@ -17,8 +17,8 @@ package org.apache.lucene.index;
  * limitations under the License.
  */
 
+import org.apache.lucene.util.ByteBlockPool;
 import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.UnicodeUtil;
 
 import java.io.IOException;
 
@@ -26,20 +26,17 @@ final class TermsHashPerThread extends I
 
   final TermsHash termsHash;
   final TermsHashConsumerPerThread consumer;
-  final TermsHashPerThread nextPerThread;
+  final TermsHashPerThread nextPerThread; // the secondary is currently consumed by TermVectorsWriter 
+  // see secondary entry point in TermsHashPerField#add(int)
 
   final IntBlockPool intPool;
   final ByteBlockPool bytePool;
   final ByteBlockPool termBytePool;
+  
   final boolean primary;
   final DocumentsWriter.DocState docState;
-
-  // Used when comparing postings via termRefComp, in TermsHashPerField
-  final BytesRef tr1 = new BytesRef();
-  final BytesRef tr2 = new BytesRef();
-
-  // Used by perField:
-  final BytesRef utf8 = new BytesRef(10);
+  // Used by perField to obtain terms from the analysis chain
+  final BytesRef termBytesRef = new BytesRef(10);
 
   public TermsHashPerThread(DocInverterPerThread docInverterPerThread, final TermsHash termsHash, final TermsHash nextTermsHash, final TermsHashPerThread primaryPerThread) {
     docState = docInverterPerThread.docState;
@@ -48,21 +45,18 @@ final class TermsHashPerThread extends I
     this.consumer = termsHash.consumer.addThread(this);
 
     intPool = new IntBlockPool(termsHash.docWriter);
-    bytePool = new ByteBlockPool(termsHash.docWriter.byteBlockAllocator);
-
-    if (nextTermsHash != null) {
+    bytePool = new ByteBlockPool(termsHash.docWriter.byteBlockAllocator); // use the allocator from the docWriter which tracks the used bytes 
+    primary = nextTermsHash != null;
+    if (primary) {
       // We are primary
-      primary = true;
       termBytePool = bytePool;
+      nextPerThread = nextTermsHash.addThread(docInverterPerThread, this); // this will be the primaryPerThread in the secondary
+      assert nextPerThread != null;
     } else {
-      primary = false;
-      termBytePool = primaryPerThread.bytePool;
-    }
-
-    if (nextTermsHash != null)
-      nextPerThread = nextTermsHash.addThread(docInverterPerThread, this);
-    else
+      assert primaryPerThread != null;
+      termBytePool = primaryPerThread.bytePool; // we are secondary and share the byte pool with the primary 
       nextPerThread = null;
+    }
   }
 
   @Override
@@ -74,30 +68,25 @@ final class TermsHashPerThread extends I
   synchronized public void abort() {
     reset(true);
     consumer.abort();
-    if (nextPerThread != null)
+    if (primary)
       nextPerThread.abort();
   }
 
   @Override
   public void startDocument() throws IOException {
     consumer.startDocument();
-    if (nextPerThread != null)
+    if (primary)
       nextPerThread.consumer.startDocument();
   }
 
   @Override
   public DocumentsWriter.DocWriter finishDocument() throws IOException {
     final DocumentsWriter.DocWriter doc = consumer.finishDocument();
-
-    final DocumentsWriter.DocWriter doc2;
-    if (nextPerThread != null)
-      doc2 = nextPerThread.consumer.finishDocument();
-    else
-      doc2 = null;
+    final DocumentsWriter.DocWriter docFromSecondary = primary? nextPerThread.consumer.finishDocument():null;
     if (doc == null)
-      return doc2;
+      return docFromSecondary;
     else {
-      doc.setNext(doc2);
+      doc.setNext(docFromSecondary);
       return doc;
     }
   }
@@ -106,9 +95,5 @@ final class TermsHashPerThread extends I
   void reset(boolean recyclePostings) {
     intPool.reset();
     bytePool.reset();
-
-    if (primary) {
-      bytePool.reset();
-    }
   }
 }

Copied: lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/ByteBlockPool.java (from r1002577, lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/ByteBlockPool.java)
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/ByteBlockPool.java?p2=lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/ByteBlockPool.java&p1=lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/ByteBlockPool.java&r1=1002577&r2=1003790&rev=1003790&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/ByteBlockPool.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/ByteBlockPool.java Sat Oct  2 12:44:32 2010
@@ -1,4 +1,4 @@
-package org.apache.lucene.index;
+package org.apache.lucene.util;
 
 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
@@ -34,26 +34,42 @@ package org.apache.lucene.index;
  * hit a non-zero byte. */
 
 import java.util.Arrays;
-import org.apache.lucene.util.BytesRef;
+
+
 import java.util.List;
 import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_OBJECT_REF;
-import org.apache.lucene.util.ArrayUtil;
 
-final class ByteBlockPool {
+public final class ByteBlockPool {
+  public final static int BYTE_BLOCK_SHIFT = 15;
+  public final static int BYTE_BLOCK_SIZE = 1 << BYTE_BLOCK_SHIFT;
+  public final static int BYTE_BLOCK_MASK = BYTE_BLOCK_SIZE - 1;
+
+  public abstract static class Allocator {
+    protected final int blockSize;
+
+    public Allocator(int blockSize) {
+      this.blockSize = blockSize;
+    }
 
-  abstract static class Allocator {
-    abstract void recycleByteBlocks(byte[][] blocks, int start, int end);
-    abstract void recycleByteBlocks(List<byte[]> blocks);
-    abstract byte[] getByteBlock();
+    public abstract void recycleByteBlocks(byte[][] blocks, int start, int end);
+
+    public void recycleByteBlocks(List<byte[]> blocks) {
+      final byte[][] b = blocks.toArray(new byte[blocks.size()][]);
+      recycleByteBlocks(b, 0, b.length);
+    }
+
+    public byte[] getByteBlock() {
+      return new byte[blockSize];
+    }
   }
 
   public byte[][] buffers = new byte[10][];
 
   int bufferUpto = -1;                        // Which buffer we are upto
-  public int byteUpto = DocumentsWriter.BYTE_BLOCK_SIZE;             // Where we are in head buffer
+  public int byteUpto = BYTE_BLOCK_SIZE;             // Where we are in head buffer
 
   public byte[] buffer;                              // Current head buffer
-  public int byteOffset = -DocumentsWriter.BYTE_BLOCK_SIZE;          // Current head offset
+  public int byteOffset = -BYTE_BLOCK_SIZE;          // Current head offset
 
   private final Allocator allocator;
 
@@ -95,11 +111,11 @@ final class ByteBlockPool {
     bufferUpto++;
 
     byteUpto = 0;
-    byteOffset += DocumentsWriter.BYTE_BLOCK_SIZE;
+    byteOffset += BYTE_BLOCK_SIZE;
   }
 
   public int newSlice(final int size) {
-    if (byteUpto > DocumentsWriter.BYTE_BLOCK_SIZE-size)
+    if (byteUpto > BYTE_BLOCK_SIZE-size)
       nextBuffer();
     final int upto = byteUpto;
     byteUpto += size;
@@ -112,9 +128,10 @@ final class ByteBlockPool {
   // is just a compact way to encode X+1 with a max.  Second
   // array is the length of each slice, ie first slice is 5
   // bytes, next slice is 14 bytes, etc.
-  final static int[] nextLevelArray = {1, 2, 3, 4, 5, 6, 7, 8, 9, 9};
-  final static int[] levelSizeArray = {5, 14, 20, 30, 40, 40, 80, 80, 120, 200};
-  final static int FIRST_LEVEL_SIZE = levelSizeArray[0];
+  
+  public final static int[] nextLevelArray = {1, 2, 3, 4, 5, 6, 7, 8, 9, 9};
+  public final static int[] levelSizeArray = {5, 14, 20, 30, 40, 40, 80, 80, 120, 200};
+  public final static int FIRST_LEVEL_SIZE = levelSizeArray[0];
 
   public int allocSlice(final byte[] slice, final int upto) {
 
@@ -123,7 +140,7 @@ final class ByteBlockPool {
     final int newSize = levelSizeArray[newLevel];
 
     // Maybe allocate another block
-    if (byteUpto > DocumentsWriter.BYTE_BLOCK_SIZE-newSize)
+    if (byteUpto > BYTE_BLOCK_SIZE-newSize)
       nextBuffer();
 
     final int newUpto = byteUpto;
@@ -150,9 +167,9 @@ final class ByteBlockPool {
 
   // Fill in a BytesRef from term's length & bytes encoded in
   // byte block
-  final BytesRef setBytesRef(BytesRef term, int textStart) {
-    final byte[] bytes = term.bytes = buffers[textStart >> DocumentsWriter.BYTE_BLOCK_SHIFT];
-    int pos = textStart & DocumentsWriter.BYTE_BLOCK_MASK;
+  public final BytesRef setBytesRef(BytesRef term, int textStart) {
+    final byte[] bytes = term.bytes = buffers[textStart >> BYTE_BLOCK_SHIFT];
+    int pos = textStart & BYTE_BLOCK_MASK;
     if ((bytes[pos] & 0x80) == 0) {
       // length is 1 byte
       term.length = bytes[pos];

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/BytesRef.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/BytesRef.java?rev=1003790&r1=1003789&r2=1003790&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/BytesRef.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/BytesRef.java Sat Oct  2 12:44:32 2010
@@ -30,6 +30,7 @@ import java.io.IOException;
  *  @lucene.experimental */
 public final class BytesRef implements Comparable<BytesRef>, Externalizable {
 
+  static final int HASH_PRIME = 31;
   public static final byte[] EMPTY_BYTES = new byte[0]; 
 
   /** The contents of the BytesRef. Should never be {@code null}. */
@@ -182,11 +183,10 @@ public final class BytesRef implements C
    */
   @Override
   public int hashCode() {
-    final int prime = 31;
     int result = 0;
     final int end = offset + length;
     for(int i=offset;i<end;i++) {
-      result = prime * result + bytes[i];
+      result = HASH_PRIME * result + bytes[i];
     }
     return result;
   }

Added: lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/BytesRefHash.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/BytesRefHash.java?rev=1003790&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/BytesRefHash.java (added)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/BytesRefHash.java Sat Oct  2 12:44:32 2010
@@ -0,0 +1,574 @@
+package org.apache.lucene.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.concurrent.atomic.AtomicLong;
+
+import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_MASK;
+import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE;
+import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SHIFT;
+
+/**
+ * {@link BytesRefHash} is a special purpose hash-map like data-structure
+ * optimized for {@link BytesRef} instances. BytesRefHash maintains mappings of
+ * byte arrays to ordinal (Map<BytesRef,int>) storing the hashed bytes
+ * efficiently in continuous storage. The mapping to the ordinal is
+ * encapsulated inside {@link BytesRefHash} and is guaranteed to be increased
+ * for each added {@link BytesRef}.
+ * 
+ * <p>
+ * Note: The maximum capacity {@link BytesRef} instance passed to
+ * {@link #add(BytesRef)} must not be longer than {@link #BYTES_BLOCK_SIZE}-2 (
+ * {@value #BYTES_BLOCK_SIZE}-2. The internal storage is limited to 2GB total
+ * byte storage.
+ * </p>
+ * 
+ * @lucene.internal
+ */
+public final class BytesRefHash {
+
+  private final ByteBlockPool pool;
+  private int hashSize;
+  private int hashHalfSize;
+  private int hashMask;
+  private int count;
+  private int lastCount = -1;
+  private int[] ords;
+  private int[] bytesStart;
+  public static final int DEFAULT_CAPACITY = 16;
+  private final BytesStartArray bytesStartArray;
+  private AtomicLong bytesUsed;
+
+  /**
+   * Creates a new {@link BytesRefHash}
+   */
+  public BytesRefHash(ByteBlockPool pool) {
+    this(pool, DEFAULT_CAPACITY, new DirectBytesStartArray(DEFAULT_CAPACITY));
+  }
+
+  /**
+   * Creates a new {@link BytesRefHash}
+   */
+  public BytesRefHash(ByteBlockPool pool, int capacity,
+      BytesStartArray bytesStartArray) {
+    hashSize = capacity;
+    hashHalfSize = hashSize >> 1;
+    hashMask = hashSize - 1;
+    this.pool = pool;
+    ords = new int[hashSize];
+    Arrays.fill(ords, -1);
+    this.bytesStartArray = bytesStartArray;
+    bytesStart = bytesStartArray.init();
+    bytesUsed = bytesStartArray.bytesUsed();
+    bytesUsed.addAndGet(hashSize * RamUsageEstimator.NUM_BYTES_INT);
+  }
+
+  /**
+   * Returns the number of {@link BytesRef} values in this {@link BytesRefHash}.
+   * 
+   * @return the number of {@link BytesRef} values in this {@link BytesRefHash}.
+   */
+  public int size() {
+    return count;
+  }
+
+  /**
+   * Returns the {@link BytesRef} value for the given ord.
+   * <p>
+   * Note: the given ord must be a positive integer less that the current size (
+   * {@link #size()})
+   * </p>
+   * 
+   * @param ord
+   *          the ord
+   * 
+   * @return a BytesRef instance for the given ord
+   */
+  public BytesRef get(int ord) {
+    assert bytesStart != null : "bytesStart is null - not initialized";
+    assert ord < bytesStart.length: "ord exceeeds byteStart len: " + bytesStart.length;
+    return pool.setBytesRef(scratch1, bytesStart[ord]);
+  }
+
+  /**
+   * Returns the ords array in arbitrary order. Valid ords start at offset of 0
+   * and end at a limit of {@link #size()} - 1
+   * <p>
+   * Note: This is a destructive operation. {@link #clear()} must be called in
+   * order to reuse this {@link BytesRefHash} instance.
+   * </p>
+   */
+  public int[] compact() {
+    assert bytesStart != null : "Bytesstart is null - not initialized";
+    int upto = 0;
+    for (int i = 0; i < hashSize; i++) {
+      if (ords[i] != -1) {
+        if (upto < i) {
+          ords[upto] = ords[i];
+          ords[i] = -1;
+        }
+        upto++;
+      }
+    }
+
+    assert upto == count;
+    lastCount = count;
+    return ords;
+  }
+
+  /**
+   * Returns the values array sorted by the referenced byte values.
+   * <p>
+   * Note: This is a destructive operation. {@link #clear()} must be called in
+   * order to reuse this {@link BytesRefHash} instance.
+   * </p>
+   * 
+   * @param comp
+   *          the {@link Comparator} used for sorting
+   */
+  public int[] sort(Comparator<BytesRef> comp) {
+    assert bytesStart != null : "Bytesstart is null - not initialized";
+    final int[] compact = compact();
+    quickSort(comp, compact, 0, count - 1);
+    return compact;
+  }
+
+  private void quickSort(Comparator<BytesRef> comp, int[] entries, int lo,
+      int hi) {
+    if (lo >= hi)
+      return;
+    if (hi == 1 + lo) {
+      if (compare(comp, entries[lo], entries[hi]) > 0) {
+        final int tmp = entries[lo];
+        entries[lo] = entries[hi];
+        entries[hi] = tmp;
+      }
+      return;
+    }
+    final int mid = (lo + hi) >>> 1;
+    if (compare(comp, entries[lo], entries[mid]) > 0) {
+      int tmp = entries[lo];
+      entries[lo] = entries[mid];
+      entries[mid] = tmp;
+    }
+
+    if (compare(comp, entries[mid], entries[hi]) > 0) {
+      int tmp = entries[mid];
+      entries[mid] = entries[hi];
+      entries[hi] = tmp;
+
+      if (compare(comp, entries[lo], entries[mid]) > 0) {
+        int tmp2 = entries[lo];
+        entries[lo] = entries[mid];
+        entries[mid] = tmp2;
+      }
+    }
+    int left = lo + 1;
+    int right = hi - 1;
+
+    if (left >= right)
+      return;
+
+    final int partition = entries[mid];
+
+    for (;;) {
+      while (compare(comp, entries[right], partition) > 0)
+        --right;
+
+      while (left < right && compare(comp, entries[left], partition) <= 0)
+        ++left;
+
+      if (left < right) {
+        final int tmp = entries[left];
+        entries[left] = entries[right];
+        entries[right] = tmp;
+        --right;
+      } else {
+        break;
+      }
+    }
+
+    quickSort(comp, entries, lo, left);
+    quickSort(comp, entries, left + 1, hi);
+  }
+
+  private final BytesRef scratch1 = new BytesRef();
+  private final BytesRef scratch2 = new BytesRef();
+
+  private boolean equals(int ord, BytesRef b) {
+    return pool.setBytesRef(scratch1, bytesStart[ord]).bytesEquals(b);
+  }
+
+  private int compare(Comparator<BytesRef> comp, int ord1, int ord2) {
+    assert bytesStart.length > ord1 && bytesStart.length > ord2;
+    return comp.compare(pool.setBytesRef(scratch1, bytesStart[ord1]),
+        pool.setBytesRef(scratch2, bytesStart[ord2]));
+  }
+
+  private boolean shrink(int targetSize) {
+    // Cannot use ArrayUtil.shrink because we require power
+    // of 2:
+    int newSize = hashSize;
+    while (newSize >= 8 && newSize / 4 > targetSize) {
+      newSize /= 2;
+    }
+    if (newSize != hashSize) {
+      bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT
+          * -(hashSize - newSize));
+      hashSize = newSize;
+      ords = new int[hashSize];
+      Arrays.fill(ords, -1);
+      hashHalfSize = newSize / 2;
+      hashMask = newSize - 1;
+      return true;
+    } else {
+      return false;
+    }
+  }
+
+  /**
+   * Clears the {@link BytesRef} and returns an {@link Entry} which maps to the
+   * given {@link BytesRef}
+   */
+  public void clear(boolean resetPool) {
+    lastCount = count;
+    count = 0;
+    if (resetPool)
+      pool.reset();
+    bytesStart = bytesStartArray.clear();
+    if (lastCount != -1 && shrink(lastCount)) {
+      // shrink clears the hash entries
+      return;
+    }
+    Arrays.fill(ords, -1);
+  }
+
+  public void clear() {
+    clear(true);
+  }
+
+  /**
+   * Adds a new {@link BytesRef}
+   * 
+   * @param bytes
+   *          the bytes to hash
+   * @return the ord the given bytes are hashed if there was no mapping for the
+   *         given bytes, otherwise <code>(-(ord)-1)</code>. This guarantees
+   *         that the return value will always be &gt;= 0 if the given bytes
+   *         haven't been hashed before.
+   * 
+   * @throws MaxBytesLengthExceededException
+   *           if the given bytes are > 2 +
+   *           {@link ByteBlockPool#BYTE_BLOCK_SIZE}
+   */
+  public int add(BytesRef bytes) {
+    return add(bytes, bytes.hashCode());
+  }
+
+  /**
+   * Adds a new {@link BytesRef} with a pre-calculated hash code.
+   * 
+   * @param bytes
+   *          the bytes to hash
+   * @param code
+   *          the bytes hash code
+   * 
+   *          <p>
+   *          Hashcode is defined as:
+   * 
+   *          <pre>
+   * int hash = 0;
+   * for (int i = offset; i &lt; offset + length; i++) {
+   *   hash = 31 * hash + bytes[i];
+   * }
+   * </pre>
+   * 
+   * @return the ord the given bytes are hashed if there was no mapping for the
+   *         given bytes, otherwise <code>(-(ord)-1)</code>. This guarantees
+   *         that the return value will always be &gt;= 0 if the given bytes
+   *         haven't been hashed before.
+   * 
+   * @throws MaxBytesLengthExceededException
+   *           if the given bytes are > 2 +
+   *           {@link ByteBlockPool#BYTE_BLOCK_SIZE}
+   */
+  public int add(BytesRef bytes, int code) {
+    assert bytesStart != null : "Bytesstart is null - not initialized";
+    final int length = bytes.length;
+    // final position
+    int hashPos = code & hashMask;
+    int e = ords[hashPos];
+    if (e != -1 && !equals(e, bytes)) {
+      // Conflict: keep searching different locations in
+      // the hash table.
+      final int inc = ((code >> 8) + code) | 1;
+      do {
+        code += inc;
+        hashPos = code & hashMask;
+        e = ords[hashPos];
+      } while (e != -1 && !equals(e, bytes));
+    }
+
+    if (e == -1) {
+      // new entry
+      final int len2 = 2 + bytes.length;
+      if (len2 + pool.byteUpto > BYTE_BLOCK_SIZE) {
+        if (len2 > BYTE_BLOCK_SIZE) {
+          throw new MaxBytesLengthExceededException("bytes can be at most "
+              + (BYTE_BLOCK_SIZE - 2) + " in length; got " + bytes.length);
+        }
+        pool.nextBuffer();
+      }
+      final byte[] buffer = pool.buffer;
+      final int bufferUpto = pool.byteUpto;
+      if (count >= bytesStart.length) {
+        bytesStart = bytesStartArray.grow();
+        assert count < bytesStart.length + 1 : "count: " + count + " len: "
+            + bytesStart.length;
+      }
+      e = count++;
+
+      bytesStart[e] = bufferUpto + pool.byteOffset;
+
+      // We first encode the length, followed by the
+      // bytes. Length is encoded as vInt, but will consume
+      // 1 or 2 bytes at most (we reject too-long terms,
+      // above).
+      if (length < 128) {
+        // 1 byte to store length
+        buffer[bufferUpto] = (byte) length;
+        pool.byteUpto += length + 1;
+        System.arraycopy(bytes.bytes, bytes.offset, buffer, bufferUpto + 1,
+            length);
+      } else {
+        // 2 byte to store length
+        buffer[bufferUpto] = (byte) (0x80 | (length & 0x7f));
+        buffer[bufferUpto + 1] = (byte) ((length >> 7) & 0xff);
+        pool.byteUpto += length + 2;
+        System.arraycopy(bytes.bytes, bytes.offset, buffer, bufferUpto + 2,
+            length);
+      }
+      assert ords[hashPos] == -1;
+      ords[hashPos] = e;
+
+      if (count == hashHalfSize) {
+        rehash(2 * hashSize, true);
+      }
+      return e;
+    }
+    return -(e + 1);
+  }
+
+  public int addByPoolOffset(int offset) {
+    assert bytesStart != null : "Bytesstart is null - not initialized";
+    // final position
+    int code = offset;
+    int hashPos = offset & hashMask;
+    int e = ords[hashPos];
+    if (e != -1 && bytesStart[e] != offset) {
+      // Conflict: keep searching different locations in
+      // the hash table.
+      final int inc = ((code >> 8) + code) | 1;
+      do {
+        code += inc;
+        hashPos = code & hashMask;
+        e = ords[hashPos];
+      } while (e != -1 && bytesStart[e] != offset);
+    }
+    if (e == -1) {
+      // new entry
+      if (count >= bytesStart.length) {
+        bytesStart = bytesStartArray.grow();
+        assert count < bytesStart.length + 1 : "count: " + count + " len: "
+            + bytesStart.length;
+      }
+      e = count++;
+      bytesStart[e] = offset;
+      assert ords[hashPos] == -1;
+      ords[hashPos] = e;
+
+      if (count == hashHalfSize) {
+        rehash(2 * hashSize, false);
+      }
+      return e;
+    }
+    return -(e + 1);
+  }
+
+  /**
+   * Called when hash is too small (> 50% occupied) or too large (< 20%
+   * occupied).
+   */
+  private void rehash(final int newSize, boolean hashOnData) {
+    final int newMask = newSize - 1;
+    bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * (newSize));
+    final int[] newHash = new int[newSize];
+    Arrays.fill(newHash, -1);
+    for (int i = 0; i < hashSize; i++) {
+      final int e0 = ords[i];
+      if (e0 != -1) {
+        int code;
+        if (hashOnData) {
+          final int off = bytesStart[e0];
+          final int start = off & BYTE_BLOCK_MASK;
+          final byte[] bytes = pool.buffers[off >> BYTE_BLOCK_SHIFT];
+          code = 0;
+          final int len;
+          int pos;
+          if ((bytes[start] & 0x80) == 0) {
+            // length is 1 byte
+            len = bytes[start];
+            pos = start + 1;
+          } else {
+            len = (bytes[start] & 0x7f) + ((bytes[start + 1] & 0xff) << 7);
+            pos = start + 2;
+          }
+
+          final int endPos = pos + len;
+          while (pos < endPos) {
+            code = BytesRef.HASH_PRIME * code + bytes[pos++];
+          }
+        } else {
+          code = bytesStart[e0];
+        }
+
+        int hashPos = code & newMask;
+        assert hashPos >= 0;
+        if (newHash[hashPos] != -1) {
+          final int inc = ((code >> 8) + code) | 1;
+          do {
+            code += inc;
+            hashPos = code & newMask;
+          } while (newHash[hashPos] != -1);
+        }
+        newHash[hashPos] = e0;
+      }
+    }
+
+    hashMask = newMask;
+    bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * (-ords.length));
+    ords = newHash;
+    hashSize = newSize;
+    hashHalfSize = newSize / 2;
+  }
+
+  /**
+   * reinitializes the {@link BytesRefHash} after a previous {@link #clear()}
+   * call. If {@link #clear()} has not been called previously this method has no
+   * effect.
+   */
+  public void reinit() {
+    if (bytesStart == null)
+      bytesStart = bytesStartArray.init();
+  }
+
+  /**
+   * Returns the bytesStart offset into the internally used
+   * {@link ByteBlockPool} for the given ord
+   * 
+   * @param ord
+   *          the ord to look up
+   * @return the bytesStart offset into the internally used
+   *         {@link ByteBlockPool} for the given ord
+   */
+  public int byteStart(int ord) {
+    assert bytesStart != null : "Bytesstart is null - not initialized";
+    assert ord >= 0 && ord < count : ord;
+    return bytesStart[ord];
+  }
+
+  /**
+   * Thrown if a {@link BytesRef} exceeds the {@link BytesRefHash} limit of
+   * {@link #BYTES_BLOCK_SIZE}-2 ({@value #BYTES_BLOCK_SIZE}-2).
+   */
+  @SuppressWarnings("serial")
+  public static class MaxBytesLengthExceededException extends RuntimeException {
+    MaxBytesLengthExceededException(String message) {
+      super(message);
+    }
+  }
+
+  public abstract static class BytesStartArray {
+    /**
+     * Initializes the BytesStartArray. This call will allocate memory
+     * 
+     * @return the initialized bytes start array
+     */
+    public abstract int[] init();
+
+    /**
+     * Grows the {@link BytesStartArray}
+     * 
+     * @return the grown array
+     */
+    public abstract int[] grow();
+
+    /**
+     * clears the {@link BytesStartArray} and returns the cleared instance.
+     * 
+     * @return the cleared instance, this might be <code>null</code>
+     */
+    public abstract int[] clear();
+
+    /**
+     * A {@link AtomicLong} reference holding the number of bytes used by this
+     * {@link BytesStartArray}. The {@link BytesRefHash} uses this reference to
+     * track it memory usage
+     * 
+     * @return a {@link AtomicLong} reference holding the number of bytes used
+     *         by this {@link BytesStartArray}.
+     */
+    public abstract AtomicLong bytesUsed();
+  }
+
+  static class DirectBytesStartArray extends BytesStartArray {
+
+    private final int initSize;
+    private int[] bytesStart;
+    private final AtomicLong bytesUsed = new AtomicLong(0);
+
+    DirectBytesStartArray(int initSize) {
+      this.initSize = initSize;
+    }
+
+    @Override
+    public int[] clear() {
+      return bytesStart = null;
+    }
+
+    @Override
+    public int[] grow() {
+      assert bytesStart != null;
+      return bytesStart = ArrayUtil.grow(bytesStart, bytesStart.length + 1);
+    }
+
+    @Override
+    public int[] init() {
+      return bytesStart = new int[ArrayUtil.oversize(initSize,
+          RamUsageEstimator.NUM_BYTES_INT)];
+    }
+
+    @Override
+    public AtomicLong bytesUsed() {
+      return bytesUsed;
+    }
+
+  }
+}

Propchange: lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/BytesRefHash.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/BytesRefHash.java
------------------------------------------------------------------------------
    svn:keywords = Date Author Id Revision HeadURL

Added: lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/RecyclingByteBlockAllocator.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/RecyclingByteBlockAllocator.java?rev=1003790&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/RecyclingByteBlockAllocator.java (added)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/RecyclingByteBlockAllocator.java Sat Oct  2 12:44:32 2010
@@ -0,0 +1,162 @@
+package org.apache.lucene.util;
+
+import java.util.concurrent.atomic.AtomicLong;
+
+import org.apache.lucene.util.ByteBlockPool.Allocator;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A {@link ByteBlockPool.Allocator} implementation that recycles unused byte
+ * blocks in a buffer and reuses them in subsequent calls to
+ * {@link #getByteBlock()}.
+ * 
+ * @lucene.internal
+ */
+public final class RecyclingByteBlockAllocator extends ByteBlockPool.Allocator {
+  private byte[][] freeByteBlocks;
+  private final int maxBufferedBlocks;
+  private int freeBlocks = 0;
+  private final AtomicLong bytesUsed;
+  public static final int DEFAULT_BUFFERED_BLOCKS = 64;
+
+  /**
+   * Creates a new {@link RecyclingByteBlockAllocator}
+   * 
+   * @param blockSize
+   *          the block size in bytes
+   * @param maxBufferedBlocks
+   *          maximum number of buffered byte block
+   * @param bytesUsed
+   *          {@link AtomicLong} reference counting internally allocated bytes
+   * 
+   * @see DummyConcurrentLock
+   */
+  public RecyclingByteBlockAllocator(int blockSize, int maxBufferedBlocks,
+      AtomicLong bytesUsed) {
+    super(blockSize);
+    freeByteBlocks = new byte[Math.min(10, maxBufferedBlocks)][];
+    this.maxBufferedBlocks = maxBufferedBlocks;
+    this.bytesUsed = bytesUsed;
+  }
+
+  /**
+   * Creates a new {@link RecyclingByteBlockAllocator} with a
+   * {@link DummyConcurrentLock} instance.
+   * 
+   * @param blockSize
+   *          the block size in bytes
+   * @param maxBufferedBlocks
+   *          maximum number of buffered byte block
+   */
+  public RecyclingByteBlockAllocator(int blockSize, int maxBufferedBlocks) {
+    this(blockSize, maxBufferedBlocks, new AtomicLong());
+  }
+
+  /**
+   * Creates a new {@link RecyclingByteBlockAllocator} with a block size of
+   * {@link ByteBlockPool#BYTE_BLOCK_SIZE} (
+   * {@value ByteBlockPool#BYTE_BLOCK_SIZE}, upper buffered docs limit of
+   * {@link #DEFAULT_BUFFERED_BLOCKS} ({@value #DEFAULT_BUFFERED_BLOCKS}) and a
+   * {@link DummyConcurrentLock} instance.
+   * 
+   */
+  public RecyclingByteBlockAllocator() {
+    this(ByteBlockPool.BYTE_BLOCK_SIZE, 64, new AtomicLong());
+  }
+
+  @Override
+  public synchronized byte[] getByteBlock() {
+    if (freeBlocks == 0) {
+      bytesUsed.addAndGet(blockSize);
+      return new byte[blockSize];
+    }
+    final byte[] b = freeByteBlocks[--freeBlocks];
+    freeByteBlocks[freeBlocks] = null;
+    return b;
+  }
+
+  @Override
+  public synchronized void recycleByteBlocks(byte[][] blocks, int start, int end) {
+    final int numBlocks = Math.min(maxBufferedBlocks - freeBlocks, end - start);
+    final int size = freeBlocks + numBlocks;
+    if (size >= freeByteBlocks.length) {
+      final byte[][] newBlocks = new byte[ArrayUtil.oversize(size,
+          RamUsageEstimator.NUM_BYTES_OBJ_REF)][];
+      System.arraycopy(freeByteBlocks, 0, newBlocks, 0, freeBlocks);
+      freeByteBlocks = newBlocks;
+    }
+    final int stop = start + numBlocks;
+    for (int i = start; i < stop; i++) {
+      freeByteBlocks[freeBlocks++] = blocks[i];
+      blocks[i] = null;
+    }
+    for (int i = stop; i < end; i++) {
+      blocks[i] = null;
+    }
+    bytesUsed.addAndGet(-(end - stop) * blockSize);
+    assert bytesUsed.get() >= 0;
+  }
+
+  /**
+   * @return the number of currently buffered blocks
+   */
+  public synchronized int numBufferedBlocks() {
+    return freeBlocks;
+  }
+
+  /**
+   * @return the number of bytes currently allocated by this {@link Allocator}
+   */
+  public long bytesUsed() {
+    return bytesUsed.get();
+  }
+
+  /**
+   * @return the maximum number of buffered byte blocks
+   */
+  public int maxBufferedBlocks() {
+    return maxBufferedBlocks;
+  }
+
+  /**
+   * Removes the given number of byte blocks from the buffer if possible.
+   * 
+   * @param num
+   *          the number of byte blocks to remove
+   * @return the number of actually removed buffers
+   */
+  public synchronized int freeBlocks(int num) {
+    assert num >= 0;
+    final int stop;
+    final int count;
+    if (num > freeBlocks) {
+      stop = 0;
+      count = freeBlocks;
+    } else {
+      stop = freeBlocks - num;
+      count = num;
+    }
+    while (freeBlocks > stop) {
+      freeByteBlocks[--freeBlocks] = null;
+    }
+    bytesUsed.addAndGet(-count*blockSize);
+    assert bytesUsed.get() >= 0;
+    return count;
+  }
+}
\ No newline at end of file

Propchange: lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/RecyclingByteBlockAllocator.java
------------------------------------------------------------------------------
    svn:eol-style = native

Propchange: lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/RecyclingByteBlockAllocator.java
------------------------------------------------------------------------------
    svn:keywords = Date Author Id Revision HeadURL

Modified: lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestByteSlices.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestByteSlices.java?rev=1003790&r1=1003789&r2=1003790&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestByteSlices.java (original)
+++ lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestByteSlices.java Sat Oct  2 12:44:32 2010
@@ -14,44 +14,16 @@ package org.apache.lucene.index;
  * limitations under the License.
  */
 
-import java.util.ArrayList;
-import java.util.List;
+import java.util.concurrent.locks.ReentrantLock;
+
+import org.apache.lucene.util.ByteBlockPool;
 import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.RecyclingByteBlockAllocator;
 
 public class TestByteSlices extends LuceneTestCase {
 
-  private static class ByteBlockAllocator extends ByteBlockPool.Allocator {
-    ArrayList<byte[]> freeByteBlocks = new ArrayList<byte[]>();
-    
-    /* Allocate another byte[] from the shared pool */
-    @Override
-    synchronized byte[] getByteBlock() {
-      final int size = freeByteBlocks.size();
-      final byte[] b;
-      if (0 == size)
-        b = new byte[DocumentsWriter.BYTE_BLOCK_SIZE];
-      else
-        b =  freeByteBlocks.remove(size-1);
-      return b;
-    }
-
-    /* Return a byte[] to the pool */
-    @Override
-    synchronized void recycleByteBlocks(byte[][] blocks, int start, int end) {
-      for(int i=start;i<end;i++)
-        freeByteBlocks.add(blocks[i]);
-    }
-
-    @Override
-    synchronized void recycleByteBlocks(List<byte[]> blocks) {
-      final int size = blocks.size();
-      for(int i=0;i<size;i++)
-        freeByteBlocks.add(blocks.get(i));
-    }
-  }
-
   public void testBasic() throws Throwable {
-    ByteBlockPool pool = new ByteBlockPool(new ByteBlockAllocator());
+    ByteBlockPool pool = new ByteBlockPool(new RecyclingByteBlockAllocator(ByteBlockPool.BYTE_BLOCK_SIZE, Integer.MAX_VALUE));
 
     final int NUM_STREAM = 100 * RANDOM_MULTIPLIER;
 

Modified: lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java?rev=1003790&r1=1003789&r2=1003790&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java (original)
+++ lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java Sat Oct  2 12:44:32 2010
@@ -77,6 +77,7 @@ import org.apache.lucene.util._TestUtil;
 import org.apache.lucene.util.ThreadInterruptedException;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.Bits;
+import org.junit.Assume;
 
 public class TestIndexWriter extends LuceneTestCase {