You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by si...@apache.org on 2010/10/02 14:44:35 UTC
svn commit: r1003790 [1/2] - in /lucene/dev/trunk/lucene/src:
java/org/apache/lucene/index/ java/org/apache/lucene/util/
test/org/apache/lucene/index/ test/org/apache/lucene/util/
Author: simonw
Date: Sat Oct 2 12:44:32 2010
New Revision: 1003790
URL: http://svn.apache.org/viewvc?rev=1003790&view=rev
Log:
LUCENE-2662: Refactored TermsHashPerField to utilize ByteRefHash
Added:
lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/ByteBlockPool.java
- copied, changed from r1002577, lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/ByteBlockPool.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/BytesRefHash.java (with props)
lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/RecyclingByteBlockAllocator.java (with props)
lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/TestBytesRefHash.java (with props)
lucene/dev/trunk/lucene/src/test/org/apache/lucene/util/TestRecyclingByteBlockAllocator.java (with props)
Removed:
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/ByteBlockPool.java
Modified:
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/ByteSliceReader.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/ByteSliceWriter.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FreqProxFieldMergeState.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashPerField.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashPerThread.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/BytesRef.java
lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestByteSlices.java
lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/ByteSliceReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/ByteSliceReader.java?rev=1003790&r1=1003789&r2=1003790&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/ByteSliceReader.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/ByteSliceReader.java Sat Oct 2 12:44:32 2010
@@ -21,6 +21,7 @@ import java.io.IOException;
import org.apache.lucene.store.DataInput;
import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.util.ByteBlockPool;
/* IndexInput that knows how to read the byte slices written
* by Posting and PostingVector. We read the bytes in
@@ -48,16 +49,16 @@ final class ByteSliceReader extends Data
this.endIndex = endIndex;
level = 0;
- bufferUpto = startIndex / DocumentsWriter.BYTE_BLOCK_SIZE;
- bufferOffset = bufferUpto * DocumentsWriter.BYTE_BLOCK_SIZE;
+ bufferUpto = startIndex / ByteBlockPool.BYTE_BLOCK_SIZE;
+ bufferOffset = bufferUpto * ByteBlockPool.BYTE_BLOCK_SIZE;
buffer = pool.buffers[bufferUpto];
- upto = startIndex & DocumentsWriter.BYTE_BLOCK_MASK;
+ upto = startIndex & ByteBlockPool.BYTE_BLOCK_MASK;
final int firstSize = ByteBlockPool.levelSizeArray[0];
if (startIndex+firstSize >= endIndex) {
// There is only this one slice to read
- limit = endIndex & DocumentsWriter.BYTE_BLOCK_MASK;
+ limit = endIndex & ByteBlockPool.BYTE_BLOCK_MASK;
} else
limit = upto+firstSize-4;
}
@@ -102,11 +103,11 @@ final class ByteSliceReader extends Data
level = ByteBlockPool.nextLevelArray[level];
final int newSize = ByteBlockPool.levelSizeArray[level];
- bufferUpto = nextIndex / DocumentsWriter.BYTE_BLOCK_SIZE;
- bufferOffset = bufferUpto * DocumentsWriter.BYTE_BLOCK_SIZE;
+ bufferUpto = nextIndex / ByteBlockPool.BYTE_BLOCK_SIZE;
+ bufferOffset = bufferUpto * ByteBlockPool.BYTE_BLOCK_SIZE;
buffer = pool.buffers[bufferUpto];
- upto = nextIndex & DocumentsWriter.BYTE_BLOCK_MASK;
+ upto = nextIndex & ByteBlockPool.BYTE_BLOCK_MASK;
if (nextIndex + newSize >= endIndex) {
// We are advancing to the final slice
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/ByteSliceWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/ByteSliceWriter.java?rev=1003790&r1=1003789&r2=1003790&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/ByteSliceWriter.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/ByteSliceWriter.java Sat Oct 2 12:44:32 2010
@@ -1,6 +1,7 @@
package org.apache.lucene.index;
import org.apache.lucene.store.DataOutput;
+import org.apache.lucene.util.ByteBlockPool;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -42,9 +43,9 @@ final class ByteSliceWriter extends Data
* Set up the writer to write at address.
*/
public void init(int address) {
- slice = pool.buffers[address >> DocumentsWriter.BYTE_BLOCK_SHIFT];
+ slice = pool.buffers[address >> ByteBlockPool.BYTE_BLOCK_SHIFT];
assert slice != null;
- upto = address & DocumentsWriter.BYTE_BLOCK_MASK;
+ upto = address & ByteBlockPool.BYTE_BLOCK_MASK;
offset0 = address;
assert upto < slice.length;
}
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java?rev=1003790&r1=1003789&r2=1003790&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/DocumentsWriter.java Sat Oct 2 12:44:32 2010
@@ -27,6 +27,8 @@ import java.util.Map;
import java.util.HashSet;
import java.util.List;
import java.util.Map.Entry;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.concurrent.locks.ReentrantLock;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.document.Document;
@@ -41,8 +43,11 @@ import org.apache.lucene.store.Directory
import org.apache.lucene.store.RAMFile;
import org.apache.lucene.util.ArrayUtil;
import org.apache.lucene.util.Constants;
+import org.apache.lucene.util.RecyclingByteBlockAllocator;
import org.apache.lucene.util.ThreadInterruptedException;
import org.apache.lucene.util.RamUsageEstimator;
+import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_MASK;
+import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE;
/**
* This class accepts multiple added documents and directly
@@ -113,6 +118,7 @@ import org.apache.lucene.util.RamUsageEs
final class DocumentsWriter {
+ final AtomicLong bytesUsed = new AtomicLong(0);
IndexWriter writer;
Directory directory;
@@ -195,6 +201,7 @@ final class DocumentsWriter {
/**
* RAMFile buffer for DocWriters.
*/
+ @SuppressWarnings("serial")
class PerDocBuffer extends RAMFile {
/**
@@ -257,9 +264,12 @@ final class DocumentsWriter {
final TermsHashConsumer termVectorsWriter = new TermVectorsTermsWriter(documentsWriter);
final TermsHashConsumer freqProxWriter = new FreqProxTermsWriter();
-
- final InvertedDocConsumer termsHash = new TermsHash(documentsWriter, true, freqProxWriter,
- new TermsHash(documentsWriter, false, termVectorsWriter, null));
+ /*
+ * nesting TermsHash instances here to allow the secondary (TermVectors) share the interned postings
+ * via a shared ByteBlockPool. See TermsHashPerField for details.
+ */
+ final TermsHash termVectorsTermHash = new TermsHash(documentsWriter, false, termVectorsWriter, null);
+ final InvertedDocConsumer termsHash = new TermsHash(documentsWriter, true, freqProxWriter, termVectorsTermHash);
final NormsWriter normsWriter = new NormsWriter();
final DocInverter docInverter = new DocInverter(termsHash, normsWriter);
return new DocFieldProcessor(documentsWriter, docInverter);
@@ -637,7 +647,7 @@ final class DocumentsWriter {
for(int i=0;i<threadStates.length;i++)
threads.add(threadStates[i].consumer);
- final long startNumBytesUsed = numBytesUsed;
+ final long startNumBytesUsed = bytesUsed();
consumer.flush(threads, flushState);
if (infoStream != null) {
@@ -963,7 +973,7 @@ final class DocumentsWriter {
synchronized boolean deletesFull() {
return (ramBufferSize != IndexWriterConfig.DISABLE_AUTO_FLUSH &&
- (deletesInRAM.bytesUsed + deletesFlushed.bytesUsed + numBytesUsed) >= ramBufferSize) ||
+ (deletesInRAM.bytesUsed + deletesFlushed.bytesUsed + bytesUsed()) >= ramBufferSize) ||
(maxBufferedDeleteTerms != IndexWriterConfig.DISABLE_AUTO_FLUSH &&
((deletesInRAM.size() + deletesFlushed.size()) >= maxBufferedDeleteTerms));
}
@@ -1256,11 +1266,9 @@ final class DocumentsWriter {
final SkipDocWriter skipDocWriter = new SkipDocWriter();
long getRAMUsed() {
- return numBytesUsed + deletesInRAM.bytesUsed + deletesFlushed.bytesUsed;
+ return bytesUsed() + deletesInRAM.bytesUsed + deletesFlushed.bytesUsed;
}
- long numBytesUsed;
-
NumberFormat nf = NumberFormat.getInstance();
// Coarse estimates used to measure RAM usage of buffered deletes
@@ -1295,63 +1303,12 @@ final class DocumentsWriter {
/* Initial chunks size of the shared byte[] blocks used to
store postings data */
- final static int BYTE_BLOCK_SHIFT = 15;
- final static int BYTE_BLOCK_SIZE = 1 << BYTE_BLOCK_SHIFT;
- final static int BYTE_BLOCK_MASK = BYTE_BLOCK_SIZE - 1;
final static int BYTE_BLOCK_NOT_MASK = ~BYTE_BLOCK_MASK;
/* if you increase this, you must fix field cache impl for
* getTerms/getTermsIndex requires <= 32768 */
final static int MAX_TERM_LENGTH_UTF8 = BYTE_BLOCK_SIZE-2;
- private class ByteBlockAllocator extends ByteBlockPool.Allocator {
- final int blockSize;
-
- ByteBlockAllocator(int blockSize) {
- this.blockSize = blockSize;
- }
-
- ArrayList<byte[]> freeByteBlocks = new ArrayList<byte[]>();
-
- /* Allocate another byte[] from the shared pool */
- @Override
- byte[] getByteBlock() {
- synchronized(DocumentsWriter.this) {
- final int size = freeByteBlocks.size();
- final byte[] b;
- if (0 == size) {
- b = new byte[blockSize];
- numBytesUsed += blockSize;
- } else
- b = freeByteBlocks.remove(size-1);
- return b;
- }
- }
-
- /* Return byte[]'s to the pool */
-
- @Override
- void recycleByteBlocks(byte[][] blocks, int start, int end) {
- synchronized(DocumentsWriter.this) {
- for(int i=start;i<end;i++) {
- freeByteBlocks.add(blocks[i]);
- blocks[i] = null;
- }
- }
- }
-
- @Override
- void recycleByteBlocks(List<byte[]> blocks) {
- synchronized(DocumentsWriter.this) {
- final int size = blocks.size();
- for(int i=0;i<size;i++) {
- freeByteBlocks.add(blocks.get(i));
- blocks.set(i, null);
- }
- }
- }
- }
-
/* Initial chunks size of the shared int[] blocks used to
store postings data */
final static int INT_BLOCK_SHIFT = 13;
@@ -1366,14 +1323,14 @@ final class DocumentsWriter {
final int[] b;
if (0 == size) {
b = new int[INT_BLOCK_SIZE];
- numBytesUsed += INT_BLOCK_SIZE*INT_NUM_BYTE;
+ bytesUsed.addAndGet(INT_BLOCK_SIZE*INT_NUM_BYTE);
} else
b = freeIntBlocks.remove(size-1);
return b;
}
- synchronized void bytesUsed(long numBytes) {
- numBytesUsed += numBytes;
+ private long bytesUsed() {
+ return bytesUsed.get();
}
/* Return int[]s to the pool */
@@ -1384,11 +1341,11 @@ final class DocumentsWriter {
}
}
- ByteBlockAllocator byteBlockAllocator = new ByteBlockAllocator(BYTE_BLOCK_SIZE);
+ final RecyclingByteBlockAllocator byteBlockAllocator = new RecyclingByteBlockAllocator(BYTE_BLOCK_SIZE, Integer.MAX_VALUE, bytesUsed);
final static int PER_DOC_BLOCK_SIZE = 1024;
- final ByteBlockAllocator perDocAllocator = new ByteBlockAllocator(PER_DOC_BLOCK_SIZE);
+ final RecyclingByteBlockAllocator perDocAllocator = new RecyclingByteBlockAllocator(PER_DOC_BLOCK_SIZE, Integer.MAX_VALUE, bytesUsed);
String toMB(long v) {
return nf.format(v/1024./1024.);
@@ -1415,19 +1372,19 @@ final class DocumentsWriter {
}
deletesRAMUsed = deletesInRAM.bytesUsed+deletesFlushed.bytesUsed;
- doBalance = numBytesUsed+deletesRAMUsed >= ramBufferSize;
+ doBalance = bytesUsed() +deletesRAMUsed >= ramBufferSize;
}
if (doBalance) {
if (infoStream != null)
- message(" RAM: now balance allocations: usedMB=" + toMB(numBytesUsed) +
+ message(" RAM: now balance allocations: usedMB=" + toMB(bytesUsed()) +
" vs trigger=" + toMB(ramBufferSize) +
" deletesMB=" + toMB(deletesRAMUsed) +
- " byteBlockFree=" + toMB(byteBlockAllocator.freeByteBlocks.size()*BYTE_BLOCK_SIZE) +
- " perDocFree=" + toMB(perDocAllocator.freeByteBlocks.size()*PER_DOC_BLOCK_SIZE));
+ " byteBlockFree=" + toMB(byteBlockAllocator.bytesUsed()) +
+ " perDocFree=" + toMB(perDocAllocator.bytesUsed()));
- final long startBytesUsed = numBytesUsed + deletesRAMUsed;
+ final long startBytesUsed = bytesUsed() + deletesRAMUsed;
int iter = 0;
@@ -1437,16 +1394,16 @@ final class DocumentsWriter {
boolean any = true;
- while(numBytesUsed+deletesRAMUsed > freeLevel) {
+ while(bytesUsed()+deletesRAMUsed > freeLevel) {
synchronized(this) {
- if (0 == perDocAllocator.freeByteBlocks.size() &&
- 0 == byteBlockAllocator.freeByteBlocks.size() &&
+ if (0 == perDocAllocator.numBufferedBlocks() &&
+ 0 == byteBlockAllocator.numBufferedBlocks() &&
0 == freeIntBlocks.size() && !any) {
// Nothing else to free -- must flush now.
- bufferIsFull = numBytesUsed+deletesRAMUsed > ramBufferSize;
+ bufferIsFull = bytesUsed()+deletesRAMUsed > ramBufferSize;
if (infoStream != null) {
- if (numBytesUsed+deletesRAMUsed > ramBufferSize)
+ if (bytesUsed()+deletesRAMUsed > ramBufferSize)
message(" nothing to free; now set bufferIsFull");
else
message(" nothing to free");
@@ -1454,25 +1411,15 @@ final class DocumentsWriter {
break;
}
- if ((0 == iter % 4) && byteBlockAllocator.freeByteBlocks.size() > 0) {
- byteBlockAllocator.freeByteBlocks.remove(byteBlockAllocator.freeByteBlocks.size()-1);
- numBytesUsed -= BYTE_BLOCK_SIZE;
+ if ((0 == iter % 4) && byteBlockAllocator.numBufferedBlocks() > 0) {
+ byteBlockAllocator.freeBlocks(1);
}
-
if ((1 == iter % 4) && freeIntBlocks.size() > 0) {
freeIntBlocks.remove(freeIntBlocks.size()-1);
- numBytesUsed -= INT_BLOCK_SIZE * INT_NUM_BYTE;
+ bytesUsed.addAndGet(-INT_BLOCK_SIZE * INT_NUM_BYTE);
}
-
- if ((2 == iter % 4) && perDocAllocator.freeByteBlocks.size() > 0) {
- // Remove upwards of 32 blocks (each block is 1K)
- for (int i = 0; i < 32; ++i) {
- perDocAllocator.freeByteBlocks.remove(perDocAllocator.freeByteBlocks.size() - 1);
- numBytesUsed -= PER_DOC_BLOCK_SIZE;
- if (perDocAllocator.freeByteBlocks.size() == 0) {
- break;
- }
- }
+ if ((2 == iter % 4) && perDocAllocator.numBufferedBlocks() > 0) {
+ perDocAllocator.freeBlocks(32); // Remove upwards of 32 blocks (each block is 1K)
}
}
@@ -1484,7 +1431,7 @@ final class DocumentsWriter {
}
if (infoStream != null)
- message(" after free: freedMB=" + nf.format((startBytesUsed-numBytesUsed-deletesRAMUsed)/1024./1024.) + " usedMB=" + nf.format((numBytesUsed+deletesRAMUsed)/1024./1024.));
+ message(" after free: freedMB=" + nf.format((startBytesUsed-bytesUsed()-deletesRAMUsed)/1024./1024.) + " usedMB=" + nf.format((bytesUsed()+deletesRAMUsed)/1024./1024.));
}
}
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FreqProxFieldMergeState.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FreqProxFieldMergeState.java?rev=1003790&r1=1003789&r2=1003790&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FreqProxFieldMergeState.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FreqProxFieldMergeState.java Sat Oct 2 12:44:32 2010
@@ -19,6 +19,8 @@ package org.apache.lucene.index;
import java.io.IOException;
import java.util.Comparator;
+
+import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.index.FreqProxTermsWriterPerField.FreqProxPostingsArray;
@@ -50,7 +52,7 @@ final class FreqProxFieldMergeState {
public FreqProxFieldMergeState(FreqProxTermsWriterPerField field, Comparator<BytesRef> termComp) {
this.field = field;
- this.numPostings = field.termsHashPerField.numPostings;
+ this.numPostings = field.termsHashPerField.bytesHash.size();
this.bytePool = field.perThread.termsHashPerThread.bytePool;
this.termIDs = field.termsHashPerField.sortPostings(termComp);
this.postings = (FreqProxPostingsArray) field.termsHashPerField.postingsArray;
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java?rev=1003790&r1=1003789&r2=1003790&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriter.java Sat Oct 2 12:44:32 2010
@@ -67,7 +67,7 @@ final class FreqProxTermsWriter extends
for (final TermsHashConsumerPerField i : fields) {
final FreqProxTermsWriterPerField perField = (FreqProxTermsWriterPerField) i;
- if (perField.termsHashPerField.numPostings > 0)
+ if (perField.termsHashPerField.bytesHash.size() > 0)
allFields.add(perField);
}
}
@@ -116,7 +116,7 @@ final class FreqProxTermsWriter extends
for(int i=0;i<fields.length;i++) {
TermsHashPerField perField = fields[i].termsHashPerField;
- int numPostings = perField.numPostings;
+ int numPostings = perField.bytesHash.size();
perField.reset();
perField.shrinkHash(numPostings);
fields[i].reset();
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java?rev=1003790&r1=1003789&r2=1003790&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/FreqProxTermsWriterPerField.java Sat Oct 2 12:44:32 2010
@@ -144,7 +144,7 @@ final class FreqProxTermsWriterPerField
}
} else {
if (docState.docID != postings.lastDocIDs[termID]) {
- assert docState.docID > postings.lastDocIDs[termID];
+ assert docState.docID > postings.lastDocIDs[termID]:"id: "+docState.docID + " postings ID: "+ postings.lastDocIDs[termID] + " termID: "+termID;
// Term not yet seen in the current doc but previously
// seen in other doc(s) since the last flush
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java?rev=1003790&r1=1003789&r2=1003790&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermVectorsTermsWriterPerField.java Sat Oct 2 12:44:32 2010
@@ -22,6 +22,7 @@ import java.io.IOException;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.document.Fieldable;
import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef;
final class TermVectorsTermsWriterPerField extends TermsHashConsumerPerField {
@@ -80,7 +81,7 @@ final class TermVectorsTermsWriterPerFie
assert perThread.doc.docID == docState.docID;
- if (termsHashPerField.numPostings != 0) {
+ if (termsHashPerField.bytesHash.size() != 0) {
// Only necessary if previous doc hit a
// non-aborting exception while writing vectors in
// this field:
@@ -106,7 +107,7 @@ final class TermVectorsTermsWriterPerFie
assert docState.testPoint("TermVectorsTermsWriterPerField.finish start");
- final int numPostings = termsHashPerField.numPostings;
+ final int numPostings = termsHashPerField.bytesHash.size();
final BytesRef flushTerm = perThread.flushTerm;
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashPerField.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashPerField.java?rev=1003790&r1=1003789&r2=1003790&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashPerField.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashPerField.java Sat Oct 2 12:44:32 2010
@@ -18,15 +18,19 @@ package org.apache.lucene.index;
*/
import java.io.IOException;
-import java.util.Arrays;
import java.util.Comparator;
+import java.util.concurrent.atomic.AtomicLong;
import org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute;
import org.apache.lucene.document.Fieldable;
+import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.RamUsageEstimator;
+import org.apache.lucene.util.BytesRefHash;
+import org.apache.lucene.util.BytesRefHash.BytesStartArray;
+import org.apache.lucene.util.BytesRefHash.MaxBytesLengthExceededException;
final class TermsHashPerField extends InvertedDocConsumerPerField {
+ private static final int HASH_INIT_SIZE = 4;
final TermsHashConsumerPerField consumer;
@@ -46,16 +50,12 @@ final class TermsHashPerField extends In
final FieldInfo fieldInfo;
- boolean postingsCompacted;
- int numPostings;
- private int postingsHashSize = 4;
- private int postingsHashHalfSize = postingsHashSize/2;
- private int postingsHashMask = postingsHashSize-1;
- private int[] postingsHash;
+ // nocommit - how to communicate byte usage to DocumentsWriter?
+ final BytesRefHash bytesHash;
ParallelPostingsArray postingsArray;
- private final BytesRef utf8;
- private Comparator<BytesRef> termComp;
+ private final BytesRef termBytesRef;
+ private final AtomicLong bytesUsed;
public TermsHashPerField(DocInverterPerField docInverterPerField, final TermsHashPerThread perThread, final TermsHashPerThread nextPerThread, final FieldInfo fieldInfo) {
this.perThread = perThread;
@@ -63,18 +63,15 @@ final class TermsHashPerField extends In
bytePool = perThread.bytePool;
termBytePool = perThread.termBytePool;
docState = perThread.docState;
-
- postingsHash = new int[postingsHashSize];
- Arrays.fill(postingsHash, -1);
- bytesUsed(postingsHashSize * RamUsageEstimator.NUM_BYTES_INT);
+ bytesUsed = perThread.termsHash.trackAllocations?perThread.termsHash.docWriter.bytesUsed:new AtomicLong();
fieldState = docInverterPerField.fieldState;
this.consumer = perThread.consumer.addField(this, fieldInfo);
- initPostingsArray();
-
+ PostingsBytesStartArray byteStarts = new PostingsBytesStartArray(this, bytesUsed);
+ bytesHash = new BytesRefHash(termBytePool, HASH_INIT_SIZE, byteStarts);
streamCount = consumer.getStreamCount();
numPostingInt = 2*streamCount;
- utf8 = perThread.utf8;
+ termBytesRef = perThread.termBytesRef;
this.fieldInfo = fieldInfo;
if (nextPerThread != null)
nextPerField = (TermsHashPerField) nextPerThread.addField(docInverterPerField, fieldInfo);
@@ -82,48 +79,14 @@ final class TermsHashPerField extends In
nextPerField = null;
}
- private void initPostingsArray() {
- postingsArray = consumer.createPostingsArray(2);
- bytesUsed(postingsArray.size * postingsArray.bytesPerPosting());
- }
-
- // sugar: just forwards to DW
- private void bytesUsed(long size) {
- if (perThread.termsHash.trackAllocations) {
- perThread.termsHash.docWriter.bytesUsed(size);
- }
- }
-
void shrinkHash(int targetSize) {
- assert postingsCompacted || numPostings == 0;
-
- final int newSize = 4;
- if (newSize != postingsHash.length) {
- final long previousSize = postingsHash.length;
- postingsHash = new int[newSize];
- bytesUsed((newSize-previousSize)*RamUsageEstimator.NUM_BYTES_INT);
- Arrays.fill(postingsHash, -1);
- postingsHashSize = newSize;
- postingsHashHalfSize = newSize/2;
- postingsHashMask = newSize-1;
- }
-
- // Fully free the postings array on each flush:
- if (postingsArray != null) {
- bytesUsed(-postingsArray.bytesPerPosting() * postingsArray.size);
- postingsArray = null;
- }
+ // Fully free the bytesHash on each flush but keep the pool untouched
+ // bytesHash.clear will clear the ByteStartArray and in turn the ParallelPostingsArray too
+ bytesHash.clear(false);
}
public void reset() {
- if (!postingsCompacted)
- compactPostings();
- assert numPostings <= postingsHash.length;
- if (numPostings > 0) {
- Arrays.fill(postingsHash, 0, numPostings, -1);
- numPostings = 0;
- }
- postingsCompacted = false;
+ bytesHash.clear(false);
if (nextPerField != null)
nextPerField.reset();
}
@@ -134,12 +97,6 @@ final class TermsHashPerField extends In
if (nextPerField != null)
nextPerField.abort();
}
-
- private final void growParallelPostingsArray() {
- int oldSize = postingsArray.size;
- this.postingsArray = this.postingsArray.grow();
- bytesUsed(postingsArray.bytesPerPosting() * (postingsArray.size - oldSize));
- }
public void initReader(ByteSliceReader reader, int termID, int stream) {
assert stream < streamCount;
@@ -151,139 +108,12 @@ final class TermsHashPerField extends In
ints[upto+stream]);
}
- private synchronized void compactPostings() {
- int upto = 0;
- for(int i=0;i<postingsHashSize;i++) {
- if (postingsHash[i] != -1) {
- if (upto < i) {
- postingsHash[upto] = postingsHash[i];
- postingsHash[i] = -1;
- }
- upto++;
- }
- }
-
- assert upto == numPostings;
- postingsCompacted = true;
- }
/** Collapse the hash table & sort in-place. */
public int[] sortPostings(Comparator<BytesRef> termComp) {
- this.termComp = termComp;
- compactPostings();
- quickSort(postingsHash, 0, numPostings-1);
- return postingsHash;
+ return bytesHash.sort(termComp);
}
- void quickSort(int[] termIDs, int lo, int hi) {
- if (lo >= hi)
- return;
- else if (hi == 1+lo) {
- if (comparePostings(termIDs[lo], termIDs[hi]) > 0) {
- final int tmp = termIDs[lo];
- termIDs[lo] = termIDs[hi];
- termIDs[hi] = tmp;
- }
- return;
- }
-
- int mid = (lo + hi) >>> 1;
-
- if (comparePostings(termIDs[lo], termIDs[mid]) > 0) {
- int tmp = termIDs[lo];
- termIDs[lo] = termIDs[mid];
- termIDs[mid] = tmp;
- }
-
- if (comparePostings(termIDs[mid], termIDs[hi]) > 0) {
- int tmp = termIDs[mid];
- termIDs[mid] = termIDs[hi];
- termIDs[hi] = tmp;
-
- if (comparePostings(termIDs[lo], termIDs[mid]) > 0) {
- int tmp2 = termIDs[lo];
- termIDs[lo] = termIDs[mid];
- termIDs[mid] = tmp2;
- }
- }
-
- int left = lo + 1;
- int right = hi - 1;
-
- if (left >= right)
- return;
-
- int partition = termIDs[mid];
-
- for (; ;) {
- while (comparePostings(termIDs[right], partition) > 0)
- --right;
-
- while (left < right && comparePostings(termIDs[left], partition) <= 0)
- ++left;
-
- if (left < right) {
- int tmp = termIDs[left];
- termIDs[left] = termIDs[right];
- termIDs[right] = tmp;
- --right;
- } else {
- break;
- }
- }
-
- quickSort(termIDs, lo, left);
- quickSort(termIDs, left + 1, hi);
- }
-
- /** Compares term text for two Posting instance and
- * returns -1 if p1 < p2; 1 if p1 > p2; else 0. */
- int comparePostings(int term1, int term2) {
-
- if (term1 == term2) {
- // Our quicksort does this, eg during partition
- return 0;
- }
-
- termBytePool.setBytesRef(perThread.tr1, postingsArray.textStarts[term1]);
- termBytePool.setBytesRef(perThread.tr2, postingsArray.textStarts[term2]);
-
- return termComp.compare(perThread.tr1, perThread.tr2);
- }
-
- /** Test whether the text for current RawPostingList p equals
- * current tokenText in utf8. */
- private boolean postingEquals(final int termID) {
- final int textStart = postingsArray.textStarts[termID];
- final byte[] text = termBytePool.buffers[textStart >> DocumentsWriter.BYTE_BLOCK_SHIFT];
- assert text != null;
-
- int pos = textStart & DocumentsWriter.BYTE_BLOCK_MASK;
-
- final int len;
- if ((text[pos] & 0x80) == 0) {
- // length is 1 byte
- len = text[pos];
- pos += 1;
- } else {
- // length is 2 bytes
- len = (text[pos]&0x7f) + ((text[pos+1]&0xff)<<7);
- pos += 2;
- }
-
- if (len == utf8.length) {
- final byte[] utf8Bytes = utf8.bytes;
- for(int tokenPos=0;tokenPos<utf8.length;pos++,tokenPos++) {
- if (utf8Bytes[tokenPos] != text[pos]) {
- return false;
- }
- }
- return true;
- } else {
- return false;
- }
- }
-
private boolean doCall;
private boolean doNextCall;
@@ -299,10 +129,7 @@ final class TermsHashPerField extends In
@Override
boolean start(Fieldable[] fields, int count) throws IOException {
doCall = consumer.start(fields, count);
- if (postingsArray == null) {
- initPostingsArray();
- }
-
+ bytesHash.reinit();
if (nextPerField != null)
doNextCall = nextPerField.start(fields, count);
return doCall || doNextCall;
@@ -312,52 +139,15 @@ final class TermsHashPerField extends In
// because token text has already been "interned" into
// textStart, so we hash by textStart
public void add(int textStart) throws IOException {
- int code = textStart;
-
- int hashPos = code & postingsHashMask;
-
- assert !postingsCompacted;
-
- // Locate RawPostingList in hash
- int termID = postingsHash[hashPos];
-
- if (termID != -1 && postingsArray.textStarts[termID] != textStart) {
- // Conflict: keep searching different locations in
- // the hash table.
- final int inc = ((code>>8)+code)|1;
- do {
- code += inc;
- hashPos = code & postingsHashMask;
- termID = postingsHash[hashPos];
- } while (termID != -1 && postingsArray.textStarts[termID] != textStart);
- }
-
- if (termID == -1) {
-
+ int termID = bytesHash.addByPoolOffset(textStart);
+ if (termID >= 0) { // New posting
// First time we are seeing this token since we last
// flushed the hash.
-
- // New posting
- termID = numPostings++;
- if (termID >= postingsArray.size) {
- growParallelPostingsArray();
- }
-
- assert termID >= 0;
-
- postingsArray.textStarts[termID] = textStart;
-
- assert postingsHash[hashPos] == -1;
- postingsHash[hashPos] = termID;
-
- if (numPostings == postingsHashHalfSize)
- rehashPostings(2*postingsHashSize);
-
// Init stream slices
if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE)
intPool.nextBuffer();
- if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE)
+ if (ByteBlockPool.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE)
bytePool.nextBuffer();
intUptos = intPool.buffer;
@@ -375,6 +165,7 @@ final class TermsHashPerField extends In
consumer.newTerm(termID);
} else {
+ termID = (-termID)-1;
int intStart = postingsArray.intStarts[termID];
intUptos = intPool.buffers[intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
intUptoStart = intStart & DocumentsWriter.INT_BLOCK_MASK;
@@ -386,105 +177,39 @@ final class TermsHashPerField extends In
@Override
void add() throws IOException {
- assert !postingsCompacted;
-
// We are first in the chain so we must "intern" the
// term text into textStart address
-
// Get the text & hash of this term.
- int code = termAtt.toBytesRef(utf8);
-
- int hashPos = code & postingsHashMask;
-
- // Locate RawPostingList in hash
- int termID = postingsHash[hashPos];
-
- if (termID != -1 && !postingEquals(termID)) {
- // Conflict: keep searching different locations in
- // the hash table.
- final int inc = ((code>>8)+code)|1;
- do {
- code += inc;
- hashPos = code & postingsHashMask;
- termID = postingsHash[hashPos];
- } while (termID != -1 && !postingEquals(termID));
- }
-
- if (termID == -1) {
-
- // First time we are seeing this token since we last
- // flushed the hash.
- final int textLen2 = 2+utf8.length;
- if (textLen2 + bytePool.byteUpto > DocumentsWriter.BYTE_BLOCK_SIZE) {
- // Not enough room in current block
-
- if (utf8.length > DocumentsWriter.MAX_TERM_LENGTH_UTF8) {
- // Just skip this term, to remain as robust as
- // possible during indexing. A TokenFilter
- // can be inserted into the analyzer chain if
- // other behavior is wanted (pruning the term
- // to a prefix, throwing an exception, etc).
- if (docState.maxTermPrefix == null) {
- final int saved = utf8.length;
- try {
- utf8.length = Math.min(30, DocumentsWriter.MAX_TERM_LENGTH_UTF8);
- docState.maxTermPrefix = utf8.toString();
- } finally {
- utf8.length = saved;
- }
- }
-
- consumer.skippingLongTerm();
- return;
+ int termID;
+ try{
+ termID = bytesHash.add(termBytesRef, termAtt.toBytesRef(termBytesRef));
+ }catch (MaxBytesLengthExceededException e) {
+ // Not enough room in current block
+ // Just skip this term, to remain as robust as
+ // possible during indexing. A TokenFilter
+ // can be inserted into the analyzer chain if
+ // other behavior is wanted (pruning the term
+ // to a prefix, throwing an exception, etc).
+ if (docState.maxTermPrefix == null) {
+ final int saved = termBytesRef.length;
+ try {
+ termBytesRef.length = Math.min(30, DocumentsWriter.MAX_TERM_LENGTH_UTF8);
+ docState.maxTermPrefix = termBytesRef.toString();
+ } finally {
+ termBytesRef.length = saved;
}
- bytePool.nextBuffer();
- }
-
- // New posting
- termID = numPostings++;
- if (termID >= postingsArray.size) {
- growParallelPostingsArray();
}
-
- assert termID != -1;
- assert postingsHash[hashPos] == -1;
-
- postingsHash[hashPos] = termID;
-
- final byte[] text = bytePool.buffer;
- final int textUpto = bytePool.byteUpto;
- postingsArray.textStarts[termID] = textUpto + bytePool.byteOffset;
-
- // We first encode the length, followed by the UTF8
- // bytes. Length is encoded as vInt, but will consume
- // 1 or 2 bytes at most (we reject too-long terms,
- // above).
-
- // encode length @ start of bytes
- if (utf8.length < 128) {
- // 1 byte to store length
- text[textUpto] = (byte) utf8.length;
- bytePool.byteUpto += utf8.length + 1;
- System.arraycopy(utf8.bytes, 0, text, textUpto+1, utf8.length);
- } else {
- // 2 byte to store length
- text[textUpto] = (byte) (0x80 | (utf8.length & 0x7f));
- text[textUpto+1] = (byte) ((utf8.length>>7) & 0xff);
- bytePool.byteUpto += utf8.length + 2;
- System.arraycopy(utf8.bytes, 0, text, textUpto+2, utf8.length);
- }
-
- if (numPostings == postingsHashHalfSize) {
- rehashPostings(2*postingsHashSize);
- bytesUsed(2*numPostings * RamUsageEstimator.NUM_BYTES_INT);
- }
-
+ consumer.skippingLongTerm();
+ return;
+ }
+ if (termID >= 0) {// New posting
+ bytesHash.byteStart(termID);
// Init stream slices
if (numPostingInt + intPool.intUpto > DocumentsWriter.INT_BLOCK_SIZE) {
intPool.nextBuffer();
}
- if (DocumentsWriter.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE) {
+ if (ByteBlockPool.BYTE_BLOCK_SIZE - bytePool.byteUpto < numPostingInt*ByteBlockPool.FIRST_LEVEL_SIZE) {
bytePool.nextBuffer();
}
@@ -503,6 +228,7 @@ final class TermsHashPerField extends In
consumer.newTerm(termID);
} else {
+ termID = (-termID)-1;
final int intStart = postingsArray.intStarts[termID];
intUptos = intPool.buffers[intStart >> DocumentsWriter.INT_BLOCK_SHIFT];
intUptoStart = intStart & DocumentsWriter.INT_BLOCK_MASK;
@@ -518,9 +244,9 @@ final class TermsHashPerField extends In
void writeByte(int stream, byte b) {
int upto = intUptos[intUptoStart+stream];
- byte[] bytes = bytePool.buffers[upto >> DocumentsWriter.BYTE_BLOCK_SHIFT];
+ byte[] bytes = bytePool.buffers[upto >> ByteBlockPool.BYTE_BLOCK_SHIFT];
assert bytes != null;
- int offset = upto & DocumentsWriter.BYTE_BLOCK_MASK;
+ int offset = upto & ByteBlockPool.BYTE_BLOCK_MASK;
if (bytes[offset] != 0) {
// End of slice; allocate a new one
offset = bytePool.allocSlice(bytes, offset);
@@ -553,61 +279,51 @@ final class TermsHashPerField extends In
if (nextPerField != null)
nextPerField.finish();
}
+
+ private static final class PostingsBytesStartArray extends BytesStartArray {
- /** Called when postings hash is too small (> 50%
- * occupied) or too large (< 20% occupied). */
- void rehashPostings(final int newSize) {
-
- final int newMask = newSize-1;
-
- int[] newHash = new int[newSize];
- Arrays.fill(newHash, -1);
- for(int i=0;i<postingsHashSize;i++) {
- int termID = postingsHash[i];
- if (termID != -1) {
- int code;
- if (perThread.primary) {
- final int textStart = postingsArray.textStarts[termID];
- final int start = textStart & DocumentsWriter.BYTE_BLOCK_MASK;
- final byte[] text = bytePool.buffers[textStart >> DocumentsWriter.BYTE_BLOCK_SHIFT];
- code = 0;
-
- final int len;
- int pos;
- if ((text[start] & 0x80) == 0) {
- // length is 1 byte
- len = text[start];
- pos = start+1;
- } else {
- len = (text[start]&0x7f) + ((text[start+1]&0xff)<<7);
- pos = start+2;
- }
-
- final int endPos = pos+len;
- while(pos < endPos) {
- code = (code*31) + text[pos++];
- }
- } else {
- code = postingsArray.textStarts[termID];
- }
+ private final TermsHashPerField perField;
+ private final AtomicLong bytesUsed;
- int hashPos = code & newMask;
- assert hashPos >= 0;
- if (newHash[hashPos] != -1) {
- final int inc = ((code>>8)+code)|1;
- do {
- code += inc;
- hashPos = code & newMask;
- } while (newHash[hashPos] != -1);
- }
- newHash[hashPos] = termID;
+ private PostingsBytesStartArray(
+ TermsHashPerField perField, AtomicLong bytesUsed) {
+ this.perField = perField;
+ this.bytesUsed = bytesUsed;
+ }
+
+ @Override
+ public int[] init() {
+ if(perField.postingsArray == null) {
+ perField.postingsArray = perField.consumer.createPostingsArray(2);
+ bytesUsed.addAndGet(perField.postingsArray.size * perField.postingsArray.bytesPerPosting());
+ }
+ return perField.postingsArray.textStarts;
+ }
+
+ @Override
+ public int[] grow() {
+ ParallelPostingsArray postingsArray = perField.postingsArray;
+ final int oldSize = perField.postingsArray.size;
+ postingsArray = perField.postingsArray = postingsArray.grow();
+ bytesUsed
+ .addAndGet((postingsArray.bytesPerPosting() * (postingsArray.size - oldSize)));
+ return postingsArray.textStarts;
+ }
+
+ @Override
+ public int[] clear() {
+ if(perField.postingsArray != null) {
+ bytesUsed.addAndGet(-perField.postingsArray.size * perField.postingsArray.bytesPerPosting());
+ perField.postingsArray = null;
}
+ return null;
}
- postingsHashMask = newMask;
- postingsHash = newHash;
+ @Override
+ public AtomicLong bytesUsed() {
+ return bytesUsed;
+ }
- postingsHashSize = newSize;
- postingsHashHalfSize = newSize >> 1;
}
+
}
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashPerThread.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashPerThread.java?rev=1003790&r1=1003789&r2=1003790&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashPerThread.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/TermsHashPerThread.java Sat Oct 2 12:44:32 2010
@@ -17,8 +17,8 @@ package org.apache.lucene.index;
* limitations under the License.
*/
+import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.UnicodeUtil;
import java.io.IOException;
@@ -26,20 +26,17 @@ final class TermsHashPerThread extends I
final TermsHash termsHash;
final TermsHashConsumerPerThread consumer;
- final TermsHashPerThread nextPerThread;
+ final TermsHashPerThread nextPerThread; // the secondary is currently consumed by TermVectorsWriter
+ // see secondary entry point in TermsHashPerField#add(int)
final IntBlockPool intPool;
final ByteBlockPool bytePool;
final ByteBlockPool termBytePool;
+
final boolean primary;
final DocumentsWriter.DocState docState;
-
- // Used when comparing postings via termRefComp, in TermsHashPerField
- final BytesRef tr1 = new BytesRef();
- final BytesRef tr2 = new BytesRef();
-
- // Used by perField:
- final BytesRef utf8 = new BytesRef(10);
+ // Used by perField to obtain terms from the analysis chain
+ final BytesRef termBytesRef = new BytesRef(10);
public TermsHashPerThread(DocInverterPerThread docInverterPerThread, final TermsHash termsHash, final TermsHash nextTermsHash, final TermsHashPerThread primaryPerThread) {
docState = docInverterPerThread.docState;
@@ -48,21 +45,18 @@ final class TermsHashPerThread extends I
this.consumer = termsHash.consumer.addThread(this);
intPool = new IntBlockPool(termsHash.docWriter);
- bytePool = new ByteBlockPool(termsHash.docWriter.byteBlockAllocator);
-
- if (nextTermsHash != null) {
+ bytePool = new ByteBlockPool(termsHash.docWriter.byteBlockAllocator); // use the allocator from the docWriter which tracks the used bytes
+ primary = nextTermsHash != null;
+ if (primary) {
// We are primary
- primary = true;
termBytePool = bytePool;
+ nextPerThread = nextTermsHash.addThread(docInverterPerThread, this); // this will be the primaryPerThread in the secondary
+ assert nextPerThread != null;
} else {
- primary = false;
- termBytePool = primaryPerThread.bytePool;
- }
-
- if (nextTermsHash != null)
- nextPerThread = nextTermsHash.addThread(docInverterPerThread, this);
- else
+ assert primaryPerThread != null;
+ termBytePool = primaryPerThread.bytePool; // we are secondary and share the byte pool with the primary
nextPerThread = null;
+ }
}
@Override
@@ -74,30 +68,25 @@ final class TermsHashPerThread extends I
synchronized public void abort() {
reset(true);
consumer.abort();
- if (nextPerThread != null)
+ if (primary)
nextPerThread.abort();
}
@Override
public void startDocument() throws IOException {
consumer.startDocument();
- if (nextPerThread != null)
+ if (primary)
nextPerThread.consumer.startDocument();
}
@Override
public DocumentsWriter.DocWriter finishDocument() throws IOException {
final DocumentsWriter.DocWriter doc = consumer.finishDocument();
-
- final DocumentsWriter.DocWriter doc2;
- if (nextPerThread != null)
- doc2 = nextPerThread.consumer.finishDocument();
- else
- doc2 = null;
+ final DocumentsWriter.DocWriter docFromSecondary = primary? nextPerThread.consumer.finishDocument():null;
if (doc == null)
- return doc2;
+ return docFromSecondary;
else {
- doc.setNext(doc2);
+ doc.setNext(docFromSecondary);
return doc;
}
}
@@ -106,9 +95,5 @@ final class TermsHashPerThread extends I
void reset(boolean recyclePostings) {
intPool.reset();
bytePool.reset();
-
- if (primary) {
- bytePool.reset();
- }
}
}
Copied: lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/ByteBlockPool.java (from r1002577, lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/ByteBlockPool.java)
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/ByteBlockPool.java?p2=lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/ByteBlockPool.java&p1=lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/ByteBlockPool.java&r1=1002577&r2=1003790&rev=1003790&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/ByteBlockPool.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/ByteBlockPool.java Sat Oct 2 12:44:32 2010
@@ -1,4 +1,4 @@
-package org.apache.lucene.index;
+package org.apache.lucene.util;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
@@ -34,26 +34,42 @@ package org.apache.lucene.index;
* hit a non-zero byte. */
import java.util.Arrays;
-import org.apache.lucene.util.BytesRef;
+
+
import java.util.List;
import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_OBJECT_REF;
-import org.apache.lucene.util.ArrayUtil;
-final class ByteBlockPool {
+public final class ByteBlockPool {
+ public final static int BYTE_BLOCK_SHIFT = 15;
+ public final static int BYTE_BLOCK_SIZE = 1 << BYTE_BLOCK_SHIFT;
+ public final static int BYTE_BLOCK_MASK = BYTE_BLOCK_SIZE - 1;
+
+ public abstract static class Allocator {
+ protected final int blockSize;
+
+ public Allocator(int blockSize) {
+ this.blockSize = blockSize;
+ }
- abstract static class Allocator {
- abstract void recycleByteBlocks(byte[][] blocks, int start, int end);
- abstract void recycleByteBlocks(List<byte[]> blocks);
- abstract byte[] getByteBlock();
+ public abstract void recycleByteBlocks(byte[][] blocks, int start, int end);
+
+ public void recycleByteBlocks(List<byte[]> blocks) {
+ final byte[][] b = blocks.toArray(new byte[blocks.size()][]);
+ recycleByteBlocks(b, 0, b.length);
+ }
+
+ public byte[] getByteBlock() {
+ return new byte[blockSize];
+ }
}
public byte[][] buffers = new byte[10][];
int bufferUpto = -1; // Which buffer we are upto
- public int byteUpto = DocumentsWriter.BYTE_BLOCK_SIZE; // Where we are in head buffer
+ public int byteUpto = BYTE_BLOCK_SIZE; // Where we are in head buffer
public byte[] buffer; // Current head buffer
- public int byteOffset = -DocumentsWriter.BYTE_BLOCK_SIZE; // Current head offset
+ public int byteOffset = -BYTE_BLOCK_SIZE; // Current head offset
private final Allocator allocator;
@@ -95,11 +111,11 @@ final class ByteBlockPool {
bufferUpto++;
byteUpto = 0;
- byteOffset += DocumentsWriter.BYTE_BLOCK_SIZE;
+ byteOffset += BYTE_BLOCK_SIZE;
}
public int newSlice(final int size) {
- if (byteUpto > DocumentsWriter.BYTE_BLOCK_SIZE-size)
+ if (byteUpto > BYTE_BLOCK_SIZE-size)
nextBuffer();
final int upto = byteUpto;
byteUpto += size;
@@ -112,9 +128,10 @@ final class ByteBlockPool {
// is just a compact way to encode X+1 with a max. Second
// array is the length of each slice, ie first slice is 5
// bytes, next slice is 14 bytes, etc.
- final static int[] nextLevelArray = {1, 2, 3, 4, 5, 6, 7, 8, 9, 9};
- final static int[] levelSizeArray = {5, 14, 20, 30, 40, 40, 80, 80, 120, 200};
- final static int FIRST_LEVEL_SIZE = levelSizeArray[0];
+
+ public final static int[] nextLevelArray = {1, 2, 3, 4, 5, 6, 7, 8, 9, 9};
+ public final static int[] levelSizeArray = {5, 14, 20, 30, 40, 40, 80, 80, 120, 200};
+ public final static int FIRST_LEVEL_SIZE = levelSizeArray[0];
public int allocSlice(final byte[] slice, final int upto) {
@@ -123,7 +140,7 @@ final class ByteBlockPool {
final int newSize = levelSizeArray[newLevel];
// Maybe allocate another block
- if (byteUpto > DocumentsWriter.BYTE_BLOCK_SIZE-newSize)
+ if (byteUpto > BYTE_BLOCK_SIZE-newSize)
nextBuffer();
final int newUpto = byteUpto;
@@ -150,9 +167,9 @@ final class ByteBlockPool {
// Fill in a BytesRef from term's length & bytes encoded in
// byte block
- final BytesRef setBytesRef(BytesRef term, int textStart) {
- final byte[] bytes = term.bytes = buffers[textStart >> DocumentsWriter.BYTE_BLOCK_SHIFT];
- int pos = textStart & DocumentsWriter.BYTE_BLOCK_MASK;
+ public final BytesRef setBytesRef(BytesRef term, int textStart) {
+ final byte[] bytes = term.bytes = buffers[textStart >> BYTE_BLOCK_SHIFT];
+ int pos = textStart & BYTE_BLOCK_MASK;
if ((bytes[pos] & 0x80) == 0) {
// length is 1 byte
term.length = bytes[pos];
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/BytesRef.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/BytesRef.java?rev=1003790&r1=1003789&r2=1003790&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/BytesRef.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/BytesRef.java Sat Oct 2 12:44:32 2010
@@ -30,6 +30,7 @@ import java.io.IOException;
* @lucene.experimental */
public final class BytesRef implements Comparable<BytesRef>, Externalizable {
+ static final int HASH_PRIME = 31;
public static final byte[] EMPTY_BYTES = new byte[0];
/** The contents of the BytesRef. Should never be {@code null}. */
@@ -182,11 +183,10 @@ public final class BytesRef implements C
*/
@Override
public int hashCode() {
- final int prime = 31;
int result = 0;
final int end = offset + length;
for(int i=offset;i<end;i++) {
- result = prime * result + bytes[i];
+ result = HASH_PRIME * result + bytes[i];
}
return result;
}
Added: lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/BytesRefHash.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/BytesRefHash.java?rev=1003790&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/BytesRefHash.java (added)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/BytesRefHash.java Sat Oct 2 12:44:32 2010
@@ -0,0 +1,574 @@
+package org.apache.lucene.util;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.Arrays;
+import java.util.Comparator;
+import java.util.concurrent.atomic.AtomicLong;
+
+import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_MASK;
+import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SIZE;
+import static org.apache.lucene.util.ByteBlockPool.BYTE_BLOCK_SHIFT;
+
+/**
+ * {@link BytesRefHash} is a special purpose hash-map like data-structure
+ * optimized for {@link BytesRef} instances. BytesRefHash maintains mappings of
+ * byte arrays to ordinal (Map<BytesRef,int>) storing the hashed bytes
+ * efficiently in continuous storage. The mapping to the ordinal is
+ * encapsulated inside {@link BytesRefHash} and is guaranteed to be increased
+ * for each added {@link BytesRef}.
+ *
+ * <p>
+ * Note: The maximum capacity {@link BytesRef} instance passed to
+ * {@link #add(BytesRef)} must not be longer than {@link #BYTES_BLOCK_SIZE}-2 (
+ * {@value #BYTES_BLOCK_SIZE}-2. The internal storage is limited to 2GB total
+ * byte storage.
+ * </p>
+ *
+ * @lucene.internal
+ */
+public final class BytesRefHash {
+
+ private final ByteBlockPool pool;
+ private int hashSize;
+ private int hashHalfSize;
+ private int hashMask;
+ private int count;
+ private int lastCount = -1;
+ private int[] ords;
+ private int[] bytesStart;
+ public static final int DEFAULT_CAPACITY = 16;
+ private final BytesStartArray bytesStartArray;
+ private AtomicLong bytesUsed;
+
+ /**
+ * Creates a new {@link BytesRefHash}
+ */
+ public BytesRefHash(ByteBlockPool pool) {
+ this(pool, DEFAULT_CAPACITY, new DirectBytesStartArray(DEFAULT_CAPACITY));
+ }
+
+ /**
+ * Creates a new {@link BytesRefHash}
+ */
+ public BytesRefHash(ByteBlockPool pool, int capacity,
+ BytesStartArray bytesStartArray) {
+ hashSize = capacity;
+ hashHalfSize = hashSize >> 1;
+ hashMask = hashSize - 1;
+ this.pool = pool;
+ ords = new int[hashSize];
+ Arrays.fill(ords, -1);
+ this.bytesStartArray = bytesStartArray;
+ bytesStart = bytesStartArray.init();
+ bytesUsed = bytesStartArray.bytesUsed();
+ bytesUsed.addAndGet(hashSize * RamUsageEstimator.NUM_BYTES_INT);
+ }
+
+ /**
+ * Returns the number of {@link BytesRef} values in this {@link BytesRefHash}.
+ *
+ * @return the number of {@link BytesRef} values in this {@link BytesRefHash}.
+ */
+ public int size() {
+ return count;
+ }
+
+ /**
+ * Returns the {@link BytesRef} value for the given ord.
+ * <p>
+ * Note: the given ord must be a positive integer less that the current size (
+ * {@link #size()})
+ * </p>
+ *
+ * @param ord
+ * the ord
+ *
+ * @return a BytesRef instance for the given ord
+ */
+ public BytesRef get(int ord) {
+ assert bytesStart != null : "bytesStart is null - not initialized";
+ assert ord < bytesStart.length: "ord exceeeds byteStart len: " + bytesStart.length;
+ return pool.setBytesRef(scratch1, bytesStart[ord]);
+ }
+
+ /**
+ * Returns the ords array in arbitrary order. Valid ords start at offset of 0
+ * and end at a limit of {@link #size()} - 1
+ * <p>
+ * Note: This is a destructive operation. {@link #clear()} must be called in
+ * order to reuse this {@link BytesRefHash} instance.
+ * </p>
+ */
+ public int[] compact() {
+ assert bytesStart != null : "Bytesstart is null - not initialized";
+ int upto = 0;
+ for (int i = 0; i < hashSize; i++) {
+ if (ords[i] != -1) {
+ if (upto < i) {
+ ords[upto] = ords[i];
+ ords[i] = -1;
+ }
+ upto++;
+ }
+ }
+
+ assert upto == count;
+ lastCount = count;
+ return ords;
+ }
+
+ /**
+ * Returns the values array sorted by the referenced byte values.
+ * <p>
+ * Note: This is a destructive operation. {@link #clear()} must be called in
+ * order to reuse this {@link BytesRefHash} instance.
+ * </p>
+ *
+ * @param comp
+ * the {@link Comparator} used for sorting
+ */
+ public int[] sort(Comparator<BytesRef> comp) {
+ assert bytesStart != null : "Bytesstart is null - not initialized";
+ final int[] compact = compact();
+ quickSort(comp, compact, 0, count - 1);
+ return compact;
+ }
+
+ private void quickSort(Comparator<BytesRef> comp, int[] entries, int lo,
+ int hi) {
+ if (lo >= hi)
+ return;
+ if (hi == 1 + lo) {
+ if (compare(comp, entries[lo], entries[hi]) > 0) {
+ final int tmp = entries[lo];
+ entries[lo] = entries[hi];
+ entries[hi] = tmp;
+ }
+ return;
+ }
+ final int mid = (lo + hi) >>> 1;
+ if (compare(comp, entries[lo], entries[mid]) > 0) {
+ int tmp = entries[lo];
+ entries[lo] = entries[mid];
+ entries[mid] = tmp;
+ }
+
+ if (compare(comp, entries[mid], entries[hi]) > 0) {
+ int tmp = entries[mid];
+ entries[mid] = entries[hi];
+ entries[hi] = tmp;
+
+ if (compare(comp, entries[lo], entries[mid]) > 0) {
+ int tmp2 = entries[lo];
+ entries[lo] = entries[mid];
+ entries[mid] = tmp2;
+ }
+ }
+ int left = lo + 1;
+ int right = hi - 1;
+
+ if (left >= right)
+ return;
+
+ final int partition = entries[mid];
+
+ for (;;) {
+ while (compare(comp, entries[right], partition) > 0)
+ --right;
+
+ while (left < right && compare(comp, entries[left], partition) <= 0)
+ ++left;
+
+ if (left < right) {
+ final int tmp = entries[left];
+ entries[left] = entries[right];
+ entries[right] = tmp;
+ --right;
+ } else {
+ break;
+ }
+ }
+
+ quickSort(comp, entries, lo, left);
+ quickSort(comp, entries, left + 1, hi);
+ }
+
+ private final BytesRef scratch1 = new BytesRef();
+ private final BytesRef scratch2 = new BytesRef();
+
+ private boolean equals(int ord, BytesRef b) {
+ return pool.setBytesRef(scratch1, bytesStart[ord]).bytesEquals(b);
+ }
+
+ private int compare(Comparator<BytesRef> comp, int ord1, int ord2) {
+ assert bytesStart.length > ord1 && bytesStart.length > ord2;
+ return comp.compare(pool.setBytesRef(scratch1, bytesStart[ord1]),
+ pool.setBytesRef(scratch2, bytesStart[ord2]));
+ }
+
+ private boolean shrink(int targetSize) {
+ // Cannot use ArrayUtil.shrink because we require power
+ // of 2:
+ int newSize = hashSize;
+ while (newSize >= 8 && newSize / 4 > targetSize) {
+ newSize /= 2;
+ }
+ if (newSize != hashSize) {
+ bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT
+ * -(hashSize - newSize));
+ hashSize = newSize;
+ ords = new int[hashSize];
+ Arrays.fill(ords, -1);
+ hashHalfSize = newSize / 2;
+ hashMask = newSize - 1;
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ /**
+ * Clears the {@link BytesRef} and returns an {@link Entry} which maps to the
+ * given {@link BytesRef}
+ */
+ public void clear(boolean resetPool) {
+ lastCount = count;
+ count = 0;
+ if (resetPool)
+ pool.reset();
+ bytesStart = bytesStartArray.clear();
+ if (lastCount != -1 && shrink(lastCount)) {
+ // shrink clears the hash entries
+ return;
+ }
+ Arrays.fill(ords, -1);
+ }
+
+ public void clear() {
+ clear(true);
+ }
+
+ /**
+ * Adds a new {@link BytesRef}
+ *
+ * @param bytes
+ * the bytes to hash
+ * @return the ord the given bytes are hashed if there was no mapping for the
+ * given bytes, otherwise <code>(-(ord)-1)</code>. This guarantees
+ * that the return value will always be >= 0 if the given bytes
+ * haven't been hashed before.
+ *
+ * @throws MaxBytesLengthExceededException
+ * if the given bytes are > 2 +
+ * {@link ByteBlockPool#BYTE_BLOCK_SIZE}
+ */
+ public int add(BytesRef bytes) {
+ return add(bytes, bytes.hashCode());
+ }
+
+ /**
+ * Adds a new {@link BytesRef} with a pre-calculated hash code.
+ *
+ * @param bytes
+ * the bytes to hash
+ * @param code
+ * the bytes hash code
+ *
+ * <p>
+ * Hashcode is defined as:
+ *
+ * <pre>
+ * int hash = 0;
+ * for (int i = offset; i < offset + length; i++) {
+ * hash = 31 * hash + bytes[i];
+ * }
+ * </pre>
+ *
+ * @return the ord the given bytes are hashed if there was no mapping for the
+ * given bytes, otherwise <code>(-(ord)-1)</code>. This guarantees
+ * that the return value will always be >= 0 if the given bytes
+ * haven't been hashed before.
+ *
+ * @throws MaxBytesLengthExceededException
+ * if the given bytes are > 2 +
+ * {@link ByteBlockPool#BYTE_BLOCK_SIZE}
+ */
+ public int add(BytesRef bytes, int code) {
+ assert bytesStart != null : "Bytesstart is null - not initialized";
+ final int length = bytes.length;
+ // final position
+ int hashPos = code & hashMask;
+ int e = ords[hashPos];
+ if (e != -1 && !equals(e, bytes)) {
+ // Conflict: keep searching different locations in
+ // the hash table.
+ final int inc = ((code >> 8) + code) | 1;
+ do {
+ code += inc;
+ hashPos = code & hashMask;
+ e = ords[hashPos];
+ } while (e != -1 && !equals(e, bytes));
+ }
+
+ if (e == -1) {
+ // new entry
+ final int len2 = 2 + bytes.length;
+ if (len2 + pool.byteUpto > BYTE_BLOCK_SIZE) {
+ if (len2 > BYTE_BLOCK_SIZE) {
+ throw new MaxBytesLengthExceededException("bytes can be at most "
+ + (BYTE_BLOCK_SIZE - 2) + " in length; got " + bytes.length);
+ }
+ pool.nextBuffer();
+ }
+ final byte[] buffer = pool.buffer;
+ final int bufferUpto = pool.byteUpto;
+ if (count >= bytesStart.length) {
+ bytesStart = bytesStartArray.grow();
+ assert count < bytesStart.length + 1 : "count: " + count + " len: "
+ + bytesStart.length;
+ }
+ e = count++;
+
+ bytesStart[e] = bufferUpto + pool.byteOffset;
+
+ // We first encode the length, followed by the
+ // bytes. Length is encoded as vInt, but will consume
+ // 1 or 2 bytes at most (we reject too-long terms,
+ // above).
+ if (length < 128) {
+ // 1 byte to store length
+ buffer[bufferUpto] = (byte) length;
+ pool.byteUpto += length + 1;
+ System.arraycopy(bytes.bytes, bytes.offset, buffer, bufferUpto + 1,
+ length);
+ } else {
+ // 2 byte to store length
+ buffer[bufferUpto] = (byte) (0x80 | (length & 0x7f));
+ buffer[bufferUpto + 1] = (byte) ((length >> 7) & 0xff);
+ pool.byteUpto += length + 2;
+ System.arraycopy(bytes.bytes, bytes.offset, buffer, bufferUpto + 2,
+ length);
+ }
+ assert ords[hashPos] == -1;
+ ords[hashPos] = e;
+
+ if (count == hashHalfSize) {
+ rehash(2 * hashSize, true);
+ }
+ return e;
+ }
+ return -(e + 1);
+ }
+
+ public int addByPoolOffset(int offset) {
+ assert bytesStart != null : "Bytesstart is null - not initialized";
+ // final position
+ int code = offset;
+ int hashPos = offset & hashMask;
+ int e = ords[hashPos];
+ if (e != -1 && bytesStart[e] != offset) {
+ // Conflict: keep searching different locations in
+ // the hash table.
+ final int inc = ((code >> 8) + code) | 1;
+ do {
+ code += inc;
+ hashPos = code & hashMask;
+ e = ords[hashPos];
+ } while (e != -1 && bytesStart[e] != offset);
+ }
+ if (e == -1) {
+ // new entry
+ if (count >= bytesStart.length) {
+ bytesStart = bytesStartArray.grow();
+ assert count < bytesStart.length + 1 : "count: " + count + " len: "
+ + bytesStart.length;
+ }
+ e = count++;
+ bytesStart[e] = offset;
+ assert ords[hashPos] == -1;
+ ords[hashPos] = e;
+
+ if (count == hashHalfSize) {
+ rehash(2 * hashSize, false);
+ }
+ return e;
+ }
+ return -(e + 1);
+ }
+
+ /**
+ * Called when hash is too small (> 50% occupied) or too large (< 20%
+ * occupied).
+ */
+ private void rehash(final int newSize, boolean hashOnData) {
+ final int newMask = newSize - 1;
+ bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * (newSize));
+ final int[] newHash = new int[newSize];
+ Arrays.fill(newHash, -1);
+ for (int i = 0; i < hashSize; i++) {
+ final int e0 = ords[i];
+ if (e0 != -1) {
+ int code;
+ if (hashOnData) {
+ final int off = bytesStart[e0];
+ final int start = off & BYTE_BLOCK_MASK;
+ final byte[] bytes = pool.buffers[off >> BYTE_BLOCK_SHIFT];
+ code = 0;
+ final int len;
+ int pos;
+ if ((bytes[start] & 0x80) == 0) {
+ // length is 1 byte
+ len = bytes[start];
+ pos = start + 1;
+ } else {
+ len = (bytes[start] & 0x7f) + ((bytes[start + 1] & 0xff) << 7);
+ pos = start + 2;
+ }
+
+ final int endPos = pos + len;
+ while (pos < endPos) {
+ code = BytesRef.HASH_PRIME * code + bytes[pos++];
+ }
+ } else {
+ code = bytesStart[e0];
+ }
+
+ int hashPos = code & newMask;
+ assert hashPos >= 0;
+ if (newHash[hashPos] != -1) {
+ final int inc = ((code >> 8) + code) | 1;
+ do {
+ code += inc;
+ hashPos = code & newMask;
+ } while (newHash[hashPos] != -1);
+ }
+ newHash[hashPos] = e0;
+ }
+ }
+
+ hashMask = newMask;
+ bytesUsed.addAndGet(RamUsageEstimator.NUM_BYTES_INT * (-ords.length));
+ ords = newHash;
+ hashSize = newSize;
+ hashHalfSize = newSize / 2;
+ }
+
+ /**
+ * reinitializes the {@link BytesRefHash} after a previous {@link #clear()}
+ * call. If {@link #clear()} has not been called previously this method has no
+ * effect.
+ */
+ public void reinit() {
+ if (bytesStart == null)
+ bytesStart = bytesStartArray.init();
+ }
+
+ /**
+ * Returns the bytesStart offset into the internally used
+ * {@link ByteBlockPool} for the given ord
+ *
+ * @param ord
+ * the ord to look up
+ * @return the bytesStart offset into the internally used
+ * {@link ByteBlockPool} for the given ord
+ */
+ public int byteStart(int ord) {
+ assert bytesStart != null : "Bytesstart is null - not initialized";
+ assert ord >= 0 && ord < count : ord;
+ return bytesStart[ord];
+ }
+
+ /**
+ * Thrown if a {@link BytesRef} exceeds the {@link BytesRefHash} limit of
+ * {@link #BYTES_BLOCK_SIZE}-2 ({@value #BYTES_BLOCK_SIZE}-2).
+ */
+ @SuppressWarnings("serial")
+ public static class MaxBytesLengthExceededException extends RuntimeException {
+ MaxBytesLengthExceededException(String message) {
+ super(message);
+ }
+ }
+
+ public abstract static class BytesStartArray {
+ /**
+ * Initializes the BytesStartArray. This call will allocate memory
+ *
+ * @return the initialized bytes start array
+ */
+ public abstract int[] init();
+
+ /**
+ * Grows the {@link BytesStartArray}
+ *
+ * @return the grown array
+ */
+ public abstract int[] grow();
+
+ /**
+ * clears the {@link BytesStartArray} and returns the cleared instance.
+ *
+ * @return the cleared instance, this might be <code>null</code>
+ */
+ public abstract int[] clear();
+
+ /**
+ * A {@link AtomicLong} reference holding the number of bytes used by this
+ * {@link BytesStartArray}. The {@link BytesRefHash} uses this reference to
+ * track it memory usage
+ *
+ * @return a {@link AtomicLong} reference holding the number of bytes used
+ * by this {@link BytesStartArray}.
+ */
+ public abstract AtomicLong bytesUsed();
+ }
+
+ static class DirectBytesStartArray extends BytesStartArray {
+
+ private final int initSize;
+ private int[] bytesStart;
+ private final AtomicLong bytesUsed = new AtomicLong(0);
+
+ DirectBytesStartArray(int initSize) {
+ this.initSize = initSize;
+ }
+
+ @Override
+ public int[] clear() {
+ return bytesStart = null;
+ }
+
+ @Override
+ public int[] grow() {
+ assert bytesStart != null;
+ return bytesStart = ArrayUtil.grow(bytesStart, bytesStart.length + 1);
+ }
+
+ @Override
+ public int[] init() {
+ return bytesStart = new int[ArrayUtil.oversize(initSize,
+ RamUsageEstimator.NUM_BYTES_INT)];
+ }
+
+ @Override
+ public AtomicLong bytesUsed() {
+ return bytesUsed;
+ }
+
+ }
+}
Propchange: lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/BytesRefHash.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/BytesRefHash.java
------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL
Added: lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/RecyclingByteBlockAllocator.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/RecyclingByteBlockAllocator.java?rev=1003790&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/RecyclingByteBlockAllocator.java (added)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/RecyclingByteBlockAllocator.java Sat Oct 2 12:44:32 2010
@@ -0,0 +1,162 @@
+package org.apache.lucene.util;
+
+import java.util.concurrent.atomic.AtomicLong;
+
+import org.apache.lucene.util.ByteBlockPool.Allocator;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * A {@link ByteBlockPool.Allocator} implementation that recycles unused byte
+ * blocks in a buffer and reuses them in subsequent calls to
+ * {@link #getByteBlock()}.
+ *
+ * @lucene.internal
+ */
+public final class RecyclingByteBlockAllocator extends ByteBlockPool.Allocator {
+ private byte[][] freeByteBlocks;
+ private final int maxBufferedBlocks;
+ private int freeBlocks = 0;
+ private final AtomicLong bytesUsed;
+ public static final int DEFAULT_BUFFERED_BLOCKS = 64;
+
+ /**
+ * Creates a new {@link RecyclingByteBlockAllocator}
+ *
+ * @param blockSize
+ * the block size in bytes
+ * @param maxBufferedBlocks
+ * maximum number of buffered byte block
+ * @param bytesUsed
+ * {@link AtomicLong} reference counting internally allocated bytes
+ *
+ * @see DummyConcurrentLock
+ */
+ public RecyclingByteBlockAllocator(int blockSize, int maxBufferedBlocks,
+ AtomicLong bytesUsed) {
+ super(blockSize);
+ freeByteBlocks = new byte[Math.min(10, maxBufferedBlocks)][];
+ this.maxBufferedBlocks = maxBufferedBlocks;
+ this.bytesUsed = bytesUsed;
+ }
+
+ /**
+ * Creates a new {@link RecyclingByteBlockAllocator} with a
+ * {@link DummyConcurrentLock} instance.
+ *
+ * @param blockSize
+ * the block size in bytes
+ * @param maxBufferedBlocks
+ * maximum number of buffered byte block
+ */
+ public RecyclingByteBlockAllocator(int blockSize, int maxBufferedBlocks) {
+ this(blockSize, maxBufferedBlocks, new AtomicLong());
+ }
+
+ /**
+ * Creates a new {@link RecyclingByteBlockAllocator} with a block size of
+ * {@link ByteBlockPool#BYTE_BLOCK_SIZE} (
+ * {@value ByteBlockPool#BYTE_BLOCK_SIZE}, upper buffered docs limit of
+ * {@link #DEFAULT_BUFFERED_BLOCKS} ({@value #DEFAULT_BUFFERED_BLOCKS}) and a
+ * {@link DummyConcurrentLock} instance.
+ *
+ */
+ public RecyclingByteBlockAllocator() {
+ this(ByteBlockPool.BYTE_BLOCK_SIZE, 64, new AtomicLong());
+ }
+
+ @Override
+ public synchronized byte[] getByteBlock() {
+ if (freeBlocks == 0) {
+ bytesUsed.addAndGet(blockSize);
+ return new byte[blockSize];
+ }
+ final byte[] b = freeByteBlocks[--freeBlocks];
+ freeByteBlocks[freeBlocks] = null;
+ return b;
+ }
+
+ @Override
+ public synchronized void recycleByteBlocks(byte[][] blocks, int start, int end) {
+ final int numBlocks = Math.min(maxBufferedBlocks - freeBlocks, end - start);
+ final int size = freeBlocks + numBlocks;
+ if (size >= freeByteBlocks.length) {
+ final byte[][] newBlocks = new byte[ArrayUtil.oversize(size,
+ RamUsageEstimator.NUM_BYTES_OBJ_REF)][];
+ System.arraycopy(freeByteBlocks, 0, newBlocks, 0, freeBlocks);
+ freeByteBlocks = newBlocks;
+ }
+ final int stop = start + numBlocks;
+ for (int i = start; i < stop; i++) {
+ freeByteBlocks[freeBlocks++] = blocks[i];
+ blocks[i] = null;
+ }
+ for (int i = stop; i < end; i++) {
+ blocks[i] = null;
+ }
+ bytesUsed.addAndGet(-(end - stop) * blockSize);
+ assert bytesUsed.get() >= 0;
+ }
+
+ /**
+ * @return the number of currently buffered blocks
+ */
+ public synchronized int numBufferedBlocks() {
+ return freeBlocks;
+ }
+
+ /**
+ * @return the number of bytes currently allocated by this {@link Allocator}
+ */
+ public long bytesUsed() {
+ return bytesUsed.get();
+ }
+
+ /**
+ * @return the maximum number of buffered byte blocks
+ */
+ public int maxBufferedBlocks() {
+ return maxBufferedBlocks;
+ }
+
+ /**
+ * Removes the given number of byte blocks from the buffer if possible.
+ *
+ * @param num
+ * the number of byte blocks to remove
+ * @return the number of actually removed buffers
+ */
+ public synchronized int freeBlocks(int num) {
+ assert num >= 0;
+ final int stop;
+ final int count;
+ if (num > freeBlocks) {
+ stop = 0;
+ count = freeBlocks;
+ } else {
+ stop = freeBlocks - num;
+ count = num;
+ }
+ while (freeBlocks > stop) {
+ freeByteBlocks[--freeBlocks] = null;
+ }
+ bytesUsed.addAndGet(-count*blockSize);
+ assert bytesUsed.get() >= 0;
+ return count;
+ }
+}
\ No newline at end of file
Propchange: lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/RecyclingByteBlockAllocator.java
------------------------------------------------------------------------------
svn:eol-style = native
Propchange: lucene/dev/trunk/lucene/src/java/org/apache/lucene/util/RecyclingByteBlockAllocator.java
------------------------------------------------------------------------------
svn:keywords = Date Author Id Revision HeadURL
Modified: lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestByteSlices.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestByteSlices.java?rev=1003790&r1=1003789&r2=1003790&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestByteSlices.java (original)
+++ lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestByteSlices.java Sat Oct 2 12:44:32 2010
@@ -14,44 +14,16 @@ package org.apache.lucene.index;
* limitations under the License.
*/
-import java.util.ArrayList;
-import java.util.List;
+import java.util.concurrent.locks.ReentrantLock;
+
+import org.apache.lucene.util.ByteBlockPool;
import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.RecyclingByteBlockAllocator;
public class TestByteSlices extends LuceneTestCase {
- private static class ByteBlockAllocator extends ByteBlockPool.Allocator {
- ArrayList<byte[]> freeByteBlocks = new ArrayList<byte[]>();
-
- /* Allocate another byte[] from the shared pool */
- @Override
- synchronized byte[] getByteBlock() {
- final int size = freeByteBlocks.size();
- final byte[] b;
- if (0 == size)
- b = new byte[DocumentsWriter.BYTE_BLOCK_SIZE];
- else
- b = freeByteBlocks.remove(size-1);
- return b;
- }
-
- /* Return a byte[] to the pool */
- @Override
- synchronized void recycleByteBlocks(byte[][] blocks, int start, int end) {
- for(int i=start;i<end;i++)
- freeByteBlocks.add(blocks[i]);
- }
-
- @Override
- synchronized void recycleByteBlocks(List<byte[]> blocks) {
- final int size = blocks.size();
- for(int i=0;i<size;i++)
- freeByteBlocks.add(blocks.get(i));
- }
- }
-
public void testBasic() throws Throwable {
- ByteBlockPool pool = new ByteBlockPool(new ByteBlockAllocator());
+ ByteBlockPool pool = new ByteBlockPool(new RecyclingByteBlockAllocator(ByteBlockPool.BYTE_BLOCK_SIZE, Integer.MAX_VALUE));
final int NUM_STREAM = 100 * RANDOM_MULTIPLIER;
Modified: lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java?rev=1003790&r1=1003789&r2=1003790&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java (original)
+++ lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestIndexWriter.java Sat Oct 2 12:44:32 2010
@@ -77,6 +77,7 @@ import org.apache.lucene.util._TestUtil;
import org.apache.lucene.util.ThreadInterruptedException;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.Bits;
+import org.junit.Assume;
public class TestIndexWriter extends LuceneTestCase {