You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2010/08/06 20:13:18 UTC
svn commit: r983081 - in /lucene/dev/trunk/lucene: ./
src/java/org/apache/lucene/index/
src/java/org/apache/lucene/index/codecs/standard/
src/test/org/apache/lucene/index/
Author: mikemccand
Date: Fri Aug 6 18:13:17 2010
New Revision: 983081
URL: http://svn.apache.org/viewvc?rev=983081&view=rev
Log:
LUCENE-2588: trim unnecessary suffixes from terms in the terms dict index
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java
lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexWriter.java
lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestIndexWriterConfig.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=983081&r1=983080&r2=983081&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Fri Aug 6 18:13:17 2010
@@ -209,6 +209,11 @@ Optimizations
efficient copying by sub-classes. Optimized copy is implemented for RAM and FS
streams. (Shai Erera)
+* LUCENE-2588: Don't store unecessary suffixes when writing the terms
+ index, saving RAM in IndexReader; change default terms index
+ interval from 128 to 32, because the terms index now requires much
+ less RAM. (Robert Muir, Mike McCandless)
+
Documentation
* LUCENE-2579: Fix oal.search's package.html description of abstract
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java?rev=983081&r1=983080&r2=983081&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java Fri Aug 6 18:13:17 2010
@@ -55,7 +55,7 @@ public final class IndexWriterConfig imp
public static enum OpenMode { CREATE, APPEND, CREATE_OR_APPEND }
/** Default value is 128. Change using {@link #setTermIndexInterval(int)}. */
- public static final int DEFAULT_TERM_INDEX_INTERVAL = 128;
+ public static final int DEFAULT_TERM_INDEX_INTERVAL = 32;
/** Denotes a flush trigger is disabled. */
public final static int DISABLE_AUTO_FLUSH = -1;
Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexWriter.java?rev=983081&r1=983080&r2=983081&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexWriter.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexWriter.java Fri Aug 6 18:13:17 2010
@@ -80,7 +80,7 @@ public class SimpleStandardTermsIndexWri
final long termsStart;
long packedIndexStart;
long packedOffsetsStart;
- private int numTerms;
+ private long numTerms;
// TODO: we could conceivably make a PackedInts wrapper
// that auto-grows... then we wouldn't force 6 bytes RAM
@@ -90,6 +90,8 @@ public class SimpleStandardTermsIndexWri
private long lastTermsPointer;
private long totTermLength;
+ private final BytesRef lastTerm = new BytesRef();
+
SimpleFieldWriter(FieldInfo fieldInfo) {
this.fieldInfo = fieldInfo;
indexStart = out.getFilePointer();
@@ -103,8 +105,20 @@ public class SimpleStandardTermsIndexWri
// First term is first indexed term:
if (0 == (numTerms++ % termIndexInterval)) {
- // write full bytes
- out.writeBytes(text.bytes, text.offset, text.length);
+ // we can safely strip off the non-distinguishing
+ // suffix to save RAM in the loaded terms index.
+ final int limit = Math.min(lastTerm.length, text.length);
+ int minPrefixDiff = 1+lastTerm.length;
+ for(int byteIdx=0;byteIdx<limit;byteIdx++) {
+ if (lastTerm.bytes[lastTerm.offset+byteIdx] != text.bytes[text.offset+byteIdx]) {
+ minPrefixDiff = byteIdx+1;
+ break;
+ }
+ }
+
+ // write only the min prefix that shows the diff
+ // against prior term
+ out.writeBytes(text.bytes, text.offset, minPrefixDiff);
if (termLengths.length == numIndexTerms) {
termLengths = ArrayUtil.grow(termLengths);
@@ -119,14 +133,19 @@ public class SimpleStandardTermsIndexWri
lastTermsPointer = fp;
// save term length (in bytes)
- assert text.length <= Short.MAX_VALUE;
- termLengths[numIndexTerms] = (short) text.length;
-
- totTermLength += text.length;
+ assert minPrefixDiff <= Short.MAX_VALUE;
+ termLengths[numIndexTerms] = (short) minPrefixDiff;
+ totTermLength += minPrefixDiff;
+ lastTerm.copy(text);
numIndexTerms++;
return true;
} else {
+ if (0 == numTerms % termIndexInterval) {
+ // save last term just before next index term so we
+ // can compute wasted suffix
+ lastTerm.copy(text);
+ }
return false;
}
}
Modified: lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestIndexWriterConfig.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestIndexWriterConfig.java?rev=983081&r1=983080&r2=983081&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestIndexWriterConfig.java (original)
+++ lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestIndexWriterConfig.java Fri Aug 6 18:13:17 2010
@@ -134,7 +134,7 @@ public class TestIndexWriterConfig exten
public void testConstants() throws Exception {
// Tests that the values of the constants does not change
assertEquals(1000, IndexWriterConfig.WRITE_LOCK_TIMEOUT);
- assertEquals(128, IndexWriterConfig.DEFAULT_TERM_INDEX_INTERVAL);
+ assertEquals(32, IndexWriterConfig.DEFAULT_TERM_INDEX_INTERVAL);
assertEquals(Integer.MAX_VALUE, IndexWriterConfig.UNLIMITED_FIELD_LENGTH);
assertEquals(-1, IndexWriterConfig.DISABLE_AUTO_FLUSH);
assertEquals(IndexWriterConfig.DISABLE_AUTO_FLUSH, IndexWriterConfig.DEFAULT_MAX_BUFFERED_DELETE_TERMS);