You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2010/08/06 20:13:18 UTC

svn commit: r983081 - in /lucene/dev/trunk/lucene: ./ src/java/org/apache/lucene/index/ src/java/org/apache/lucene/index/codecs/standard/ src/test/org/apache/lucene/index/

Author: mikemccand
Date: Fri Aug  6 18:13:17 2010
New Revision: 983081

URL: http://svn.apache.org/viewvc?rev=983081&view=rev
Log:
LUCENE-2588: trim unnecessary suffixes from terms in the terms dict index

Modified:
    lucene/dev/trunk/lucene/CHANGES.txt
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java
    lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexWriter.java
    lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestIndexWriterConfig.java

Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=983081&r1=983080&r2=983081&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Fri Aug  6 18:13:17 2010
@@ -209,6 +209,11 @@ Optimizations
   efficient copying by sub-classes. Optimized copy is implemented for RAM and FS
   streams. (Shai Erera)
 
+* LUCENE-2588: Don't store unecessary suffixes when writing the terms
+  index, saving RAM in IndexReader; change default terms index
+  interval from 128 to 32, because the terms index now requires much
+  less RAM.  (Robert Muir, Mike McCandless)
+
 Documentation
 
 * LUCENE-2579: Fix oal.search's package.html description of abstract

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java?rev=983081&r1=983080&r2=983081&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/IndexWriterConfig.java Fri Aug  6 18:13:17 2010
@@ -55,7 +55,7 @@ public final class IndexWriterConfig imp
   public static enum OpenMode { CREATE, APPEND, CREATE_OR_APPEND }
   
   /** Default value is 128. Change using {@link #setTermIndexInterval(int)}. */
-  public static final int DEFAULT_TERM_INDEX_INTERVAL = 128;
+  public static final int DEFAULT_TERM_INDEX_INTERVAL = 32;
 
   /** Denotes a flush trigger is disabled. */
   public final static int DISABLE_AUTO_FLUSH = -1;

Modified: lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexWriter.java?rev=983081&r1=983080&r2=983081&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexWriter.java (original)
+++ lucene/dev/trunk/lucene/src/java/org/apache/lucene/index/codecs/standard/SimpleStandardTermsIndexWriter.java Fri Aug  6 18:13:17 2010
@@ -80,7 +80,7 @@ public class SimpleStandardTermsIndexWri
     final long termsStart;
     long packedIndexStart;
     long packedOffsetsStart;
-    private int numTerms;
+    private long numTerms;
 
     // TODO: we could conceivably make a PackedInts wrapper
     // that auto-grows... then we wouldn't force 6 bytes RAM
@@ -90,6 +90,8 @@ public class SimpleStandardTermsIndexWri
     private long lastTermsPointer;
     private long totTermLength;
 
+    private final BytesRef lastTerm = new BytesRef();
+
     SimpleFieldWriter(FieldInfo fieldInfo) {
       this.fieldInfo = fieldInfo;
       indexStart = out.getFilePointer();
@@ -103,8 +105,20 @@ public class SimpleStandardTermsIndexWri
       // First term is first indexed term:
       if (0 == (numTerms++ % termIndexInterval)) {
 
-        // write full bytes
-        out.writeBytes(text.bytes, text.offset, text.length);
+        // we can safely strip off the non-distinguishing
+        // suffix to save RAM in the loaded terms index.
+        final int limit = Math.min(lastTerm.length, text.length);
+        int minPrefixDiff = 1+lastTerm.length;
+        for(int byteIdx=0;byteIdx<limit;byteIdx++) {
+          if (lastTerm.bytes[lastTerm.offset+byteIdx] != text.bytes[text.offset+byteIdx]) {
+            minPrefixDiff = byteIdx+1;
+            break;
+          }
+        }
+
+        // write only the min prefix that shows the diff
+        // against prior term
+        out.writeBytes(text.bytes, text.offset, minPrefixDiff);
 
         if (termLengths.length == numIndexTerms) {
           termLengths = ArrayUtil.grow(termLengths);
@@ -119,14 +133,19 @@ public class SimpleStandardTermsIndexWri
         lastTermsPointer = fp;
 
         // save term length (in bytes)
-        assert text.length <= Short.MAX_VALUE;
-        termLengths[numIndexTerms] = (short) text.length;
-
-        totTermLength += text.length;
+        assert minPrefixDiff <= Short.MAX_VALUE;
+        termLengths[numIndexTerms] = (short) minPrefixDiff;
+        totTermLength += minPrefixDiff;
 
+        lastTerm.copy(text);
         numIndexTerms++;
         return true;
       } else {
+        if (0 == numTerms % termIndexInterval) {
+          // save last term just before next index term so we
+          // can compute wasted suffix
+          lastTerm.copy(text);
+        }
         return false;
       }
     }

Modified: lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestIndexWriterConfig.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestIndexWriterConfig.java?rev=983081&r1=983080&r2=983081&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestIndexWriterConfig.java (original)
+++ lucene/dev/trunk/lucene/src/test/org/apache/lucene/index/TestIndexWriterConfig.java Fri Aug  6 18:13:17 2010
@@ -134,7 +134,7 @@ public class TestIndexWriterConfig exten
   public void testConstants() throws Exception {
     // Tests that the values of the constants does not change
     assertEquals(1000, IndexWriterConfig.WRITE_LOCK_TIMEOUT);
-    assertEquals(128, IndexWriterConfig.DEFAULT_TERM_INDEX_INTERVAL);
+    assertEquals(32, IndexWriterConfig.DEFAULT_TERM_INDEX_INTERVAL);
     assertEquals(Integer.MAX_VALUE, IndexWriterConfig.UNLIMITED_FIELD_LENGTH);
     assertEquals(-1, IndexWriterConfig.DISABLE_AUTO_FLUSH);
     assertEquals(IndexWriterConfig.DISABLE_AUTO_FLUSH, IndexWriterConfig.DEFAULT_MAX_BUFFERED_DELETE_TERMS);