You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by mi...@apache.org on 2015/04/13 00:43:15 UTC

svn commit: r1673075 - /lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/AutoPrefixTermsWriter.java

Author: mikemccand
Date: Sun Apr 12 22:43:15 2015
New Revision: 1673075

URL: http://svn.apache.org/r1673075
Log:
LUCENE-5879: fix ob1 that caused OOME in test when min and max auto-prefix terms was 2; attempt to simplify empty string case

Modified:
    lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/AutoPrefixTermsWriter.java

Modified: lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/AutoPrefixTermsWriter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/AutoPrefixTermsWriter.java?rev=1673075&r1=1673074&r2=1673075&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/AutoPrefixTermsWriter.java (original)
+++ lucene/dev/trunk/lucene/core/src/java/org/apache/lucene/codecs/blocktree/AutoPrefixTermsWriter.java Sun Apr 12 22:43:15 2015
@@ -211,12 +211,17 @@ class AutoPrefixTermsWriter {
       }
     }
 
+    // Even though we visited terms in already-sorted order, the prefixes
+    // can be slightly unsorted, e.g. aaaaa will be before aaa, so we
+    // must sort here so our caller can do merge sort into actual terms
+    // when writing.  Probably we should use CollectionUtil.timSort here?
     Collections.sort(prefixes);
   }
 
   /** Pushes the new term to the top of the stack, and writes new blocks. */
   private void pushTerm(BytesRef text) throws IOException {
     int limit = Math.min(lastTerm.length(), text.length);
+    //if (DEBUG) System.out.println("\nterm: " + text.utf8ToString());
 
     // Find common prefix between last term and current term:
     int pos = 0;
@@ -234,10 +239,10 @@ class AutoPrefixTermsWriter {
       int prefixTopSize = pending.size() - prefixStarts[i];
 
       while (prefixTopSize >= minItemsInPrefix) {       
-        //if (DEBUG) System.out.println("pushTerm i=" + i + " prefixTopSize=" + prefixTopSize + " minItemsInBlock=" + minItemsInPrefix);
+        //if (DEBUG) System.out.println("  pop: i=" + i + " prefixTopSize=" + prefixTopSize + " minItemsInBlock=" + minItemsInPrefix);
         savePrefixes(i+1, prefixTopSize);
         //prefixStarts[i] -= prefixTopSize;
-        //System.out.println("    after savePrefixes: " + (pending.size() - prefixStarts[i]) + " pending.size()=" + pending.size() + " start=" + prefixStarts[i]);
+        //if (DEBUG) System.out.println("    after savePrefixes: " + (pending.size() - prefixStarts[i]) + " pending.size()=" + pending.size() + " start=" + prefixStarts[i]);
 
         // For large floor blocks, it's possible we should now re-run on the new prefix terms we just created:
         prefixTopSize = pending.size() - prefixStarts[i];
@@ -267,27 +272,52 @@ class AutoPrefixTermsWriter {
 
     assert count > 0;
 
-    //if (DEBUG2) {
-    //  BytesRef br = new BytesRef(lastTerm.bytes());
-    //  br.length = prefixLength;
-    //  System.out.println("  savePrefixes: seg=" + segment + " " + brToString(br) + " count=" + count + " pending.size()=" + pending.size());
-    //}
+    /*
+    if (DEBUG2) {
+      BytesRef br = new BytesRef(lastTerm.bytes());
+      br.length = prefixLength;
+      //System.out.println("  savePrefixes: seg=" + segment + " " + brToString(br) + " count=" + count + " pending.size()=" + pending.size());
+      System.out.println("  savePrefixes: " + brToString(br) + " count=" + count + " pending.size()=" + pending.size());
+    }
+    */
 
     int lastSuffixLeadLabel = -2;
 
     int start = pending.size()-count;
     assert start >=0;
 
+    // Special case empty-string suffix case: we are being asked to build prefix terms for all aaa* terms, but 
+    // the exact term aaa is here, and we must skip it (it is handled "higher", under the aa* terms):
+    Object o = pending.get(start);
+    boolean skippedEmptyStringSuffix = false;
+    if (o instanceof byte[]) {
+      if (((byte[]) o).length == prefixLength) {
+        start++;
+        count--;
+        //if (DEBUG) System.out.println("  skip empty-string term suffix");
+        skippedEmptyStringSuffix = true;
+      }
+    } else {
+      PrefixTerm prefix = (PrefixTerm) o;
+      if (prefix.term.bytes.length == prefixLength) {
+        start++;
+        count--;
+        //if (DEBUG) System.out.println("  skip empty-string PT suffix");
+        skippedEmptyStringSuffix = true;
+      }
+    }
+
     int end = pending.size();
     int nextBlockStart = start;
     int nextFloorLeadLabel = -1;
     int prefixCount = 0;
-    int pendingCount = 0;
+
     PrefixTerm lastPTEntry = null;
+
     for (int i=start; i<end; i++) {
 
       byte[] termBytes;
-      Object o = pending.get(i);
+      o = pending.get(i);
       PrefixTerm ptEntry;
       if (o instanceof byte[]) {
         ptEntry = null;
@@ -300,23 +330,15 @@ class AutoPrefixTermsWriter {
           ptEntry = null;
         }
       }
-      pendingCount++;
 
-      //if (DEBUG) System.out.println("    check term=" + brToString(new BytesRef(termBytes)));
+      //if (DEBUG) System.out.println("    check term=" + brToString(new BytesRef(termBytes)) + " o=" + o);
 
-      int suffixLeadLabel;
+      // We handled the empty-string suffix case up front:
+      assert termBytes.length > prefixLength;
 
-      if (termBytes.length == prefixLength) {
-        // Suffix is 0, i.e. prefix 'foo' and term is
-        // 'foo' so the term has empty string suffix
-        // in this block
-        assert lastSuffixLeadLabel == -2;
-        suffixLeadLabel = -2;
-      } else {
-        suffixLeadLabel = termBytes[prefixLength] & 0xff;
-      }
+      int suffixLeadLabel = termBytes[prefixLength] & 0xff;
 
-      // if (DEBUG) System.out.println("  i=" + i + " ent=" + ent + " suffixLeadLabel=" + suffixLeadLabel);
+      //if (DEBUG) System.out.println("  i=" + i + " o=" + o + " suffixLeadLabel=" + Integer.toHexString(suffixLeadLabel) + " pendingCount=" + (i - nextBlockStart) + " min=" + minItemsInPrefix);
 
       if (suffixLeadLabel != lastSuffixLeadLabel) {
         // This is a boundary, a chance to make an auto-prefix term if we want:
@@ -327,8 +349,9 @@ class AutoPrefixTermsWriter {
         // than the lead start of the current entry:
         assert suffixLeadLabel > lastSuffixLeadLabel: "suffixLeadLabel=" + suffixLeadLabel + " vs lastSuffixLeadLabel=" + lastSuffixLeadLabel;
 
-        // NOTE: must check nextFloorLeadLabel in case minItemsInPrefix is 2 and prefix is 'a' and we've seen 'a' and then 'aa'
-        if (pendingCount >= minItemsInPrefix && end-nextBlockStart > maxItemsInPrefix && nextFloorLeadLabel != -1) {
+        int itemsInBlock = i - nextBlockStart;
+
+        if (itemsInBlock >= minItemsInPrefix && end-nextBlockStart > maxItemsInPrefix) {
           // The count is too large for one block, so we must break it into "floor" blocks, where we record
           // the leading label of the suffix of the first term in each floor block, so at search time we can
           // jump to the right floor block.  We just use a naive greedy segmenter here: make a new floor
@@ -338,11 +361,10 @@ class AutoPrefixTermsWriter {
           // If the last entry was another prefix term of the same length, then it represents a range of terms, so we must use its ending
           // prefix label as our ending label:
           if (lastPTEntry != null) {
+            //if (DEBUG) System.out.println("  use last");
             lastSuffixLeadLabel = lastPTEntry.floorLeadEnd;
           }
-
           savePrefix(prefixLength, nextFloorLeadLabel, lastSuffixLeadLabel);
-          pendingCount = 0;
 
           prefixCount++;
           nextFloorLeadLabel = suffixLeadLabel;
@@ -356,6 +378,7 @@ class AutoPrefixTermsWriter {
 
         lastSuffixLeadLabel = suffixLeadLabel;
       }
+
       lastPTEntry = ptEntry;
     }
 
@@ -370,6 +393,12 @@ class AutoPrefixTermsWriter {
         if (prefixLength > 0) {
           savePrefix(prefixLength, -2, 0xff);
           prefixCount++;
+          
+          // If we skipped empty string suffix, e.g. term aaa for prefix aaa*, since we
+          // are now writing the full aaa* prefix term, we include it here:
+          if (skippedEmptyStringSuffix) {
+            count++;
+          }
         } else {
           // Don't add a prefix term for all terms in the index!
         }
@@ -384,16 +413,8 @@ class AutoPrefixTermsWriter {
     }
 
     // Remove slice from the top of the pending stack, that we just wrote:
-    int sizeToClear = count;
-    if (prefixCount > 1) {
-      Object o = pending.get(pending.size()-count);
-      if (o instanceof byte[] && ((byte[]) o).length == prefixLength) {
-        // If we were just asked to write all f* terms, but there were too many and so we made floor blocks, the exact term 'f' will remain
-        // as its own item, followed by floor block terms like f[a-m]*, f[n-z]*, so in this case we leave 3 (not 2) items on the pending stack:
-        sizeToClear--;
-      }
-    }
-    pending.subList(pending.size()-sizeToClear, pending.size()).clear();
+
+    pending.subList(pending.size()-count, pending.size()).clear();
 
     // Append prefix terms for each prefix, since these count like real terms that also need to be "rolled up":
     for(int i=0;i<prefixCount;i++) {
@@ -410,6 +431,8 @@ class AutoPrefixTermsWriter {
 
     PrefixTerm pt = new PrefixTerm(prefix, floorLeadStart, floorLeadEnd); 
     //if (DEBUG2) System.out.println("    savePrefix: seg=" + segment + " " + pt + " count=" + count);
+    //if (DEBUG) System.out.println("    savePrefix: " + pt);
+
     prefixes.add(pt);
   }
 }