You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ds...@apache.org on 2015/04/22 23:48:12 UTC

svn commit: r1675505 - in /lucene/dev/branches/branch_5x: ./ lucene/ lucene/highlighter/ lucene/highlighter/src/java/org/apache/lucene/search/highlight/

Author: dsmiley
Date: Wed Apr 22 21:48:11 2015
New Revision: 1675505

URL: http://svn.apache.org/r1675505
Log:
LUCENE-6392 Highlighter: add maxStartOffset (and other memory saving efficiencies) to TokenStreamFromTermVector.
Delaying TokenSources changes for LUCENE-6445.

Modified:
    lucene/dev/branches/branch_5x/   (props changed)
    lucene/dev/branches/branch_5x/lucene/   (props changed)
    lucene/dev/branches/branch_5x/lucene/CHANGES.txt   (contents, props changed)
    lucene/dev/branches/branch_5x/lucene/highlighter/   (props changed)
    lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
    lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermVector.java

Modified: lucene/dev/branches/branch_5x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/CHANGES.txt?rev=1675505&r1=1675504&r2=1675505&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_5x/lucene/CHANGES.txt Wed Apr 22 21:48:11 2015
@@ -57,6 +57,9 @@ Optimizations
 * LUCENE-6421: Defer reading of positions in MultiPhraseQuery until
   they are needed. (Robert Muir)
 
+* LUCENE-6392: Highligher- reduce memory of tokens in
+  TokenStreamFromTermVector, and add maxStartOffset limit. (David Smiley)
+
 Bug Fixes
 
 * LUCENE-6378: Fix all RuntimeExceptions to throw the underlying root cause.

Modified: lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java?rev=1675505&r1=1675504&r2=1675505&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java (original)
+++ lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java Wed Apr 22 21:48:11 2015
@@ -127,7 +127,7 @@ public class TokenSources {
       // highlighters require offsets, so we insist here.
     }
 
-    return new TokenStreamFromTermVector(tpv);
+    return new TokenStreamFromTermVector(tpv, -1); // TODO propagate maxStartOffset; see LUCENE-6445
   }
 
   /**

Modified: lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermVector.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermVector.java?rev=1675505&r1=1675504&r2=1675505&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermVector.java (original)
+++ lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermVector.java Wed Apr 22 21:48:11 2015
@@ -32,6 +32,7 @@ import org.apache.lucene.util.AttributeF
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefArray;
 import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.CharsRefBuilder;
 import org.apache.lucene.util.Counter;
 import org.apache.lucene.util.UnicodeUtil;
 
@@ -52,8 +53,6 @@ import org.apache.lucene.util.UnicodeUti
  */
 public final class TokenStreamFromTermVector extends TokenStream {
 
-  //TODO add a maxStartOffset filter, which highlighters will find handy
-
   //This attribute factory uses less memory when captureState() is called.
   public static final AttributeFactory ATTRIBUTE_FACTORY =
       AttributeFactory.getStaticImplementation(
@@ -65,9 +64,14 @@ public final class TokenStreamFromTermVe
 
   private final PositionIncrementAttribute positionIncrementAttribute;
 
+  private final int maxStartOffset;
+
   private OffsetAttribute offsetAttribute;//maybe null
 
   private PayloadAttribute payloadAttribute;//maybe null
+
+  private CharsRefBuilder termCharsBuilder;//term data here
+
   private BytesRefArray payloadsBytesRefArray;//only used when payloadAttribute is non-null
   private BytesRefBuilder spareBytesRefBuilder;//only used when payloadAttribute is non-null
 
@@ -78,13 +82,16 @@ public final class TokenStreamFromTermVe
   private boolean initialized = false;//lazy
 
   /**
-   * Constructor.
-   * 
+   * Constructor. The uninversion doesn't happen here; it's delayed till the first call to
+   * {@link #incrementToken}.
+   *
    * @param vector Terms that contains the data for
    *        creating the TokenStream. Must have positions and/or offsets.
+   * @param maxStartOffset if a token's start offset exceeds this then the token is not added. -1 disables the limit.
    */
-  public TokenStreamFromTermVector(Terms vector) throws IOException {
+  public TokenStreamFromTermVector(Terms vector, int maxStartOffset) throws IOException {
     super(ATTRIBUTE_FACTORY);
+    this.maxStartOffset = maxStartOffset < 0 ? Integer.MAX_VALUE : maxStartOffset;
     assert !hasAttribute(PayloadAttribute.class) : "AttributeFactory shouldn't have payloads *yet*";
     if (!vector.hasPositions() && !vector.hasOffsets()) {
       throw new IllegalArgumentException("The term vector needs positions and/or offsets.");
@@ -106,15 +113,22 @@ public final class TokenStreamFromTermVe
   //We delay initialization because we can see which attributes the consumer wants, particularly payloads
   private void init() throws IOException {
     assert !initialized;
+    short dpEnumFlags = PostingsEnum.POSITIONS;
     if (vector.hasOffsets()) {
+      dpEnumFlags |= PostingsEnum.OFFSETS;
       offsetAttribute = addAttribute(OffsetAttribute.class);
     }
     if (vector.hasPayloads() && hasAttribute(PayloadAttribute.class)) {
+      dpEnumFlags |= (PostingsEnum.OFFSETS | PostingsEnum.PAYLOADS);//must ask for offsets too
       payloadAttribute = getAttribute(PayloadAttribute.class);
       payloadsBytesRefArray = new BytesRefArray(Counter.newCounter());
       spareBytesRefBuilder = new BytesRefBuilder();
     }
 
+    // We put term data here
+    termCharsBuilder = new CharsRefBuilder();
+    termCharsBuilder.grow((int) (vector.size() * 7));//7 is over-estimate of average term len
+
     // Step 1: iterate termsEnum and create a token, placing into an array of tokens by position
 
     TokenLL[] positionedTokens = initTokensArray();
@@ -124,14 +138,17 @@ public final class TokenStreamFromTermVe
     final TermsEnum termsEnum = vector.iterator();
     BytesRef termBytesRef;
     PostingsEnum dpEnum = null;
+    CharsRefBuilder tempCharsRefBuilder = new CharsRefBuilder();//only for UTF8->UTF16 call
     //int sumFreq = 0;
     while ((termBytesRef = termsEnum.next()) != null) {
       //Grab the term (in same way as BytesRef.utf8ToString() but we don't want a String obj)
       // note: if term vectors supported seek by ord then we might just keep an int and seek by ord on-demand
-      final char[] termChars = new char[termBytesRef.length];
-      final int termCharsLen = UnicodeUtil.UTF8toUTF16(termBytesRef, termChars);
+      tempCharsRefBuilder.grow(termBytesRef.length);
+      final int termCharsLen = UnicodeUtil.UTF8toUTF16(termBytesRef, tempCharsRefBuilder.chars());
+      final int termCharsOff = termCharsBuilder.length();
+      termCharsBuilder.append(tempCharsRefBuilder.chars(), 0, termCharsLen);
 
-      dpEnum = termsEnum.postings(null, dpEnum, PostingsEnum.POSITIONS);
+      dpEnum = termsEnum.postings(null, dpEnum, dpEnumFlags);
       assert dpEnum != null; // presumably checked by TokenSources.hasPositions earlier
       dpEnum.nextDoc();
       final int freq = dpEnum.freq();
@@ -139,11 +156,14 @@ public final class TokenStreamFromTermVe
       for (int j = 0; j < freq; j++) {
         int pos = dpEnum.nextPosition();
         TokenLL token = new TokenLL();
-        token.termChars = termChars;
-        token.termCharsLen = termCharsLen;
+        token.termCharsOff = termCharsOff;
+        token.termCharsLen = (short) Math.min(termCharsLen, Short.MAX_VALUE);
         if (offsetAttribute != null) {
           token.startOffset = dpEnum.startOffset();
-          token.endOffset = dpEnum.endOffset();
+          if (token.startOffset > maxStartOffset) {
+            continue;//filter this token out; exceeds threshold
+          }
+          token.endOffsetInc = (short) Math.min(dpEnum.endOffset() - token.startOffset, Short.MAX_VALUE);
           if (pos == -1) {
             pos = token.startOffset >> 3;//divide by 8
           }
@@ -216,8 +236,8 @@ public final class TokenStreamFromTermVe
   }
 
   private TokenLL[] initTokensArray() throws IOException {
-    // Estimate the number of position slots we need. We use some estimation factors taken from Wikipedia
-    //  that reduce the likelihood of needing to expand the array.
+    // Estimate the number of position slots we need from term stats.  We use some estimation factors taken from
+    //  Wikipedia that reduce the likelihood of needing to expand the array.
     int sumTotalTermFreq = (int) vector.getSumTotalTermFreq();
     if (sumTotalTermFreq == -1) {//unfortunately term vectors seem to not have this stat
       int size = (int) vector.size();
@@ -227,7 +247,12 @@ public final class TokenStreamFromTermVe
       sumTotalTermFreq = (int)(size * 2.4);
     }
     final int originalPositionEstimate = (int) (sumTotalTermFreq * 1.5);//less than 1 in 10 docs exceed this
-    return new TokenLL[originalPositionEstimate];
+
+    // This estimate is based on maxStartOffset. Err on the side of this being larger than needed.
+    final int offsetLimitPositionEstimate = (int) (maxStartOffset / 5.0);
+
+    // Take the smaller of the two estimates, but no smaller than 64
+    return new TokenLL[Math.max(64, Math.min(originalPositionEstimate, offsetLimitPositionEstimate))];
   }
 
   @Override
@@ -247,10 +272,10 @@ public final class TokenStreamFromTermVe
       return false;
     }
     clearAttributes();
-    termAttribute.copyBuffer(incrementToken.termChars, 0, incrementToken.termCharsLen);
+    termAttribute.copyBuffer(termCharsBuilder.chars(), incrementToken.termCharsOff, incrementToken.termCharsLen);
     positionIncrementAttribute.setPositionIncrement(incrementToken.positionIncrement);
     if (offsetAttribute != null) {
-      offsetAttribute.setOffset(incrementToken.startOffset, incrementToken.endOffset);
+      offsetAttribute.setOffset(incrementToken.startOffset, incrementToken.startOffset + incrementToken.endOffsetInc);
     }
     if (payloadAttribute != null) {
       if (incrementToken.payloadIndex == -1) {
@@ -263,11 +288,14 @@ public final class TokenStreamFromTermVe
   }
 
   private static class TokenLL {
-    char[] termChars;
-    int termCharsLen;
+    // This class should weigh 32 bytes, including object header
+
+    int termCharsOff; // see termCharsBuilder
+    short termCharsLen;
+
     int positionIncrement;
     int startOffset;
-    int endOffset;
+    short endOffsetInc; // add to startOffset to get endOffset
     int payloadIndex;
 
     TokenLL next;
@@ -297,7 +325,7 @@ public final class TokenStreamFromTermVe
     int compareOffsets(TokenLL tokenB) {
       int cmp = Integer.compare(this.startOffset, tokenB.startOffset);
       if (cmp == 0) {
-        cmp = Integer.compare(this.endOffset, tokenB.endOffset);
+        cmp = Short.compare(this.endOffsetInc, tokenB.endOffsetInc);
       }
       return cmp;
     }