You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ds...@apache.org on 2015/04/22 23:48:12 UTC
svn commit: r1675505 - in /lucene/dev/branches/branch_5x: ./ lucene/
lucene/highlighter/
lucene/highlighter/src/java/org/apache/lucene/search/highlight/
Author: dsmiley
Date: Wed Apr 22 21:48:11 2015
New Revision: 1675505
URL: http://svn.apache.org/r1675505
Log:
LUCENE-6392 Highlighter: add maxStartOffset (and other memory saving efficiencies) to TokenStreamFromTermVector.
Delaying TokenSources changes for LUCENE-6445.
Modified:
lucene/dev/branches/branch_5x/ (props changed)
lucene/dev/branches/branch_5x/lucene/ (props changed)
lucene/dev/branches/branch_5x/lucene/CHANGES.txt (contents, props changed)
lucene/dev/branches/branch_5x/lucene/highlighter/ (props changed)
lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermVector.java
Modified: lucene/dev/branches/branch_5x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/CHANGES.txt?rev=1675505&r1=1675504&r2=1675505&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_5x/lucene/CHANGES.txt Wed Apr 22 21:48:11 2015
@@ -57,6 +57,9 @@ Optimizations
* LUCENE-6421: Defer reading of positions in MultiPhraseQuery until
they are needed. (Robert Muir)
+* LUCENE-6392: Highligher- reduce memory of tokens in
+ TokenStreamFromTermVector, and add maxStartOffset limit. (David Smiley)
+
Bug Fixes
* LUCENE-6378: Fix all RuntimeExceptions to throw the underlying root cause.
Modified: lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java?rev=1675505&r1=1675504&r2=1675505&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java (original)
+++ lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java Wed Apr 22 21:48:11 2015
@@ -127,7 +127,7 @@ public class TokenSources {
// highlighters require offsets, so we insist here.
}
- return new TokenStreamFromTermVector(tpv);
+ return new TokenStreamFromTermVector(tpv, -1); // TODO propagate maxStartOffset; see LUCENE-6445
}
/**
Modified: lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermVector.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermVector.java?rev=1675505&r1=1675504&r2=1675505&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermVector.java (original)
+++ lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenStreamFromTermVector.java Wed Apr 22 21:48:11 2015
@@ -32,6 +32,7 @@ import org.apache.lucene.util.AttributeF
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefArray;
import org.apache.lucene.util.BytesRefBuilder;
+import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.Counter;
import org.apache.lucene.util.UnicodeUtil;
@@ -52,8 +53,6 @@ import org.apache.lucene.util.UnicodeUti
*/
public final class TokenStreamFromTermVector extends TokenStream {
- //TODO add a maxStartOffset filter, which highlighters will find handy
-
//This attribute factory uses less memory when captureState() is called.
public static final AttributeFactory ATTRIBUTE_FACTORY =
AttributeFactory.getStaticImplementation(
@@ -65,9 +64,14 @@ public final class TokenStreamFromTermVe
private final PositionIncrementAttribute positionIncrementAttribute;
+ private final int maxStartOffset;
+
private OffsetAttribute offsetAttribute;//maybe null
private PayloadAttribute payloadAttribute;//maybe null
+
+ private CharsRefBuilder termCharsBuilder;//term data here
+
private BytesRefArray payloadsBytesRefArray;//only used when payloadAttribute is non-null
private BytesRefBuilder spareBytesRefBuilder;//only used when payloadAttribute is non-null
@@ -78,13 +82,16 @@ public final class TokenStreamFromTermVe
private boolean initialized = false;//lazy
/**
- * Constructor.
- *
+ * Constructor. The uninversion doesn't happen here; it's delayed till the first call to
+ * {@link #incrementToken}.
+ *
* @param vector Terms that contains the data for
* creating the TokenStream. Must have positions and/or offsets.
+ * @param maxStartOffset if a token's start offset exceeds this then the token is not added. -1 disables the limit.
*/
- public TokenStreamFromTermVector(Terms vector) throws IOException {
+ public TokenStreamFromTermVector(Terms vector, int maxStartOffset) throws IOException {
super(ATTRIBUTE_FACTORY);
+ this.maxStartOffset = maxStartOffset < 0 ? Integer.MAX_VALUE : maxStartOffset;
assert !hasAttribute(PayloadAttribute.class) : "AttributeFactory shouldn't have payloads *yet*";
if (!vector.hasPositions() && !vector.hasOffsets()) {
throw new IllegalArgumentException("The term vector needs positions and/or offsets.");
@@ -106,15 +113,22 @@ public final class TokenStreamFromTermVe
//We delay initialization because we can see which attributes the consumer wants, particularly payloads
private void init() throws IOException {
assert !initialized;
+ short dpEnumFlags = PostingsEnum.POSITIONS;
if (vector.hasOffsets()) {
+ dpEnumFlags |= PostingsEnum.OFFSETS;
offsetAttribute = addAttribute(OffsetAttribute.class);
}
if (vector.hasPayloads() && hasAttribute(PayloadAttribute.class)) {
+ dpEnumFlags |= (PostingsEnum.OFFSETS | PostingsEnum.PAYLOADS);//must ask for offsets too
payloadAttribute = getAttribute(PayloadAttribute.class);
payloadsBytesRefArray = new BytesRefArray(Counter.newCounter());
spareBytesRefBuilder = new BytesRefBuilder();
}
+ // We put term data here
+ termCharsBuilder = new CharsRefBuilder();
+ termCharsBuilder.grow((int) (vector.size() * 7));//7 is over-estimate of average term len
+
// Step 1: iterate termsEnum and create a token, placing into an array of tokens by position
TokenLL[] positionedTokens = initTokensArray();
@@ -124,14 +138,17 @@ public final class TokenStreamFromTermVe
final TermsEnum termsEnum = vector.iterator();
BytesRef termBytesRef;
PostingsEnum dpEnum = null;
+ CharsRefBuilder tempCharsRefBuilder = new CharsRefBuilder();//only for UTF8->UTF16 call
//int sumFreq = 0;
while ((termBytesRef = termsEnum.next()) != null) {
//Grab the term (in same way as BytesRef.utf8ToString() but we don't want a String obj)
// note: if term vectors supported seek by ord then we might just keep an int and seek by ord on-demand
- final char[] termChars = new char[termBytesRef.length];
- final int termCharsLen = UnicodeUtil.UTF8toUTF16(termBytesRef, termChars);
+ tempCharsRefBuilder.grow(termBytesRef.length);
+ final int termCharsLen = UnicodeUtil.UTF8toUTF16(termBytesRef, tempCharsRefBuilder.chars());
+ final int termCharsOff = termCharsBuilder.length();
+ termCharsBuilder.append(tempCharsRefBuilder.chars(), 0, termCharsLen);
- dpEnum = termsEnum.postings(null, dpEnum, PostingsEnum.POSITIONS);
+ dpEnum = termsEnum.postings(null, dpEnum, dpEnumFlags);
assert dpEnum != null; // presumably checked by TokenSources.hasPositions earlier
dpEnum.nextDoc();
final int freq = dpEnum.freq();
@@ -139,11 +156,14 @@ public final class TokenStreamFromTermVe
for (int j = 0; j < freq; j++) {
int pos = dpEnum.nextPosition();
TokenLL token = new TokenLL();
- token.termChars = termChars;
- token.termCharsLen = termCharsLen;
+ token.termCharsOff = termCharsOff;
+ token.termCharsLen = (short) Math.min(termCharsLen, Short.MAX_VALUE);
if (offsetAttribute != null) {
token.startOffset = dpEnum.startOffset();
- token.endOffset = dpEnum.endOffset();
+ if (token.startOffset > maxStartOffset) {
+ continue;//filter this token out; exceeds threshold
+ }
+ token.endOffsetInc = (short) Math.min(dpEnum.endOffset() - token.startOffset, Short.MAX_VALUE);
if (pos == -1) {
pos = token.startOffset >> 3;//divide by 8
}
@@ -216,8 +236,8 @@ public final class TokenStreamFromTermVe
}
private TokenLL[] initTokensArray() throws IOException {
- // Estimate the number of position slots we need. We use some estimation factors taken from Wikipedia
- // that reduce the likelihood of needing to expand the array.
+ // Estimate the number of position slots we need from term stats. We use some estimation factors taken from
+ // Wikipedia that reduce the likelihood of needing to expand the array.
int sumTotalTermFreq = (int) vector.getSumTotalTermFreq();
if (sumTotalTermFreq == -1) {//unfortunately term vectors seem to not have this stat
int size = (int) vector.size();
@@ -227,7 +247,12 @@ public final class TokenStreamFromTermVe
sumTotalTermFreq = (int)(size * 2.4);
}
final int originalPositionEstimate = (int) (sumTotalTermFreq * 1.5);//less than 1 in 10 docs exceed this
- return new TokenLL[originalPositionEstimate];
+
+ // This estimate is based on maxStartOffset. Err on the side of this being larger than needed.
+ final int offsetLimitPositionEstimate = (int) (maxStartOffset / 5.0);
+
+ // Take the smaller of the two estimates, but no smaller than 64
+ return new TokenLL[Math.max(64, Math.min(originalPositionEstimate, offsetLimitPositionEstimate))];
}
@Override
@@ -247,10 +272,10 @@ public final class TokenStreamFromTermVe
return false;
}
clearAttributes();
- termAttribute.copyBuffer(incrementToken.termChars, 0, incrementToken.termCharsLen);
+ termAttribute.copyBuffer(termCharsBuilder.chars(), incrementToken.termCharsOff, incrementToken.termCharsLen);
positionIncrementAttribute.setPositionIncrement(incrementToken.positionIncrement);
if (offsetAttribute != null) {
- offsetAttribute.setOffset(incrementToken.startOffset, incrementToken.endOffset);
+ offsetAttribute.setOffset(incrementToken.startOffset, incrementToken.startOffset + incrementToken.endOffsetInc);
}
if (payloadAttribute != null) {
if (incrementToken.payloadIndex == -1) {
@@ -263,11 +288,14 @@ public final class TokenStreamFromTermVe
}
private static class TokenLL {
- char[] termChars;
- int termCharsLen;
+ // This class should weigh 32 bytes, including object header
+
+ int termCharsOff; // see termCharsBuilder
+ short termCharsLen;
+
int positionIncrement;
int startOffset;
- int endOffset;
+ short endOffsetInc; // add to startOffset to get endOffset
int payloadIndex;
TokenLL next;
@@ -297,7 +325,7 @@ public final class TokenStreamFromTermVe
int compareOffsets(TokenLL tokenB) {
int cmp = Integer.compare(this.startOffset, tokenB.startOffset);
if (cmp == 0) {
- cmp = Integer.compare(this.endOffset, tokenB.endOffset);
+ cmp = Short.compare(this.endOffsetInc, tokenB.endOffsetInc);
}
return cmp;
}