You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ds...@apache.org on 2014/11/30 05:32:03 UTC
svn commit: r1642511 - in /lucene/dev/branches/branch_5x: ./ solr/
solr/CHANGES.txt solr/core/
solr/core/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java
Author: dsmiley
Date: Sun Nov 30 04:32:02 2014
New Revision: 1642511
URL: http://svn.apache.org/r1642511
Log:
SOLR-6680: DefaultSolrHighlighter: avoid CachingTokenFilter when not needed
Modified:
lucene/dev/branches/branch_5x/ (props changed)
lucene/dev/branches/branch_5x/solr/ (props changed)
lucene/dev/branches/branch_5x/solr/CHANGES.txt (contents, props changed)
lucene/dev/branches/branch_5x/solr/core/ (props changed)
lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java
Modified: lucene/dev/branches/branch_5x/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/CHANGES.txt?rev=1642511&r1=1642510&r2=1642511&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/CHANGES.txt (original)
+++ lucene/dev/branches/branch_5x/solr/CHANGES.txt Sun Nov 30 04:32:02 2014
@@ -281,6 +281,9 @@ Optimizations
* SOLR-6603: LBHttpSolrServer - lazily allocate skipped-zombie-servers list.
(Christine Poerschke via shalin)
+* SOLR-6680: DefaultSolrHighlighter can sometimes avoid CachingTokenFilter to save memory and
+ enable other optimizations. (David Smiley)
+
Other Changes
----------------------
Modified: lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java?rev=1642511&r1=1642510&r2=1642511&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java (original)
+++ lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java Sun Nov 30 04:32:02 2014
@@ -21,6 +21,7 @@ import org.apache.lucene.analysis.TokenF
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.highlight.*;
@@ -176,14 +177,13 @@ public class DefaultSolrHighlighter exte
* @param query The current Query
* @param fieldName The name of the field
* @param request The current SolrQueryRequest
- * @param tokenStream document text CachingTokenStream
+ * @param tokenStream document text tokenStream that implements reset() efficiently (e.g. CachingTokenFilter).
+ * If it's used, call reset() first.
* @throws IOException If there is a low-level I/O error.
*/
- protected Highlighter getPhraseHighlighter(Query query, String fieldName, SolrQueryRequest request, CachingTokenFilter tokenStream) throws IOException {
+ protected Highlighter getPhraseHighlighter(Query query, String fieldName, SolrQueryRequest request, TokenStream tokenStream) throws IOException {
SolrParams params = request.getParams();
- Highlighter highlighter = null;
-
- highlighter = new Highlighter(
+ Highlighter highlighter = new Highlighter(
getFormatter(fieldName, params),
getEncoder(fieldName, params),
getSpanQueryScorer(query, fieldName, tokenStream, request));
@@ -212,16 +212,14 @@ public class DefaultSolrHighlighter exte
/**
* Return a {@link org.apache.lucene.search.highlight.QueryScorer} suitable for this Query and field.
* @param query The current query
- * @param tokenStream document text CachingTokenStream
+ * @param tokenStream document text tokenStream that implements reset() efficiently (e.g. CachingTokenFilter).
+ * If it's used, call reset() first.
* @param fieldName The name of the field
* @param request The SolrQueryRequest
*/
private QueryScorer getSpanQueryScorer(Query query, String fieldName, TokenStream tokenStream, SolrQueryRequest request) {
boolean reqFieldMatch = request.getParams().getFieldBool(fieldName, HighlightParams.FIELD_MATCH, false);
- Boolean highlightMultiTerm = request.getParams().getBool(HighlightParams.HIGHLIGHT_MULTI_TERM, true);
- if(highlightMultiTerm == null) {
- highlightMultiTerm = false;
- }
+ boolean highlightMultiTerm = request.getParams().getBool(HighlightParams.HIGHLIGHT_MULTI_TERM, true);
QueryScorer scorer;
if (reqFieldMatch) {
scorer = new QueryScorer(query, fieldName);
@@ -435,6 +433,7 @@ public class DefaultSolrHighlighter exte
return termPosOff;
}
+ @SuppressWarnings("unchecked")
private void doHighlightingByHighlighter( Query query, SolrQueryRequest req, NamedList docSummaries,
int docId, Document doc, String fieldName ) throws IOException {
final SolrIndexSearcher searcher = req.getSearcher();
@@ -444,10 +443,7 @@ public class DefaultSolrHighlighter exte
// so we disable them until fixed (see LUCENE-3080)!
// BEGIN: Hack
final SchemaField schemaField = schema.getFieldOrNull(fieldName);
- if (schemaField != null && (
- (schemaField.getType() instanceof org.apache.solr.schema.TrieField) ||
- (schemaField.getType() instanceof org.apache.solr.schema.TrieDateField)
- )) return;
+ if (schemaField != null && schemaField.getType() instanceof org.apache.solr.schema.TrieField) return;
// END: Hack
SolrParams params = req.getParams();
@@ -456,19 +452,18 @@ public class DefaultSolrHighlighter exte
boolean preserveMulti = params.getFieldBool(fieldName, HighlightParams.PRESERVE_MULTI, false);
List<IndexableField> allFields = doc.getFields();
- if (allFields != null && allFields.size() == 0) return; // No explicit contract that getFields returns != null,
+ if (allFields == null || allFields.isEmpty()) return; // No explicit contract that getFields returns != null,
// although currently it can't.
- TokenStream tstream = null;
int numFragments = getMaxSnippets(fieldName, params);
boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params);
String[] summaries = null;
List<TextFragment> frags = new ArrayList<>();
- TermOffsetsTokenStream tots = null; // to be non-null iff we're using TermOffsets optimization
+ TermOffsetsTokenStream tots = null; // to be non-null iff we're using TermOffsets optimization (multi-valued)
TokenStream tvStream = TokenSources.getTokenStreamWithOffsets(searcher.getIndexReader(), docId, fieldName);
- if (tvStream != null) {
+ if (tvStream != null && schemaField.multiValued() && isActuallyMultiValued(allFields, fieldName)) {
tots = new TermOffsetsTokenStream(tvStream);
}
int mvToExamine = Integer.parseInt(req.getParams().get(HighlightParams.MAX_MULTIVALUED_TO_EXAMINE,
@@ -483,10 +478,13 @@ public class DefaultSolrHighlighter exte
--mvToExamine;
String thisText = thisField.stringValue();
- if( tots != null ) {
- // if we're using TermOffsets optimization, then get the next
+ TokenStream tstream;
+ if (tots != null) {
+ // if we're using TermOffsets optimization (multi-valued field with term vectors), then get the next
// field value's TokenStream (i.e. get field j's TokenStream) from tots:
- tstream = tots.getMultiValuedTokenStream( thisText.length() );
+ tstream = tots.getMultiValuedTokenStream(thisText.length());
+ } else if (tvStream != null) {
+ tstream = tvStream; // single-valued with term vectors
} else {
// fall back to analyzer
tstream = createAnalyzerTStream(schema, fieldName, thisText);
@@ -498,17 +496,30 @@ public class DefaultSolrHighlighter exte
Highlighter highlighter;
if (Boolean.valueOf(req.getParams().get(HighlightParams.USE_PHRASE_HIGHLIGHTER, "true"))) {
- if (maxCharsToAnalyze < 0) {
- tstream = new CachingTokenFilter(tstream);
+ // We're going to call getPhraseHighlighter and it might consume the tokenStream. If it does, the tokenStream
+ // needs to implement reset() efficiently.
+
+ //If the tokenStream is right from the term vectors, then CachingTokenFilter is unnecessary.
+ // It should be okay if OffsetLimit won't get applied in this case.
+ final TokenStream tempTokenStream;
+ if (tstream != tvStream) {
+ if (maxCharsToAnalyze < 0) {
+ tempTokenStream = new CachingTokenFilter(tstream);
+ } else {
+ tempTokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tstream, maxCharsToAnalyze));
+ }
} else {
- tstream = new CachingTokenFilter(new OffsetLimitTokenFilter(tstream, maxCharsToAnalyze));
+ tempTokenStream = tstream;
}
-
+
// get highlighter
- highlighter = getPhraseHighlighter(query, fieldName, req, (CachingTokenFilter) tstream);
+ highlighter = getPhraseHighlighter(query, fieldName, req, tempTokenStream);
- // after highlighter initialization, reset tstream since construction of highlighter already used it
- tstream.reset();
+ // if the CachingTokenFilter was consumed then use it going forward.
+ if (tempTokenStream instanceof CachingTokenFilter && ((CachingTokenFilter)tempTokenStream).isCached()) {
+ tstream = tempTokenStream;
+ }
+ //tstream.reset(); not needed; getBestTextFragments will reset it.
}
else {
// use "the old way"
@@ -523,15 +534,15 @@ public class DefaultSolrHighlighter exte
try {
TextFragment[] bestTextFragments = highlighter.getBestTextFragments(tstream, thisText, mergeContiguousFragments, numFragments);
- for (int k = 0; k < bestTextFragments.length; k++) {
+ for (TextFragment bestTextFragment : bestTextFragments) {
if (preserveMulti) {
- if (bestTextFragments[k] != null) {
- frags.add(bestTextFragments[k]);
+ if (bestTextFragment != null) {
+ frags.add(bestTextFragment);
--mvToMatch;
}
} else {
- if ((bestTextFragments[k] != null) && (bestTextFragments[k].getScore() > 0)) {
- frags.add(bestTextFragments[k]);
+ if ((bestTextFragment != null) && (bestTextFragment.getScore() > 0)) {
+ frags.add(bestTextFragment);
--mvToMatch;
}
}
@@ -539,19 +550,20 @@ public class DefaultSolrHighlighter exte
} catch (InvalidTokenOffsetsException e) {
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
}
- }
+ }//end field value loop
+
// sort such that the fragments with the highest score come first
- if(!preserveMulti){
- Collections.sort(frags, new Comparator<TextFragment>() {
- @Override
- public int compare(TextFragment arg0, TextFragment arg1) {
- return Math.round(arg1.getScore() - arg0.getScore());
+ if (!preserveMulti) {
+ Collections.sort(frags, new Comparator<TextFragment>() {
+ @Override
+ public int compare(TextFragment arg0, TextFragment arg1) {
+ return Math.round(arg1.getScore() - arg0.getScore());
}
- });
- }
+ });
+ }
- // convert fragments back into text
- // TODO: we can include score and position information in output as snippet attributes
+ // convert fragments back into text
+ // TODO: we can include score and position information in output as snippet attributes
if (frags.size() > 0) {
ArrayList<String> fragTexts = new ArrayList<>();
for (TextFragment fragment: frags) {
@@ -577,6 +589,22 @@ public class DefaultSolrHighlighter exte
}
}
+ /** Is this field *actually* multi-valued for this document's fields? */
+ private boolean isActuallyMultiValued(List<IndexableField> allFields, String fieldName) {
+ boolean foundFirst = false;
+ for (IndexableField field : allFields) {
+ if (field.name().equals(fieldName)) {
+ if (foundFirst) {
+ return true;//we found another
+ } else {
+ foundFirst = true;
+ }
+ }
+ }
+ return false;//0 or 1 value
+ }
+
+ @SuppressWarnings("unchecked")
private void doHighlightingByFastVectorHighlighter( FastVectorHighlighter highlighter, FieldQuery fieldQuery,
SolrQueryRequest req, NamedList docSummaries, int docId, Document doc,
String fieldName ) throws IOException {
@@ -612,7 +640,7 @@ public class DefaultSolrHighlighter exte
String[] altTexts = listFields.toArray(new String[listFields.size()]);
- if (altTexts != null && altTexts.length > 0){
+ if (altTexts.length > 0){
Encoder encoder = getEncoder(fieldName, params);
int alternateFieldLen = params.getFieldInt(fieldName, HighlightParams.ALTERNATE_FIELD_LENGTH,0);
List<String> altList = new ArrayList<>();
@@ -622,6 +650,7 @@ public class DefaultSolrHighlighter exte
altList.add(encoder.encodeText(altText));
}
else{
+ //note: seemingly redundant new String(...) releases memory to the larger text
altList.add( len + altText.length() > alternateFieldLen ?
encoder.encodeText(new String(altText.substring( 0, alternateFieldLen - len ))) :
encoder.encodeText(altText) );
@@ -635,12 +664,7 @@ public class DefaultSolrHighlighter exte
}
private TokenStream createAnalyzerTStream(IndexSchema schema, String fieldName, String docText) throws IOException {
-
- TokenStream tstream;
- TokenStream ts = schema.getIndexAnalyzer().tokenStream(fieldName, docText);
- ts.reset();
- tstream = new TokenOrderingFilter(ts, 10);
- return tstream;
+ return new TokenOrderingFilter(schema.getIndexAnalyzer().tokenStream(fieldName, docText), 10);
}
}
@@ -651,7 +675,7 @@ public class DefaultSolrHighlighter exte
*/
final class TokenOrderingFilter extends TokenFilter {
private final int windowSize;
- private final LinkedList<OrderedToken> queue = new LinkedList<>();
+ private final LinkedList<OrderedToken> queue = new LinkedList<>(); //TODO replace with Deque, Array impl
private boolean done=false;
private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
@@ -694,10 +718,6 @@ final class TokenOrderingFilter extends
}
}
- @Override
- public void reset() throws IOException {
- // this looks wrong: but its correct.
- }
}
// for TokenOrderingFilter, so it can easily sort by startOffset
@@ -708,19 +728,18 @@ class OrderedToken {
class TermOffsetsTokenStream {
- TokenStream bufferedTokenStream = null;
- OffsetAttribute bufferedOffsetAtt;
+ final TokenStream bufferedTokenStream;
+ final OffsetAttribute bufferedOffsetAtt;
State bufferedToken;
int bufferedStartOffset;
int bufferedEndOffset;
- int startOffset;
+ int startOffset = 0;
int endOffset;
+ boolean bufferedTokenStreamWasReset = false;
public TermOffsetsTokenStream( TokenStream tstream ){
bufferedTokenStream = tstream;
bufferedOffsetAtt = bufferedTokenStream.addAttribute(OffsetAttribute.class);
- startOffset = 0;
- bufferedToken = null;
}
public TokenStream getMultiValuedTokenStream( final int length ){
@@ -730,38 +749,53 @@ class TermOffsetsTokenStream {
final class MultiValuedStream extends TokenStream {
private final int length;
+ private boolean incrementTokenWasCalled = false;
OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
- MultiValuedStream(int length) {
- super(bufferedTokenStream.cloneAttributes());
- this.length = length;
- }
-
- @Override
- public boolean incrementToken() throws IOException {
- while( true ){
- if( bufferedToken == null ) {
- if (!bufferedTokenStream.incrementToken())
- return false;
- bufferedToken = bufferedTokenStream.captureState();
- bufferedStartOffset = bufferedOffsetAtt.startOffset();
- bufferedEndOffset = bufferedOffsetAtt.endOffset();
- }
-
- if( startOffset <= bufferedStartOffset &&
- bufferedEndOffset <= endOffset ){
- restoreState(bufferedToken);
- bufferedToken = null;
- offsetAtt.setOffset( offsetAtt.startOffset() - startOffset, offsetAtt.endOffset() - startOffset );
- return true;
- }
- else if( bufferedEndOffset > endOffset ){
- startOffset += length + 1;
+ MultiValuedStream(int length) {
+ super(bufferedTokenStream.cloneAttributes());//clone so we don't manipulate the buffered offsets
+ this.length = length;
+ }
+
+ @Override
+ public void reset() throws IOException {
+ //this flag allows reset() to be called multiple times up-front without a problem
+ if (incrementTokenWasCalled) {
+ throw new IllegalStateException("This TokenStream does not support being subsequently reset()");
+ }
+ if (!bufferedTokenStreamWasReset) {
+ bufferedTokenStream.reset();
+ bufferedTokenStreamWasReset = true;
+ }
+ super.reset();
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ incrementTokenWasCalled = true;
+ while( true ){
+ if( bufferedToken == null ) {
+ if (!bufferedTokenStream.incrementToken())
return false;
- }
+ bufferedToken = bufferedTokenStream.captureState();
+ bufferedStartOffset = bufferedOffsetAtt.startOffset();
+ bufferedEndOffset = bufferedOffsetAtt.endOffset();
+ }
+
+ if( startOffset <= bufferedStartOffset &&
+ bufferedEndOffset <= endOffset ){
+ restoreState(bufferedToken);
bufferedToken = null;
+ offsetAtt.setOffset( offsetAtt.startOffset() - startOffset, offsetAtt.endOffset() - startOffset );
+ return true;
}
+ else if( bufferedEndOffset > endOffset ){
+ startOffset += length + 1;
+ return false;
+ }
+ bufferedToken = null;
}
+ }
- };
-};
+ }
+}