You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ds...@apache.org on 2015/04/13 15:35:36 UTC
svn commit: r1673183 - in /lucene/dev/trunk/solr: CHANGES.txt
core/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java
Author: dsmiley
Date: Mon Apr 13 13:35:35 2015
New Revision: 1673183
URL: http://svn.apache.org/r1673183
Log:
SOLR-6692: Made standard highlighter more extensible
Private methods are now protected, removed bad constructor, and refactored out getFieldValues() and getResponseForFragments() for both clarity and sub-class extension utility. Some refactoring of doHighlightingByHighlighter to clarify logic. These changes should have ZERO effect on highlights.
Modified:
lucene/dev/trunk/solr/CHANGES.txt
lucene/dev/trunk/solr/core/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java
Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1673183&r1=1673182&r2=1673183&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Mon Apr 13 13:35:35 2015
@@ -128,6 +128,8 @@ Other Changes
* SOLR-7384: Fix spurious failures in FullSolrCloudDistribCmdsTest. (shalin)
+* SOLR-6692: The default highlighter is much more extensible. (David Smiley)
+
================== 5.1.0 ==================
Consult the LUCENE_CHANGES.txt file for additional, low level, changes in this release
Modified: lucene/dev/trunk/solr/core/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/core/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java?rev=1673183&r1=1673182&r2=1673183&view=diff
==============================================================================
--- lucene/dev/trunk/solr/core/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java (original)
+++ lucene/dev/trunk/solr/core/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java Mon Apr 13 13:35:35 2015
@@ -16,6 +16,18 @@
*/
package org.apache.solr.highlight;
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
+import java.util.Comparator;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.LinkedList;
+import java.util.List;
+import java.util.ListIterator;
+import java.util.Map;
+import java.util.Set;
+
import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
@@ -58,18 +70,6 @@ import org.apache.solr.util.plugin.Plugi
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.LinkedList;
-import java.util.List;
-import java.util.ListIterator;
-import java.util.Map;
-import java.util.Set;
-
/**
*
* @since solr 1.3
@@ -79,11 +79,9 @@ public class DefaultSolrHighlighter exte
public static Logger log = LoggerFactory.getLogger(DefaultSolrHighlighter.class);
- private SolrCore solrCore;
-
- public DefaultSolrHighlighter() {
- }
+ protected final SolrCore solrCore;
+ /** Will be invoked via reflection */
public DefaultSolrHighlighter(SolrCore solrCore) {
this.solrCore = solrCore;
}
@@ -207,7 +205,7 @@ public class DefaultSolrHighlighter exte
* @param fieldName The name of the field
* @param request The SolrQueryRequest
*/
- private QueryScorer getSpanQueryScorer(Query query, String fieldName, TokenStream tokenStream, SolrQueryRequest request) {
+ protected QueryScorer getSpanQueryScorer(Query query, String fieldName, TokenStream tokenStream, SolrQueryRequest request) {
QueryScorer scorer = new QueryScorer(query,
request.getParams().getFieldBool(fieldName, HighlightParams.FIELD_MATCH, false) ? fieldName : null);
scorer.setExpandMultiTermQuery(request.getParams().getBool(HighlightParams.HIGHLIGHT_MULTI_TERM, true));
@@ -222,7 +220,7 @@ public class DefaultSolrHighlighter exte
* @param fieldName The name of the field
* @param request The SolrQueryRequest
*/
- private Scorer getQueryScorer(Query query, String fieldName, SolrQueryRequest request) {
+ protected Scorer getQueryScorer(Query query, String fieldName, SolrQueryRequest request) {
boolean reqFieldMatch = request.getParams().getFieldBool(fieldName, HighlightParams.FIELD_MATCH, false);
if (reqFieldMatch) {
return new QueryTermScorer(query, request.getSearcher().getIndexReader(), fieldName);
@@ -264,11 +262,11 @@ public class DefaultSolrHighlighter exte
protected Formatter getFormatter(String fieldName, SolrParams params )
{
String str = params.getFieldParam( fieldName, HighlightParams.FORMATTER );
- SolrFormatter formatter = formatters.get( str );
+ SolrFormatter formatter = formatters.get(str);
if( formatter == null ) {
throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "Unknown formatter: "+str );
}
- return formatter.getFormatter( fieldName, params );
+ return formatter.getFormatter(fieldName, params);
}
/**
@@ -286,7 +284,7 @@ public class DefaultSolrHighlighter exte
if( encoder == null ) {
throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "Unknown encoder: "+str );
}
- return encoder.getEncoder( fieldName, params );
+ return encoder.getEncoder(fieldName, params);
}
/**
@@ -305,33 +303,33 @@ public class DefaultSolrHighlighter exte
if( frag == null ) {
throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "Unknown fragmenter: "+fmt );
}
- return frag.getFragmenter( fieldName, params );
+ return frag.getFragmenter(fieldName, params);
}
protected FragListBuilder getFragListBuilder( String fieldName, SolrParams params ){
String flb = params.getFieldParam( fieldName, HighlightParams.FRAG_LIST_BUILDER );
- SolrFragListBuilder solrFlb = fragListBuilders.get( flb );
+ SolrFragListBuilder solrFlb = fragListBuilders.get(flb);
if( solrFlb == null ){
throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "Unknown fragListBuilder: " + flb );
}
- return solrFlb.getFragListBuilder( params );
+ return solrFlb.getFragListBuilder(params);
}
protected FragmentsBuilder getFragmentsBuilder( String fieldName, SolrParams params ){
BoundaryScanner bs = getBoundaryScanner(fieldName, params);
- return getSolrFragmentsBuilder( fieldName, params ).getFragmentsBuilder( params, bs );
+ return getSolrFragmentsBuilder( fieldName, params ).getFragmentsBuilder(params, bs);
}
-
- private SolrFragmentsBuilder getSolrFragmentsBuilder( String fieldName, SolrParams params ){
+
+ protected SolrFragmentsBuilder getSolrFragmentsBuilder( String fieldName, SolrParams params ){
String fb = params.getFieldParam( fieldName, HighlightParams.FRAGMENTS_BUILDER );
- SolrFragmentsBuilder solrFb = fragmentsBuilders.get( fb );
+ SolrFragmentsBuilder solrFb = fragmentsBuilders.get(fb);
if( solrFb == null ){
throw new SolrException( SolrException.ErrorCode.BAD_REQUEST, "Unknown fragmentsBuilder: " + fb );
}
return solrFb;
}
-
- private BoundaryScanner getBoundaryScanner(String fieldName, SolrParams params){
+
+ protected BoundaryScanner getBoundaryScanner(String fieldName, SolrParams params){
String bs = params.getFieldParam(fieldName, HighlightParams.BOUNDARY_SCANNER);
SolrBoundaryScanner solrBs = boundaryScanners.get(bs);
if(solrBs == null){
@@ -406,10 +404,10 @@ public class DefaultSolrHighlighter exte
* If fieldName is undefined, this method returns false, then
* doHighlightingByHighlighter() will do nothing for the field.
*/
- private boolean useFastVectorHighlighter( SolrParams params, IndexSchema schema, String fieldName ){
- SchemaField schemaField = schema.getFieldOrNull( fieldName );
+ protected boolean useFastVectorHighlighter( SolrParams params, IndexSchema schema, String fieldName ){
+ SchemaField schemaField = schema.getFieldOrNull(fieldName);
if( schemaField == null ) return false;
- boolean useFvhParam = params.getFieldBool( fieldName, HighlightParams.USE_FVH, false );
+ boolean useFvhParam = params.getFieldBool(fieldName, HighlightParams.USE_FVH, false);
if( !useFvhParam ) return false;
boolean termPosOff = schemaField.storeTermPositions() && schemaField.storeTermOffsets();
if( !termPosOff ) {
@@ -417,56 +415,86 @@ public class DefaultSolrHighlighter exte
}
return termPosOff;
}
-
+
@SuppressWarnings("unchecked")
- private void doHighlightingByHighlighter( Query query, SolrQueryRequest req, NamedList docSummaries,
+ protected void doHighlightingByFastVectorHighlighter( FastVectorHighlighter highlighter, FieldQuery fieldQuery,
+ SolrQueryRequest req, NamedList docSummaries, int docId, StoredDocument doc,
+ String fieldName ) throws IOException {
+ SolrParams params = req.getParams();
+ SolrFragmentsBuilder solrFb = getSolrFragmentsBuilder(fieldName, params);
+ String[] snippets = highlighter.getBestFragments( fieldQuery, req.getSearcher().getIndexReader(), docId, fieldName,
+ params.getFieldInt( fieldName, HighlightParams.FRAGSIZE, 100 ),
+ params.getFieldInt( fieldName, HighlightParams.SNIPPETS, 1 ),
+ getFragListBuilder( fieldName, params ),
+ getFragmentsBuilder( fieldName, params ),
+ solrFb.getPreTags( params, fieldName ),
+ solrFb.getPostTags( params, fieldName ),
+ getEncoder( fieldName, params ) );
+ if( snippets != null && snippets.length > 0 )
+ docSummaries.add( fieldName, snippets );
+ else
+ alternateField( docSummaries, params, doc, fieldName );
+ }
+
+ @SuppressWarnings("unchecked")
+ protected void doHighlightingByHighlighter( Query query, SolrQueryRequest req, NamedList docSummaries,
int docId, StoredDocument doc, String fieldName ) throws IOException {
final SolrIndexSearcher searcher = req.getSearcher();
final IndexSchema schema = searcher.getSchema();
-
- // TODO: Currently in trunk highlighting numeric fields is broken (Lucene) -
- // so we disable them until fixed (see LUCENE-3080)!
+ final SolrParams params = req.getParams();
+
+ // TODO: highlighting numeric fields is broken (Lucene) - so we disable them until fixed (see LUCENE-3080)!
// BEGIN: Hack
final SchemaField schemaField = schema.getFieldOrNull(fieldName);
if (schemaField != null && schemaField.getType() instanceof org.apache.solr.schema.TrieField) return;
// END: Hack
-
- SolrParams params = req.getParams();
+
+ final int mvToExamine =
+ req.getParams().getFieldInt(fieldName, HighlightParams.MAX_MULTIVALUED_TO_EXAMINE,
+ schemaField.multiValued() ? Integer.MAX_VALUE : 1);
+ // Technically this is the max *fragments* (snippets), not max values:
+ int mvToMatch =
+ req.getParams().getFieldInt(fieldName, HighlightParams.MAX_MULTIVALUED_TO_MATCH, Integer.MAX_VALUE);
+ if (mvToExamine <= 0 || mvToMatch <= 0) {
+ return;
+ }
+
+ int maxCharsToAnalyze = params.getFieldInt(fieldName,
+ HighlightParams.MAX_CHARS,
+ Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE);
+ if (maxCharsToAnalyze < 0) {//e.g. -1
+ maxCharsToAnalyze = Integer.MAX_VALUE;
+ }
+
+ List<String> fieldValues = getFieldValues(req, doc, fieldName, mvToExamine, maxCharsToAnalyze);
+ if (fieldValues.isEmpty()) {
+ return;
+ }
// preserve order of values in a multiValued list
boolean preserveMulti = params.getFieldBool(fieldName, HighlightParams.PRESERVE_MULTI, false);
- List<StorableField> allFields = doc.getFields();
- if (allFields == null || allFields.isEmpty()) return; // No explicit contract that getFields returns != null,
- // although currently it can't.
-
int numFragments = getMaxSnippets(fieldName, params);
boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params);
- String[] summaries = null;
List<TextFragment> frags = new ArrayList<>();
//Try term vectors, which is faster
- TokenStream tvStream = TokenSources.getTokenStreamWithOffsets(searcher.getIndexReader(), docId, fieldName);
+ final TokenStream tvStream =
+ TokenSources.getTokenStreamWithOffsets(searcher.getIndexReader(), docId, fieldName);
+ // We need to wrap in OffsetWindowTokenFilter if multi-valued
final OffsetWindowTokenFilter tvWindowStream;
- if (tvStream != null && schemaField.multiValued() && isActuallyMultiValued(allFields, fieldName)) {
+ if (tvStream != null && fieldValues.size() > 1) {
tvWindowStream = new OffsetWindowTokenFilter(tvStream);
} else {
tvWindowStream = null;
}
- int mvToExamine = Integer.parseInt(req.getParams().get(HighlightParams.MAX_MULTIVALUED_TO_EXAMINE,
- Integer.toString(Integer.MAX_VALUE)));
- int mvToMatch = Integer.parseInt(req.getParams().get(HighlightParams.MAX_MULTIVALUED_TO_MATCH,
- Integer.toString(Integer.MAX_VALUE)));
-
- for (StorableField thisField : allFields) {
- if (mvToExamine <= 0 || mvToMatch <= 0) break;
-
- if (! thisField.name().equals(fieldName)) continue; // Is there a better way to do this?
+ for (String thisText : fieldValues) {
+ if (mvToMatch <= 0 || maxCharsToAnalyze <= 0) {
+ break;
+ }
- --mvToExamine;
- String thisText = thisField.stringValue();
TokenStream tstream;
if (tvWindowStream != null) {
// if we have a multi-valued field with term vectors, then get the next offset window
@@ -477,13 +505,9 @@ public class DefaultSolrHighlighter exte
// fall back to analyzer
tstream = createAnalyzerTStream(schema, fieldName, thisText);
}
-
- int maxCharsToAnalyze = params.getFieldInt(fieldName,
- HighlightParams.MAX_CHARS,
- Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE);
-
+
Highlighter highlighter;
- if (Boolean.valueOf(req.getParams().get(HighlightParams.USE_PHRASE_HIGHLIGHTER, "true"))) {
+ if (req.getParams().getFieldBool(fieldName, HighlightParams.USE_PHRASE_HIGHLIGHTER, true)) {
// We're going to call getPhraseHighlighter and it might consume the tokenStream. If it does, the tokenStream
// needs to implement reset() efficiently.
@@ -491,7 +515,7 @@ public class DefaultSolrHighlighter exte
// It should be okay if OffsetLimit won't get applied in this case.
final TokenStream tempTokenStream;
if (tstream != tvStream) {
- if (maxCharsToAnalyze < 0) {
+ if (maxCharsToAnalyze >= thisText.length()) {
tempTokenStream = new CachingTokenFilter(tstream);
} else {
tempTokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tstream, maxCharsToAnalyze));
@@ -502,37 +526,30 @@ public class DefaultSolrHighlighter exte
// get highlighter
highlighter = getPhraseHighlighter(query, fieldName, req, tempTokenStream);
-
+
// if the CachingTokenFilter was consumed then use it going forward.
if (tempTokenStream instanceof CachingTokenFilter && ((CachingTokenFilter)tempTokenStream).isCached()) {
tstream = tempTokenStream;
}
//tstream.reset(); not needed; getBestTextFragments will reset it.
- }
- else {
+ } else {
// use "the old way"
highlighter = getHighlighter(query, fieldName, req);
}
-
- if (maxCharsToAnalyze < 0) {
- highlighter.setMaxDocCharsToAnalyze(thisText.length());
- } else {
- highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze);
- }
+ highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze);
+
+ // Highlight!
try {
- TextFragment[] bestTextFragments = highlighter.getBestTextFragments(tstream, thisText, mergeContiguousFragments, numFragments);
+ TextFragment[] bestTextFragments =
+ highlighter.getBestTextFragments(tstream, thisText, mergeContiguousFragments, numFragments);
for (TextFragment bestTextFragment : bestTextFragments) {
- if (preserveMulti) {
- if (bestTextFragment != null) {
- frags.add(bestTextFragment);
- --mvToMatch;
- }
- } else {
- if ((bestTextFragment != null) && (bestTextFragment.getScore() > 0)) {
- frags.add(bestTextFragment);
- --mvToMatch;
- }
+ if (bestTextFragment == null)//can happen via mergeContiguousFragments
+ continue;
+ // normally we want a score (must be highlighted), but if preserveMulti then we return a snippet regardless.
+ if (bestTextFragment.getScore() > 0 || preserveMulti) {
+ frags.add(bestTextFragment);
+ --mvToMatch; // note: limits fragments (for multi-valued fields), not quite the number of values
}
}
} catch (InvalidTokenOffsetsException e) {
@@ -540,80 +557,65 @@ public class DefaultSolrHighlighter exte
}
}//end field value loop
- // sort such that the fragments with the highest score come first
- if (!preserveMulti) {
- Collections.sort(frags, new Comparator<TextFragment>() {
- @Override
- public int compare(TextFragment arg0, TextFragment arg1) {
- return Math.round(arg1.getScore() - arg0.getScore());
- }
- });
- }
-
- // convert fragments back into text
- // TODO: we can include score and position information in output as snippet attributes
+ // Put the fragments onto the Solr response (docSummaries)
if (frags.size() > 0) {
- ArrayList<String> fragTexts = new ArrayList<>();
- for (TextFragment fragment: frags) {
- if (preserveMulti) {
- if (fragment != null) {
- fragTexts.add(fragment.toString());
+ // sort such that the fragments with the highest score come first
+ if (!preserveMulti) {
+ Collections.sort(frags, new Comparator<TextFragment>() {//TODO make TextFragment Comparable
+ @Override
+ public int compare(TextFragment arg0, TextFragment arg1) {
+ return Math.round(arg1.getScore() - arg0.getScore());
}
- } else {
- if ((fragment != null) && (fragment.getScore() > 0)) {
- fragTexts.add(fragment.toString());
- }
- }
+ });
+ }
- if (fragTexts.size() >= numFragments && !preserveMulti) break;
+ // Truncate list to hl.snippets, but not when hl.preserveMulti
+ if (frags.size() > numFragments && !preserveMulti) {
+ frags = frags.subList(0, numFragments);
}
- summaries = fragTexts.toArray(new String[0]);
- if (summaries.length > 0)
- docSummaries.add(fieldName, summaries);
- }
- // no summaries made, copy text from alternate field
- if (summaries == null || summaries.length == 0) {
- alternateField( docSummaries, params, doc, fieldName );
+ docSummaries.add(fieldName, getResponseForFragments(frags, req));
+ } else {
+ // no summaries made, copy text from alternate field
+ alternateField(docSummaries, params, doc, fieldName);
}
}
- /** Is this field *actually* multi-valued for this document's fields? */
- private boolean isActuallyMultiValued(List<StorableField> allFields, String fieldName) {
- boolean foundFirst = false;
- for (StorableField field : allFields) {
- if (field.name().equals(fieldName)) {
- if (foundFirst) {
- return true;//we found another
- } else {
- foundFirst = true;
- }
+ /** Fetches field values to highlight. If the field value should come from an atypical place (or another aliased
+ * field name, then a subclass could override to implement that.
+ */
+ protected List<String> getFieldValues(SolrQueryRequest req, StoredDocument doc, String fieldName,
+ int maxValues, int maxCharsToAnalyze) {
+ // Collect the Fields we will examine (could be more than one if multi-valued)
+ List<String> result = new ArrayList<>();
+ for (StorableField thisField : doc.getFields()) {
+ if (! thisField.name().equals(fieldName)) {
+ continue;
+ }
+ String value = thisField.stringValue();
+ result.add(value);
+
+ maxValues--;
+ if (maxValues <= 0 || maxCharsToAnalyze <= 0) {
+ break;
}
}
- return false;//0 or 1 value
+ return result;
}
- @SuppressWarnings("unchecked")
- private void doHighlightingByFastVectorHighlighter( FastVectorHighlighter highlighter, FieldQuery fieldQuery,
- SolrQueryRequest req, NamedList docSummaries, int docId, StoredDocument doc,
- String fieldName ) throws IOException {
- SolrParams params = req.getParams();
- SolrFragmentsBuilder solrFb = getSolrFragmentsBuilder( fieldName, params );
- String[] snippets = highlighter.getBestFragments( fieldQuery, req.getSearcher().getIndexReader(), docId, fieldName,
- params.getFieldInt( fieldName, HighlightParams.FRAGSIZE, 100 ),
- params.getFieldInt( fieldName, HighlightParams.SNIPPETS, 1 ),
- getFragListBuilder( fieldName, params ),
- getFragmentsBuilder( fieldName, params ),
- solrFb.getPreTags( params, fieldName ),
- solrFb.getPostTags( params, fieldName ),
- getEncoder( fieldName, params ) );
- if( snippets != null && snippets.length > 0 )
- docSummaries.add( fieldName, snippets );
- else
- alternateField( docSummaries, params, doc, fieldName );
+ /** Given the fragments, return the result to be put in the field {@link NamedList}. This is an extension
+ * point to allow adding other metadata like the absolute offsets or scores.
+ */
+ protected Object getResponseForFragments(List<TextFragment> frags, SolrQueryRequest req) {
+ // TODO: we can include score and position information in output as snippet attributes
+ ArrayList<String> fragTexts = new ArrayList<>();
+ for (TextFragment fragment : frags) {
+ fragTexts.add(fragment.toString());
+ }
+ return fragTexts.toArray(new String[fragTexts.size()]);
}
-
+
@SuppressWarnings("unchecked")
- private void alternateField( NamedList docSummaries, SolrParams params, StoredDocument doc, String fieldName ){
+ protected void alternateField( NamedList docSummaries, SolrParams params, StoredDocument doc, String fieldName ){
String alternateField = params.getFieldParam(fieldName, HighlightParams.ALTERNATE_FIELD);
if (alternateField != null && alternateField.length() > 0) {
StorableField[] docFields = doc.getFields(alternateField);
@@ -652,7 +654,7 @@ public class DefaultSolrHighlighter exte
}
}
- private TokenStream createAnalyzerTStream(IndexSchema schema, String fieldName, String docText) throws IOException {
+ protected TokenStream createAnalyzerTStream(IndexSchema schema, String fieldName, String docText) throws IOException {
return new TokenOrderingFilter(schema.getIndexAnalyzer().tokenStream(fieldName, docText), 10);
}
}