You are viewing a plain text version of this content. The canonical link for it is here.
Posted to solr-commits@lucene.apache.org by kl...@apache.org on 2008/07/08 01:52:36 UTC
svn commit: r674677 - in /lucene/solr/trunk: CHANGES.txt
src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java
src/test/org/apache/solr/highlight/HighlighterTest.java
Author: klaas
Date: Mon Jul 7 16:52:36 2008
New Revision: 674677
URL: http://svn.apache.org/viewvc?rev=674677&view=rev
Log:
SOLR-556 , SOLR-610
Modified:
lucene/solr/trunk/CHANGES.txt
lucene/solr/trunk/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java
lucene/solr/trunk/src/test/org/apache/solr/highlight/HighlighterTest.java
Modified: lucene/solr/trunk/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/CHANGES.txt?rev=674677&r1=674676&r2=674677&view=diff
==============================================================================
--- lucene/solr/trunk/CHANGES.txt (original)
+++ lucene/solr/trunk/CHANGES.txt Mon Jul 7 16:52:36 2008
@@ -309,6 +309,8 @@
58. SOLR-502: Add search timeout support. (Sean Timm via yonik)
59. SOLR-605: Add the ability to register callbacks programatically (ryan, Noble Paul)
+
+60. SOLR-610: hl.maxAnalyzedChars can be -1 to highlight everything (Lars Kotthoff via klaas)
Changes in runtime behavior
1. SOLR-559: use Lucene updateDocument, deleteDocuments methods. This
@@ -464,6 +466,7 @@
via useMultiPartPost in CommonsHttpSolrServer.
(Lars Kotthoff, Andrew Schurman, ryan, yonik)
+40. SOLR-556: multi-valued fields always highlighted in disparate snippets (Lars Kotthoff via klaas)
Other Changes
1. SOLR-135: Moved common classes to org.apache.solr.common and altered the
Modified: lucene/solr/trunk/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java?rev=674677&r1=674676&r2=674677&view=diff
==============================================================================
--- lucene/solr/trunk/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java (original)
+++ lucene/solr/trunk/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java Mon Jul 7 16:52:36 2008
@@ -20,6 +20,7 @@
import java.io.StringReader;
import java.util.ArrayList;
import java.util.Collections;
+import java.util.Comparator;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
@@ -27,7 +28,6 @@
import java.util.ListIterator;
import java.util.Map;
import java.util.Set;
-import java.util.logging.Logger;
import javax.xml.xpath.XPathConstants;
@@ -108,9 +108,6 @@
highlighter = new Highlighter(getFormatter(fieldName, params), getSpanQueryScorer(query, fieldName, tokenStream, request));
highlighter.setTextFragmenter(getFragmenter(fieldName, params));
- highlighter.setMaxDocBytesToAnalyze(params.getFieldInt(
- fieldName, HighlightParams.MAX_CHARS,
- Highlighter.DEFAULT_MAX_DOC_BYTES_TO_ANALYZE));
return highlighter;
}
@@ -127,9 +124,6 @@
getFormatter(fieldName, params),
getQueryScorer(query, fieldName, request));
highlighter.setTextFragmenter(getFragmenter(fieldName, params));
- highlighter.setMaxDocBytesToAnalyze(params.getFieldInt(
- fieldName, HighlightParams.MAX_CHARS,
- Highlighter.DEFAULT_MAX_DOC_BYTES_TO_ANALYZE));
return highlighter;
}
@@ -272,71 +266,75 @@
if (docTexts == null) continue;
TokenStream tstream = null;
+ int numFragments = getMaxSnippets(fieldName, params);
+ boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params);
- // create TokenStream
- if (docTexts.length == 1) {
- // single-valued field
+ String[] summaries = null;
+ List<TextFragment> frags = new ArrayList<TextFragment>();
+ for (int j = 0; j < docTexts.length; j++) {
+ // create TokenStream
try {
// attempt term vectors
tstream = TokenSources.getTokenStream(searcher.getReader(), docId, fieldName);
}
catch (IllegalArgumentException e) {
// fall back to anaylzer
- tstream = new TokenOrderingFilter(schema.getAnalyzer().tokenStream(fieldName, new StringReader(docTexts[0])), 10);
+ tstream = new TokenOrderingFilter(schema.getAnalyzer().tokenStream(fieldName, new StringReader(docTexts[j])), 10);
+ }
+
+ Highlighter highlighter;
+ if (Boolean.valueOf(req.getParams().get(HighlightParams.USE_PHRASE_HIGHLIGHTER))) {
+ // wrap CachingTokenFilter around TokenStream for reuse
+ tstream = new CachingTokenFilter(tstream);
+
+ // get highlighter
+ highlighter = getPhraseHighlighter(query, fieldName, req, (CachingTokenFilter) tstream);
+
+ // after highlighter initialization, reset tstream since construction of highlighter already used it
+ tstream.reset();
+ }
+ else {
+ // use "the old way"
+ highlighter = getHighlighter(query, fieldName, req);
}
- }
- else {
- // multi-valued field
- tstream = new MultiValueTokenStream(fieldName, docTexts, schema.getAnalyzer(), true);
- }
-
- Highlighter highlighter;
-
- if (Boolean.valueOf(req.getParams().get(HighlightParams.USE_PHRASE_HIGHLIGHTER))) {
- // wrap CachingTokenFilter around TokenStream for reuse
- tstream = new CachingTokenFilter(tstream);
- // get highlighter
- highlighter = getPhraseHighlighter(query, fieldName, req, (CachingTokenFilter) tstream);
+ int maxCharsToAnalyze = params.getFieldInt(fieldName,
+ HighlightParams.MAX_CHARS,
+ Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE);
+ if (maxCharsToAnalyze < 0) {
+ highlighter.setMaxDocCharsToAnalyze(docTexts[j].length());
+ } else {
+ highlighter.setMaxDocCharsToAnalyze(maxCharsToAnalyze);
+ }
- // after highlighter initialization, reset tstream since construction of highlighter already used it
- tstream.reset();
- }
- else {
- // use "the old way"
- highlighter = getHighlighter(query, fieldName, req);
+ TextFragment[] bestTextFragments = highlighter.getBestTextFragments(tstream, docTexts[j], mergeContiguousFragments, numFragments);
+ for (int k = 0; k < bestTextFragments.length; k++) {
+ if ((bestTextFragments[k] != null) && (bestTextFragments[k].getScore() > 0)) {
+ frags.add(bestTextFragments[k]);
+ }
+ }
}
-
- int numFragments = getMaxSnippets(fieldName, params);
- boolean mergeContiguousFragments = isMergeContiguousFragments(fieldName, params);
-
- String[] summaries = null;
- TextFragment[] frag;
- if (docTexts.length == 1) {
- frag = highlighter.getBestTextFragments(tstream, docTexts[0], mergeContiguousFragments, numFragments);
- }
- else {
- StringBuilder singleValue = new StringBuilder();
-
- for (String txt:docTexts) {
- singleValue.append(txt);
- }
-
- frag = highlighter.getBestTextFragments(tstream, singleValue.toString(), false, numFragments);
- }
+ // sort such that the fragments with the highest score come first
+ Collections.sort(frags, new Comparator<TextFragment>() {
+ public int compare(TextFragment arg0, TextFragment arg1) {
+ return Math.round(arg1.getScore() - arg0.getScore());
+ }
+ });
+
// convert fragments back into text
// TODO: we can include score and position information in output as snippet attributes
- if (frag.length > 0) {
- ArrayList<String> fragTexts = new ArrayList<String>();
- for (int j = 0; j < frag.length; j++) {
- if ((frag[j] != null) && (frag[j].getScore() > 0)) {
- fragTexts.add(frag[j].toString());
- }
+ if (frags.size() > 0) {
+ ArrayList<String> fragTexts = new ArrayList<String>();
+ for (TextFragment fragment: frags) {
+ if ((fragment != null) && (fragment.getScore() > 0)) {
+ fragTexts.add(fragment.toString());
}
- summaries = fragTexts.toArray(new String[0]);
- if (summaries.length > 0)
- docSummaries.add(fieldName, summaries);
- }
+ if (fragTexts.size() >= numFragments) break;
+ }
+ summaries = fragTexts.toArray(new String[0]);
+ if (summaries.length > 0)
+ docSummaries.add(fieldName, summaries);
+ }
// no summeries made, copy text from alternate field
if (summaries == null || summaries.length == 0) {
String alternateField = req.getParams().getFieldParam(fieldName, HighlightParams.ALTERNATE_FIELD);
@@ -370,80 +368,6 @@
}
}
-/**
- * Creates a single TokenStream out multi-value field values.
- */
-class MultiValueTokenStream extends TokenStream {
- private String fieldName;
- private String[] values;
- private Analyzer analyzer;
- private int curIndex; // next index into the values array
- private int curOffset; // offset into concatenated string
- private TokenStream currentStream; // tokenStream currently being iterated
- private boolean orderTokenOffsets;
-
- /** Constructs a TokenStream for consecutively-analyzed field values
- *
- * @param fieldName name of the field
- * @param values array of field data
- * @param analyzer analyzer instance
- */
- public MultiValueTokenStream(String fieldName, String[] values,
- Analyzer analyzer, boolean orderTokenOffsets) {
- this.fieldName = fieldName;
- this.values = values;
- this.analyzer = analyzer;
- curIndex = -1;
- curOffset = 0;
- currentStream = null;
- this.orderTokenOffsets=orderTokenOffsets;
- }
-
- /** Returns the next token in the stream, or null at EOS. */
- @Override
- public Token next() throws IOException {
- int extra = 0;
- if(currentStream == null) {
- curIndex++;
- if(curIndex < values.length) {
- currentStream = analyzer.tokenStream(fieldName,
- new StringReader(values[curIndex]));
- if (orderTokenOffsets) currentStream = new TokenOrderingFilter(currentStream,10);
- // add extra space between multiple values
- if(curIndex > 0)
- extra = analyzer.getPositionIncrementGap(fieldName);
- } else {
- return null;
- }
- }
- Token nextToken = currentStream.next();
- if(nextToken == null) {
- curOffset += values[curIndex].length();
- currentStream = null;
- return next();
- }
- // create an modified token which is the offset into the concatenated
- // string of all values
- Token offsetToken = new Token(nextToken.termText(),
- nextToken.startOffset() + curOffset,
- nextToken.endOffset() + curOffset);
- offsetToken.setPositionIncrement(nextToken.getPositionIncrement() + extra*10);
- return offsetToken;
- }
-
- /**
- * Returns all values as a single String into which the Tokens index with
- * their offsets.
- */
- public String asSingleValue() {
- StringBuilder sb = new StringBuilder();
- for(String str : values)
- sb.append(str);
- return sb.toString();
- }
-}
-
-
/** Orders Tokens in a window first by their startOffset ascending.
* endOffset is currently ignored.
* This is meant to work around fickleness in the highlighter only. It
Modified: lucene/solr/trunk/src/test/org/apache/solr/highlight/HighlighterTest.java
URL: http://svn.apache.org/viewvc/lucene/solr/trunk/src/test/org/apache/solr/highlight/HighlighterTest.java?rev=674677&r1=674676&r2=674677&view=diff
==============================================================================
--- lucene/solr/trunk/src/test/org/apache/solr/highlight/HighlighterTest.java (original)
+++ lucene/solr/trunk/src/test/org/apache/solr/highlight/HighlighterTest.java Mon Jul 7 16:52:36 2008
@@ -185,6 +185,26 @@
);
}
+
+ public void testMultiValueBestFragmentHighlight() {
+ HashMap<String,String> args = new HashMap<String,String>();
+ args.put("hl", "true");
+ args.put("hl.fl", "textgap");
+ args.put("df", "textgap");
+ TestHarness.LocalRequestFactory sumLRF = h.getRequestFactory(
+ "standard", 0, 200, args);
+
+ assertU(adoc("textgap", "first entry has one word foo",
+ "textgap", "second entry has both words foo bar",
+ "id", "1"));
+ assertU(commit());
+ assertU(optimize());
+ assertQ("Best fragment summarization",
+ sumLRF.makeRequest("foo bar"),
+ "//lst[@name='highlighting']/lst[@name='1']",
+ "//lst[@name='1']/arr[@name='textgap']/str[.=\'second entry has both words <em>foo</em> <em>bar</em>\']"
+ );
+ }
public void testDefaultFieldHighlight() {
@@ -361,6 +381,13 @@
"//lst[@name='highlighting']/lst[@name='1']",
"//lst[@name='1'][not(*)]"
);
+ args.put("hl.maxAnalyzedChars", "-1");
+ sumLRF = h.getRequestFactory("standard", 0, 200, args);
+ assertQ("token at start of text",
+ sumLRF.makeRequest("t_text:disjoint"),
+ "//lst[@name='highlighting']/lst[@name='1']",
+ "//lst[@name='1']/arr[count(str)=1]"
+ );
}
public void testRegexFragmenter() {
HashMap<String,String> args = new HashMap<String,String>();