You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ma...@apache.org on 2011/04/20 05:43:27 UTC
svn commit: r1095260 - in /lucene/dev/trunk: lucene/contrib/
lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/
lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/
solr/ solr/src/java/org/apache/solr/highlight/
Author: markrmiller
Date: Wed Apr 20 03:43:27 2011
New Revision: 1095260
URL: http://svn.apache.org/viewvc?rev=1095260&view=rev
Log:
LUCENE-2939: Highlighter should try and use maxDocCharsToAnalyze in WeightedSpanTermExtractor when adding a new field to MemoryIndex as well as when using CachingTokenStream
SOLR-2390: Performance of usePhraseHighlighter is terrible on very large Documents, regardless of hl.maxDocCharsToAnalyze
Added:
lucene/dev/trunk/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/OffsetLimitTokenFilter.java (with props)
lucene/dev/trunk/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/OffsetLimitTokenFilterTest.java (with props)
Modified:
lucene/dev/trunk/lucene/contrib/CHANGES.txt
lucene/dev/trunk/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
lucene/dev/trunk/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java
lucene/dev/trunk/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
lucene/dev/trunk/solr/CHANGES.txt
lucene/dev/trunk/solr/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java
Modified: lucene/dev/trunk/lucene/contrib/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/CHANGES.txt?rev=1095260&r1=1095259&r2=1095260&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/contrib/CHANGES.txt Wed Apr 20 03:43:27 2011
@@ -45,10 +45,15 @@ API Changes
======================= Lucene 3.x (not yet released) =======================
-Bug fixes
+Bug Fixes
* LUCENE-3026: SmartChineseAnalyzer's WordTokenFilter threw NullPointerException
on sentences longer than 32,767 characters. (wangzhenghang via Robert Muir)
+
+ * LUCENE-2939: Highlighter should try and use maxDocCharsToAnalyze in
+ WeightedSpanTermExtractor when adding a new field to MemoryIndex as well as
+ when using CachingTokenStream. This can be a significant performance bug for
+ large documents. (Mark Miller)
New Features
Modified: lucene/dev/trunk/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java?rev=1095260&r1=1095259&r2=1095260&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java (original)
+++ lucene/dev/trunk/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java Wed Apr 20 03:43:27 2011
@@ -197,6 +197,11 @@ public class Highlighter
tokenStream.reset();
TextFragment currentFrag = new TextFragment(newText,newText.length(), docFrags.size());
+
+ if (fragmentScorer instanceof QueryScorer) {
+ ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(maxDocCharsToAnalyze);
+ }
+
TokenStream newStream = fragmentScorer.init(tokenStream);
if(newStream != null) {
tokenStream = newStream;
Added: lucene/dev/trunk/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/OffsetLimitTokenFilter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/OffsetLimitTokenFilter.java?rev=1095260&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/OffsetLimitTokenFilter.java (added)
+++ lucene/dev/trunk/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/OffsetLimitTokenFilter.java Wed Apr 20 03:43:27 2011
@@ -0,0 +1 @@
+package org.apache.lucene.search.highlight;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache
.lucene.analysis.tokenattributes.OffsetAttribute;
/**
* This TokenFilter limits the number of tokens while indexing by adding up the
* current offset.
*/
public final class OffsetLimitTokenFilter extends TokenFilter {
private int offsetCount;
private OffsetAttribute offsetAttrib = getAttribute(OffsetAttribute.class);
private int offsetLimit;
public OffsetLimitTokenFilter(TokenStream input, int offsetLimit) {
super(input);
this.offsetLimit = offsetLimit;
}
@Override
public boolean incrementToken() throws IOException {
if (offsetCount < offsetLimit && input.incrementToken()) {
int offsetLength = offsetAttrib.endOffset() - offsetAttrib.startOffset();
offsetCount += offsetLength;
return true;
}
return false;
}
@Override
public void reset() throws IOException {
super.reset();
offsetCount = 0;
}
}
\ No newline at end of file
Modified: lucene/dev/trunk/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java?rev=1095260&r1=1095259&r2=1095260&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java (original)
+++ lucene/dev/trunk/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java Wed Apr 20 03:43:27 2011
@@ -54,6 +54,7 @@ public class QueryScorer implements Scor
private IndexReader reader;
private boolean skipInitExtractor;
private boolean wrapToCaching = true;
+ private int maxCharsToAnalyze;
/**
* @param query Query to use for highlighting
@@ -209,7 +210,7 @@ public class QueryScorer implements Scor
private TokenStream initExtractor(TokenStream tokenStream) throws IOException {
WeightedSpanTermExtractor qse = defaultField == null ? new WeightedSpanTermExtractor()
: new WeightedSpanTermExtractor(defaultField);
-
+ qse.setMaxDocCharsToAnalyze(maxCharsToAnalyze);
qse.setExpandMultiTermQuery(expandMultiTermQuery);
qse.setWrapIfNotCachingTokenFilter(wrapToCaching);
if (reader == null) {
@@ -265,4 +266,8 @@ public class QueryScorer implements Scor
public void setWrapIfNotCachingTokenFilter(boolean wrap) {
this.wrapToCaching = wrap;
}
+
+ public void setMaxDocCharsToAnalyze(int maxDocCharsToAnalyze) {
+ this.maxCharsToAnalyze = maxDocCharsToAnalyze;
+ }
}
Modified: lucene/dev/trunk/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java?rev=1095260&r1=1095259&r2=1095260&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java (original)
+++ lucene/dev/trunk/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java Wed Apr 20 03:43:27 2011
@@ -56,6 +56,7 @@ public class WeightedSpanTermExtractor {
private boolean expandMultiTermQuery;
private boolean cachedTokenStream;
private boolean wrapToCaching = true;
+ private int maxDocCharsToAnalyze;
public WeightedSpanTermExtractor() {
}
@@ -320,13 +321,13 @@ public class WeightedSpanTermExtractor {
private AtomicReaderContext getLeafContextForField(String field) throws IOException {
if(wrapToCaching && !cachedTokenStream && !(tokenStream instanceof CachingTokenFilter)) {
- tokenStream = new CachingTokenFilter(tokenStream);
+ tokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze));
cachedTokenStream = true;
}
AtomicReaderContext context = readers.get(field);
if (context == null) {
MemoryIndex indexer = new MemoryIndex();
- indexer.addField(field, tokenStream);
+ indexer.addField(field, new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze));
tokenStream.reset();
IndexSearcher searcher = indexer.createSearcher();
// MEM index has only atomic ctx
@@ -545,4 +546,8 @@ public class WeightedSpanTermExtractor {
public void setWrapIfNotCachingTokenFilter(boolean wrap) {
this.wrapToCaching = wrap;
}
+
+ protected final void setMaxDocCharsToAnalyze(int maxDocCharsToAnalyze) {
+ this.maxDocCharsToAnalyze = maxDocCharsToAnalyze;
+ }
}
Added: lucene/dev/trunk/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/OffsetLimitTokenFilterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/OffsetLimitTokenFilterTest.java?rev=1095260&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/OffsetLimitTokenFilterTest.java (added)
+++ lucene/dev/trunk/lucene/contrib/highlighter/src/test/org/apache/lucene/search/highlight/OffsetLimitTokenFilterTest.java Wed Apr 20 03:43:27 2011
@@ -0,0 +1 @@
+package org.apache.lucene.search.highlight;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.Reader;
import java.io.StringReader;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseToken
StreamTestCase;
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.TokenStream;
public class OffsetLimitTokenFilterTest extends BaseTokenStreamTestCase {
public void testFilter() throws Exception {
TokenStream stream = new MockTokenizer(new StringReader(
"short toolong evenmuchlongertext a ab toolong foo"),
MockTokenizer.WHITESPACE, false);
OffsetLimitTokenFilter filter = new OffsetLimitTokenFilter(stream, 10);
assertTokenStreamContents(filter, new String[] {"short", "toolong"});
stream = new MockTokenizer(new StringReader(
"short toolong evenmuchlongertext a ab toolong foo"),
MockTokenizer.WHITESPACE, false);
filter = new OffsetLimitTokenFilter(stream, 12);
assertTokenStreamContents(filter, new String[] {"short", "toolong"});
stream = new MockTokenizer(new StringReader(
"short toolong evenmuchlongertext a ab toolong foo"),
MockTokenizer.WHITESPACE, false);
filter = new OffsetLimitTokenFilter(stream, 30);
assertTokenStreamContents(filter, new String[] {"short", "toolong",
"evenmuchlongertext"});
checkOneTermReuse(new Analyzer() {
@Override
public TokenStream tokenStream(String fieldName, Reader reader) {
return new OffsetLimitTokenFilter(new MockTokenizer(reader,
MockTokenizer.WHITESPACE, false), 10);
}
}, "llenges", "llenges");
}
}
\ No newline at end of file
Modified: lucene/dev/trunk/solr/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/CHANGES.txt?rev=1095260&r1=1095259&r2=1095260&view=diff
==============================================================================
--- lucene/dev/trunk/solr/CHANGES.txt (original)
+++ lucene/dev/trunk/solr/CHANGES.txt Wed Apr 20 03:43:27 2011
@@ -278,6 +278,9 @@ Bug Fixes
* SOLR-2333: The "rename" core admin action does not persist the new name to solr.xml
(Rasmus Hahn, Paul R. Brown via Mark Miller)
+
+* SOLR-2390: Performance of usePhraseHighlighter is terrible on very large Documents,
+ regardless of hl.maxDocCharsToAnalyze. (Mark Miller)
Other Changes
----------------------
Modified: lucene/dev/trunk/solr/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/solr/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java?rev=1095260&r1=1095259&r2=1095260&view=diff
==============================================================================
--- lucene/dev/trunk/solr/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java (original)
+++ lucene/dev/trunk/solr/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java Wed Apr 20 03:43:27 2011
@@ -435,12 +435,20 @@ public class DefaultSolrHighlighter exte
// fall back to analyzer
tstream = createAnalyzerTStream(schema, fieldName, docTexts[j]);
}
-
+
+ int maxCharsToAnalyze = params.getFieldInt(fieldName,
+ HighlightParams.MAX_CHARS,
+ Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE);
+
Highlighter highlighter;
if (Boolean.valueOf(req.getParams().get(HighlightParams.USE_PHRASE_HIGHLIGHTER, "true"))) {
// TODO: this is not always necessary - eventually we would like to avoid this wrap
// when it is not needed.
- tstream = new CachingTokenFilter(tstream);
+ if (maxCharsToAnalyze < 0) {
+ tstream = new CachingTokenFilter(tstream);
+ } else {
+ tstream = new CachingTokenFilter(new OffsetLimitTokenFilter(tstream, maxCharsToAnalyze));
+ }
// get highlighter
highlighter = getPhraseHighlighter(query, fieldName, req, (CachingTokenFilter) tstream);
@@ -453,9 +461,6 @@ public class DefaultSolrHighlighter exte
highlighter = getHighlighter(query, fieldName, req);
}
- int maxCharsToAnalyze = params.getFieldInt(fieldName,
- HighlightParams.MAX_CHARS,
- Highlighter.DEFAULT_MAX_CHARS_TO_ANALYZE);
if (maxCharsToAnalyze < 0) {
highlighter.setMaxDocCharsToAnalyze(docTexts[j].length());
} else {