You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ds...@apache.org on 2015/04/28 17:53:42 UTC
svn commit: r1676571 - in /lucene/dev/branches/branch_5x: ./ dev-tools/
dev-tools/idea/lucene/highlighter/ lucene/ lucene/benchmark/
lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/
lucene/benchmark/src/test/org/apache/lucene/benchma...
Author: dsmiley
Date: Tue Apr 28 15:53:41 2015
New Revision: 1676571
URL: http://svn.apache.org/r1676571
Log:
LUCENE-6445: Highlighter TokenSources simplification
Modified:
lucene/dev/branches/branch_5x/ (props changed)
lucene/dev/branches/branch_5x/dev-tools/ (props changed)
lucene/dev/branches/branch_5x/dev-tools/idea/lucene/highlighter/highlighter.iml
lucene/dev/branches/branch_5x/lucene/ (props changed)
lucene/dev/branches/branch_5x/lucene/CHANGES.txt (contents, props changed)
lucene/dev/branches/branch_5x/lucene/benchmark/ (props changed)
lucene/dev/branches/branch_5x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetHighlightTask.java
lucene/dev/branches/branch_5x/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/CountingHighlighterTestTask.java
lucene/dev/branches/branch_5x/lucene/highlighter/ (props changed)
lucene/dev/branches/branch_5x/lucene/highlighter/build.xml
lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
lucene/dev/branches/branch_5x/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterPhraseTest.java
lucene/dev/branches/branch_5x/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
lucene/dev/branches/branch_5x/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java
lucene/dev/branches/branch_5x/solr/ (props changed)
lucene/dev/branches/branch_5x/solr/core/ (props changed)
lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java
Modified: lucene/dev/branches/branch_5x/dev-tools/idea/lucene/highlighter/highlighter.iml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/dev-tools/idea/lucene/highlighter/highlighter.iml?rev=1676571&r1=1676570&r2=1676571&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/dev-tools/idea/lucene/highlighter/highlighter.iml (original)
+++ lucene/dev/branches/branch_5x/dev-tools/idea/lucene/highlighter/highlighter.iml Tue Apr 28 15:53:41 2015
@@ -18,5 +18,6 @@
<orderEntry type="module" module-name="queries" />
<orderEntry type="module" module-name="lucene-core" />
<orderEntry type="module" module-name="join" />
+ <orderEntry type="module" module-name="analysis-common" />
</component>
</module>
Modified: lucene/dev/branches/branch_5x/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/CHANGES.txt?rev=1676571&r1=1676570&r2=1676571&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/CHANGES.txt (original)
+++ lucene/dev/branches/branch_5x/lucene/CHANGES.txt Tue Apr 28 15:53:41 2015
@@ -118,6 +118,9 @@ API Changes
* LUCENE-6446: Simplified Explanation API. (Adrien Grand)
+* LUCENE-6445: Two new methods in Highlighter's TokenSources; the existing
+ methods are now marked deprecated. (David Smiley)
+
Other
* LUCENE-6413: Test runner should report the number of suites completed/
Modified: lucene/dev/branches/branch_5x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetHighlightTask.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetHighlightTask.java?rev=1676571&r1=1676570&r2=1676571&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetHighlightTask.java (original)
+++ lucene/dev/branches/branch_5x/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetHighlightTask.java Tue Apr 28 15:53:41 2015
@@ -17,6 +17,11 @@ package org.apache.lucene.benchmark.byTa
* limitations under the License.
*/
+import java.util.Collection;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.Set;
+
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.benchmark.byTask.PerfRunData;
@@ -29,11 +34,6 @@ import org.apache.lucene.search.highligh
import org.apache.lucene.search.highlight.TextFragment;
import org.apache.lucene.search.highlight.TokenSources;
-import java.util.Set;
-import java.util.Collection;
-import java.util.HashSet;
-import java.util.Collections;
-
/**
* Search and Traverse and Retrieve docs task. Highlight the fields in the retrieved documents.
*
@@ -102,7 +102,8 @@ public class SearchTravRetHighlightTask
@Override
public int doHighlight(IndexReader reader, int doc, String field,
Document document, Analyzer analyzer, String text) throws Exception {
- TokenStream ts = TokenSources.getAnyTokenStream(reader, doc, field, document, analyzer);
+ final int maxStartOffset = highlighter.getMaxDocCharsToAnalyze() - 1;
+ TokenStream ts = TokenSources.getTokenStream(field, reader.getTermVectors(doc), text, analyzer, maxStartOffset);
TextFragment[] frag = highlighter.getBestTextFragments(ts, text, mergeContiguous, maxFrags);
return frag != null ? frag.length : 0;
}
Modified: lucene/dev/branches/branch_5x/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/CountingHighlighterTestTask.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/CountingHighlighterTestTask.java?rev=1676571&r1=1676570&r2=1676571&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/CountingHighlighterTestTask.java (original)
+++ lucene/dev/branches/branch_5x/lucene/benchmark/src/test/org/apache/lucene/benchmark/byTask/tasks/CountingHighlighterTestTask.java Tue Apr 28 15:53:41 2015
@@ -17,19 +17,19 @@
package org.apache.lucene.benchmark.byTask.tasks;
-import org.apache.lucene.benchmark.byTask.PerfRunData;
-import org.apache.lucene.analysis.TokenStream;
+import java.io.IOException;
+
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.benchmark.byTask.PerfRunData;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.search.Query;
import org.apache.lucene.search.highlight.Highlighter;
-import org.apache.lucene.search.highlight.TextFragment;
import org.apache.lucene.search.highlight.QueryScorer;
+import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
+import org.apache.lucene.search.highlight.TextFragment;
import org.apache.lucene.search.highlight.TokenSources;
-import org.apache.lucene.search.Query;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.index.IndexReader;
-
-import java.io.IOException;
/**
* Test Search task which counts number of searches.
@@ -58,7 +58,8 @@ public class CountingHighlighterTestTask
return new BenchmarkHighlighter() {
@Override
public int doHighlight(IndexReader reader, int doc, String field, Document document, Analyzer analyzer, String text) throws Exception {
- TokenStream ts = TokenSources.getAnyTokenStream(reader, doc, field, document, analyzer);
+ final int maxStartOffset = highlighter.getMaxDocCharsToAnalyze() - 1;
+ TokenStream ts = TokenSources.getTokenStream(field, reader.getTermVectors(doc), text, analyzer, maxStartOffset);
TextFragment[] frag = highlighter.getBestTextFragments(ts, text, mergeContiguous, maxFrags);
numHighlightedResults += frag != null ? frag.length : 0;
return frag != null ? frag.length : 0;
Modified: lucene/dev/branches/branch_5x/lucene/highlighter/build.xml
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/highlighter/build.xml?rev=1676571&r1=1676570&r2=1676571&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/highlighter/build.xml (original)
+++ lucene/dev/branches/branch_5x/lucene/highlighter/build.xml Tue Apr 28 15:53:41 2015
@@ -32,12 +32,12 @@
<pathelement path="${memory.jar}"/>
<pathelement path="${queries.jar}"/>
<pathelement path="${join.jar}"/>
+ <pathelement path="${analyzers-common.jar}"/>
<path refid="base.classpath"/>
</path>
- <target name="init" depends="module-build.init,jar-memory,jar-queries,jar-join"/>
+ <target name="compile-core" depends="jar-memory,jar-queries,jar-join,jar-analyzers-common,common.compile-core" />
- <target name="compile-core" depends="jar-memory, common.compile-core, jar-join" />
<target name="javadocs" depends="javadocs-memory,compile-core,check-javadocs-uptodate"
unless="javadocs-uptodate-${name}">
<invoke-module-javadoc>
Modified: lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java?rev=1676571&r1=1676570&r2=1676571&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java (original)
+++ lucene/dev/branches/branch_5x/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java Tue Apr 28 15:53:41 2015
@@ -24,24 +24,82 @@ import java.io.IOException;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.miscellaneous.LimitTokenOffsetFilter;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.Terms;
/**
- * Hides implementation issues associated with obtaining a TokenStream for use
- * with the higlighter - can obtain from TermFreqVectors with offsets and
- * (optionally) positions or from Analyzer class reparsing the stored content.
+ * Convenience methods for obtaining a {@link TokenStream} for use with the {@link Highlighter} - can obtain from
+ * term vectors with offsets and positions or from an Analyzer re-parsing the stored content.
+ *
+ * @see TokenStreamFromTermVector
*/
public class TokenSources {
+
+ private TokenSources() {}
+
+ /**
+ * Get a token stream from either un-inverting a term vector if possible, or by analyzing the text.
+ *
+ * WARNING: Don't call this if there is more than one value for this field. If there are, and if there are term
+ * vectors, then there is a single tokenstream with offsets suggesting all the field values were concatenated.
+ *
+ * @param field The field to either get term vectors from or to analyze the text from.
+ * @param tvFields from {@link IndexReader#getTermVectors(int)}. Possibly null. For performance, this instance should
+ * be re-used for the same document (e.g. when highlighting multiple fields).
+ * @param text the text to analyze, failing term vector un-inversion
+ * @param analyzer the analyzer to analyze {@code text} with, failing term vector un-inversion
+ * @param maxStartOffset Terms with a startOffset greater than this aren't returned. Use -1 for no limit.
+ * Suggest using {@link Highlighter#getMaxDocCharsToAnalyze()} - 1.
+ *
+ * @return a token stream from either term vectors, or from analyzing the text. Never null.
+ */
+ public static TokenStream getTokenStream(String field, Fields tvFields, String text, Analyzer analyzer,
+ int maxStartOffset) throws IOException {
+ TokenStream tokenStream = getTermVectorTokenStreamOrNull(field, tvFields, maxStartOffset);
+ if (tokenStream != null) {
+ return tokenStream;
+ }
+ tokenStream = analyzer.tokenStream(field, text);
+ if (maxStartOffset >= 0 && maxStartOffset < text.length() - 1) {
+ tokenStream = new LimitTokenOffsetFilter(tokenStream, maxStartOffset);
+ }
+ return tokenStream;
+ }
+
+ /**
+ * Get a token stream by un-inverting the term vector. This method returns null if {@code tvFields} is null
+ * or if the field has no term vector, or if the term vector doesn't have offsets. Positions are recommended on the
+ * term vector but it isn't strictly required.
+ *
+ * @param field The field to get term vectors from.
+ * @param tvFields from {@link IndexReader#getTermVectors(int)}. Possibly null. For performance, this instance should
+ * be re-used for the same document (e.g. when highlighting multiple fields).
+ * @param maxStartOffset Terms with a startOffset greater than this aren't returned. Use -1 for no limit.
+ * Suggest using {@link Highlighter#getMaxDocCharsToAnalyze()} - 1
+ * @return a token stream from term vectors. Null if no term vectors with the right options.
+ */
+ public static TokenStream getTermVectorTokenStreamOrNull(String field, Fields tvFields, int maxStartOffset)
+ throws IOException {
+ if (tvFields == null) {
+ return null;
+ }
+ final Terms tvTerms = tvFields.terms(field);
+ if (tvTerms == null || !tvTerms.hasOffsets()) {
+ return null;
+ }
+ return new TokenStreamFromTermVector(tvTerms, maxStartOffset);
+ }
+
/**
* A convenience method that tries to first get a {@link TokenStreamFromTermVector} for the
* specified docId, then, falls back to using the passed in
* {@link org.apache.lucene.document.Document} to retrieve the TokenStream.
* This is useful when you already have the document, but would prefer to use
* the vector first.
- *
+ *
* @param reader The {@link org.apache.lucene.index.IndexReader} to use to try
* and get the vector from
* @param docId The docId to retrieve.
@@ -54,7 +112,7 @@ public class TokenSources {
* {@link org.apache.lucene.document.Document}
* @throws IOException if there was an error loading
*/
-
+ @Deprecated // maintenance reasons LUCENE-6445
public static TokenStream getAnyTokenStream(IndexReader reader, int docId,
String field, Document document, Analyzer analyzer) throws IOException {
TokenStream ts = null;
@@ -83,6 +141,7 @@ public class TokenSources {
* @return null if field not stored correctly
* @throws IOException If there is a low-level I/O error
*/
+ @Deprecated // maintenance reasons LUCENE-6445
public static TokenStream getAnyTokenStream(IndexReader reader, int docId,
String field, Analyzer analyzer) throws IOException {
TokenStream ts = null;
@@ -103,7 +162,7 @@ public class TokenSources {
}
/** Simply calls {@link #getTokenStream(org.apache.lucene.index.Terms)} now. */
- @Deprecated
+ @Deprecated // maintenance reasons LUCENE-6445
public static TokenStream getTokenStream(Terms vector,
boolean tokenPositionsGuaranteedContiguous) throws IOException {
return getTokenStream(vector);
@@ -119,6 +178,7 @@ public class TokenSources {
*
* @throws IllegalArgumentException if no offsets are available
*/
+ @Deprecated // maintenance reasons LUCENE-6445
public static TokenStream getTokenStream(final Terms tpv) throws IOException {
if (!tpv.hasOffsets()) {
@@ -144,6 +204,7 @@ public class TokenSources {
*
* @see #getTokenStream(org.apache.lucene.index.Terms)
*/
+ @Deprecated // maintenance reasons LUCENE-6445
public static TokenStream getTokenStreamWithOffsets(IndexReader reader, int docId,
String field) throws IOException {
@@ -164,13 +225,14 @@ public class TokenSources {
return getTokenStream(vector);
}
- // convenience method
+ @Deprecated // maintenance reasons LUCENE-6445
public static TokenStream getTokenStream(IndexReader reader, int docId,
String field, Analyzer analyzer) throws IOException {
Document doc = reader.document(docId);
return getTokenStream(doc, field, analyzer);
}
-
+
+ @Deprecated // maintenance reasons LUCENE-6445
public static TokenStream getTokenStream(Document doc, String field,
Analyzer analyzer) {
String contents = doc.get(field);
@@ -181,7 +243,7 @@ public class TokenSources {
return getTokenStream(field, contents, analyzer);
}
- // convenience method
+ @Deprecated // maintenance reasons LUCENE-6445
public static TokenStream getTokenStream(String field, String contents,
Analyzer analyzer) {
try {
Modified: lucene/dev/branches/branch_5x/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterPhraseTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterPhraseTest.java?rev=1676571&r1=1676570&r2=1676571&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterPhraseTest.java (original)
+++ lucene/dev/branches/branch_5x/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterPhraseTest.java Tue Apr 28 15:53:41 2015
@@ -23,19 +23,18 @@ import org.apache.lucene.analysis.MockAn
import org.apache.lucene.analysis.MockTokenizer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.TextField;
-import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.Term;
-import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.PhraseQuery;
import org.apache.lucene.search.Query;
@@ -45,11 +44,12 @@ import org.apache.lucene.search.spans.Sp
import org.apache.lucene.search.spans.SpanQuery;
import org.apache.lucene.search.spans.SpanTermQuery;
import org.apache.lucene.store.Directory;
-import org.apache.lucene.util.LuceneTestCase;
import org.apache.lucene.util.FixedBitSet;
+import org.apache.lucene.util.LuceneTestCase;
public class HighlighterPhraseTest extends LuceneTestCase {
private static final String FIELD = "text";
+
public void testConcurrentPhrase() throws IOException, InvalidTokenOffsetsException {
final String TEXT = "the fox jumped";
final Directory directory = newDirectory();
@@ -80,9 +80,8 @@ public class HighlighterPhraseTest exten
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
new QueryScorer(phraseQuery));
- final TokenStream tokenStream = TokenSources
- .getTokenStream(indexReader.getTermVector(
- 0, FIELD));
+ final TokenStream tokenStream =
+ TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
assertEquals(highlighter.getBestFragment(new TokenStreamConcurrent(),
TEXT), highlighter.getBestFragment(tokenStream, TEXT));
} finally {
@@ -147,9 +146,8 @@ public class HighlighterPhraseTest exten
for (int position = bitset.nextSetBit(0); position < maxDoc-1; position = bitset
.nextSetBit(position + 1)) {
assertEquals(0, position);
- final TokenStream tokenStream = TokenSources.getTokenStream(
- indexReader.getTermVector(position,
- FIELD));
+ final TokenStream tokenStream =
+ TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(position), -1);
assertEquals(highlighter.getBestFragment(new TokenStreamConcurrent(),
TEXT), highlighter.getBestFragment(tokenStream, TEXT));
}
@@ -189,9 +187,8 @@ public class HighlighterPhraseTest exten
final Highlighter highlighter = new Highlighter(
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
new QueryScorer(phraseQuery));
- final TokenStream tokenStream = TokenSources
- .getTokenStream(indexReader.getTermVector(
- 0, FIELD));
+ final TokenStream tokenStream =
+ TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
assertEquals(
highlighter.getBestFragment(new TokenStreamSparse(), TEXT),
highlighter.getBestFragment(tokenStream, TEXT));
@@ -230,8 +227,8 @@ public class HighlighterPhraseTest exten
final Highlighter highlighter = new Highlighter(
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
new QueryScorer(phraseQuery));
- final TokenStream tokenStream = TokenSources.getTokenStream(
- indexReader.getTermVector(0, FIELD));
+ final TokenStream tokenStream =
+ TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
assertEquals("the fox <B>did</B> not <B>jump</B>", highlighter
.getBestFragment(tokenStream, TEXT));
} finally {
@@ -269,9 +266,8 @@ public class HighlighterPhraseTest exten
final Highlighter highlighter = new Highlighter(
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
new QueryScorer(phraseQuery));
- final TokenStream tokenStream = TokenSources
- .getTokenStream(indexReader.getTermVector(
- 0, FIELD));
+ final TokenStream tokenStream =
+ TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
assertEquals(
highlighter.getBestFragment(new TokenStreamSparse(), TEXT),
highlighter.getBestFragment(tokenStream, TEXT));
Modified: lucene/dev/branches/branch_5x/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java?rev=1676571&r1=1676570&r2=1676571&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java (original)
+++ lucene/dev/branches/branch_5x/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java Tue Apr 28 15:53:41 2015
@@ -17,6 +17,8 @@ package org.apache.lucene.search.highlig
* limitations under the License.
*/
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
@@ -29,9 +31,6 @@ import java.util.List;
import java.util.Map;
import java.util.StringTokenizer;
-import javax.xml.parsers.DocumentBuilder;
-import javax.xml.parsers.DocumentBuilderFactory;
-
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CachingTokenFilter;
@@ -131,10 +130,15 @@ public class HighlighterTest extends Bas
"JFK has been shot", "John Kennedy has been shot",
"This text has a typo in referring to Keneddy",
"wordx wordy wordz wordx wordy wordx worda wordb wordy wordc", "y z x y z a b", "lets is a the lets is a the lets is a the lets" };
-
- public void testQueryScorerHits() throws Exception {
- Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
+ // Convenience method for succinct tests; doesn't represent "best practice"
+ private TokenStream getAnyTokenStream(String fieldName, int docId)
+ throws IOException {
+ return TokenSources.getTokenStream(fieldName, searcher.getIndexReader().getTermVectors(docId),
+ searcher.doc(docId).get(fieldName), analyzer, -1);
+ }
+
+ public void testQueryScorerHits() throws Exception {
PhraseQuery phraseQuery = new PhraseQuery();
phraseQuery.add(new Term(FIELD_NAME, "very"));
phraseQuery.add(new Term(FIELD_NAME, "long"));
@@ -148,11 +152,11 @@ public class HighlighterTest extends Bas
for (int i = 0; i < hits.scoreDocs.length; i++) {
- Document doc = searcher.doc(hits.scoreDocs[i].doc);
+ final int docId = hits.scoreDocs[i].doc;
+ Document doc = searcher.doc(docId);
String storedField = doc.get(FIELD_NAME);
- TokenStream stream = TokenSources.getAnyTokenStream(searcher
- .getIndexReader(), hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
+ TokenStream stream = getAnyTokenStream(FIELD_NAME, docId);
Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
@@ -176,21 +180,21 @@ public class HighlighterTest extends Bas
QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
Highlighter highlighter = new Highlighter(scorer);
- Document doc = searcher.doc(hits.scoreDocs[0].doc);
+ final int docId0 = hits.scoreDocs[0].doc;
+ Document doc = searcher.doc(docId0);
String storedField = doc.get(FIELD_NAME);
- TokenStream stream = TokenSources.getAnyTokenStream(searcher
- .getIndexReader(), hits.scoreDocs[0].doc, FIELD_NAME, doc, analyzer);
+ TokenStream stream = getAnyTokenStream(FIELD_NAME, docId0);
Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
highlighter.setTextFragmenter(fragmenter);
String fragment = highlighter.getBestFragment(stream, storedField);
assertEquals("Hello this is a piece of text that is <B>very</B> <B>long</B> and contains too much preamble and the meat is really here which says kennedy has been shot", fragment);
-
- doc = searcher.doc(hits.scoreDocs[1].doc);
+
+ final int docId1 = hits.scoreDocs[1].doc;
+ doc = searcher.doc(docId1);
storedField = doc.get(FIELD_NAME);
- stream = TokenSources.getAnyTokenStream(searcher
- .getIndexReader(), hits.scoreDocs[1].doc, FIELD_NAME, doc, analyzer);
+ stream = getAnyTokenStream(FIELD_NAME, docId1);
highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer));
fragment = highlighter.getBestFragment(stream, storedField);
assertEquals("This piece of text refers to Kennedy at the beginning then has a longer piece of text that is <B>very</B>", fragment);
@@ -230,21 +234,21 @@ public class HighlighterTest extends Bas
QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
Highlighter highlighter = new Highlighter(scorer);
- Document doc = searcher.doc(hits.scoreDocs[0].doc);
+ final int docId0 = hits.scoreDocs[0].doc;
+ Document doc = searcher.doc(docId0);
String storedField = doc.get(FIELD_NAME);
- TokenStream stream = TokenSources.getAnyTokenStream(searcher
- .getIndexReader(), hits.scoreDocs[0].doc, FIELD_NAME, doc, analyzer);
+ TokenStream stream = getAnyTokenStream(FIELD_NAME, docId0);
Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
highlighter.setTextFragmenter(fragmenter);
String fragment = highlighter.getBestFragment(stream, storedField);
assertEquals("Hello this is a piece of text that is <B>very</B> <B>long</B> and contains too much preamble and the meat is really here which says kennedy has been shot", fragment);
-
- doc = searcher.doc(hits.scoreDocs[1].doc);
+
+ final int docId1 = hits.scoreDocs[1].doc;
+ doc = searcher.doc(docId1);
storedField = doc.get(FIELD_NAME);
- stream = TokenSources.getAnyTokenStream(searcher
- .getIndexReader(), hits.scoreDocs[1].doc, FIELD_NAME, doc, analyzer);
+ stream = getAnyTokenStream(FIELD_NAME, docId1);
highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer));
fragment = highlighter.getBestFragment(stream, storedField);
assertEquals("This piece of text refers to Kennedy at the beginning then has a longer piece of text that is <B>very</B>", fragment);
@@ -391,9 +395,10 @@ public class HighlighterTest extends Bas
Highlighter highlighter = new Highlighter(this, scorer);
for (int i = 0; i < hits.totalHits; i++) {
- final Document doc = searcher.doc(hits.scoreDocs[i].doc);
+ final int docId = hits.scoreDocs[i].doc;
+ final Document doc = searcher.doc(docId);
String text = doc.get(FIELD_NAME);
- TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
+ TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
highlighter.setTextFragmenter(new SimpleFragmenter(40));
@@ -421,9 +426,10 @@ public class HighlighterTest extends Bas
highlighter = new Highlighter(this, scorer);
for (int i = 0; i < hits.totalHits; i++) {
- final Document doc = searcher.doc(hits.scoreDocs[i].doc);
+ final int docId = hits.scoreDocs[i].doc;
+ final Document doc = searcher.doc(docId);
String text = doc.get(FIELD_NAME);
- TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
+ TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
highlighter.setTextFragmenter(new SimpleFragmenter(40));
@@ -451,9 +457,10 @@ public class HighlighterTest extends Bas
highlighter = new Highlighter(this, scorer);
for (int i = 0; i < hits.totalHits; i++) {
- final Document doc = searcher.doc(hits.scoreDocs[i].doc);
+ final int docId = hits.scoreDocs[i].doc;
+ final Document doc = searcher.doc(docId);
String text = doc.get(FIELD_NAME);
- TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
+ TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
highlighter.setTextFragmenter(new SimpleFragmenter(40));
@@ -477,9 +484,10 @@ public class HighlighterTest extends Bas
Highlighter highlighter = new Highlighter(this, scorer);
for (int i = 0; i < hits.totalHits; i++) {
- final Document doc = searcher.doc(hits.scoreDocs[i].doc);
+ final int docId = hits.scoreDocs[i].doc;
+ final Document doc = searcher.doc(docId);
String text = doc.get(FIELD_NAME);
- TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
+ TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
highlighter.setTextFragmenter(new SimpleFragmenter(40));
@@ -502,9 +510,10 @@ public class HighlighterTest extends Bas
Highlighter highlighter = new Highlighter(this, scorer);
for (int i = 0; i < hits.totalHits; i++) {
- final Document doc = searcher.doc(hits.scoreDocs[i].doc);
+ final int docId = hits.scoreDocs[i].doc;
+ final Document doc = searcher.doc(docId);
String text = doc.get(FIELD_NAME);
- TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
+ TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
highlighter.setTextFragmenter(new SimpleFragmenter(40));
@@ -527,9 +536,10 @@ public class HighlighterTest extends Bas
Highlighter highlighter = new Highlighter(this, scorer);
for (int i = 0; i < hits.totalHits; i++) {
- final Document doc = searcher.doc(hits.scoreDocs[i].doc);
+ final int docId = hits.scoreDocs[i].doc;
+ final Document doc = searcher.doc(docId);
String text = doc.get(FIELD_NAME);
- TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
+ TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
highlighter.setTextFragmenter(new SimpleFragmenter(40));
@@ -613,8 +623,8 @@ public class HighlighterTest extends Bas
for (int i = 0; i < hits.totalHits; i++) {
String text = "parent document";
- Document doc = searcher.doc(hits.scoreDocs[i].doc);
- TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
+ final int docId = hits.scoreDocs[i].doc;
+ TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
highlighter.setTextFragmenter(new SimpleFragmenter(40));
highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, "...");
@@ -639,9 +649,10 @@ public class HighlighterTest extends Bas
highlighter.setTextFragmenter(new SimpleFragmenter(40));
for (int i = 0; i < hits.totalHits; i++) {
- final Document doc = searcher.doc(hits.scoreDocs[i].doc);
+ final int docId = hits.scoreDocs[i].doc;
+ final Document doc = searcher.doc(docId);
String text = doc.get(FIELD_NAME);
- TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
+ TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
"...");
@@ -662,9 +673,10 @@ public class HighlighterTest extends Bas
int maxNumFragmentsRequired = 2;
for (int i = 0; i < hits.totalHits; i++) {
- final Document doc = searcher.doc(hits.scoreDocs[i].doc);
+ final int docId = hits.scoreDocs[i].doc;
+ final Document doc = searcher.doc(docId);
String text = doc.get(FIELD_NAME);
- TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
+ TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
Highlighter highlighter = new Highlighter(this, scorer);
@@ -693,9 +705,10 @@ public class HighlighterTest extends Bas
Highlighter highlighter = new Highlighter(this, scorer);
for (int i = 0; i < hits.totalHits; i++) {
- final Document doc = searcher.doc(hits.scoreDocs[i].doc);
+ final int docId = hits.scoreDocs[i].doc;
+ final Document doc = searcher.doc(docId);
String text = doc.get(FIELD_NAME);
- TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
+ TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer, 5));
@@ -748,9 +761,10 @@ public class HighlighterTest extends Bas
Highlighter highlighter = new Highlighter(this,scorer);
for (int i = 0; i < hits.totalHits; i++) {
- final Document doc = searcher.doc(hits.scoreDocs[i].doc);
+ final int docId = hits.scoreDocs[i].doc;
+ final Document doc = searcher.doc(docId);
String text = doc.get(FIELD_NAME);
- TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
+ TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
highlighter.setTextFragmenter(new SimpleFragmenter(40));
@@ -820,9 +834,10 @@ public class HighlighterTest extends Bas
highlighter.setTextFragmenter(new SimpleFragmenter(40));
int maxNumFragmentsRequired = 2;
for (int i = 0; i < hits.totalHits; i++) {
- final Document doc = searcher.doc(hits.scoreDocs[i].doc);
+ final int docId = hits.scoreDocs[i].doc;
+ final Document doc = searcher.doc(docId);
String text = doc.get(FIELD_NAME);
- TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
+ TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
"...");
@@ -1015,9 +1030,11 @@ public class HighlighterTest extends Bas
hits = searcher.search(query, 1000);
for (int i = 0; i < hits.totalHits; i++) {
- final Document doc = searcher.doc(hits.scoreDocs[i].doc);
+ final int docId = hits.scoreDocs[i].doc;
+ final Document doc = searcher.doc(docId);
String text = doc.get(FIELD_NAME);
- TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer); int maxNumFragmentsRequired = 2;
+ TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
+ int maxNumFragmentsRequired = 2;
String fragmentSeparator = "...";
QueryScorer scorer = new QueryScorer(query, HighlighterTest.FIELD_NAME);
@@ -1039,9 +1056,10 @@ public class HighlighterTest extends Bas
numHighlights = 0;
for (int i = 0; i < hits.totalHits; i++) {
- final Document doc = searcher.doc(hits.scoreDocs[i].doc);
+ final int docId = hits.scoreDocs[i].doc;
+ final Document doc = searcher.doc(docId);
String text = doc.get(FIELD_NAME);
- TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
+ TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
int maxNumFragmentsRequired = 2;
String fragmentSeparator = "...";
QueryScorer scorer = new QueryScorer(query, null);
@@ -1064,9 +1082,11 @@ public class HighlighterTest extends Bas
numHighlights = 0;
for (int i = 0; i < hits.totalHits; i++) {
- final Document doc = searcher.doc(hits.scoreDocs[i].doc);
+ final int docId = hits.scoreDocs[i].doc;
+ final Document doc = searcher.doc(docId);
String text = doc.get(FIELD_NAME);
- TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer); int maxNumFragmentsRequired = 2;
+ TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
+ int maxNumFragmentsRequired = 2;
String fragmentSeparator = "...";
QueryScorer scorer = new QueryScorer(query, "random_field", HighlighterTest.FIELD_NAME);
@@ -1240,9 +1260,10 @@ public class HighlighterTest extends Bas
doSearching(new TermQuery(new Term(FIELD_NAME, "kennedy")));
numHighlights = 0;
for (int i = 0; i < hits.totalHits; i++) {
- final Document doc = searcher.doc(hits.scoreDocs[i].doc);
+ final int docId = hits.scoreDocs[i].doc;
+ final Document doc = searcher.doc(docId);
String text = doc.get(FIELD_NAME);
- TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
+ TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
Highlighter highlighter = getHighlighter(query, FIELD_NAME,
HighlighterTest.this);
@@ -1255,9 +1276,10 @@ public class HighlighterTest extends Bas
numHighlights = 0;
for (int i = 0; i < hits.totalHits; i++) {
- final Document doc = searcher.doc(hits.scoreDocs[i].doc);
+ final int docId = hits.scoreDocs[i].doc;
+ final Document doc = searcher.doc(docId);
String text = doc.get(FIELD_NAME);
- TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
+ TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
Highlighter highlighter = getHighlighter(query, FIELD_NAME,
HighlighterTest.this);
highlighter.getBestFragment(tokenStream, text);
@@ -1267,9 +1289,10 @@ public class HighlighterTest extends Bas
numHighlights = 0;
for (int i = 0; i < hits.totalHits; i++) {
- final Document doc = searcher.doc(hits.scoreDocs[i].doc);
+ final int docId = hits.scoreDocs[i].doc;
+ final Document doc = searcher.doc(docId);
String text = doc.get(FIELD_NAME);
- TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
+ TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
Highlighter highlighter = getHighlighter(query, FIELD_NAME,
HighlighterTest.this);
@@ -1399,9 +1422,10 @@ public class HighlighterTest extends Bas
doSearching(new TermQuery(new Term(FIELD_NAME, "kennedy")));
for (int i = 0; i < hits.totalHits; i++) {
- final Document doc = searcher.doc(hits.scoreDocs[i].doc);
+ final int docId = hits.scoreDocs[i].doc;
+ final Document doc = searcher.doc(docId);
String text = doc.get(FIELD_NAME);
- TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
+ TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
Highlighter highlighter = getHighlighter(query, FIELD_NAME,
HighlighterTest.this);// new Highlighter(this, new
@@ -1552,9 +1576,10 @@ public class HighlighterTest extends Bas
int maxNumFragmentsRequired = 3;
for (int i = 0; i < hits.totalHits; i++) {
- final Document doc = searcher.doc(hits.scoreDocs[i].doc);
+ final int docId = hits.scoreDocs[i].doc;
+ final Document doc = searcher.doc(docId);
String text = doc.get(FIELD_NAME);
- TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
+ TokenStream tokenStream = getAnyTokenStream(FIELD_NAME, docId);
Highlighter highlighter = getHighlighter(query, FIELD_NAME, HighlighterTest.this, false);
highlighter.setTextFragmenter(new SimpleFragmenter(40));
@@ -2259,13 +2284,15 @@ final class SynonymTokenizer extends Tok
throws Exception {
for (int i = 0; i < hits.totalHits; i++) {
- final Document doc = searcher.doc(hits.scoreDocs[i].doc);
+ final int docId = hits.scoreDocs[i].doc;
+ final Document doc = searcher.doc(docId);
String text = doc.get(HighlighterTest.FIELD_NAME);
int maxNumFragmentsRequired = 2;
String fragmentSeparator = "...";
Scorer scorer = null;
- TokenStream tokenStream = TokenSources.getAnyTokenStream(searcher.getIndexReader(),
- hits.scoreDocs[i].doc, HighlighterTest.FIELD_NAME, doc, analyzer);
+ TokenStream tokenStream =
+ TokenSources.getTokenStream(HighlighterTest.FIELD_NAME,
+ searcher.getIndexReader().getTermVectors(docId), text, analyzer, -1);
if (mode == QUERY) {
scorer = new QueryScorer(query);
} else if (mode == QUERY_TERM) {
Modified: lucene/dev/branches/branch_5x/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java?rev=1676571&r1=1676570&r2=1676571&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java (original)
+++ lucene/dev/branches/branch_5x/lucene/highlighter/src/test/org/apache/lucene/search/highlight/TokenSourcesTest.java Tue Apr 28 15:53:41 2015
@@ -22,6 +22,7 @@ import java.io.IOException;
import com.carrotsearch.randomizedtesting.annotations.Repeat;
import org.apache.lucene.analysis.BaseTokenStreamTestCase;
import org.apache.lucene.analysis.CannedTokenStream;
+import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.analysis.Token;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
@@ -34,6 +35,7 @@ import org.apache.lucene.document.FieldT
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.BaseTermVectorsFormatTestCase;
import org.apache.lucene.index.DirectoryReader;
+import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.RandomIndexWriter;
@@ -50,6 +52,11 @@ import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.TestUtil;
// LUCENE-2874
+
+/** Tests {@link org.apache.lucene.search.highlight.TokenSources} and
+ * {@link org.apache.lucene.search.highlight.TokenStreamFromTermVector}
+ * indirectly from that.
+ */
public class TokenSourcesTest extends BaseTokenStreamTestCase {
private static final String FIELD = "text";
@@ -100,6 +107,7 @@ public class TokenSourcesTest extends Ba
final Document document = new Document();
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
customType.setStoreTermVectors(true);
+ // no positions!
customType.setStoreTermVectorOffsets(true);
document.add(new Field(FIELD, new OverlappingTokenStream(), customType));
indexWriter.addDocument(document);
@@ -122,9 +130,8 @@ public class TokenSourcesTest extends Ba
final Highlighter highlighter = new Highlighter(
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
new QueryScorer(query));
- final TokenStream tokenStream = TokenSources
- .getTokenStream(
- indexReader.getTermVector(0, FIELD));
+ final TokenStream tokenStream =
+ TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
assertEquals("<B>the fox</B> did not jump",
highlighter.getBestFragment(tokenStream, TEXT));
} finally {
@@ -166,9 +173,8 @@ public class TokenSourcesTest extends Ba
final Highlighter highlighter = new Highlighter(
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
new QueryScorer(query));
- final TokenStream tokenStream = TokenSources
- .getTokenStream(
- indexReader.getTermVector(0, FIELD));
+ final TokenStream tokenStream =
+ TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
assertEquals("<B>the fox</B> did not jump",
highlighter.getBestFragment(tokenStream, TEXT));
} finally {
@@ -187,6 +193,7 @@ public class TokenSourcesTest extends Ba
final Document document = new Document();
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
customType.setStoreTermVectors(true);
+ // no positions!
customType.setStoreTermVectorOffsets(true);
document.add(new Field(FIELD, new OverlappingTokenStream(), customType));
indexWriter.addDocument(document);
@@ -209,9 +216,8 @@ public class TokenSourcesTest extends Ba
final Highlighter highlighter = new Highlighter(
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
new QueryScorer(phraseQuery));
- final TokenStream tokenStream = TokenSources
- .getTokenStream(
- indexReader.getTermVector(0, FIELD));
+ final TokenStream tokenStream =
+ TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
assertEquals("<B>the fox</B> did not jump",
highlighter.getBestFragment(tokenStream, TEXT));
} finally {
@@ -230,6 +236,7 @@ public class TokenSourcesTest extends Ba
final Document document = new Document();
FieldType customType = new FieldType(TextField.TYPE_NOT_STORED);
customType.setStoreTermVectors(true);
+ customType.setStoreTermVectorPositions(true);
customType.setStoreTermVectorOffsets(true);
document.add(new Field(FIELD, new OverlappingTokenStream(), customType));
indexWriter.addDocument(document);
@@ -252,9 +259,8 @@ public class TokenSourcesTest extends Ba
final Highlighter highlighter = new Highlighter(
new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
new QueryScorer(phraseQuery));
- final TokenStream tokenStream = TokenSources
- .getTokenStream(
- indexReader.getTermVector(0, FIELD));
+ final TokenStream tokenStream =
+ TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
assertEquals("<B>the fox</B> did not jump",
highlighter.getBestFragment(tokenStream, TEXT));
} finally {
@@ -263,7 +269,7 @@ public class TokenSourcesTest extends Ba
}
}
- public void testTermVectorWithoutOffsetsThrowsException()
+ public void testTermVectorWithoutOffsetsDoesntWork()
throws IOException, InvalidTokenOffsetsException {
final Directory directory = newDirectory();
final IndexWriter indexWriter = new IndexWriter(directory,
@@ -282,12 +288,9 @@ public class TokenSourcesTest extends Ba
final IndexReader indexReader = DirectoryReader.open(directory);
try {
assertEquals(1, indexReader.numDocs());
- TokenSources.getTokenStream(
- indexReader.getTermVector(0, FIELD));
- fail("TokenSources.getTokenStream should throw IllegalArgumentException if term vector has no offsets");
- }
- catch (IllegalArgumentException e) {
- // expected
+ final TokenStream tokenStream =
+ TokenSources.getTermVectorTokenStreamOrNull(FIELD, indexReader.getTermVectors(0), -1);
+ assertNull(tokenStream);
}
finally {
indexReader.close();
@@ -333,7 +336,7 @@ public class TokenSourcesTest extends Ba
writer.close();
assertEquals(1, reader.numDocs());
- TokenStream ts = TokenSources.getTokenStream(reader.getTermVectors(0).terms("field"));
+ TokenStream ts = TokenSources.getTermVectorTokenStreamOrNull("field", reader.getTermVectors(0), -1);
CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
PositionIncrementAttribute posIncAtt = ts.getAttribute(PositionIncrementAttribute.class);
@@ -409,7 +412,8 @@ public class TokenSourcesTest extends Ba
writer.close();
assertEquals(1, reader.numDocs());
- TokenStream vectorTokenStream = TokenSources.getTokenStream(reader.getTermVectors(0).terms("field"));
+ TokenStream vectorTokenStream =
+ TokenSources.getTermVectorTokenStreamOrNull("field", reader.getTermVectors(0), -1);
//sometimes check payloads
PayloadAttribute payloadAttribute = null;
@@ -430,4 +434,57 @@ public class TokenSourcesTest extends Ba
reader.close();
dir.close();
}
+
+ public void testMaxStartOffsetConsistency() throws IOException {
+ FieldType tvFieldType = new FieldType(TextField.TYPE_NOT_STORED);
+ tvFieldType.setStoreTermVectors(true);
+ tvFieldType.setStoreTermVectorOffsets(true);
+ tvFieldType.setStoreTermVectorPositions(true);
+
+ Directory dir = newDirectory();
+
+ MockAnalyzer analyzer = new MockAnalyzer(random());
+ analyzer.setEnableChecks(false);//we don't necessarily consume the whole stream because of limiting by startOffset
+ Document doc = new Document();
+ final String TEXT = " f gg h";
+ doc.add(new Field("fld_tv", analyzer.tokenStream("fooFld", TEXT), tvFieldType));
+ doc.add(new TextField("fld_notv", analyzer.tokenStream("barFld", TEXT)));
+
+ IndexReader reader;
+ try (RandomIndexWriter writer = new RandomIndexWriter(random(), dir)) {
+ writer.addDocument(doc);
+ reader = writer.getReader();
+ }
+ try {
+ Fields tvFields = reader.getTermVectors(0);
+ for (int maxStartOffset = -1; maxStartOffset <= TEXT.length(); maxStartOffset++) {
+ TokenStream tvStream = TokenSources.getTokenStream("fld_tv", tvFields, TEXT, analyzer, maxStartOffset);
+ TokenStream anaStream = TokenSources.getTokenStream("fld_notv", tvFields, TEXT, analyzer, maxStartOffset);
+
+ //assert have same tokens, none of which has a start offset > maxStartOffset
+ final OffsetAttribute tvOffAtt = tvStream.addAttribute(OffsetAttribute.class);
+ final OffsetAttribute anaOffAtt = anaStream.addAttribute(OffsetAttribute.class);
+ tvStream.reset();
+ anaStream.reset();
+ while (tvStream.incrementToken()) {
+ assertTrue(anaStream.incrementToken());
+ assertEquals(tvOffAtt.startOffset(), anaOffAtt.startOffset());
+ if (maxStartOffset >= 0)
+ assertTrue(tvOffAtt.startOffset() <= maxStartOffset);
+ }
+ assertTrue(anaStream.incrementToken() == false);
+ tvStream.end();
+ anaStream.end();
+ tvStream.close();
+ anaStream.close();
+ }
+
+ } finally {
+ reader.close();
+ }
+
+
+
+ dir.close();
+ }
}
Modified: lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java
URL: http://svn.apache.org/viewvc/lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java?rev=1676571&r1=1676570&r2=1676571&view=diff
==============================================================================
--- lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java (original)
+++ lucene/dev/branches/branch_5x/solr/core/src/java/org/apache/solr/highlight/DefaultSolrHighlighter.java Tue Apr 28 15:53:41 2015
@@ -34,6 +34,7 @@ import org.apache.lucene.analysis.TokenS
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.Document;
+import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.highlight.Encoder;
@@ -493,8 +494,9 @@ public class DefaultSolrHighlighter exte
List<TextFragment> frags = new ArrayList<>();
//Try term vectors, which is faster
+ final Fields tvFields = searcher.getIndexReader().getTermVectors(docId); // TODO add as param; see SOLR-5855
final TokenStream tvStream =
- TokenSources.getTokenStreamWithOffsets(searcher.getIndexReader(), docId, fieldName);
+ TokenSources.getTermVectorTokenStreamOrNull(fieldName, tvFields, maxCharsToAnalyze - 1);
// We need to wrap in OffsetWindowTokenFilter if multi-valued
final OffsetWindowTokenFilter tvWindowStream;
if (tvStream != null && fieldValues.size() > 1) {