You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ds...@apache.org on 2014/12/05 16:08:58 UTC
svn commit: r1643316 - in /lucene/dev/trunk/lucene: ./
highlighter/src/java/org/apache/lucene/search/highlight/
highlighter/src/test/org/apache/lucene/search/highlight/
Author: dsmiley
Date: Fri Dec 5 15:08:57 2014
New Revision: 1643316
URL: http://svn.apache.org/r1643316
Log:
LUCENE-6034: Highlighter QueryScorer/WeightedSpanTermExtractor shouldn't re-invert a term vector based TokenStream. It can now highlight payload-sensitive queries.
Added:
lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java (with props)
Modified:
lucene/dev/trunk/lucene/CHANGES.txt
lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java
lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
Modified: lucene/dev/trunk/lucene/CHANGES.txt
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/CHANGES.txt?rev=1643316&r1=1643315&r2=1643316&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/CHANGES.txt (original)
+++ lucene/dev/trunk/lucene/CHANGES.txt Fri Dec 5 15:08:57 2014
@@ -119,6 +119,10 @@ New Features
* LUCENE-6088: TermsFilter implements Accountable. (Adrien Grand)
+* LUCENE-6034: The default highlighter when used with QueryScorer will highlight payload-sensitive
+ queries provided that term vectors with positions, offsets, and payloads are present. This is the
+ only highlighter that can highlight such queries accurately. (David Smiley)
+
Optimizations
* LUCENE-5960: Use a more efficient bitset, not a Set<Integer>, to
@@ -161,6 +165,10 @@ Optimizations
* LUCENE-6089, LUCENE-6090: Tune CompressionMode.HIGH_COMPRESSION for
better compression and less cpu usage. (Adrien Grand, Robert Muir)
+* LUCENE-6034: QueryScorer, used by the default highlighter, needn't re-index the provided
+ TokenStream with MemoryIndex when it comes from TokenSources (term vectors) with offsets and
+ positions. (David Smiley)
+
API Changes
* LUCENE-5900: Deprecated more constructors taking Version in *InfixSuggester and
Modified: lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java?rev=1643316&r1=1643315&r2=1643316&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java (original)
+++ lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java Fri Dec 5 15:08:57 2014
@@ -265,7 +265,8 @@ public class QueryScorer implements Scor
* {@link CachingTokenFilter} are wrapped in a {@link CachingTokenFilter} to
* ensure an efficient reset - if you are already using a different caching
* {@link TokenStream} impl and you don't want it to be wrapped, set this to
- * false.
+ * false. Note that term-vector based tokenstreams are detected and won't be
+ * wrapped either.
*/
public void setWrapIfNotCachingTokenFilter(boolean wrap) {
this.wrapToCaching = wrap;
Added: lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java?rev=1643316&view=auto
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java (added)
+++ lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TermVectorLeafReader.java Fri Dec 5 15:08:57 2014
@@ -0,0 +1,176 @@
+package org.apache.lucene.search.highlight;
+
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Collections;
+import java.util.Iterator;
+
+import org.apache.lucene.index.BinaryDocValues;
+import org.apache.lucene.index.DocValuesType;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.Fields;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.NumericDocValues;
+import org.apache.lucene.index.SortedDocValues;
+import org.apache.lucene.index.SortedNumericDocValues;
+import org.apache.lucene.index.SortedSetDocValues;
+import org.apache.lucene.index.StoredFieldVisitor;
+import org.apache.lucene.index.Terms;
+import org.apache.lucene.util.Bits;
+
+/**
+ * Wraps a Terms with a {@link org.apache.lucene.index.LeafReader}, typically from term vectors.
+ *
+ * @lucene.experimental
+ */
+public class TermVectorLeafReader extends LeafReader {
+
+ private final Fields fields;
+ private final FieldInfos fieldInfos;
+
+ public TermVectorLeafReader(String field, Terms terms) {
+ fields = new Fields() {
+ @Override
+ public Iterator<String> iterator() {
+ return Collections.singletonList(field).iterator();
+ }
+
+ @Override
+ public Terms terms(String fld) throws IOException {
+ if (!field.equals(fld)) {
+ return null;
+ }
+ return terms;
+ }
+
+ @Override
+ public int size() {
+ return 1;
+ }
+ };
+
+ IndexOptions indexOptions;
+ if (!terms.hasFreqs()) {
+ indexOptions = IndexOptions.DOCS;
+ } else if (!terms.hasPositions()) {
+ indexOptions = IndexOptions.DOCS_AND_FREQS;
+ } else if (!terms.hasOffsets()) {
+ indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS;
+ } else {
+ indexOptions = IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS;
+ }
+ FieldInfo fieldInfo = new FieldInfo(field, 0,
+ true, true, terms.hasPayloads(),
+ indexOptions, DocValuesType.NONE, -1, null);
+ fieldInfos = new FieldInfos(new FieldInfo[]{fieldInfo});
+ }
+
+ @Override
+ public void addCoreClosedListener(CoreClosedListener listener) {
+ addCoreClosedListenerAsReaderClosedListener(this, listener);
+ }
+
+ @Override
+ public void removeCoreClosedListener(CoreClosedListener listener) {
+ removeCoreClosedListenerAsReaderClosedListener(this, listener);
+ }
+
+ @Override
+ protected void doClose() throws IOException {
+ }
+
+ @Override
+ public Fields fields() throws IOException {
+ return fields;
+ }
+
+ @Override
+ public NumericDocValues getNumericDocValues(String field) throws IOException {
+ return null;
+ }
+
+ @Override
+ public BinaryDocValues getBinaryDocValues(String field) throws IOException {
+ return null;
+ }
+
+ @Override
+ public SortedDocValues getSortedDocValues(String field) throws IOException {
+ return null;
+ }
+
+ @Override
+ public SortedNumericDocValues getSortedNumericDocValues(String field) throws IOException {
+ return null;
+ }
+
+ @Override
+ public SortedSetDocValues getSortedSetDocValues(String field) throws IOException {
+ return null;
+ }
+
+ @Override
+ public Bits getDocsWithField(String field) throws IOException {
+ return null;
+ }
+
+ @Override
+ public NumericDocValues getNormValues(String field) throws IOException {
+ return null;//Is this needed? See MemoryIndex for a way to do it.
+ }
+
+ @Override
+ public FieldInfos getFieldInfos() {
+ return fieldInfos;
+ }
+
+ @Override
+ public Bits getLiveDocs() {
+ return null;
+ }
+
+ @Override
+ public void checkIntegrity() throws IOException {
+ }
+
+ @Override
+ public Fields getTermVectors(int docID) throws IOException {
+ if (docID != 0) {
+ return null;
+ }
+ return fields();
+ }
+
+ @Override
+ public int numDocs() {
+ return 1;
+ }
+
+ @Override
+ public int maxDoc() {
+ return 1;
+ }
+
+ @Override
+ public void document(int docID, StoredFieldVisitor visitor) throws IOException {
+ }
+
+}
Modified: lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java?rev=1643316&r1=1643315&r2=1643316&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java (original)
+++ lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/TokenSources.java Fri Dec 5 15:08:57 2014
@@ -36,7 +36,7 @@ import org.apache.lucene.index.Terms;
*/
public class TokenSources {
/**
- * A convenience method that tries to first get a TermPositionVector for the
+ * A convenience method that tries to first get a {@link TokenStreamFromTermVector} for the
* specified docId, then, falls back to using the passed in
* {@link org.apache.lucene.document.Document} to retrieve the TokenStream.
* This is useful when you already have the document, but would prefer to use
Modified: lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java?rev=1643316&r1=1643315&r2=1643316&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java (original)
+++ lucene/dev/trunk/lucene/highlighter/src/java/org/apache/lucene/search/highlight/WeightedSpanTermExtractor.java Fri Dec 5 15:08:57 2014
@@ -16,6 +16,7 @@ package org.apache.lucene.search.highlig
* See the License for the specific language governing permissions and
* limitations under the License.
*/
+
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collections;
@@ -29,13 +30,13 @@ import java.util.TreeSet;
import org.apache.lucene.analysis.CachingTokenFilter;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.index.FilterLeafReader;
-import org.apache.lucene.index.LeafReader;
-import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.BinaryDocValues;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.Fields;
+import org.apache.lucene.index.FilterLeafReader;
import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.NumericDocValues;
import org.apache.lucene.index.SortedDocValues;
import org.apache.lucene.index.Term;
@@ -43,7 +44,18 @@ import org.apache.lucene.index.TermConte
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.memory.MemoryIndex;
import org.apache.lucene.queries.CommonTermsQuery;
-import org.apache.lucene.search.*;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.ConstantScoreQuery;
+import org.apache.lucene.search.DisjunctionMaxQuery;
+import org.apache.lucene.search.FilteredQuery;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.MatchAllDocsQuery;
+import org.apache.lucene.search.MultiPhraseQuery;
+import org.apache.lucene.search.MultiTermQuery;
+import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.join.ToChildBlockJoinQuery;
import org.apache.lucene.search.join.ToParentBlockJoinQuery;
import org.apache.lucene.search.spans.FieldMaskingSpanQuery;
@@ -65,7 +77,7 @@ import org.apache.lucene.util.IOUtils;
public class WeightedSpanTermExtractor {
private String fieldName;
- private TokenStream tokenStream;
+ private TokenStream tokenStream;//set subsequent to getWeightedSpanTerms* methods
private String defaultField;
private boolean expandMultiTermQuery;
private boolean cachedTokenStream;
@@ -209,6 +221,8 @@ public class WeightedSpanTermExtractor {
sp.setBoost(query.getBoost());
extractWeightedSpanTerms(terms, sp);
}
+ } else if (query instanceof MatchAllDocsQuery) {
+ //nothing
} else {
Query origQuery = query;
if (query instanceof MultiTermQuery) {
@@ -357,18 +371,39 @@ public class WeightedSpanTermExtractor {
protected LeafReaderContext getLeafContext() throws IOException {
if (internalReader == null) {
- if(wrapToCaching && !(tokenStream instanceof CachingTokenFilter)) {
- assert !cachedTokenStream;
- tokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze));
- cachedTokenStream = true;
- }
- final MemoryIndex indexer = new MemoryIndex(true);
- indexer.addField(DelegatingLeafReader.FIELD_NAME, tokenStream);
- tokenStream.reset();
- final IndexSearcher searcher = indexer.createSearcher();
- // MEM index has only atomic ctx
- internalReader = new DelegatingLeafReader(((LeafReaderContext)searcher.getTopReaderContext()).reader());
+ boolean cacheIt = wrapToCaching && !(tokenStream instanceof CachingTokenFilter);
+
+ // If it's from term vectors, simply wrap the underlying Terms in a reader
+ if (tokenStream instanceof TokenStreamFromTermVector) {
+ cacheIt = false;
+ Terms termVectorTerms = ((TokenStreamFromTermVector) tokenStream).getTermVectorTerms();
+ if (termVectorTerms.hasPositions() && termVectorTerms.hasOffsets()) {
+ internalReader = new TermVectorLeafReader(DelegatingLeafReader.FIELD_NAME, termVectorTerms);
+ }
+ }
+
+ // Use MemoryIndex (index/invert this tokenStream now)
+ if (internalReader == null) {
+ final MemoryIndex indexer = new MemoryIndex(true);
+ if (cacheIt) {
+ assert !cachedTokenStream;
+ tokenStream = new CachingTokenFilter(new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze));
+ cachedTokenStream = true;
+ indexer.addField(DelegatingLeafReader.FIELD_NAME, tokenStream);
+ } else {
+ indexer.addField(DelegatingLeafReader.FIELD_NAME,
+ new OffsetLimitTokenFilter(tokenStream, maxDocCharsToAnalyze));
+ }
+ tokenStream.reset();//reset to beginning when we return
+ final IndexSearcher searcher = indexer.createSearcher();
+ // MEM index has only atomic ctx
+ internalReader = ((LeafReaderContext) searcher.getTopReaderContext()).reader();
+ }
+
+ //Now wrap it so we always use a common field.
+ this.internalReader = new DelegatingLeafReader(internalReader);
}
+
return internalReader.getContext();
}
@@ -532,7 +567,7 @@ public class WeightedSpanTermExtractor {
return terms;
}
-
+
protected void collectSpanQueryFields(SpanQuery spanQuery, Set<String> fieldNames) {
if (spanQuery instanceof FieldMaskingSpanQuery) {
collectSpanQueryFields(((FieldMaskingSpanQuery)spanQuery).getMaskedQuery(), fieldNames);
@@ -622,8 +657,11 @@ public class WeightedSpanTermExtractor {
public boolean isCachedTokenStream() {
return cachedTokenStream;
}
-
+
+ /** Returns the tokenStream which may have been wrapped in a CachingTokenFilter.
+ * getWeightedSpanTerms* sets the tokenStream, so don't call this before. */
public TokenStream getTokenStream() {
+ assert tokenStream != null;
return tokenStream;
}
@@ -632,12 +670,16 @@ public class WeightedSpanTermExtractor {
* {@link CachingTokenFilter} are wrapped in a {@link CachingTokenFilter} to
* ensure an efficient reset - if you are already using a different caching
* {@link TokenStream} impl and you don't want it to be wrapped, set this to
- * false.
+ * false. This setting is ignored when a term vector based TokenStream is supplied,
+ * since it can be reset efficiently.
*/
public void setWrapIfNotCachingTokenFilter(boolean wrap) {
this.wrapToCaching = wrap;
}
+ /** A threshold of number of characters to analyze. When a TokenStream based on
+ * term vectors with offsets and positions are supplied, this setting
+ * does not apply. */
protected final void setMaxDocCharsToAnalyze(int maxDocCharsToAnalyze) {
this.maxDocCharsToAnalyze = maxDocCharsToAnalyze;
}
Modified: lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java
URL: http://svn.apache.org/viewvc/lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java?rev=1643316&r1=1643315&r2=1643316&view=diff
==============================================================================
--- lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java (original)
+++ lucene/dev/trunk/lucene/highlighter/src/test/org/apache/lucene/search/highlight/HighlighterTest.java Fri Dec 5 15:08:57 2014
@@ -22,6 +22,7 @@ import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
+import java.util.Collections;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
@@ -36,6 +37,7 @@ import org.apache.lucene.analysis.tokena
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
+import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.StoredField;
import org.apache.lucene.document.TextField;
@@ -78,8 +80,20 @@ public class HighlighterTest extends Bas
Directory ramDir;
public IndexSearcher searcher = null;
int numHighlights = 0;
- Analyzer analyzer;
+ MockAnalyzer analyzer;
TopDocs hits;
+ FieldType fieldType;//see doc()
+
+ final FieldType FIELD_TYPE_TV;
+ {
+ FieldType fieldType = new FieldType(TextField.TYPE_STORED);
+ fieldType.setStoreTermVectors(true);
+ fieldType.setStoreTermVectorPositions(true);
+ fieldType.setStoreTermVectorPayloads(true);
+ fieldType.setStoreTermVectorOffsets(true);
+ fieldType.freeze();
+ FIELD_TYPE_TV = fieldType;
+ }
String[] texts = {
"Hello this is a piece of text that is very long and contains too much preamble and the meat is really here which says kennedy has been shot",
@@ -121,9 +135,8 @@ public class HighlighterTest extends Bas
}
public void testHighlightingCommonTermsQuery() throws Exception {
- Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
CommonTermsQuery query = new CommonTermsQuery(Occur.MUST, Occur.SHOULD, 3);
- query.add(new Term(FIELD_NAME, "this"));
+ query.add(new Term(FIELD_NAME, "this"));//stop-word
query.add(new Term(FIELD_NAME, "long"));
query.add(new Term(FIELD_NAME, "very"));
@@ -141,7 +154,7 @@ public class HighlighterTest extends Bas
Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
highlighter.setTextFragmenter(fragmenter);
String fragment = highlighter.getBestFragment(stream, storedField);
- assertEquals("Hello <B>this</B> is a piece of text that is <B>very</B> <B>long</B> and contains too much preamble and the meat is really here which says kennedy has been shot", fragment);
+ assertEquals("Hello this is a piece of text that is <B>very</B> <B>long</B> and contains too much preamble and the meat is really here which says kennedy has been shot", fragment);
doc = searcher.doc(hits.scoreDocs[1].doc);
storedField = doc.get(FIELD_NAME);
@@ -150,7 +163,7 @@ public class HighlighterTest extends Bas
.getIndexReader(), hits.scoreDocs[1].doc, FIELD_NAME, doc, analyzer);
highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer));
fragment = highlighter.getBestFragment(stream, storedField);
- assertEquals("<B>This</B> piece of text refers to Kennedy at the beginning then has a longer piece of text that is <B>very</B>", fragment);
+ assertEquals("This piece of text refers to Kennedy at the beginning then has a longer piece of text that is <B>very</B>", fragment);
}
public void testHighlightUnknowQueryAfterRewrite() throws IOException, InvalidTokenOffsetsException {
@@ -159,7 +172,7 @@ public class HighlighterTest extends Bas
@Override
public Query rewrite(IndexReader reader) throws IOException {
CommonTermsQuery query = new CommonTermsQuery(Occur.MUST, Occur.SHOULD, 3);
- query.add(new Term(FIELD_NAME, "this"));
+ query.add(new Term(FIELD_NAME, "this"));//stop-word
query.add(new Term(FIELD_NAME, "long"));
query.add(new Term(FIELD_NAME, "very"));
return query;
@@ -180,9 +193,7 @@ public class HighlighterTest extends Bas
return super.equals(obj);
}
};
-
- Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true);
-
+
searcher = newSearcher(reader);
TopDocs hits = searcher.search(query, 10);
assertEquals(2, hits.totalHits);
@@ -197,7 +208,7 @@ public class HighlighterTest extends Bas
Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
highlighter.setTextFragmenter(fragmenter);
String fragment = highlighter.getBestFragment(stream, storedField);
- assertEquals("Hello <B>this</B> is a piece of text that is <B>very</B> <B>long</B> and contains too much preamble and the meat is really here which says kennedy has been shot", fragment);
+ assertEquals("Hello this is a piece of text that is <B>very</B> <B>long</B> and contains too much preamble and the meat is really here which says kennedy has been shot", fragment);
doc = searcher.doc(hits.scoreDocs[1].doc);
storedField = doc.get(FIELD_NAME);
@@ -206,7 +217,7 @@ public class HighlighterTest extends Bas
.getIndexReader(), hits.scoreDocs[1].doc, FIELD_NAME, doc, analyzer);
highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer));
fragment = highlighter.getBestFragment(stream, storedField);
- assertEquals("<B>This</B> piece of text refers to Kennedy at the beginning then has a longer piece of text that is <B>very</B>", fragment);
+ assertEquals("This piece of text refers to Kennedy at the beginning then has a longer piece of text that is <B>very</B>", fragment);
}
@@ -252,8 +263,7 @@ public class HighlighterTest extends Bas
*/
private String highlightField(Query query, String fieldName, String text)
throws IOException, InvalidTokenOffsetsException {
- TokenStream tokenStream = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)
- .tokenStream(fieldName, text);
+ TokenStream tokenStream = analyzer.tokenStream(fieldName, text);
// Assuming "<B>", "</B>" used to highlight
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter();
QueryScorer scorer = new QueryScorer(query, fieldName, FIELD_NAME);
@@ -351,8 +361,9 @@ public class HighlighterTest extends Bas
Highlighter highlighter = new Highlighter(this, scorer);
for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
- TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
+ final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
+ String text = doc.get(FIELD_NAME);
+ TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
highlighter.setTextFragmenter(new SimpleFragmenter(40));
@@ -380,8 +391,9 @@ public class HighlighterTest extends Bas
highlighter = new Highlighter(this, scorer);
for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
- TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
+ final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
+ String text = doc.get(FIELD_NAME);
+ TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
highlighter.setTextFragmenter(new SimpleFragmenter(40));
@@ -409,8 +421,9 @@ public class HighlighterTest extends Bas
highlighter = new Highlighter(this, scorer);
for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
- TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
+ final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
+ String text = doc.get(FIELD_NAME);
+ TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
highlighter.setTextFragmenter(new SimpleFragmenter(40));
@@ -434,8 +447,9 @@ public class HighlighterTest extends Bas
Highlighter highlighter = new Highlighter(this, scorer);
for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
- TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
+ final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
+ String text = doc.get(FIELD_NAME);
+ TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
highlighter.setTextFragmenter(new SimpleFragmenter(40));
@@ -458,8 +472,9 @@ public class HighlighterTest extends Bas
Highlighter highlighter = new Highlighter(this, scorer);
for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
- TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
+ final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
+ String text = doc.get(FIELD_NAME);
+ TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
highlighter.setTextFragmenter(new SimpleFragmenter(40));
@@ -482,8 +497,9 @@ public class HighlighterTest extends Bas
Highlighter highlighter = new Highlighter(this, scorer);
for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
- TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
+ final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
+ String text = doc.get(FIELD_NAME);
+ TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
highlighter.setTextFragmenter(new SimpleFragmenter(40));
@@ -567,7 +583,8 @@ public class HighlighterTest extends Bas
for (int i = 0; i < hits.totalHits; i++) {
String text = "parent document";
- TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
+ StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
+ TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
highlighter.setTextFragmenter(new SimpleFragmenter(40));
highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired, "...");
@@ -592,8 +609,9 @@ public class HighlighterTest extends Bas
highlighter.setTextFragmenter(new SimpleFragmenter(40));
for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
- TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
+ final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
+ String text = doc.get(FIELD_NAME);
+ TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
"...");
@@ -614,8 +632,9 @@ public class HighlighterTest extends Bas
int maxNumFragmentsRequired = 2;
for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
- TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
+ final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
+ String text = doc.get(FIELD_NAME);
+ TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
Highlighter highlighter = new Highlighter(this, scorer);
@@ -644,8 +663,9 @@ public class HighlighterTest extends Bas
Highlighter highlighter = new Highlighter(this, scorer);
for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
- TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
+ final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
+ String text = doc.get(FIELD_NAME);
+ TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer, 5));
@@ -698,8 +718,9 @@ public class HighlighterTest extends Bas
Highlighter highlighter = new Highlighter(this,scorer);
for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
- TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
+ final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
+ String text = doc.get(FIELD_NAME);
+ TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
highlighter.setTextFragmenter(new SimpleFragmenter(40));
@@ -769,8 +790,9 @@ public class HighlighterTest extends Bas
highlighter.setTextFragmenter(new SimpleFragmenter(40));
int maxNumFragmentsRequired = 2;
for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
- TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
+ final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
+ String text = doc.get(FIELD_NAME);
+ TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
"...");
@@ -963,11 +985,11 @@ public class HighlighterTest extends Bas
hits = searcher.search(query, null, 1000);
for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(HighlighterTest.FIELD_NAME);
- int maxNumFragmentsRequired = 2;
+ final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
+ String text = doc.get(FIELD_NAME);
+ TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer); int maxNumFragmentsRequired = 2;
String fragmentSeparator = "...";
QueryScorer scorer = new QueryScorer(query, HighlighterTest.FIELD_NAME);
- TokenStream tokenStream = analyzer.tokenStream(HighlighterTest.FIELD_NAME, text);
Highlighter highlighter = new Highlighter(this, scorer);
@@ -987,11 +1009,12 @@ public class HighlighterTest extends Bas
numHighlights = 0;
for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(HighlighterTest.FIELD_NAME);
+ final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
+ String text = doc.get(FIELD_NAME);
+ TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
int maxNumFragmentsRequired = 2;
String fragmentSeparator = "...";
QueryScorer scorer = new QueryScorer(query, null);
- TokenStream tokenStream = analyzer.tokenStream(HighlighterTest.FIELD_NAME, text);
Highlighter highlighter = new Highlighter(this, scorer);
@@ -1011,11 +1034,11 @@ public class HighlighterTest extends Bas
numHighlights = 0;
for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(HighlighterTest.FIELD_NAME);
- int maxNumFragmentsRequired = 2;
+ final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
+ String text = doc.get(FIELD_NAME);
+ TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer); int maxNumFragmentsRequired = 2;
String fragmentSeparator = "...";
QueryScorer scorer = new QueryScorer(query, "random_field", HighlighterTest.FIELD_NAME);
- TokenStream tokenStream = analyzer.tokenStream(HighlighterTest.FIELD_NAME, text);
Highlighter highlighter = new Highlighter(this, scorer);
@@ -1185,8 +1208,9 @@ public class HighlighterTest extends Bas
doSearching(new TermQuery(new Term(FIELD_NAME, "kennedy")));
numHighlights = 0;
for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
- TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
+ final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
+ String text = doc.get(FIELD_NAME);
+ TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
Highlighter highlighter = getHighlighter(query, FIELD_NAME,
HighlighterTest.this);
@@ -1199,21 +1223,25 @@ public class HighlighterTest extends Bas
numHighlights = 0;
for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
+ final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
+ String text = doc.get(FIELD_NAME);
+ TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
Highlighter highlighter = getHighlighter(query, FIELD_NAME,
HighlighterTest.this);
- highlighter.getBestFragment(analyzer, FIELD_NAME, text);
+ highlighter.getBestFragment(tokenStream, text);
}
assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
numHighlights == 4);
numHighlights = 0;
for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
+ final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
+ String text = doc.get(FIELD_NAME);
+ TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
Highlighter highlighter = getHighlighter(query, FIELD_NAME,
HighlighterTest.this);
- highlighter.getBestFragments(analyzer, FIELD_NAME, text, 10);
+ highlighter.getBestFragments(tokenStream, text, 10);
}
assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
numHighlights == 4);
@@ -1339,8 +1367,9 @@ public class HighlighterTest extends Bas
doSearching(new TermQuery(new Term(FIELD_NAME, "kennedy")));
for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
- TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
+ final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
+ String text = doc.get(FIELD_NAME);
+ TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
Highlighter highlighter = getHighlighter(query, FIELD_NAME,
HighlighterTest.this);// new Highlighter(this, new
@@ -1368,9 +1397,10 @@ public class HighlighterTest extends Bas
}
public void testMaxSizeHighlight() throws Exception {
- final MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
- // we disable MockTokenizer checks because we will forcefully limit the
+ // we disable MockTokenizer checks because we will forcefully limit the
// tokenstream and call end() before incrementToken() returns false.
+ // But we first need to clear the re-used tokenstream components that have enableChecks.
+ analyzer.getReuseStrategy().setReusableComponents(analyzer, FIELD_NAME, null);
analyzer.setEnableChecks(false);
TestHighlightRunner helper = new TestHighlightRunner() {
@@ -1471,8 +1501,7 @@ public class HighlighterTest extends Bas
numHighlights = 0;
// test to show how rewritten query can still be used
searcher = newSearcher(reader);
- Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
-
+
BooleanQuery query = new BooleanQuery();
query.add(new WildcardQuery(new Term(FIELD_NAME, "jf?")), Occur.SHOULD);
query.add(new WildcardQuery(new Term(FIELD_NAME, "kenned*")), Occur.SHOULD);
@@ -1491,8 +1520,9 @@ public class HighlighterTest extends Bas
int maxNumFragmentsRequired = 3;
for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
- TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, text);
+ final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
+ String text = doc.get(FIELD_NAME);
+ TokenStream tokenStream = TokenSources.getAnyTokenStream(reader, hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
Highlighter highlighter = getHighlighter(query, FIELD_NAME, HighlighterTest.this, false);
highlighter.setTextFragmenter(new SimpleFragmenter(40));
@@ -1823,12 +1853,6 @@ public class HighlighterTest extends Bas
searchIndex();
}
- private Document doc( String f, String v ){
- Document doc = new Document();
- doc.add( new TextField( f, v, Field.Store.YES));
- return doc;
- }
-
private void makeIndex() throws IOException {
IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false)));
writer.addDocument( doc( "t_text1", "random words for highlighting tests del" ) );
@@ -1867,6 +1891,34 @@ public class HighlighterTest extends Bas
reader.close();
}
+ /** If we have term vectors, we can highlight based on payloads */
+ public void testPayloadQuery() throws IOException, InvalidTokenOffsetsException {
+ final String text = "random words and words";//"words" at positions 1 & 4
+
+ Analyzer analyzer = new MockPayloadAnalyzer();//sets payload to "pos: X" (where X is position #)
+ try (IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(analyzer))) {
+ writer.deleteAll();
+ Document doc = new Document();
+
+ doc.add(new Field(FIELD_NAME, text, FIELD_TYPE_TV));
+ writer.addDocument(doc);
+ writer.commit();
+ }
+ try (IndexReader reader = DirectoryReader.open(dir)) {
+ Query query = new SpanPayloadCheckQuery(new SpanTermQuery(new Term(FIELD_NAME, "words")),
+ Collections.singleton("pos: 1".getBytes("UTF-8")));//just match the first "word" occurrence
+ IndexSearcher searcher = newSearcher(reader);
+ Scorer scorer = new QueryScorer(query, searcher.getIndexReader(), FIELD_NAME);
+ Highlighter h = new Highlighter(scorer);
+
+ TopDocs hits = searcher.search(query, null, 10);
+ assertEquals(1, hits.scoreDocs.length);
+ TokenStream stream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), 0, FIELD_NAME, analyzer);
+ String result = h.getBestFragment(stream, text);
+ assertEquals("random <B>words</B> and words", result);//only highlight first "word"
+ }
+ }
+
/*
*
* public void testBigramAnalyzer() throws IOException, ParseException {
@@ -1934,14 +1986,21 @@ public class HighlighterTest extends Bas
public void setUp() throws Exception {
super.setUp();
+ //Not many use this setup:
a = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false);
- analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
dir = newDirectory();
+
+ //Most tests use this setup:
+ analyzer = new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET);
ramDir = newDirectory();
- IndexWriter writer = new IndexWriter(ramDir, newIndexWriterConfig(new MockAnalyzer(random(), MockTokenizer.SIMPLE, true, MockTokenFilter.ENGLISH_STOPSET)));
+ fieldType = random().nextBoolean() ? FIELD_TYPE_TV : TextField.TYPE_STORED;
+ IndexWriter writer = new IndexWriter(ramDir, newIndexWriterConfig(analyzer));
+
for (String text : texts) {
- addDoc(writer, text);
+ writer.addDocument(doc(FIELD_NAME, text));
}
+
+ // a few tests need other docs...:
Document doc = new Document();
doc.add(new IntField(NUMERIC_FIELD_NAME, 1, Field.Store.NO));
doc.add(new StoredField(NUMERIC_FIELD_NAME, 1));
@@ -1969,6 +2028,8 @@ public class HighlighterTest extends Bas
writer.forceMerge(1);
writer.close();
reader = DirectoryReader.open(ramDir);
+
+ //Misc:
numHighlights = 0;
}
@@ -1979,13 +2040,11 @@ public class HighlighterTest extends Bas
ramDir.close();
super.tearDown();
}
- private void addDoc(IndexWriter writer, String text) throws IOException {
- Document d = new Document();
-
- Field f = new TextField(FIELD_NAME, text, Field.Store.YES);
- d.add(f);
- writer.addDocument(d);
+ private Document doc(String name, String value) {
+ Document d = new Document();
+ d.add(new Field(name, value, fieldType));//fieldType is randomly chosen for term vectors in setUp
+ return d;
}
private static Token createToken(String term, int start, int offset)
@@ -2164,11 +2223,13 @@ final class SynonymTokenizer extends Tok
throws Exception {
for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(HighlighterTest.FIELD_NAME);
+ final StoredDocument doc = searcher.doc(hits.scoreDocs[i].doc);
+ String text = doc.get(HighlighterTest.FIELD_NAME);
int maxNumFragmentsRequired = 2;
String fragmentSeparator = "...";
Scorer scorer = null;
- TokenStream tokenStream = analyzer.tokenStream(HighlighterTest.FIELD_NAME, text);
+ TokenStream tokenStream = TokenSources.getAnyTokenStream(searcher.getIndexReader(),
+ hits.scoreDocs[i].doc, HighlighterTest.FIELD_NAME, doc, analyzer);
if (mode == QUERY) {
scorer = new QueryScorer(query);
} else if (mode == QUERY_TERM) {
@@ -2176,7 +2237,6 @@ final class SynonymTokenizer extends Tok
}
Highlighter highlighter = new Highlighter(formatter, scorer);
highlighter.setTextFragmenter(frag);
-
String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
fragmentSeparator);
if (LuceneTestCase.VERBOSE) System.out.println("\t" + result);