You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ds...@apache.org on 2016/11/15 21:17:04 UTC
[1/2] lucene-solr:master: LUCENE-7526: UnifiedHighlighter: enhance
MTQ passage relevancy. TokenStreamFromTermVector isn't used by the UH
anymore. Refactor AnalysisOffsetStrategy into TokenStream and MemoryIndex
strategies, and related refactorings from t
Repository: lucene-solr
Updated Branches:
refs/heads/master 280cbfd8f -> 7af454ad7
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java
index 5f09d84..ac5f0f6 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java
@@ -117,6 +117,8 @@ public class UnifiedHighlighter {
private boolean defaultHighlightPhrasesStrictly = true; // AKA "accuracy" or "query debugging"
+ private boolean defaultPassageRelevancyOverSpeed = true; //For analysis, prefer MemoryIndexOffsetStrategy
+
// private boolean defaultRequireFieldMatch = true; TODO
private int maxLength = DEFAULT_MAX_LENGTH;
@@ -213,6 +215,12 @@ public class UnifiedHighlighter {
return defaultHighlightPhrasesStrictly;
}
+
+ protected boolean shouldPreferPassageRelevancyOverSpeed(String field) {
+ return defaultPassageRelevancyOverSpeed;
+ }
+
+
/**
* The maximum content size to process. Content will be truncated to this size before highlighting. Typically
* snippets closer to the beginning of the document better summarize its content.
@@ -716,8 +724,13 @@ public class UnifiedHighlighter {
}
protected FieldHighlighter getFieldHighlighter(String field, Query query, SortedSet<Term> allTerms, int maxPassages) {
+ BytesRef[] terms = filterExtractedTerms(field, allTerms);
+ Set<HighlightFlag> highlightFlags = getFlags(field);
+ PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
+ CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags);
+ OffsetSource offsetSource = getOptimizedOffsetSource(field, terms, phraseHelper, automata);
return new FieldHighlighter(field,
- getOffsetStrategy(field, query, allTerms),
+ getOffsetStrategy(offsetSource, field, terms, phraseHelper, automata, highlightFlags),
new SplittingBreakIterator(getBreakIterator(field), UnifiedHighlighter.MULTIVAL_SEP_CHAR),
getScorer(field),
maxPassages,
@@ -725,41 +738,7 @@ public class UnifiedHighlighter {
getFormatter(field));
}
- protected FieldOffsetStrategy getOffsetStrategy(String field, Query query, SortedSet<Term> allTerms) {
- EnumSet<HighlightFlag> highlightFlags = getFlags(field);
- BytesRef[] terms = filterExtractedTerms(field, allTerms);
- PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
- CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags);
- OffsetSource offsetSource = getOptimizedOffsetSource(field, terms, phraseHelper, automata);
- switch (offsetSource) {
- case ANALYSIS:
- return new AnalysisOffsetStrategy(field, terms, phraseHelper, automata, getIndexAnalyzer(),
- this::preMultiTermQueryRewrite);
- case NONE_NEEDED:
- return NoOpOffsetStrategy.INSTANCE;
- case TERM_VECTORS:
- return new TermVectorOffsetStrategy(field, terms, phraseHelper, automata);
- case POSTINGS:
- return new PostingsOffsetStrategy(field, terms, phraseHelper, automata);
- case POSTINGS_WITH_TERM_VECTORS:
- return new PostingsWithTermVectorsOffsetStrategy(field, terms, phraseHelper, automata);
- default:
- throw new IllegalArgumentException("Unrecognized offset source " + offsetSource);
- }
- }
-
- protected EnumSet<HighlightFlag> getFlags(String field) {
- EnumSet<HighlightFlag> highlightFlags = EnumSet.noneOf(HighlightFlag.class);
- if (shouldHandleMultiTermQuery(field)) {
- highlightFlags.add(HighlightFlag.MULTI_TERM_QUERY);
- }
- if (shouldHighlightPhrasesStrictly(field)) {
- highlightFlags.add(HighlightFlag.PHRASES);
- }
- return highlightFlags;
- }
-
- protected BytesRef[] filterExtractedTerms(String field, SortedSet<Term> queryTerms) {
+ protected static BytesRef[] filterExtractedTerms(String field, SortedSet<Term> queryTerms) {
// TODO consider requireFieldMatch
Term floor = new Term(field, "");
Term ceiling = new Term(field, UnicodeUtil.BIG_TERM);
@@ -774,7 +753,21 @@ public class UnifiedHighlighter {
return terms;
}
- protected PhraseHelper getPhraseHelper(String field, Query query, EnumSet<HighlightFlag> highlightFlags) {
+ protected Set<HighlightFlag> getFlags(String field) {
+ Set<HighlightFlag> highlightFlags = EnumSet.noneOf(HighlightFlag.class);
+ if (shouldHandleMultiTermQuery(field)) {
+ highlightFlags.add(HighlightFlag.MULTI_TERM_QUERY);
+ }
+ if (shouldHighlightPhrasesStrictly(field)) {
+ highlightFlags.add(HighlightFlag.PHRASES);
+ }
+ if (shouldPreferPassageRelevancyOverSpeed(field)) {
+ highlightFlags.add(HighlightFlag.PASSAGE_RELEVANCY_OVER_SPEED);
+ }
+ return highlightFlags;
+ }
+
+ protected PhraseHelper getPhraseHelper(String field, Query query, Set<HighlightFlag> highlightFlags) {
boolean highlightPhrasesStrictly = highlightFlags.contains(HighlightFlag.PHRASES);
boolean handleMultiTermQuery = highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY);
return highlightPhrasesStrictly ?
@@ -782,7 +775,7 @@ public class UnifiedHighlighter {
PhraseHelper.NONE;
}
- protected CharacterRunAutomaton[] getAutomata(String field, Query query, EnumSet<HighlightFlag> highlightFlags) {
+ protected CharacterRunAutomaton[] getAutomata(String field, Query query, Set<HighlightFlag> highlightFlags) {
return highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY)
? MultiTermHighlighting.extractAutomata(query, field, !highlightFlags.contains(HighlightFlag.PHRASES),
this::preMultiTermQueryRewrite)
@@ -790,11 +783,12 @@ public class UnifiedHighlighter {
}
protected OffsetSource getOptimizedOffsetSource(String field, BytesRef[] terms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata) {
+ OffsetSource offsetSource = getOffsetSource(field);
+
if (terms.length == 0 && automata.length == 0 && !phraseHelper.willRewrite()) {
return OffsetSource.NONE_NEEDED; //nothing to highlight
}
- OffsetSource offsetSource = getOffsetSource(field);
switch (offsetSource) {
case POSTINGS:
if (phraseHelper.willRewrite()) {
@@ -822,6 +816,32 @@ public class UnifiedHighlighter {
return offsetSource;
}
+ protected FieldOffsetStrategy getOffsetStrategy(OffsetSource offsetSource, String field, BytesRef[] terms,
+ PhraseHelper phraseHelper, CharacterRunAutomaton[] automata,
+ Set<HighlightFlag> highlightFlags) {
+ switch (offsetSource) {
+ case ANALYSIS:
+ if (!phraseHelper.hasPositionSensitivity() &&
+ !highlightFlags.contains(HighlightFlag.PASSAGE_RELEVANCY_OVER_SPEED)) {
+ //skip using a memory index since it's pure term filtering
+ return new TokenStreamOffsetStrategy(field, terms, phraseHelper, automata, getIndexAnalyzer());
+ } else {
+ return new MemoryIndexOffsetStrategy(field, terms, phraseHelper, automata, getIndexAnalyzer(),
+ this::preMultiTermQueryRewrite);
+ }
+ case NONE_NEEDED:
+ return NoOpOffsetStrategy.INSTANCE;
+ case TERM_VECTORS:
+ return new TermVectorOffsetStrategy(field, terms, phraseHelper, automata);
+ case POSTINGS:
+ return new PostingsOffsetStrategy(field, terms, phraseHelper, automata);
+ case POSTINGS_WITH_TERM_VECTORS:
+ return new PostingsWithTermVectorsOffsetStrategy(field, terms, phraseHelper, automata);
+ default:
+ throw new IllegalArgumentException("Unrecognized offset source " + offsetSource);
+ }
+ }
+
/**
* When highlighting phrases accurately, we need to know which {@link SpanQuery}'s need to have
* {@link Query#rewrite(IndexReader)} called on them. It helps performance to avoid it if it's not needed.
@@ -1041,10 +1061,9 @@ public class UnifiedHighlighter {
*/
public enum HighlightFlag {
PHRASES,
- MULTI_TERM_QUERY
+ MULTI_TERM_QUERY,
+ PASSAGE_RELEVANCY_OVER_SPEED
// TODO: ignoreQueryFields
// TODO: useQueryBoosts
- // TODO: avoidMemoryIndexIfPossible
- // TODO: preferMemoryIndexForStats
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java
index ddc9507..be0ff1b 100644
--- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterMTQ.java
@@ -773,7 +773,40 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
ir.close();
}
- public void testTokenStreamIsClosed() throws IOException {
+ public void testWithMaxLenAndMultipleWildcardMatches() throws IOException {
+ RandomIndexWriter iw = new RandomIndexWriter(random(), dir, indexAnalyzer);
+
+ Field body = new Field("body", "", fieldType);
+ Document doc = new Document();
+ doc.add(body);
+
+ //tests interleaving of multiple wildcard matches with the CompositePostingsEnum
+ //In this case the CompositePostingsEnum will have an underlying PostingsEnum that jumps form pos 1 to 9 for bravo
+ //and a second with position 2 for Bravado
+ body.setStringValue("Alpha Bravo Bravado foo foo foo. Foo foo Alpha Bravo");
+ iw.addDocument(doc);
+
+ IndexReader ir = iw.getReader();
+ iw.close();
+
+ IndexSearcher searcher = newSearcher(ir);
+ UnifiedHighlighter highlighter = new UnifiedHighlighter(searcher, indexAnalyzer);
+ highlighter.setMaxLength(32);//a little past first sentence
+
+ BooleanQuery query = new BooleanQuery.Builder()
+ .add(new TermQuery(new Term("body", "alpha")), BooleanClause.Occur.MUST)
+ .add(new PrefixQuery(new Term("body", "bra")), BooleanClause.Occur.MUST)
+ .build();
+ TopDocs topDocs = searcher.search(query, 10, Sort.INDEXORDER);
+ String snippets[] = highlighter.highlight("body", query, topDocs, 2);//ask for 2 but we'll only get 1
+ assertArrayEquals(
+ new String[]{"<b>Alpha</b> <b>Bravo</b> <b>Bravado</b> foo foo foo."}, snippets
+ );
+
+ ir.close();
+ }
+
+ public void testTokenStreamIsClosed() throws Exception {
// note: test is a derivative of testWithMaxLen()
RandomIndexWriter iw = new RandomIndexWriter(random(), dir, indexAnalyzer);
@@ -828,8 +861,8 @@ public class TestUnifiedHighlighterMTQ extends LuceneTestCase {
if (fieldType == UHTestHelper.reanalysisType) {
fail("Expecting EXPECTED IOException");
}
- } catch (IOException e) {
- if (!e.getMessage().equals("EXPECTED")) {
+ } catch (Exception e) {
+ if (!e.getMessage().contains("EXPECTED")) {
throw e;
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterRanking.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterRanking.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterRanking.java
index bc2a14d..64570ae 100644
--- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterRanking.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/TestUnifiedHighlighterRanking.java
@@ -50,9 +50,8 @@ public class TestUnifiedHighlighterRanking extends LuceneTestCase {
Analyzer indexAnalyzer;
- // note: don't choose reanalysis because it doesn't always know the term frequency, which is a statistic used
- // in passage ranking. Sometimes it does (e.g. when it builds a MemoryIndex) but not necessarily.
- final FieldType fieldType = UHTestHelper.randomFieldType(random(), UHTestHelper.postingsType, UHTestHelper.tvType);
+ // note: all offset sources, by default, use term freq, so it shouldn't matter which we choose.
+ final FieldType fieldType = UHTestHelper.randomFieldType(random());
/**
* indexes a bunch of gibberish, and then highlights top(n).
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java
index 641a835..d150940 100644
--- a/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java
+++ b/lucene/highlighter/src/test/org/apache/lucene/search/uhighlight/visibility/TestUnifiedHighlighterExtensibility.java
@@ -22,11 +22,13 @@ import java.text.BreakIterator;
import java.util.Collections;
import java.util.List;
import java.util.Map;
+import java.util.Set;
import java.util.SortedSet;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.MockAnalyzer;
import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.search.IndexSearcher;
@@ -68,6 +70,11 @@ public class TestUnifiedHighlighterExtensibility extends LuceneTestCase {
return Collections.emptyList();
}
+ @Override
+ protected List<OffsetsEnum> createOffsetsEnumsFromReader(LeafReader leafReader, int doc) throws IOException {
+ return super.createOffsetsEnumsFromReader(leafReader, doc);
+ }
+
};
assertEquals(offsetSource, strategy.getOffsetSource());
}
@@ -142,8 +149,8 @@ public class TestUnifiedHighlighterExtensibility extends LuceneTestCase {
}
@Override
- protected FieldOffsetStrategy getOffsetStrategy(String field, Query query, SortedSet<Term> allTerms) {
- return super.getOffsetStrategy(field, query, allTerms);
+ protected FieldOffsetStrategy getOffsetStrategy(OffsetSource offsetSource, String field, BytesRef[] terms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata, Set<HighlightFlag> highlightFlags) {
+ return super.getOffsetStrategy(offsetSource, field, terms, phraseHelper, automata, highlightFlags);
}
@Override
[2/2] lucene-solr:master: LUCENE-7526: UnifiedHighlighter: enhance
MTQ passage relevancy. TokenStreamFromTermVector isn't used by the UH
anymore. Refactor AnalysisOffsetStrategy into TokenStream and MemoryIndex
strategies, and related refactorings from t
Posted by ds...@apache.org.
LUCENE-7526: UnifiedHighlighter: enhance MTQ passage relevancy. TokenStreamFromTermVector isn't used by the UH anymore. Refactor AnalysisOffsetStrategy into TokenStream and MemoryIndex strategies, and related refactorings from that.
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/7af454ad
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/7af454ad
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/7af454ad
Branch: refs/heads/master
Commit: 7af454ad767c3a0364757d6fcf55bff9f063febe
Parents: 280cbfd
Author: David Smiley <ds...@apache.org>
Authored: Tue Nov 15 16:16:46 2016 -0500
Committer: David Smiley <ds...@apache.org>
Committed: Tue Nov 15 16:16:46 2016 -0500
----------------------------------------------------------------------
lucene/CHANGES.txt | 5 +
lucene/benchmark/conf/highlighters-postings.alg | 4 +-
lucene/benchmark/conf/highlighters-tv.alg | 2 +-
.../uhighlight/AnalysisOffsetStrategy.java | 261 ++++++------
.../CompositeOffsetsPostingsEnum.java | 145 +++++++
.../search/uhighlight/FieldOffsetStrategy.java | 115 ++++--
.../uhighlight/MemoryIndexOffsetStrategy.java | 129 ++++++
.../uhighlight/MultiTermHighlighting.java | 190 ---------
.../uhighlight/MultiValueTokenStream.java | 148 -------
.../lucene/search/uhighlight/OffsetsEnum.java | 1 +
.../lucene/search/uhighlight/Passage.java | 2 +-
.../lucene/search/uhighlight/PhraseHelper.java | 2 +-
.../uhighlight/PostingsOffsetStrategy.java | 3 +-
.../PostingsWithTermVectorsOffsetStrategy.java | 8 +-
.../uhighlight/TermVectorOffsetStrategy.java | 15 +-
.../uhighlight/TokenStreamFromTermVector.java | 395 -------------------
.../uhighlight/TokenStreamOffsetStrategy.java | 173 ++++++++
.../search/uhighlight/UnifiedHighlighter.java | 103 +++--
.../uhighlight/TestUnifiedHighlighterMTQ.java | 39 +-
.../TestUnifiedHighlighterRanking.java | 5 +-
.../TestUnifiedHighlighterExtensibility.java | 11 +-
21 files changed, 763 insertions(+), 993 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 5182276..a6c6dbe 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -85,6 +85,11 @@ Improvements
* LUCENE-7524: Added more detailed explanation of how IDF is computed in
ClassicSimilarity and BM25Similarity. (Adrien Grand)
+* LUCENE-7526: Enhanced UnifiedHighlighter's passage relevancy for queries with
+ wildcards and sometimes just terms. Added shouldPreferPassageRelevancyOverSpeed()
+ which can be overridden to return false to eek out more speed in some cases.
+ (Timothy M. Rodriguez, David Smiley)
+
Other
* LUCENE-7546: Fixed references to benchmark wikipedia data and the Jenkins line-docs file
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/benchmark/conf/highlighters-postings.alg
----------------------------------------------------------------------
diff --git a/lucene/benchmark/conf/highlighters-postings.alg b/lucene/benchmark/conf/highlighters-postings.alg
index cf9df11..610908f 100644
--- a/lucene/benchmark/conf/highlighters-postings.alg
+++ b/lucene/benchmark/conf/highlighters-postings.alg
@@ -34,7 +34,7 @@ content.source=org.apache.lucene.benchmark.byTask.feeds.EnwikiContentSource
docs.file=temp/enwiki-20070527-pages-articles.xml.bz2
query.maker=org.apache.lucene.benchmark.byTask.feeds.FileBasedQueryMaker
-file.query.maker.file=conf/query-phrases.txt
+file.query.maker.file=conf/query-terms.txt
log.queries=false
log.step.SearchTravRetHighlight=-1
@@ -55,7 +55,7 @@ highlighter=HlImpl:NONE:SH_A:UH_A:PH_P:UH_P:UH_PV
{ "Warm" SearchTravRetHighlight > : 1000
- { "HL" SearchTravRetHighlight > : 500
+ { "HL" SearchTravRetHighlight > : 2000
CloseReader
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/benchmark/conf/highlighters-tv.alg
----------------------------------------------------------------------
diff --git a/lucene/benchmark/conf/highlighters-tv.alg b/lucene/benchmark/conf/highlighters-tv.alg
index 1e51018..26b64a3 100644
--- a/lucene/benchmark/conf/highlighters-tv.alg
+++ b/lucene/benchmark/conf/highlighters-tv.alg
@@ -54,7 +54,7 @@ highlighter=HlImpl:NONE:SH_V:FVH_V:UH_V
{ "Warm" SearchTravRetHighlight > : 1000
- { "HL" SearchTravRetHighlight > : 500
+ { "HL" SearchTravRetHighlight > : 2000
CloseReader
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/AnalysisOffsetStrategy.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/AnalysisOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/AnalysisOffsetStrategy.java
index 6b4cc74..e9db77c 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/AnalysisOffsetStrategy.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/AnalysisOffsetStrategy.java
@@ -17,181 +17,154 @@
package org.apache.lucene.search.uhighlight;
import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.Collections;
-import java.util.List;
-import java.util.function.Function;
import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.FilteringTokenFilter;
+import org.apache.lucene.analysis.TokenFilter;
import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.LeafReader;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.memory.MemoryIndex;
-import org.apache.lucene.search.Query;
-import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
-
/**
- * Uses an {@link Analyzer} on content to get offsets. It may use a {@link MemoryIndex} too.
+ * Provides a base class for analysis based offset strategies to extend from.
+ * Requires an Analyzer and provides an override-able method for altering how
+ * the TokenStream is created.
*
* @lucene.internal
*/
-public class AnalysisOffsetStrategy extends FieldOffsetStrategy {
-
- //TODO: Consider splitting this highlighter into a MemoryIndexFieldHighlighter and a TokenStreamFieldHighlighter
- private static final BytesRef[] ZERO_LEN_BYTES_REF_ARRAY = new BytesRef[0];
- private final Analyzer analyzer;
- private final MemoryIndex memoryIndex;
- private final LeafReader leafReader;
- private final CharacterRunAutomaton preMemIndexFilterAutomaton;
-
- public AnalysisOffsetStrategy(String field, BytesRef[] extractedTerms, PhraseHelper phraseHelper,
- CharacterRunAutomaton[] automata, Analyzer analyzer,
- Function<Query, Collection<Query>> multiTermQueryRewrite) {
- super(field, extractedTerms, phraseHelper, automata);
- this.analyzer = analyzer;
- // Automata (Wildcards / MultiTermQuery):
- this.automata = automata;
+public abstract class AnalysisOffsetStrategy extends FieldOffsetStrategy {
- if (terms.length > 0 && !strictPhrases.hasPositionSensitivity()) {
- this.automata = convertTermsToAutomata(terms, automata);
- // clear the terms array now that we've moved them to be expressed as automata
- terms = ZERO_LEN_BYTES_REF_ARRAY;
- }
+ protected final Analyzer analyzer;
- if (terms.length > 0 || strictPhrases.willRewrite()) { //needs MemoryIndex
- // init MemoryIndex
- boolean storePayloads = strictPhrases.hasPositionSensitivity(); // might be needed
- memoryIndex = new MemoryIndex(true, storePayloads);//true==store offsets
- leafReader = (LeafReader) memoryIndex.createSearcher().getIndexReader();
- // preFilter for MemoryIndex
- preMemIndexFilterAutomaton = buildCombinedAutomaton(field, terms, this.automata, strictPhrases,
- multiTermQueryRewrite);
- } else {
- memoryIndex = null;
- leafReader = null;
- preMemIndexFilterAutomaton = null;
+ public AnalysisOffsetStrategy(String field, BytesRef[] queryTerms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata, Analyzer analyzer) {
+ super(field, queryTerms, phraseHelper, automata);
+ this.analyzer = analyzer;
+ if (analyzer.getOffsetGap(field) != 1) { // note: 1 is the default. It is RARELY changed.
+ throw new IllegalArgumentException(
+ "offset gap of the provided analyzer should be 1 (field " + field + ")");
}
-
}
@Override
- public UnifiedHighlighter.OffsetSource getOffsetSource() {
+ public final UnifiedHighlighter.OffsetSource getOffsetSource() {
return UnifiedHighlighter.OffsetSource.ANALYSIS;
}
- @Override
- public List<OffsetsEnum> getOffsetsEnums(IndexReader reader, int docId, String content) throws IOException {
- // note: don't need LimitTokenOffsetFilter since content is already truncated to maxLength
- TokenStream tokenStream = tokenStream(content);
-
- if (memoryIndex != null) { // also handles automata.length > 0
- // We use a MemoryIndex and index the tokenStream so that later we have the PostingsEnum with offsets.
-
- // note: An *alternative* strategy is to get PostingsEnums without offsets from the main index
- // and then marry this up with a fake PostingsEnum backed by a TokenStream (which has the offsets) and
- // can use that to filter applicable tokens? It would have the advantage of being able to exit
- // early and save some re-analysis. This would be an additional method/offset-source approach
- // since it's still useful to highlight without any index (so we build MemoryIndex).
-
- // note: probably unwise to re-use TermsEnum on reset mem index so we don't. But we do re-use the
- // leaf reader, which is a bit more top level than in the guts.
- memoryIndex.reset();
-
- // Filter the tokenStream to applicable terms
- if (preMemIndexFilterAutomaton != null) {
- tokenStream = newKeepWordFilter(tokenStream, preMemIndexFilterAutomaton);
- }
- memoryIndex.addField(field, tokenStream);//note: calls tokenStream.reset() & close()
- tokenStream = null; // it's consumed; done.
- docId = 0;
-
- if (automata.length > 0) {
- Terms foundTerms = leafReader.terms(field);
- if (foundTerms == null) {
- return Collections.emptyList(); //No offsets for this field.
- }
- // Un-invert for the automata. Much more compact than a CachingTokenStream
- tokenStream = MultiTermHighlighting.uninvertAndFilterTerms(foundTerms, 0, automata, content.length());
- }
-
- }
-
- return createOffsetsEnums(leafReader, docId, tokenStream);
- }
-
protected TokenStream tokenStream(String content) throws IOException {
- return MultiValueTokenStream.wrap(field, analyzer, content, UnifiedHighlighter.MULTIVAL_SEP_CHAR);
- }
-
- private static CharacterRunAutomaton[] convertTermsToAutomata(BytesRef[] terms, CharacterRunAutomaton[] automata) {
- CharacterRunAutomaton[] newAutomata = new CharacterRunAutomaton[terms.length + automata.length];
- for (int i = 0; i < terms.length; i++) {
- newAutomata[i] = MultiTermHighlighting.makeStringMatchAutomata(terms[i]);
+ // If there is no splitChar in content then we needn't wrap:
+ int splitCharIdx = content.indexOf(UnifiedHighlighter.MULTIVAL_SEP_CHAR);
+ if (splitCharIdx == -1) {
+ return analyzer.tokenStream(field, content);
}
- // Append existing automata (that which is used for MTQs)
- System.arraycopy(automata, 0, newAutomata, terms.length, automata.length);
- return newAutomata;
- }
- private static FilteringTokenFilter newKeepWordFilter(final TokenStream tokenStream,
- final CharacterRunAutomaton charRunAutomaton) {
- // it'd be nice to use KeepWordFilter but it demands a CharArraySet. TODO File JIRA? Need a new interface?
- return new FilteringTokenFilter(tokenStream) {
- final CharTermAttribute charAtt = addAttribute(CharTermAttribute.class);
+ TokenStream subTokenStream = analyzer.tokenStream(field, content.substring(0, splitCharIdx));
- @Override
- protected boolean accept() throws IOException {
- return charRunAutomaton.run(charAtt.buffer(), 0, charAtt.length());
- }
- };
+ return new MultiValueTokenStream(subTokenStream, field, analyzer, content, UnifiedHighlighter.MULTIVAL_SEP_CHAR, splitCharIdx);
}
-
/**
- * Build one {@link CharacterRunAutomaton} matching any term the query might match.
+ * Wraps an {@link Analyzer} and string text that represents multiple values delimited by a specified character. This
+ * exposes a TokenStream that matches what would get indexed considering the
+ * {@link Analyzer#getPositionIncrementGap(String)}. Currently this assumes {@link Analyzer#getOffsetGap(String)} is
+ * 1; an exception will be thrown if it isn't.
+ * <br />
+ * It would be more orthogonal for this to be an Analyzer since we're wrapping an Analyzer but doing so seems like
+ * more work. The underlying components see a Reader not a String -- and the String is easy to
+ * split up without redundant buffering.
+ *
+ * @lucene.internal
*/
- private static CharacterRunAutomaton buildCombinedAutomaton(String field, BytesRef[] terms,
- CharacterRunAutomaton[] automata,
- PhraseHelper strictPhrases,
- Function<Query, Collection<Query>> multiTermQueryRewrite) {
- List<CharacterRunAutomaton> allAutomata = new ArrayList<>();
- if (terms.length > 0) {
- allAutomata.add(new CharacterRunAutomaton(Automata.makeStringUnion(Arrays.asList(terms))));
- }
- Collections.addAll(allAutomata, automata);
- for (SpanQuery spanQuery : strictPhrases.getSpanQueries()) {
- Collections.addAll(allAutomata,
- MultiTermHighlighting.extractAutomata(spanQuery, field, true, multiTermQueryRewrite));//true==lookInSpan
+ private static final class MultiValueTokenStream extends TokenFilter {
+
+ private final String fieldName;
+ private final Analyzer indexAnalyzer;
+ private final String content;
+ private final char splitChar;
+
+ private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
+ private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+
+ private int startValIdx = 0;
+ private int endValIdx;
+ private int remainingPosInc = 0;
+
+ private MultiValueTokenStream(TokenStream subTokenStream, String fieldName, Analyzer indexAnalyzer,
+ String content, char splitChar, int splitCharIdx) {
+ super(subTokenStream); // subTokenStream is already initialized to operate on the first value
+ this.fieldName = fieldName;
+ this.indexAnalyzer = indexAnalyzer;
+ this.content = content;
+ this.splitChar = splitChar;
+ this.endValIdx = splitCharIdx;
}
- if (allAutomata.size() == 1) {
- return allAutomata.get(0);
+ @Override
+ public void reset() throws IOException {
+ if (startValIdx != 0) {
+ throw new IllegalStateException("This TokenStream wasn't developed to be re-used.");
+ // ... although we could if a need for it arises.
+ }
+ super.reset();
}
- //TODO it'd be nice if we could get at the underlying Automaton in CharacterRunAutomaton so that we
- // could union them all. But it's not exposed, and note TermRangeQuery isn't modelled as an Automaton
- // by MultiTermHighlighting.
-
- // Return an aggregate CharacterRunAutomaton of others
- return new CharacterRunAutomaton(Automata.makeEmpty()) {// the makeEmpty() is bogus; won't be used
- @Override
- public boolean run(char[] chars, int offset, int length) {
- for (int i = 0; i < allAutomata.size(); i++) {// don't use foreach to avoid Iterator allocation
- if (allAutomata.get(i).run(chars, offset, length)) {
- return true;
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ while (true) {
+
+ if (input.incrementToken()) {
+ // Position tracking:
+ if (remainingPosInc > 0) {//usually true first token of additional values (not first val)
+ posIncAtt.setPositionIncrement(remainingPosInc + posIncAtt.getPositionIncrement());
+ remainingPosInc = 0;//reset
}
+ // Offset tracking:
+ offsetAtt.setOffset(
+ startValIdx + offsetAtt.startOffset(),
+ startValIdx + offsetAtt.endOffset()
+ );
+ return true;
+ }
+
+ if (endValIdx == content.length()) {//no more
+ return false;
}
- return false;
- }
- };
- }
+ input.end(); // might adjust position increment
+ remainingPosInc += posIncAtt.getPositionIncrement();
+ input.close();
+ remainingPosInc += indexAnalyzer.getPositionIncrementGap(fieldName);
+
+ // Get new tokenStream based on next segment divided by the splitChar
+ startValIdx = endValIdx + 1;
+ endValIdx = content.indexOf(splitChar, startValIdx);
+ if (endValIdx == -1) {//EOF
+ endValIdx = content.length();
+ }
+ TokenStream tokenStream = indexAnalyzer.tokenStream(fieldName, content.substring(startValIdx, endValIdx));
+ if (tokenStream != input) {// (input is defined in TokenFilter set in the constructor)
+ // This is a grand trick we do -- knowing that the analyzer's re-use strategy is going to produce the
+ // very same tokenStream instance and thus have the same AttributeSource as this wrapping TokenStream
+ // since we used it as our input in the constructor.
+ // Were this not the case, we'd have to copy every attribute of interest since we can't alter the
+ // AttributeSource of this wrapping TokenStream post-construction (it's all private/final).
+ // If this is a problem, we could do that instead; maybe with a custom CharTermAttribute that allows
+ // us to easily set the char[] reference without literally copying char by char.
+ throw new IllegalStateException("Require TokenStream re-use. Unsupported re-use strategy?: " +
+ indexAnalyzer.getReuseStrategy());
+ }
+ tokenStream.reset();
+ } // while loop to increment token of this new value
+ }
+
+ @Override
+ public void end() throws IOException {
+ super.end();
+ // Offset tracking:
+ offsetAtt.setOffset(
+ startValIdx + offsetAtt.startOffset(),
+ startValIdx + offsetAtt.endOffset());
+ }
+
+ }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/CompositeOffsetsPostingsEnum.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/CompositeOffsetsPostingsEnum.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/CompositeOffsetsPostingsEnum.java
new file mode 100644
index 0000000..356f553
--- /dev/null
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/CompositeOffsetsPostingsEnum.java
@@ -0,0 +1,145 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.uhighlight;
+
+import java.io.IOException;
+import java.util.List;
+
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.PriorityQueue;
+
+/**
+ * Provides a view over several underlying PostingsEnums for the iteration of offsets on the current document only.
+ * It's not general purpose; the position returned is always -1 and it doesn't iterate the documents.
+ */
+final class CompositeOffsetsPostingsEnum extends PostingsEnum {
+
+ private final int docId;
+ private final int freq;
+ private final PriorityQueue<BoundsCheckingPostingsEnum> queue;
+ private boolean firstPositionConsumed = false;
+
+ /**
+ * This class is used to ensure we don't over iterate the underlying
+ * postings enum by keeping track of the position relative to the
+ * frequency.
+ * Ideally this would've been an implementation of a PostingsEnum
+ * but it would have to delegate most methods and it seemed easier
+ * to just wrap the tweaked method.
+ */
+ private static final class BoundsCheckingPostingsEnum {
+
+ private final PostingsEnum postingsEnum;
+ private int remainingPositions;
+
+ BoundsCheckingPostingsEnum(PostingsEnum postingsEnum) throws IOException {
+ this.postingsEnum = postingsEnum;
+ this.remainingPositions = postingsEnum.freq();
+ nextPosition();
+ }
+
+ /** Advances to the next position and returns true, or returns false if it can't. */
+ private boolean nextPosition() throws IOException {
+ if (remainingPositions-- > 0) {
+ postingsEnum.nextPosition(); // ignore the actual position; we don't care.
+ return true;
+ } else {
+ return false;
+ }
+ }
+
+ }
+
+ /** The provided {@link PostingsEnum}s must all be positioned to the same document, and must have offsets. */
+ CompositeOffsetsPostingsEnum(List<PostingsEnum> postingsEnums) throws IOException {
+ queue = new PriorityQueue<BoundsCheckingPostingsEnum>(postingsEnums.size()) {
+ @Override
+ protected boolean lessThan(BoundsCheckingPostingsEnum a, BoundsCheckingPostingsEnum b) {
+ try {
+ return a.postingsEnum.startOffset() < b.postingsEnum.startOffset();
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+ };
+
+ int freqAdd = 0;
+ for (PostingsEnum postingsEnum : postingsEnums) {
+ queue.add(new BoundsCheckingPostingsEnum(postingsEnum));
+ freqAdd += postingsEnum.freq();
+ }
+ freq = freqAdd;
+ this.docId = queue.top().postingsEnum.docID();
+ }
+
+ @Override
+ public int freq() throws IOException {
+ return freq;
+ }
+
+ /** Advances to the next position. Always returns -1; the caller is assumed not to care for the highlighter. */
+ @Override
+ public int nextPosition() throws IOException {
+ if (!firstPositionConsumed) {
+ firstPositionConsumed = true;
+ } else if (queue.size() == 0) {
+ throw new IllegalStateException("nextPosition called too many times");
+ } else if (queue.top().nextPosition()) { // advance head
+ queue.updateTop(); //the new position may be behind another postingsEnum in the queue
+ } else {
+ queue.pop(); //this postingsEnum is consumed; get rid of it. Another will take it's place.
+ }
+ assert queue.size() > 0;
+ return -1;
+ }
+
+ @Override
+ public int startOffset() throws IOException {
+ return queue.top().postingsEnum.startOffset();
+ }
+
+ @Override
+ public int endOffset() throws IOException {
+ return queue.top().postingsEnum.endOffset();
+ }
+
+ @Override
+ public BytesRef getPayload() throws IOException {
+ return queue.top().postingsEnum.getPayload();
+ }
+
+ @Override
+ public int docID() {
+ return docId;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ return NO_MORE_DOCS;
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ return NO_MORE_DOCS;
+ }
+
+ @Override
+ public long cost() {
+ return 1L; //at most 1 doc is returned
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java
index 04df31e..155f0a7 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/FieldOffsetStrategy.java
@@ -14,16 +14,14 @@
* See the License for the specific language governing permissions and
* limitations under the License.
*/
-
package org.apache.lucene.search.uhighlight;
-import java.io.Closeable;
import java.io.IOException;
import java.util.ArrayList;
+import java.util.Collections;
import java.util.List;
import java.util.Map;
-import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.PostingsEnum;
@@ -31,6 +29,7 @@ import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.spans.Spans;
import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.automaton.CharacterRunAutomaton;
/**
@@ -42,14 +41,14 @@ import org.apache.lucene.util.automaton.CharacterRunAutomaton;
public abstract class FieldOffsetStrategy {
protected final String field;
- protected BytesRef[] terms; // Query: free-standing terms
- protected PhraseHelper strictPhrases; // Query: position-sensitive information TODO: rename
- protected CharacterRunAutomaton[] automata; // Query: free-standing wildcards (multi-term query)
+ protected final PhraseHelper phraseHelper; // Query: position-sensitive information TODO: rename
+ protected final BytesRef[] terms; // Query: free-standing terms
+ protected final CharacterRunAutomaton[] automata; // Query: free-standing wildcards (multi-term query)
public FieldOffsetStrategy(String field, BytesRef[] queryTerms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata) {
this.field = field;
this.terms = queryTerms;
- this.strictPhrases = phraseHelper;
+ this.phraseHelper = phraseHelper;
this.automata = automata;
}
@@ -65,58 +64,90 @@ public abstract class FieldOffsetStrategy {
*/
public abstract List<OffsetsEnum> getOffsetsEnums(IndexReader reader, int docId, String content) throws IOException;
- protected List<OffsetsEnum> createOffsetsEnums(LeafReader leafReader, int doc, TokenStream tokenStream) throws IOException {
- List<OffsetsEnum> offsetsEnums = createOffsetsEnumsFromReader(leafReader, doc);
- if (automata.length > 0) {
- offsetsEnums.add(createOffsetsEnumFromTokenStream(doc, tokenStream));
+ protected List<OffsetsEnum> createOffsetsEnumsFromReader(LeafReader leafReader, int doc) throws IOException {
+ final Terms termsIndex = leafReader.terms(field);
+ if (termsIndex == null) {
+ return Collections.emptyList();
}
- return offsetsEnums;
- }
- protected List<OffsetsEnum> createOffsetsEnumsFromReader(LeafReader atomicReader, int doc) throws IOException {
// For strict positions, get a Map of term to Spans:
// note: ScriptPhraseHelper.NONE does the right thing for these method calls
final Map<BytesRef, Spans> strictPhrasesTermToSpans =
- strictPhrases.getTermToSpans(atomicReader, doc);
+ phraseHelper.getTermToSpans(leafReader, doc);
// Usually simply wraps terms in a List; but if willRewrite() then can be expanded
final List<BytesRef> sourceTerms =
- strictPhrases.expandTermsIfRewrite(terms, strictPhrasesTermToSpans);
+ phraseHelper.expandTermsIfRewrite(terms, strictPhrasesTermToSpans);
- final List<OffsetsEnum> offsetsEnums = new ArrayList<>(sourceTerms.size() + 1);
+ final List<OffsetsEnum> offsetsEnums = new ArrayList<>(sourceTerms.size() + automata.length);
- Terms termsIndex = atomicReader == null || sourceTerms.isEmpty() ? null : atomicReader.terms(field);
- if (termsIndex != null) {
+ // Handle sourceTerms:
+ if (!sourceTerms.isEmpty()) {
TermsEnum termsEnum = termsIndex.iterator();//does not return null
for (BytesRef term : sourceTerms) {
- if (!termsEnum.seekExact(term)) {
- continue; // term not found
- }
- PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS);
- if (postingsEnum == null) {
- // no offsets or positions available
- throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight");
- }
- if (doc != postingsEnum.advance(doc)) { // now it's positioned, although may be exhausted
- continue;
+ if (termsEnum.seekExact(term)) {
+ PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS);
+
+ if (postingsEnum == null) {
+ // no offsets or positions available
+ throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight");
+ }
+
+ if (doc == postingsEnum.advance(doc)) { // now it's positioned, although may be exhausted
+ postingsEnum = phraseHelper.filterPostings(term, postingsEnum, strictPhrasesTermToSpans.get(term));
+ if (postingsEnum != null) {
+ offsetsEnums.add(new OffsetsEnum(term, postingsEnum));
+ }
+ }
}
- postingsEnum = strictPhrases.filterPostings(term, postingsEnum, strictPhrasesTermToSpans.get(term));
- if (postingsEnum == null) {
- continue;// completely filtered out
+ }
+ }
+
+ // Handle automata
+ if (automata.length > 0) {
+ offsetsEnums.addAll(createAutomataOffsetsFromTerms(termsIndex, doc));
+ }
+
+ return offsetsEnums;
+ }
+
+ protected List<OffsetsEnum> createAutomataOffsetsFromTerms(Terms termsIndex, int doc) throws IOException {
+ List<List<PostingsEnum>> automataPostings = new ArrayList<>(automata.length);
+ for (int i = 0; i < automata.length; i++) {
+ automataPostings.add(new ArrayList<>());
+ }
+
+ TermsEnum termsEnum = termsIndex.iterator();
+ BytesRef term;
+ CharsRefBuilder refBuilder = new CharsRefBuilder();
+ while ((term = termsEnum.next()) != null) {
+ for (int i = 0; i < automata.length; i++) {
+ CharacterRunAutomaton automaton = automata[i];
+ refBuilder.copyUTF8Bytes(term);
+ if (automaton.run(refBuilder.chars(), 0, refBuilder.length())) {
+ PostingsEnum postings = termsEnum.postings(null, PostingsEnum.OFFSETS);
+ if (doc == postings.advance(doc)) {
+ automataPostings.get(i).add(postings);
+ }
}
+ }
+ }
- offsetsEnums.add(new OffsetsEnum(term, postingsEnum));
+ List<OffsetsEnum> offsetsEnums = new ArrayList<>(automata.length); //will be at most this long
+ for (int i = 0; i < automata.length; i++) {
+ CharacterRunAutomaton automaton = automata[i];
+ List<PostingsEnum> postingsEnums = automataPostings.get(i);
+ int size = postingsEnums.size();
+ if (size > 0) { //only add if we have offsets
+ BytesRef wildcardTerm = new BytesRef(automaton.toString());
+ if (size == 1) { //don't wrap in a composite if there's only one OffsetsEnum
+ offsetsEnums.add(new OffsetsEnum(wildcardTerm, postingsEnums.get(0)));
+ } else {
+ offsetsEnums.add(new OffsetsEnum(wildcardTerm, new CompositeOffsetsPostingsEnum(postingsEnums)));
+ }
}
}
+
return offsetsEnums;
}
- protected OffsetsEnum createOffsetsEnumFromTokenStream(int doc, TokenStream tokenStream) throws IOException {
- // if there are automata (MTQ), we have to initialize the "fake" enum wrapping them.
- assert tokenStream != null;
- // TODO Opt: we sometimes evaluate the automata twice when this TS isn't the original; can we avoid?
- PostingsEnum mtqPostingsEnum = MultiTermHighlighting.getDocsEnum(tokenStream, automata);
- assert mtqPostingsEnum instanceof Closeable; // FYI we propagate close() later.
- mtqPostingsEnum.advance(doc);
- return new OffsetsEnum(null, mtqPostingsEnum);
- }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java
new file mode 100644
index 0000000..4028912
--- /dev/null
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MemoryIndexOffsetStrategy.java
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.uhighlight;
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collection;
+import java.util.Collections;
+import java.util.List;
+import java.util.function.Function;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.FilteringTokenFilter;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.memory.MemoryIndex;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.automaton.Automata;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+
+
+/**
+ * Uses an {@link Analyzer} on content to get offsets and then populates a {@link MemoryIndex}.
+ *
+ * @lucene.internal
+ */
+public class MemoryIndexOffsetStrategy extends AnalysisOffsetStrategy {
+
+ private final MemoryIndex memoryIndex;
+ private final LeafReader leafReader;
+ private final CharacterRunAutomaton preMemIndexFilterAutomaton;
+
+ public MemoryIndexOffsetStrategy(String field, BytesRef[] extractedTerms, PhraseHelper phraseHelper,
+ CharacterRunAutomaton[] automata, Analyzer analyzer,
+ Function<Query, Collection<Query>> multiTermQueryRewrite) {
+ super(field, extractedTerms, phraseHelper, automata, analyzer);
+ boolean storePayloads = phraseHelper.hasPositionSensitivity(); // might be needed
+ memoryIndex = new MemoryIndex(true, storePayloads);//true==store offsets
+ leafReader = (LeafReader) memoryIndex.createSearcher().getIndexReader(); // appears to be re-usable
+ // preFilter for MemoryIndex
+ preMemIndexFilterAutomaton = buildCombinedAutomaton(field, terms, this.automata, phraseHelper, multiTermQueryRewrite);
+ }
+
+ /**
+ * Build one {@link CharacterRunAutomaton} matching any term the query might match.
+ */
+ private static CharacterRunAutomaton buildCombinedAutomaton(String field, BytesRef[] terms,
+ CharacterRunAutomaton[] automata,
+ PhraseHelper strictPhrases,
+ Function<Query, Collection<Query>> multiTermQueryRewrite) {
+ List<CharacterRunAutomaton> allAutomata = new ArrayList<>();
+ if (terms.length > 0) {
+ allAutomata.add(new CharacterRunAutomaton(Automata.makeStringUnion(Arrays.asList(terms))));
+ }
+ Collections.addAll(allAutomata, automata);
+ for (SpanQuery spanQuery : strictPhrases.getSpanQueries()) {
+ Collections.addAll(allAutomata,
+ MultiTermHighlighting.extractAutomata(spanQuery, field, true, multiTermQueryRewrite));//true==lookInSpan
+ }
+
+ if (allAutomata.size() == 1) {
+ return allAutomata.get(0);
+ }
+ //TODO it'd be nice if we could get at the underlying Automaton in CharacterRunAutomaton so that we
+ // could union them all. But it's not exposed, and note TermRangeQuery isn't modelled as an Automaton
+ // by MultiTermHighlighting.
+
+ // Return an aggregate CharacterRunAutomaton of others
+ return new CharacterRunAutomaton(Automata.makeEmpty()) {// the makeEmpty() is bogus; won't be used
+ @Override
+ public boolean run(char[] chars, int offset, int length) {
+ for (int i = 0; i < allAutomata.size(); i++) {// don't use foreach to avoid Iterator allocation
+ if (allAutomata.get(i).run(chars, offset, length)) {
+ return true;
+ }
+ }
+ return false;
+ }
+ };
+ }
+
+ @Override
+ public List<OffsetsEnum> getOffsetsEnums(IndexReader reader, int docId, String content) throws IOException {
+ // note: don't need LimitTokenOffsetFilter since content is already truncated to maxLength
+ TokenStream tokenStream = tokenStream(content);
+
+ // Filter the tokenStream to applicable terms
+ tokenStream = newKeepWordFilter(tokenStream, preMemIndexFilterAutomaton);
+ memoryIndex.reset();
+ memoryIndex.addField(field, tokenStream);//note: calls tokenStream.reset() & close()
+ docId = 0;
+
+ return createOffsetsEnumsFromReader(leafReader, docId);
+ }
+
+
+ private static FilteringTokenFilter newKeepWordFilter(final TokenStream tokenStream,
+ final CharacterRunAutomaton charRunAutomaton) {
+ // it'd be nice to use KeepWordFilter but it demands a CharArraySet. TODO File JIRA? Need a new interface?
+ return new FilteringTokenFilter(tokenStream) {
+ final CharTermAttribute charAtt = addAttribute(CharTermAttribute.class);
+
+ @Override
+ protected boolean accept() throws IOException {
+ return charRunAutomaton.run(charAtt.buffer(), 0, charAtt.length());
+ }
+ };
+ }
+
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java
index e85fa3b..fd6a26a 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiTermHighlighting.java
@@ -16,8 +16,6 @@
*/
package org.apache.lucene.search.uhighlight;
-import java.io.Closeable;
-import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
@@ -25,15 +23,7 @@ import java.util.Comparator;
import java.util.List;
import java.util.function.Function;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.index.FilterLeafReader;
-import org.apache.lucene.index.FilteredTermsEnum;
-import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.Term;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.search.AutomatonQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanQuery;
@@ -48,9 +38,7 @@ import org.apache.lucene.search.spans.SpanNearQuery;
import org.apache.lucene.search.spans.SpanNotQuery;
import org.apache.lucene.search.spans.SpanOrQuery;
import org.apache.lucene.search.spans.SpanPositionCheckQuery;
-import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CharsRef;
-import org.apache.lucene.util.CharsRefBuilder;
import org.apache.lucene.util.UnicodeUtil;
import org.apache.lucene.util.automaton.Automata;
import org.apache.lucene.util.automaton.Automaton;
@@ -210,182 +198,4 @@ class MultiTermHighlighting {
return list.toArray(new CharacterRunAutomaton[list.size()]);
}
- /**
- * Returns a "fake" DocsAndPositionsEnum over the tokenstream, returning offsets where {@code matchers}
- * matches tokens.
- * <p>
- * This is solely used internally by PostingsHighlighter: <b>DO NOT USE THIS METHOD!</b>
- */
- public static PostingsEnum getDocsEnum(final TokenStream ts, final CharacterRunAutomaton[] matchers) throws IOException {
- return new TokenStreamPostingsEnum(ts, matchers);
- }
-
- // TODO: we could use CachingWrapperFilter, (or consume twice) to allow us to have a true freq()
- // but this would have a performance cost for likely little gain in the user experience, it
- // would only serve to make this method less bogus.
- // instead, we always return freq() = Integer.MAX_VALUE and let the highlighter terminate based on offset...
- // TODO: DWS perhaps instead OffsetsEnum could become abstract and this would be an impl?
- private static class TokenStreamPostingsEnum extends PostingsEnum implements Closeable {
- TokenStream stream; // becomes null when closed
- final CharacterRunAutomaton[] matchers;
- final CharTermAttribute charTermAtt;
- final OffsetAttribute offsetAtt;
-
- int currentDoc = -1;
- int currentMatch = -1;
- int currentStartOffset = -1;
-
- int currentEndOffset = -1;
-
- final BytesRef matchDescriptions[];
-
- TokenStreamPostingsEnum(TokenStream ts, CharacterRunAutomaton[] matchers) throws IOException {
- this.stream = ts;
- this.matchers = matchers;
- matchDescriptions = new BytesRef[matchers.length];
- charTermAtt = ts.addAttribute(CharTermAttribute.class);
- offsetAtt = ts.addAttribute(OffsetAttribute.class);
- ts.reset();
- }
-
- @Override
- public int nextPosition() throws IOException {
- if (stream != null) {
- while (stream.incrementToken()) {
- for (int i = 0; i < matchers.length; i++) {
- if (matchers[i].run(charTermAtt.buffer(), 0, charTermAtt.length())) {
- currentStartOffset = offsetAtt.startOffset();
- currentEndOffset = offsetAtt.endOffset();
- currentMatch = i;
- return 0;
- }
- }
- }
- stream.end();
- close();
- }
- // exhausted
- currentStartOffset = currentEndOffset = Integer.MAX_VALUE;
- return Integer.MAX_VALUE;
- }
-
- @Override
- public int freq() throws IOException {
- return Integer.MAX_VALUE; // lie
- }
-
- @Override
- public int startOffset() throws IOException {
- assert currentStartOffset >= 0;
- return currentStartOffset;
- }
-
- @Override
- public int endOffset() throws IOException {
- assert currentEndOffset >= 0;
- return currentEndOffset;
- }
-
- @Override
- public BytesRef getPayload() throws IOException {
- if (matchDescriptions[currentMatch] == null) {
- matchDescriptions[currentMatch] = new BytesRef(matchers[currentMatch].toString());
- }
- return matchDescriptions[currentMatch];
- }
-
- @Override
- public int docID() {
- return currentDoc;
- }
-
- @Override
- public int nextDoc() throws IOException {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public int advance(int target) throws IOException {
- return currentDoc = target;
- }
-
- @Override
- public long cost() {
- return 0;
- }
-
- @Override
- public void close() throws IOException {
- if (stream != null) {
- stream.close();
- stream = null;
- }
- }
- }
-
- /**
- * Return a TokenStream un-inverted from the provided Terms, but filtered based on the automata. The
- * Terms must have exactly one doc count (e.g. term vector or MemoryIndex).
- */
- //TODO: Alternatively, produce a list of OffsetsEnums from the Terms that match the automata.
- public static TokenStream uninvertAndFilterTerms(Terms termsIndex,
- int doc,
- final CharacterRunAutomaton[] automata,
- int offsetLength)
- throws IOException {
- assert automata.length > 0;
- //Note: if automata were plain Automaton (not CharacterRunAutomaton), we might instead use
- // TermsEnum.intersect(compiledAutomaton). But probably won't help due to O(N) TV impl so whatever.
- FilterLeafReader.FilterTerms filteredTermsIndex = new FilterLeafReader.FilterTerms(termsIndex) {
- @Override
- public TermsEnum iterator() throws IOException {
- return new FilteredTermsEnum(super.iterator(), false) {//false == no seek
- CharsRefBuilder tempCharsRefBuilder = new CharsRefBuilder();//reuse only for UTF8->UTF16 call
-
- @Override
- protected AcceptStatus accept(BytesRef termBytesRef) throws IOException {
- //Grab the term (in same way as BytesRef.utf8ToString() but we don't want a String obj)
- tempCharsRefBuilder.grow(termBytesRef.length);
- final int charLen = UnicodeUtil.UTF8toUTF16(termBytesRef, tempCharsRefBuilder.chars());
- for (CharacterRunAutomaton runAutomaton : automata) {
- if (runAutomaton.run(tempCharsRefBuilder.chars(), 0, charLen)) {
- return AcceptStatus.YES;
- }
- }
- return AcceptStatus.NO;
- }
- };
- }
-
- @Override
- public long size() throws IOException {
- return -1; // unknown
- }
-
- @Override
- public long getSumTotalTermFreq() throws IOException {
- return -1; // unknown
- }
-
- @Override
- public long getSumDocFreq() throws IOException {
- return -1; // unknown
- }
- };
- float loadFactor = 1f / 64f;
- return new TokenStreamFromTermVector(filteredTermsIndex, doc, offsetLength, loadFactor);
- }
-
- /**
- * Returns a simple automata that matches the specified term.
- */
- public static CharacterRunAutomaton makeStringMatchAutomata(BytesRef term) {
- String termString = term.utf8ToString();
- return new CharacterRunAutomaton(Automata.makeString(termString)) {
- @Override
- public String toString() {
- return termString;
- }
- };
- }
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiValueTokenStream.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiValueTokenStream.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiValueTokenStream.java
deleted file mode 100644
index 4cbf754..0000000
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/MultiValueTokenStream.java
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.search.uhighlight;
-
-import java.io.IOException;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-
-/**
- * Wraps an {@link Analyzer} and string text that represents multiple values delimited by a specified character. This
- * exposes a TokenStream that matches what would get indexed considering the
- * {@link Analyzer#getPositionIncrementGap(String)}. Currently this assumes {@link Analyzer#getOffsetGap(String)} is
- * 1; an exception will be thrown if it isn't.
- * <br />
- * It would be more orthogonal for this to be an Analyzer since we're wrapping an Analyzer but doing so seems like
- * more work. The underlying components see a Reader not a String -- and the String is easy to
- * split up without redundant buffering.
- *
- * @lucene.internal
- */
-final class MultiValueTokenStream extends TokenFilter {
-
- private final String fieldName;
- private final Analyzer indexAnalyzer;
- private final String content;
- private final char splitChar;
-
- private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
- private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
-
- private int startValIdx = 0;
- private int endValIdx;
- private int remainingPosInc = 0;
-
- /** note: The caller must remember to close the TokenStream eventually. */
- static TokenStream wrap(String fieldName, Analyzer indexAnalyzer, String content, char splitChar)
- throws IOException {
- if (indexAnalyzer.getOffsetGap(fieldName) != 1) { // note: 1 is the default. It is RARELY changed.
- throw new IllegalArgumentException(
- "offset gap of the provided analyzer should be 1 (field " + fieldName + ")");
- }
- // If there is no splitChar in content then we needn't wrap:
- int splitCharIdx = content.indexOf(splitChar);
- if (splitCharIdx == -1) {
- return indexAnalyzer.tokenStream(fieldName, content);
- }
-
- TokenStream subTokenStream = indexAnalyzer.tokenStream(fieldName, content.substring(0, splitCharIdx));
-
- return new MultiValueTokenStream(subTokenStream, fieldName, indexAnalyzer, content, splitChar, splitCharIdx);
- }
-
- private MultiValueTokenStream(TokenStream subTokenStream, String fieldName, Analyzer indexAnalyzer,
- String content, char splitChar, int splitCharIdx) {
- super(subTokenStream); // subTokenStream is already initialized to operate on the first value
- this.fieldName = fieldName;
- this.indexAnalyzer = indexAnalyzer;
- this.content = content;
- this.splitChar = splitChar;
- this.endValIdx = splitCharIdx;
- }
-
- @Override
- public void reset() throws IOException {
- if (startValIdx != 0) {
- throw new IllegalStateException("This TokenStream wasn't developed to be re-used.");
- // ... although we could if a need for it arises.
- }
- super.reset();
- }
-
- @Override
- public boolean incrementToken() throws IOException {
- while (true) {
-
- if (input.incrementToken()) {
- // Position tracking:
- if (remainingPosInc > 0) {//usually true first token of additional values (not first val)
- posIncAtt.setPositionIncrement(remainingPosInc + posIncAtt.getPositionIncrement());
- remainingPosInc = 0;//reset
- }
- // Offset tracking:
- offsetAtt.setOffset(
- startValIdx + offsetAtt.startOffset(),
- startValIdx + offsetAtt.endOffset()
- );
- return true;
- }
-
- if (endValIdx == content.length()) {//no more
- return false;
- }
-
- input.end(); // might adjust position increment
- remainingPosInc += posIncAtt.getPositionIncrement();
- input.close();
- remainingPosInc += indexAnalyzer.getPositionIncrementGap(fieldName);
-
- // Get new tokenStream based on next segment divided by the splitChar
- startValIdx = endValIdx + 1;
- endValIdx = content.indexOf(splitChar, startValIdx);
- if (endValIdx == -1) {//EOF
- endValIdx = content.length();
- }
- TokenStream tokenStream = indexAnalyzer.tokenStream(fieldName, content.substring(startValIdx, endValIdx));
- if (tokenStream != input) {// (input is defined in TokenFilter set in the constructor)
- // This is a grand trick we do -- knowing that the analyzer's re-use strategy is going to produce the
- // very same tokenStream instance and thus have the same AttributeSource as this wrapping TokenStream
- // since we used it as our input in the constructor.
- // Were this not the case, we'd have to copy every attribute of interest since we can't alter the
- // AttributeSource of this wrapping TokenStream post-construction (it's all private/final).
- // If this is a problem, we could do that instead; maybe with a custom CharTermAttribute that allows
- // us to easily set the char[] reference without literally copying char by char.
- throw new IllegalStateException("Require TokenStream re-use. Unsupported re-use strategy?: " +
- indexAnalyzer.getReuseStrategy());
- }
- tokenStream.reset();
- } // while loop to increment token of this new value
- }
-
- @Override
- public void end() throws IOException {
- super.end();
- // Offset tracking:
- offsetAtt.setOffset(
- startValIdx + offsetAtt.startOffset(),
- startValIdx + offsetAtt.endOffset());
- }
-
-}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java
index af29ef1..cbaeb90 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/OffsetsEnum.java
@@ -76,6 +76,7 @@ public class OffsetsEnum implements Comparable<OffsetsEnum>, Closeable {
}
void nextPosition() throws IOException {
+ assert hasMorePositions();
pos++;
postingsEnum.nextPosition();
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/Passage.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/Passage.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/Passage.java
index f4caaa0..de37d5d 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/Passage.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/Passage.java
@@ -40,7 +40,7 @@ public final class Passage {
BytesRef matchTerms[] = new BytesRef[8];
int numMatches = 0;
- void addMatch(int startOffset, int endOffset, BytesRef term) {
+ public void addMatch(int startOffset, int endOffset, BytesRef term) {
assert startOffset >= this.startOffset && startOffset <= this.endOffset;
if (numMatches == matchStarts.length) {
int newLength = ArrayUtil.oversize(numMatches+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF);
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java
index 95d51c9..cde17ba 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PhraseHelper.java
@@ -266,7 +266,7 @@ public class PhraseHelper {
}
/**
- * Returns terms as a List, but expanded to any terms in strictPhrases' keySet if present. That can only
+ * Returns terms as a List, but expanded to any terms in phraseHelper' keySet if present. That can only
* happen if willRewrite() is true.
*/
List<BytesRef> expandTermsIfRewrite(BytesRef[] terms, Map<BytesRef, Spans> strictPhrasesTermToSpans) {
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsOffsetStrategy.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsOffsetStrategy.java
index 4666906..975d3a1 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsOffsetStrategy.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsOffsetStrategy.java
@@ -41,7 +41,7 @@ public class PostingsOffsetStrategy extends FieldOffsetStrategy {
@Override
public List<OffsetsEnum> getOffsetsEnums(IndexReader reader, int docId, String content) throws IOException {
- LeafReader leafReader;
+ final LeafReader leafReader;
if (reader instanceof LeafReader) {
leafReader = (LeafReader) reader;
} else {
@@ -54,6 +54,7 @@ public class PostingsOffsetStrategy extends FieldOffsetStrategy {
return createOffsetsEnumsFromReader(leafReader, docId);
}
+
@Override
public UnifiedHighlighter.OffsetSource getOffsetSource() {
return UnifiedHighlighter.OffsetSource.POSTINGS;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsWithTermVectorsOffsetStrategy.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsWithTermVectorsOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsWithTermVectorsOffsetStrategy.java
index 81de379..b9086a7 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsWithTermVectorsOffsetStrategy.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/PostingsWithTermVectorsOffsetStrategy.java
@@ -20,7 +20,6 @@ import java.io.IOException;
import java.util.Collections;
import java.util.List;
-import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
@@ -58,14 +57,11 @@ public class PostingsWithTermVectorsOffsetStrategy extends FieldOffsetStrategy {
}
leafReader = new TermVectorFilteredLeafReader(leafReader, docTerms);
- TokenStream tokenStream = automata.length > 0 ? MultiTermHighlighting
- .uninvertAndFilterTerms(leafReader.terms(field), docId, this.automata, content.length()) : null;
-
- return createOffsetsEnums(leafReader, docId, tokenStream);
+ return createOffsetsEnumsFromReader(leafReader, docId);
}
@Override
public UnifiedHighlighter.OffsetSource getOffsetSource() {
- return UnifiedHighlighter.OffsetSource.POSTINGS;
+ return UnifiedHighlighter.OffsetSource.POSTINGS_WITH_TERM_VECTORS;
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TermVectorOffsetStrategy.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TermVectorOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TermVectorOffsetStrategy.java
index 204679b..f6eedc4 100644
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TermVectorOffsetStrategy.java
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TermVectorOffsetStrategy.java
@@ -20,7 +20,6 @@ import java.io.IOException;
import java.util.Collections;
import java.util.List;
-import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.Terms;
@@ -51,18 +50,10 @@ public class TermVectorOffsetStrategy extends FieldOffsetStrategy {
return Collections.emptyList();
}
- LeafReader leafReader = null;
- if ((terms.length > 0) || strictPhrases.willRewrite()) {
- leafReader = new TermVectorLeafReader(field, tvTerms);
- docId = 0;
- }
-
- TokenStream tokenStream = null;
- if (automata.length > 0) {
- tokenStream = MultiTermHighlighting.uninvertAndFilterTerms(tvTerms, 0, automata, content.length());
- }
+ LeafReader leafReader = new TermVectorLeafReader(field, tvTerms);
+ docId = 0;
- return createOffsetsEnums(leafReader, docId, tokenStream);
+ return createOffsetsEnumsFromReader(leafReader, docId);
}
}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamFromTermVector.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamFromTermVector.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamFromTermVector.java
deleted file mode 100644
index 980c566..0000000
--- a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamFromTermVector.java
+++ /dev/null
@@ -1,395 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.search.uhighlight;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.index.PostingsEnum;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.BytesRefArray;
-import org.apache.lucene.util.BytesRefBuilder;
-import org.apache.lucene.util.CharsRefBuilder;
-import org.apache.lucene.util.Counter;
-import org.apache.lucene.util.UnicodeUtil;
-
-/**
- * TokenStream created from a term vector field. The term vector requires positions and/or offsets (either). If you
- * want payloads add PayloadAttributeImpl (as you would normally) but don't assume the attribute is already added just
- * because you know the term vector has payloads, since the first call to incrementToken() will observe if you asked
- * for them and if not then won't get them. This TokenStream supports an efficient {@link #reset()}, so there's
- * no need to wrap with a caching impl.
- *
- * @lucene.internal
- */
-final class TokenStreamFromTermVector extends TokenStream {
- // note: differs from similar class in the standard highlighter. This one is optimized for sparse cases.
-
- /**
- * content length divided by distinct positions; an average of dense text.
- */
- private static final double AVG_CHARS_PER_POSITION = 6;
-
- private static final int INSERTION_SORT_THRESHOLD = 16;
-
- private final Terms vector;
-
- private final int filteredDocId;
-
- private final CharTermAttribute termAttribute;
-
- private final PositionIncrementAttribute positionIncrementAttribute;
-
- private final int offsetLength;
-
- private final float loadFactor;
-
- private OffsetAttribute offsetAttribute;//maybe null
-
- private PayloadAttribute payloadAttribute;//maybe null
-
- private CharsRefBuilder termCharsBuilder;//term data here
-
- private BytesRefArray payloadsBytesRefArray;//only used when payloadAttribute is non-null
- private BytesRefBuilder spareBytesRefBuilder;//only used when payloadAttribute is non-null
-
- private TokenLL firstToken = null; // the head of a linked-list
-
- private TokenLL incrementToken = null;
-
- private boolean initialized = false;//lazy
-
- public TokenStreamFromTermVector(Terms vector, int offsetLength) throws IOException {
- this(vector, 0, offsetLength, 1f);
- }
-
- /**
- * Constructor.
- *
- * @param vector Terms that contains the data for
- * creating the TokenStream. Must have positions and/or offsets.
- * @param filteredDocId The docID we will process.
- * @param offsetLength Supply the character length of the text being uninverted, or a lower value if you don't want
- * to invert text beyond an offset (in so doing this will act as a filter). If you don't
- * know the length, pass -1. In conjunction with {@code loadFactor}, it's used to
- * determine how many buckets to create during uninversion.
- * It's also used to filter out tokens with a start offset exceeding this value.
- * @param loadFactor The percent of tokens from the original terms (by position count) that are
- * expected to be inverted. If they are filtered (e.g.
- * {@link org.apache.lucene.index.FilterLeafReader.FilterTerms})
- * then consider using less than 1.0 to avoid wasting space.
- * 1.0 means all, 1/64th would suggest 1/64th of all tokens coming from vector.
- */
- TokenStreamFromTermVector(Terms vector, int filteredDocId, int offsetLength, float loadFactor) throws IOException {
- super();
- this.filteredDocId = filteredDocId;
- this.offsetLength = offsetLength == Integer.MAX_VALUE ? -1 : offsetLength;
- if (loadFactor <= 0f || loadFactor > 1f) {
- throw new IllegalArgumentException("loadFactor should be > 0 and <= 1");
- }
- this.loadFactor = loadFactor;
- assert !hasAttribute(PayloadAttribute.class) : "AttributeFactory shouldn't have payloads *yet*";
- if (!vector.hasPositions() && !vector.hasOffsets()) {
- throw new IllegalArgumentException("The term vector needs positions and/or offsets.");
- }
- assert vector.hasFreqs();
- this.vector = vector;
- termAttribute = addAttribute(CharTermAttribute.class);
- positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
- }
-
- public Terms getTermVectorTerms() {
- return vector;
- }
-
- @Override
- public void reset() throws IOException {
- incrementToken = null;
- super.reset();
- }
-
- //We delay initialization because we can see which attributes the consumer wants, particularly payloads
- private void init() throws IOException {
- assert !initialized;
- int dpEnumFlags = 0;
- if (vector.hasOffsets()) {
- offsetAttribute = addAttribute(OffsetAttribute.class);
- dpEnumFlags |= PostingsEnum.OFFSETS;
- }
- if (vector.hasPayloads() && hasAttribute(PayloadAttribute.class)) {
- payloadAttribute = getAttribute(PayloadAttribute.class);
- payloadsBytesRefArray = new BytesRefArray(Counter.newCounter());
- spareBytesRefBuilder = new BytesRefBuilder();
- dpEnumFlags |= PostingsEnum.PAYLOADS;
- }
-
- // We put term data here
- termCharsBuilder = new CharsRefBuilder();
- termCharsBuilder.grow(initTotalTermCharLen());
-
- // Step 1: iterate termsEnum and create a token, placing into a bucketed array (given a load factor)
-
- final TokenLL[] tokenBuckets = initTokenBucketsArray();
- final double OFFSET_TO_BUCKET_IDX = loadFactor / AVG_CHARS_PER_POSITION;
- final double POSITION_TO_BUCKET_IDX = loadFactor;
-
- final TermsEnum termsEnum = vector.iterator();
- BytesRef termBytesRef;
- PostingsEnum dpEnum = null;
- final CharsRefBuilder tempCharsRefBuilder = new CharsRefBuilder();//only for UTF8->UTF16 call
-
- TERM_LOOP:
- while ((termBytesRef = termsEnum.next()) != null) {
- //Grab the term (in same way as BytesRef.utf8ToString() but we don't want a String obj)
- // note: if term vectors supported seek by ord then we might just keep an int and seek by ord on-demand
- tempCharsRefBuilder.grow(termBytesRef.length);
- final int termCharsLen = UnicodeUtil.UTF8toUTF16(termBytesRef, tempCharsRefBuilder.chars());
- final int termCharsOff = termCharsBuilder.length();
- termCharsBuilder.append(tempCharsRefBuilder.chars(), 0, termCharsLen);
- dpEnum = termsEnum.postings(dpEnum, dpEnumFlags);
- assert dpEnum != null; // presumably checked by TokenSources.hasPositions earlier
- int currentDocId = dpEnum.advance(filteredDocId);
- if (currentDocId != filteredDocId) {
- continue; //Not expected
- }
- final int freq = dpEnum.freq();
- for (int j = 0; j < freq; j++) {
- TokenLL token = new TokenLL();
- token.position = dpEnum.nextPosition(); // can be -1 if not in the TV
- token.termCharsOff = termCharsOff;
- token.termCharsLen = (short) Math.min(termCharsLen, Short.MAX_VALUE);
- // copy offset (if it's there) and compute bucketIdx
- int bucketIdx;
- if (offsetAttribute != null) {
- token.startOffset = dpEnum.startOffset();
- if (offsetLength >= 0 && token.startOffset > offsetLength) {
- continue TERM_LOOP;//filter this token out; exceeds threshold
- }
- token.endOffsetInc = (short) Math.min(dpEnum.endOffset() - token.startOffset, Short.MAX_VALUE);
- bucketIdx = (int) (token.startOffset * OFFSET_TO_BUCKET_IDX);
- } else {
- bucketIdx = (int) (token.position * POSITION_TO_BUCKET_IDX);
- }
- if (bucketIdx >= tokenBuckets.length) {
- bucketIdx = tokenBuckets.length - 1;
- }
-
- if (payloadAttribute != null) {
- final BytesRef payload = dpEnum.getPayload();
- token.payloadIndex = payload == null ? -1 : payloadsBytesRefArray.append(payload);
- }
-
- //Add token to the head of the bucket linked list
- token.next = tokenBuckets[bucketIdx];
- tokenBuckets[bucketIdx] = token;
- }
- }
-
- // Step 2: Link all Tokens into a linked-list and sort all tokens at the same position
-
- firstToken = initLinkAndSortTokens(tokenBuckets);
-
- // If the term vector didn't have positions, synthesize them
- if (!vector.hasPositions() && firstToken != null) {
- TokenLL prevToken = firstToken;
- prevToken.position = 0;
- for (TokenLL token = prevToken.next; token != null; prevToken = token, token = token.next) {
- if (prevToken.startOffset == token.startOffset) {
- token.position = prevToken.position;
- } else {
- token.position = prevToken.position + 1;
- }
- }
- }
-
- initialized = true;
- }
-
- private static TokenLL initLinkAndSortTokens(TokenLL[] tokenBuckets) {
- TokenLL firstToken = null;
- List<TokenLL> scratchTokenArray = new ArrayList<>(); // declare here for re-use. TODO use native array
- TokenLL prevToken = null;
- for (TokenLL tokenHead : tokenBuckets) {
- if (tokenHead == null) {
- continue;
- }
- //sort tokens at this position and link them; return the first
- TokenLL tokenTail;
- // just one token
- if (tokenHead.next == null) {
- tokenTail = tokenHead;
- } else {
- // add the linked list to a temporary array
- for (TokenLL cur = tokenHead; cur != null; cur = cur.next) {
- scratchTokenArray.add(cur);
- }
- // sort; and set tokenHead & tokenTail
- if (scratchTokenArray.size() < INSERTION_SORT_THRESHOLD) {
- // insertion sort by creating a linked list (leave scratchTokenArray alone)
- tokenHead = tokenTail = scratchTokenArray.get(0);
- tokenHead.next = null;
- for (int i = 1; i < scratchTokenArray.size(); i++) {
- TokenLL insertToken = scratchTokenArray.get(i);
- if (insertToken.compareTo(tokenHead) <= 0) {
- // takes the place of tokenHead
- insertToken.next = tokenHead;
- tokenHead = insertToken;
- } else {
- // goes somewhere after tokenHead
- for (TokenLL prev = tokenHead; true; prev = prev.next) {
- if (prev.next == null || insertToken.compareTo(prev.next) <= 0) {
- if (prev.next == null) {
- tokenTail = insertToken;
- }
- insertToken.next = prev.next;
- prev.next = insertToken;
- break;
- }
- }
- }
- }
- } else {
- Collections.sort(scratchTokenArray);
- // take back out and create a linked list
- TokenLL prev = tokenHead = scratchTokenArray.get(0);
- for (int i = 1; i < scratchTokenArray.size(); i++) {
- prev.next = scratchTokenArray.get(i);
- prev = prev.next;
- }
- tokenTail = prev;
- tokenTail.next = null;
- }
- scratchTokenArray.clear();//too bad ArrayList nulls it out; we don't actually need that
- }
-
- //link to previous
- if (prevToken != null) {
- assert prevToken.next == null;
- prevToken.next = tokenHead; //concatenate linked-list
- assert prevToken.compareTo(tokenHead) < 0 : "wrong offset / position ordering expectations";
- } else {
- assert firstToken == null;
- firstToken = tokenHead;
- }
-
- prevToken = tokenTail;
- }
- return firstToken;
- }
-
- private int initTotalTermCharLen() throws IOException {
- int guessNumTerms;
- if (vector.size() != -1) {
- guessNumTerms = (int) vector.size();
- } else if (offsetLength != -1) {
- guessNumTerms = (int) (offsetLength * 0.33);//guess 1/3rd
- } else {
- return 128;
- }
- return Math.max(64, (int) (guessNumTerms * loadFactor * 7.0));//7 is over-estimate of average term len
- }
-
- private TokenLL[] initTokenBucketsArray() throws IOException {
- // Estimate the number of non-empty positions (number of tokens, excluding same-position synonyms).
- int positionsEstimate;
- if (offsetLength == -1) { // no clue what the char length is.
- // Estimate the number of position slots we need from term stats based on Wikipedia.
- int sumTotalTermFreq = (int) vector.getSumTotalTermFreq();
- if (sumTotalTermFreq == -1) {//unfortunately term vectors seem to not have this stat
- int size = (int) vector.size();
- if (size == -1) {//doesn't happen with term vectors, it seems, but pick a default any way
- size = 128;
- }
- sumTotalTermFreq = (int) (size * 2.4);
- }
- positionsEstimate = (int) (sumTotalTermFreq * 1.5);//less than 1 in 10 docs exceed this
- } else {
- // guess number of token positions by this factor.
- positionsEstimate = (int) (offsetLength / AVG_CHARS_PER_POSITION);
- }
- // apply the load factor.
- return new TokenLL[Math.max(1, (int) (positionsEstimate * loadFactor))];
- }
-
- @Override
- public boolean incrementToken() throws IOException {
- int posInc;
- if (incrementToken == null) {
- if (!initialized) {
- init();
- assert initialized;
- }
- incrementToken = firstToken;
- if (incrementToken == null) {
- return false;
- }
- posInc = incrementToken.position + 1;//first token normally has pos 0; add 1 to get posInc
- } else if (incrementToken.next != null) {
- int lastPosition = incrementToken.position;
- incrementToken = incrementToken.next;
- posInc = incrementToken.position - lastPosition;
- } else {
- return false;
- }
- clearAttributes();
- termAttribute.copyBuffer(termCharsBuilder.chars(), incrementToken.termCharsOff, incrementToken.termCharsLen);
-
- positionIncrementAttribute.setPositionIncrement(posInc);
- if (offsetAttribute != null) {
- offsetAttribute.setOffset(incrementToken.startOffset, incrementToken.startOffset + incrementToken.endOffsetInc);
- }
- if (payloadAttribute != null && incrementToken.payloadIndex >= 0) {
- payloadAttribute.setPayload(payloadsBytesRefArray.get(spareBytesRefBuilder, incrementToken.payloadIndex));
- }
- return true;
- }
-
- private static class TokenLL implements Comparable<TokenLL> {
- // This class should weigh 32 bytes, including object header
-
- int termCharsOff; // see termCharsBuilder
- short termCharsLen;
-
- int position;
- int startOffset;
- short endOffsetInc; // add to startOffset to get endOffset
- int payloadIndex;
-
- TokenLL next;
-
- @Override
- public int compareTo(TokenLL tokenB) {
- int cmp = Integer.compare(this.position, tokenB.position);
- if (cmp == 0) {
- cmp = Integer.compare(this.startOffset, tokenB.startOffset);
- if (cmp == 0) {
- cmp = Short.compare(this.endOffsetInc, tokenB.endOffsetInc);
- }
- }
- return cmp;
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/7af454ad/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java
new file mode 100644
index 0000000..966eeef
--- /dev/null
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/TokenStreamOffsetStrategy.java
@@ -0,0 +1,173 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.uhighlight;
+
+import java.io.Closeable;
+import java.io.IOException;
+import java.util.Collections;
+import java.util.List;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.PostingsEnum;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.automaton.Automata;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+
+/**
+ * Analyzes the text, producing a single {@link OffsetsEnum} wrapping the {@link TokenStream} filtered to terms
+ * in the query, including wildcards. It can't handle position-sensitive queries (phrases). Passage accuracy suffers
+ * because the freq() is unknown -- it's always {@link Integer#MAX_VALUE} instead.
+ */
+public class TokenStreamOffsetStrategy extends AnalysisOffsetStrategy {
+
+ private static final BytesRef[] ZERO_LEN_BYTES_REF_ARRAY = new BytesRef[0];
+
+ public TokenStreamOffsetStrategy(String field, BytesRef[] terms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata, Analyzer indexAnalyzer) {
+ super(field, ZERO_LEN_BYTES_REF_ARRAY, phraseHelper, convertTermsToAutomata(terms, automata), indexAnalyzer);
+ assert phraseHelper.hasPositionSensitivity() == false;
+ }
+
+ private static CharacterRunAutomaton[] convertTermsToAutomata(BytesRef[] terms, CharacterRunAutomaton[] automata) {
+ CharacterRunAutomaton[] newAutomata = new CharacterRunAutomaton[terms.length + automata.length];
+ for (int i = 0; i < terms.length; i++) {
+ String termString = terms[i].utf8ToString();
+ newAutomata[i] = new CharacterRunAutomaton(Automata.makeString(termString)) {
+ @Override
+ public String toString() {
+ return termString;
+ }
+ };
+ }
+ // Append existing automata (that which is used for MTQs)
+ System.arraycopy(automata, 0, newAutomata, terms.length, automata.length);
+ return newAutomata;
+ }
+
+ @Override
+ public List<OffsetsEnum> getOffsetsEnums(IndexReader reader, int docId, String content) throws IOException {
+ TokenStream tokenStream = tokenStream(content);
+ PostingsEnum mtqPostingsEnum = new TokenStreamPostingsEnum(tokenStream, automata);
+ mtqPostingsEnum.advance(docId);
+ return Collections.singletonList(new OffsetsEnum(null, mtqPostingsEnum));
+ }
+
+ // but this would have a performance cost for likely little gain in the user experience, it
+ // would only serve to make this method less bogus.
+ // instead, we always return freq() = Integer.MAX_VALUE and let the highlighter terminate based on offset...
+ // TODO: DWS perhaps instead OffsetsEnum could become abstract and this would be an impl?
+ private static class TokenStreamPostingsEnum extends PostingsEnum implements Closeable {
+ TokenStream stream; // becomes null when closed
+ final CharacterRunAutomaton[] matchers;
+ final CharTermAttribute charTermAtt;
+ final OffsetAttribute offsetAtt;
+
+ int currentDoc = -1;
+ int currentMatch = -1;
+ int currentStartOffset = -1;
+
+ int currentEndOffset = -1;
+
+ final BytesRef matchDescriptions[];
+
+ TokenStreamPostingsEnum(TokenStream ts, CharacterRunAutomaton[] matchers) throws IOException {
+ this.stream = ts;
+ this.matchers = matchers;
+ matchDescriptions = new BytesRef[matchers.length];
+ charTermAtt = ts.addAttribute(CharTermAttribute.class);
+ offsetAtt = ts.addAttribute(OffsetAttribute.class);
+ ts.reset();
+ }
+
+ @Override
+ public int nextPosition() throws IOException {
+ if (stream != null) {
+ while (stream.incrementToken()) {
+ for (int i = 0; i < matchers.length; i++) {
+ if (matchers[i].run(charTermAtt.buffer(), 0, charTermAtt.length())) {
+ currentStartOffset = offsetAtt.startOffset();
+ currentEndOffset = offsetAtt.endOffset();
+ currentMatch = i;
+ return 0;
+ }
+ }
+ }
+ stream.end();
+ close();
+ }
+ // exhausted
+ currentStartOffset = currentEndOffset = Integer.MAX_VALUE;
+ return Integer.MAX_VALUE;
+ }
+
+ @Override
+ public int freq() throws IOException {
+ return Integer.MAX_VALUE; // lie
+ }
+
+ @Override
+ public int startOffset() throws IOException {
+ assert currentStartOffset >= 0;
+ return currentStartOffset;
+ }
+
+ @Override
+ public int endOffset() throws IOException {
+ assert currentEndOffset >= 0;
+ return currentEndOffset;
+ }
+
+ @Override
+ public BytesRef getPayload() throws IOException {
+ if (matchDescriptions[currentMatch] == null) {
+ matchDescriptions[currentMatch] = new BytesRef(matchers[currentMatch].toString());
+ }
+ return matchDescriptions[currentMatch];
+ }
+
+ @Override
+ public int docID() {
+ return currentDoc;
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ return currentDoc = target;
+ }
+
+ @Override
+ public long cost() {
+ return 0;
+ }
+
+ @Override
+ public void close() throws IOException {
+ if (stream != null) {
+ stream.close();
+ stream = null;
+ }
+ }
+ }
+}