You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by cp...@apache.org on 2017/05/25 17:55:21 UTC
[12/44] lucene-solr:jira/solr-8668: LUCENE-7815: Removed the
PostingsHighlighter
LUCENE-7815: Removed the PostingsHighlighter
Project: http://git-wip-us.apache.org/repos/asf/lucene-solr/repo
Commit: http://git-wip-us.apache.org/repos/asf/lucene-solr/commit/0d3c73ea
Tree: http://git-wip-us.apache.org/repos/asf/lucene-solr/tree/0d3c73ea
Diff: http://git-wip-us.apache.org/repos/asf/lucene-solr/diff/0d3c73ea
Branch: refs/heads/jira/solr-8668
Commit: 0d3c73eaa2dd26af73461fd6ec3494bc12edbe8a
Parents: 14320a5
Author: David Smiley <ds...@apache.org>
Authored: Tue May 23 14:39:51 2017 -0400
Committer: David Smiley <ds...@apache.org>
Committed: Tue May 23 14:39:51 2017 -0400
----------------------------------------------------------------------
lucene/CHANGES.txt | 4 +
lucene/benchmark/conf/highlighters-postings.alg | 4 +-
.../tasks/SearchTravRetHighlightTask.java | 30 -
.../DefaultPassageFormatter.java | 137 --
.../MultiTermHighlighting.java | 282 -----
.../search/postingshighlight/Passage.java | 159 ---
.../postingshighlight/PassageFormatter.java | 40 -
.../search/postingshighlight/PassageScorer.java | 104 --
.../postingshighlight/PostingsHighlighter.java | 820 ------------
.../search/postingshighlight/package-info.java | 21 -
.../CustomSeparatorBreakIterator.java | 150 +++
.../search/uhighlight/WholeBreakIterator.java | 116 ++
.../search/postingshighlight/CambridgeMA.utf8 | 1 -
.../TestMultiTermHighlighting.java | 884 -------------
.../TestPostingsHighlighter.java | 1185 ------------------
.../TestPostingsHighlighterRanking.java | 324 -----
.../uhighlight/LengthGoalBreakIteratorTest.java | 1 -
.../TestCustomSeparatorBreakIterator.java | 114 ++
.../uhighlight/TestUnifiedHighlighter.java | 1 -
.../uhighlight/TestWholeBreakIterator.java | 134 ++
.../solr/highlight/UnifiedSolrHighlighter.java | 4 +-
21 files changed, 522 insertions(+), 3993 deletions(-)
----------------------------------------------------------------------
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0d3c73ea/lucene/CHANGES.txt
----------------------------------------------------------------------
diff --git a/lucene/CHANGES.txt b/lucene/CHANGES.txt
index 3951cea..dadba8b 100644
--- a/lucene/CHANGES.txt
+++ b/lucene/CHANGES.txt
@@ -53,6 +53,10 @@ API Changes
* LUCENE-7741: DoubleValuesSource now has an explain() method (Alan Woodward,
Adrien Grand)
+* LUCENE-7815: Removed the PostingsHighlighter; you should use the UnifiedHighlighter
+ instead, which derived from the UH. WholeBreakIterator and
+ CustomSeparatorBreakIterator were moved to UH's package. (David Smiley)
+
Bug Fixes
* LUCENE-7626: IndexWriter will no longer accept broken token offsets
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0d3c73ea/lucene/benchmark/conf/highlighters-postings.alg
----------------------------------------------------------------------
diff --git a/lucene/benchmark/conf/highlighters-postings.alg b/lucene/benchmark/conf/highlighters-postings.alg
index 610908f..2560dad 100644
--- a/lucene/benchmark/conf/highlighters-postings.alg
+++ b/lucene/benchmark/conf/highlighters-postings.alg
@@ -38,7 +38,7 @@ file.query.maker.file=conf/query-terms.txt
log.queries=false
log.step.SearchTravRetHighlight=-1
-highlighter=HlImpl:NONE:SH_A:UH_A:PH_P:UH_P:UH_PV
+highlighter=HlImpl:NONE:SH_A:UH_A:UH_P:UH_PV
{ "Populate"
CreateIndex
@@ -60,6 +60,6 @@ highlighter=HlImpl:NONE:SH_A:UH_A:PH_P:UH_P:UH_PV
CloseReader
NewRound
-} : 6
+} : 5
RepSumByPrefRound HL
\ No newline at end of file
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0d3c73ea/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetHighlightTask.java
----------------------------------------------------------------------
diff --git a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetHighlightTask.java b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetHighlightTask.java
index f36854d..d90d3a7 100644
--- a/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetHighlightTask.java
+++ b/lucene/benchmark/src/java/org/apache/lucene/benchmark/byTask/tasks/SearchTravRetHighlightTask.java
@@ -42,7 +42,6 @@ import org.apache.lucene.search.highlight.Highlighter;
import org.apache.lucene.search.highlight.QueryScorer;
import org.apache.lucene.search.highlight.SimpleHTMLFormatter;
import org.apache.lucene.search.highlight.TokenSources;
-import org.apache.lucene.search.postingshighlight.PostingsHighlighter;
import org.apache.lucene.search.uhighlight.UnifiedHighlighter;
import org.apache.lucene.search.vectorhighlight.BoundaryScanner;
import org.apache.lucene.search.vectorhighlight.BreakIteratorBoundaryScanner;
@@ -133,8 +132,6 @@ public class SearchTravRetHighlightTask extends SearchTravTask {
case "UH_P": hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.POSTINGS); break;
case "UH_PV": hlImpl = new UnifiedHLImpl(UnifiedHighlighter.OffsetSource.POSTINGS_WITH_TERM_VECTORS); break;
- case "PH_P": hlImpl = new PostingsHLImpl(); break;
-
default: throw new Exception("unrecognized highlighter type: " + type + " (try 'UH')");
}
}
@@ -224,33 +221,6 @@ public class SearchTravRetHighlightTask extends SearchTravTask {
return clone;
}
- private class PostingsHLImpl implements HLImpl {
- PostingsHighlighter highlighter;
- String[] fields = hlFields.toArray(new String[hlFields.size()]);
- int[] maxPassages;
- PostingsHLImpl() {
- highlighter = new PostingsHighlighter(maxDocCharsToAnalyze) {
- @Override
- protected Analyzer getIndexAnalyzer(String field) { // thus support wildcards
- return analyzer;
- }
-
- @Override
- protected BreakIterator getBreakIterator(String field) {
- return BreakIterator.getSentenceInstance(Locale.ENGLISH);
- }
- };
- maxPassages = new int[hlFields.size()];
- Arrays.fill(maxPassages, maxFrags);
- }
-
- @Override
- public void withTopDocs(IndexSearcher searcher, Query q, TopDocs hits) throws Exception {
- Map<String, String[]> result = highlighter.highlightFields(fields, q, searcher, hits, maxPassages);
- preventOptimizeAway = result.size();
- }
- }
-
private class UnifiedHLImpl implements HLImpl {
UnifiedHighlighter highlighter;
IndexSearcher lastSearcher;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0d3c73ea/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/DefaultPassageFormatter.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/DefaultPassageFormatter.java b/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/DefaultPassageFormatter.java
deleted file mode 100644
index 73822c8..0000000
--- a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/DefaultPassageFormatter.java
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.search.postingshighlight;
-
-/**
- * Creates a formatted snippet from the top passages.
- * <p>
- * The default implementation marks the query terms as bold, and places
- * ellipses between unconnected passages.
- */
-public class DefaultPassageFormatter extends PassageFormatter {
- /** text that will appear before highlighted terms */
- protected final String preTag;
- /** text that will appear after highlighted terms */
- protected final String postTag;
- /** text that will appear between two unconnected passages */
- protected final String ellipsis;
- /** true if we should escape for html */
- protected final boolean escape;
-
- /**
- * Creates a new DefaultPassageFormatter with the default tags.
- */
- public DefaultPassageFormatter() {
- this("<b>", "</b>", "... ", false);
- }
-
- /**
- * Creates a new DefaultPassageFormatter with custom tags.
- * @param preTag text which should appear before a highlighted term.
- * @param postTag text which should appear after a highlighted term.
- * @param ellipsis text which should be used to connect two unconnected passages.
- * @param escape true if text should be html-escaped
- */
- public DefaultPassageFormatter(String preTag, String postTag, String ellipsis, boolean escape) {
- if (preTag == null || postTag == null || ellipsis == null) {
- throw new NullPointerException();
- }
- this.preTag = preTag;
- this.postTag = postTag;
- this.ellipsis = ellipsis;
- this.escape = escape;
- }
-
- @Override
- public String format(Passage passages[], String content) {
- StringBuilder sb = new StringBuilder();
- int pos = 0;
- for (Passage passage : passages) {
- // don't add ellipsis if it's the first one, or if it's connected.
- if (passage.startOffset > pos && pos > 0) {
- sb.append(ellipsis);
- }
- pos = passage.startOffset;
- for (int i = 0; i < passage.numMatches; i++) {
- int start = passage.matchStarts[i];
- int end = passage.matchEnds[i];
- // it's possible to have overlapping terms
- if (start > pos) {
- append(sb, content, pos, start);
- }
- if (end > pos) {
- sb.append(preTag);
- append(sb, content, Math.max(pos, start), end);
- sb.append(postTag);
- pos = end;
- }
- }
- // it's possible a "term" from the analyzer could span a sentence boundary.
- append(sb, content, pos, Math.max(pos, passage.endOffset));
- pos = passage.endOffset;
- }
- return sb.toString();
- }
-
- /**
- * Appends original text to the response.
- * @param dest resulting text, possibly transformed or encoded
- * @param content original text content
- * @param start index of the first character in content
- * @param end index of the character following the last character in content
- */
- protected void append(StringBuilder dest, String content, int start, int end) {
- if (escape) {
- // note: these are the rules from owasp.org
- for (int i = start; i < end; i++) {
- char ch = content.charAt(i);
- switch(ch) {
- case '&':
- dest.append("&");
- break;
- case '<':
- dest.append("<");
- break;
- case '>':
- dest.append(">");
- break;
- case '"':
- dest.append(""");
- break;
- case '\'':
- dest.append("'");
- break;
- case '/':
- dest.append("/");
- break;
- default:
- if (ch >= 0x30 && ch <= 0x39 || ch >= 0x41 && ch <= 0x5A || ch >= 0x61 && ch <= 0x7A) {
- dest.append(ch);
- } else if (ch < 0xff) {
- dest.append("&#");
- dest.append((int)ch);
- dest.append(";");
- } else {
- dest.append(ch);
- }
- }
- }
- } else {
- dest.append(content, start, end);
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0d3c73ea/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/MultiTermHighlighting.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/MultiTermHighlighting.java b/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/MultiTermHighlighting.java
deleted file mode 100644
index c9733d3..0000000
--- a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/MultiTermHighlighting.java
+++ /dev/null
@@ -1,282 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.search.postingshighlight;
-
-import java.io.IOException;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Comparator;
-import java.util.List;
-
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.index.PostingsEnum;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.search.AutomatonQuery;
-import org.apache.lucene.search.BooleanClause;
-import org.apache.lucene.search.BooleanQuery;
-import org.apache.lucene.search.ConstantScoreQuery;
-import org.apache.lucene.search.DisjunctionMaxQuery;
-import org.apache.lucene.search.FuzzyQuery;
-import org.apache.lucene.search.PrefixQuery;
-import org.apache.lucene.search.Query;
-import org.apache.lucene.search.TermRangeQuery;
-import org.apache.lucene.search.spans.SpanMultiTermQueryWrapper;
-import org.apache.lucene.search.spans.SpanNearQuery;
-import org.apache.lucene.search.spans.SpanNotQuery;
-import org.apache.lucene.search.spans.SpanOrQuery;
-import org.apache.lucene.search.spans.SpanPositionCheckQuery;
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.CharsRef;
-import org.apache.lucene.util.UnicodeUtil;
-import org.apache.lucene.util.automaton.Automata;
-import org.apache.lucene.util.automaton.Automaton;
-import org.apache.lucene.util.automaton.CharacterRunAutomaton;
-import org.apache.lucene.util.automaton.LevenshteinAutomata;
-import org.apache.lucene.util.automaton.Operations;
-
-/**
- * Support for highlighting multiterm queries in PostingsHighlighter.
- */
-class MultiTermHighlighting {
-
- /**
- * Extracts all MultiTermQueries for {@code field}, and returns equivalent
- * automata that will match terms.
- */
- static CharacterRunAutomaton[] extractAutomata(Query query, String field) {
- List<CharacterRunAutomaton> list = new ArrayList<>();
- if (query instanceof BooleanQuery) {
- for (BooleanClause clause : (BooleanQuery) query) {
- if (!clause.isProhibited()) {
- list.addAll(Arrays.asList(extractAutomata(clause.getQuery(), field)));
- }
- }
- } else if (query instanceof ConstantScoreQuery) {
- list.addAll(Arrays.asList(extractAutomata(((ConstantScoreQuery) query).getQuery(), field)));
- } else if (query instanceof DisjunctionMaxQuery) {
- for (Query sub : ((DisjunctionMaxQuery) query).getDisjuncts()) {
- list.addAll(Arrays.asList(extractAutomata(sub, field)));
- }
- } else if (query instanceof SpanOrQuery) {
- for (Query sub : ((SpanOrQuery) query).getClauses()) {
- list.addAll(Arrays.asList(extractAutomata(sub, field)));
- }
- } else if (query instanceof SpanNearQuery) {
- for (Query sub : ((SpanNearQuery) query).getClauses()) {
- list.addAll(Arrays.asList(extractAutomata(sub, field)));
- }
- } else if (query instanceof SpanNotQuery) {
- list.addAll(Arrays.asList(extractAutomata(((SpanNotQuery) query).getInclude(), field)));
- } else if (query instanceof SpanPositionCheckQuery) {
- list.addAll(Arrays.asList(extractAutomata(((SpanPositionCheckQuery) query).getMatch(), field)));
- } else if (query instanceof SpanMultiTermQueryWrapper) {
- list.addAll(Arrays.asList(extractAutomata(((SpanMultiTermQueryWrapper<?>) query).getWrappedQuery(), field)));
- } else if (query instanceof PrefixQuery) {
- final PrefixQuery pq = (PrefixQuery) query;
- Term prefix = pq.getPrefix();
- if (prefix.field().equals(field)) {
- list.add(new CharacterRunAutomaton(Operations.concatenate(Automata.makeString(prefix.text()),
- Automata.makeAnyString())) {
- @Override
- public String toString() {
- return pq.toString();
- }
- });
- }
- } else if (query instanceof FuzzyQuery) {
- final FuzzyQuery fq = (FuzzyQuery) query;
- if (fq.getField().equals(field)) {
- String utf16 = fq.getTerm().text();
- int termText[] = new int[utf16.codePointCount(0, utf16.length())];
- for (int cp, i = 0, j = 0; i < utf16.length(); i += Character.charCount(cp)) {
- termText[j++] = cp = utf16.codePointAt(i);
- }
- int termLength = termText.length;
- int prefixLength = Math.min(fq.getPrefixLength(), termLength);
- String suffix = UnicodeUtil.newString(termText, prefixLength, termText.length - prefixLength);
- LevenshteinAutomata builder = new LevenshteinAutomata(suffix, fq.getTranspositions());
- String prefix = UnicodeUtil.newString(termText, 0, prefixLength);
- Automaton automaton = builder.toAutomaton(fq.getMaxEdits(), prefix);
- list.add(new CharacterRunAutomaton(automaton) {
- @Override
- public String toString() {
- return fq.toString();
- }
- });
- }
- } else if (query instanceof TermRangeQuery) {
- final TermRangeQuery tq = (TermRangeQuery) query;
- if (tq.getField().equals(field)) {
- final CharsRef lowerBound;
- if (tq.getLowerTerm() == null) {
- lowerBound = null;
- } else {
- lowerBound = new CharsRef(tq.getLowerTerm().utf8ToString());
- }
-
- final CharsRef upperBound;
- if (tq.getUpperTerm() == null) {
- upperBound = null;
- } else {
- upperBound = new CharsRef(tq.getUpperTerm().utf8ToString());
- }
-
- final boolean includeLower = tq.includesLower();
- final boolean includeUpper = tq.includesUpper();
- final CharsRef scratch = new CharsRef();
- final Comparator<CharsRef> comparator = CharsRef.getUTF16SortedAsUTF8Comparator();
-
- // this is *not* an automaton, but it's very simple
- list.add(new CharacterRunAutomaton(Automata.makeEmpty()) {
- @Override
- public boolean run(char[] s, int offset, int length) {
- scratch.chars = s;
- scratch.offset = offset;
- scratch.length = length;
-
- if (lowerBound != null) {
- int cmp = comparator.compare(scratch, lowerBound);
- if (cmp < 0 || (!includeLower && cmp == 0)) {
- return false;
- }
- }
-
- if (upperBound != null) {
- int cmp = comparator.compare(scratch, upperBound);
- if (cmp > 0 || (!includeUpper && cmp == 0)) {
- return false;
- }
- }
- return true;
- }
-
- @Override
- public String toString() {
- return tq.toString();
- }
- });
- }
- } else if (query instanceof AutomatonQuery) {
- final AutomatonQuery aq = (AutomatonQuery) query;
- if (aq.getField().equals(field)) {
- list.add(new CharacterRunAutomaton(aq.getAutomaton()) {
- @Override
- public String toString() {
- return aq.toString();
- }
- });
- }
- }
- return list.toArray(new CharacterRunAutomaton[list.size()]);
- }
-
- /**
- * Returns a "fake" DocsAndPositionsEnum over the tokenstream, returning offsets where {@code matchers}
- * matches tokens.
- * <p>
- * This is solely used internally by PostingsHighlighter: <b>DO NOT USE THIS METHOD!</b>
- */
- static PostingsEnum getDocsEnum(final TokenStream ts, final CharacterRunAutomaton[] matchers) throws IOException {
- final CharTermAttribute charTermAtt = ts.addAttribute(CharTermAttribute.class);
- final OffsetAttribute offsetAtt = ts.addAttribute(OffsetAttribute.class);
- ts.reset();
-
- // TODO: we could use CachingWrapperFilter, (or consume twice) to allow us to have a true freq()
- // but this would have a performance cost for likely little gain in the user experience, it
- // would only serve to make this method less bogus.
- // instead, we always return freq() = Integer.MAX_VALUE and let PH terminate based on offset...
-
- return new PostingsEnum() {
- int currentDoc = -1;
- int currentMatch = -1;
- int currentStartOffset = -1;
- int currentEndOffset = -1;
- TokenStream stream = ts;
-
- final BytesRef matchDescriptions[] = new BytesRef[matchers.length];
-
- @Override
- public int nextPosition() throws IOException {
- if (stream != null) {
- while (stream.incrementToken()) {
- for (int i = 0; i < matchers.length; i++) {
- if (matchers[i].run(charTermAtt.buffer(), 0, charTermAtt.length())) {
- currentStartOffset = offsetAtt.startOffset();
- currentEndOffset = offsetAtt.endOffset();
- currentMatch = i;
- return 0;
- }
- }
- }
- stream.end();
- stream.close();
- stream = null;
- }
- // exhausted
- currentStartOffset = currentEndOffset = Integer.MAX_VALUE;
- return Integer.MAX_VALUE;
- }
-
- @Override
- public int freq() throws IOException {
- return Integer.MAX_VALUE; // lie
- }
-
- @Override
- public int startOffset() throws IOException {
- assert currentStartOffset >= 0;
- return currentStartOffset;
- }
-
- @Override
- public int endOffset() throws IOException {
- assert currentEndOffset >= 0;
- return currentEndOffset;
- }
-
- @Override
- public BytesRef getPayload() throws IOException {
- if (matchDescriptions[currentMatch] == null) {
- matchDescriptions[currentMatch] = new BytesRef(matchers[currentMatch].toString());
- }
- return matchDescriptions[currentMatch];
- }
-
- @Override
- public int docID() {
- return currentDoc;
- }
-
- @Override
- public int nextDoc() throws IOException {
- throw new UnsupportedOperationException();
- }
-
- @Override
- public int advance(int target) throws IOException {
- return currentDoc = target;
- }
-
- @Override
- public long cost() {
- return 0;
- }
- };
- }
-}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0d3c73ea/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/Passage.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/Passage.java b/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/Passage.java
deleted file mode 100644
index 50aebea..0000000
--- a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/Passage.java
+++ /dev/null
@@ -1,159 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.search.postingshighlight;
-
-import org.apache.lucene.util.ArrayUtil;
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.InPlaceMergeSorter;
-import org.apache.lucene.util.RamUsageEstimator;
-
-/**
- * Represents a passage (typically a sentence of the document).
- * <p>
- * A passage contains {@link #getNumMatches} highlights from the query,
- * and the offsets and query terms that correspond with each match.
- * @lucene.experimental
- */
-public final class Passage {
- int startOffset = -1;
- int endOffset = -1;
- float score = 0.0f;
-
- int matchStarts[] = new int[8];
- int matchEnds[] = new int[8];
- BytesRef matchTerms[] = new BytesRef[8];
- int numMatches = 0;
-
- void addMatch(int startOffset, int endOffset, BytesRef term) {
- assert startOffset >= this.startOffset && startOffset <= this.endOffset;
- if (numMatches == matchStarts.length) {
- int newLength = ArrayUtil.oversize(numMatches+1, RamUsageEstimator.NUM_BYTES_OBJECT_REF);
- int newMatchStarts[] = new int[newLength];
- int newMatchEnds[] = new int[newLength];
- BytesRef newMatchTerms[] = new BytesRef[newLength];
- System.arraycopy(matchStarts, 0, newMatchStarts, 0, numMatches);
- System.arraycopy(matchEnds, 0, newMatchEnds, 0, numMatches);
- System.arraycopy(matchTerms, 0, newMatchTerms, 0, numMatches);
- matchStarts = newMatchStarts;
- matchEnds = newMatchEnds;
- matchTerms = newMatchTerms;
- }
- assert matchStarts.length == matchEnds.length && matchEnds.length == matchTerms.length;
- matchStarts[numMatches] = startOffset;
- matchEnds[numMatches] = endOffset;
- matchTerms[numMatches] = term;
- numMatches++;
- }
-
- void sort() {
- final int starts[] = matchStarts;
- final int ends[] = matchEnds;
- final BytesRef terms[] = matchTerms;
- new InPlaceMergeSorter() {
- @Override
- protected void swap(int i, int j) {
- int temp = starts[i];
- starts[i] = starts[j];
- starts[j] = temp;
-
- temp = ends[i];
- ends[i] = ends[j];
- ends[j] = temp;
-
- BytesRef tempTerm = terms[i];
- terms[i] = terms[j];
- terms[j] = tempTerm;
- }
-
- @Override
- protected int compare(int i, int j) {
- return Integer.compare(starts[i], starts[j]);
- }
-
- }.sort(0, numMatches);
- }
-
- void reset() {
- startOffset = endOffset = -1;
- score = 0.0f;
- numMatches = 0;
- }
-
- /**
- * Start offset of this passage.
- * @return start index (inclusive) of the passage in the
- * original content: always >= 0.
- */
- public int getStartOffset() {
- return startOffset;
- }
-
- /**
- * End offset of this passage.
- * @return end index (exclusive) of the passage in the
- * original content: always >= {@link #getStartOffset()}
- */
- public int getEndOffset() {
- return endOffset;
- }
-
- /**
- * Passage's score.
- */
- public float getScore() {
- return score;
- }
-
- /**
- * Number of term matches available in
- * {@link #getMatchStarts}, {@link #getMatchEnds},
- * {@link #getMatchTerms}
- */
- public int getNumMatches() {
- return numMatches;
- }
-
- /**
- * Start offsets of the term matches, in increasing order.
- * <p>
- * Only {@link #getNumMatches} are valid. Note that these
- * offsets are absolute (not relative to {@link #getStartOffset()}).
- */
- public int[] getMatchStarts() {
- return matchStarts;
- }
-
- /**
- * End offsets of the term matches, corresponding with {@link #getMatchStarts}.
- * <p>
- * Only {@link #getNumMatches} are valid. Note that it's possible that an end offset
- * could exceed beyond the bounds of the passage ({@link #getEndOffset()}), if the
- * Analyzer produced a term which spans a passage boundary.
- */
- public int[] getMatchEnds() {
- return matchEnds;
- }
-
- /**
- * BytesRef (term text) of the matches, corresponding with {@link #getMatchStarts()}.
- * <p>
- * Only {@link #getNumMatches()} are valid.
- */
- public BytesRef[] getMatchTerms() {
- return matchTerms;
- }
-}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0d3c73ea/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageFormatter.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageFormatter.java b/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageFormatter.java
deleted file mode 100644
index f1596c1..0000000
--- a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageFormatter.java
+++ /dev/null
@@ -1,40 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.search.postingshighlight;
-
-/**
- * Creates a formatted snippet from the top passages.
- *
- * @lucene.experimental
- */
-public abstract class PassageFormatter {
-
- /**
- * Formats the top <code>passages</code> from <code>content</code>
- * into a human-readable text snippet.
- *
- * @param passages top-N passages for the field. Note these are sorted in
- * the order that they appear in the document for convenience.
- * @param content content for the field.
- * @return formatted highlight. Note that for the
- * non-expert APIs in {@link PostingsHighlighter} that
- * return String, the toString method on the Object
- * returned by this method is used to compute the string.
- */
- public abstract Object format(Passage passages[], String content);
-
-}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0d3c73ea/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageScorer.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageScorer.java b/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageScorer.java
deleted file mode 100644
index 1f74f7a..0000000
--- a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PassageScorer.java
+++ /dev/null
@@ -1,104 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.search.postingshighlight;
-
-/**
- * Ranks passages found by {@link PostingsHighlighter}.
- * <p>
- * Each passage is scored as a miniature document within the document.
- * The final score is computed as {@link #norm} * ∑ ({@link #weight} * {@link #tf}).
- * The default implementation is {@link #norm} * BM25.
- * @lucene.experimental
- */
-public class PassageScorer {
-
- // TODO: this formula is completely made up. It might not provide relevant snippets!
-
- /** BM25 k1 parameter, controls term frequency normalization */
- final float k1;
- /** BM25 b parameter, controls length normalization. */
- final float b;
- /** A pivot used for length normalization. */
- final float pivot;
-
- /**
- * Creates PassageScorer with these default values:
- * <ul>
- * <li>{@code k1 = 1.2},
- * <li>{@code b = 0.75}.
- * <li>{@code pivot = 87}
- * </ul>
- */
- public PassageScorer() {
- // 1.2 and 0.75 are well-known bm25 defaults (but maybe not the best here) ?
- // 87 is typical average english sentence length.
- this(1.2f, 0.75f, 87f);
- }
-
- /**
- * Creates PassageScorer with specified scoring parameters
- * @param k1 Controls non-linear term frequency normalization (saturation).
- * @param b Controls to what degree passage length normalizes tf values.
- * @param pivot Pivot value for length normalization (some rough idea of average sentence length in characters).
- */
- public PassageScorer(float k1, float b, float pivot) {
- this.k1 = k1;
- this.b = b;
- this.pivot = pivot;
- }
-
- /**
- * Computes term importance, given its in-document statistics.
- *
- * @param contentLength length of document in characters
- * @param totalTermFreq number of time term occurs in document
- * @return term importance
- */
- public float weight(int contentLength, int totalTermFreq) {
- // approximate #docs from content length
- float numDocs = 1 + contentLength / pivot;
- // numDocs not numDocs - docFreq (ala DFR), since we approximate numDocs
- return (k1 + 1) * (float) Math.log(1 + (numDocs + 0.5D)/(totalTermFreq + 0.5D));
- }
-
- /**
- * Computes term weight, given the frequency within the passage
- * and the passage's length.
- *
- * @param freq number of occurrences of within this passage
- * @param passageLen length of the passage in characters.
- * @return term weight
- */
- public float tf(int freq, int passageLen) {
- float norm = k1 * ((1 - b) + b * (passageLen / pivot));
- return freq / (freq + norm);
- }
-
- /**
- * Normalize a passage according to its position in the document.
- * <p>
- * Typically passages towards the beginning of the document are
- * more useful for summarizing the contents.
- * <p>
- * The default implementation is <code>1 + 1/log(pivot + passageStart)</code>
- * @param passageStart start offset of the passage
- * @return a boost value multiplied into the passage's core.
- */
- public float norm(int passageStart) {
- return 1 + 1/(float)Math.log(pivot + passageStart);
- }
-}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0d3c73ea/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java
deleted file mode 100644
index e4d3667..0000000
--- a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/PostingsHighlighter.java
+++ /dev/null
@@ -1,820 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.lucene.search.postingshighlight;
-
-import java.io.IOException;
-import java.nio.charset.StandardCharsets;
-import java.text.BreakIterator;
-import java.util.ArrayList;
-import java.util.Arrays;
-import java.util.Comparator;
-import java.util.HashMap;
-import java.util.List;
-import java.util.Locale;
-import java.util.Map;
-import java.util.PriorityQueue;
-import java.util.SortedSet;
-import java.util.TreeSet;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.index.FieldInfo;
-import org.apache.lucene.index.IndexOptions;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexReaderContext;
-import org.apache.lucene.index.LeafReader;
-import org.apache.lucene.index.LeafReaderContext;
-import org.apache.lucene.index.MultiReader;
-import org.apache.lucene.index.PostingsEnum;
-import org.apache.lucene.index.ReaderUtil;
-import org.apache.lucene.index.StoredFieldVisitor;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.index.Terms;
-import org.apache.lucene.index.TermsEnum;
-import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.Query;
-import org.apache.lucene.search.ScoreDoc;
-import org.apache.lucene.search.TopDocs;
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.InPlaceMergeSorter;
-import org.apache.lucene.util.UnicodeUtil;
-import org.apache.lucene.util.automaton.CharacterRunAutomaton;
-
-/**
- * Simple highlighter that does not analyze fields nor use
- * term vectors. Instead it requires
- * {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}.
- * <p>
- * PostingsHighlighter treats the single original document as the whole corpus, and then scores individual
- * passages as if they were documents in this corpus. It uses a {@link BreakIterator} to find
- * passages in the text; by default it breaks using {@link BreakIterator#getSentenceInstance(Locale)
- * getSentenceInstance(Locale.ROOT)}. It then iterates in parallel (merge sorting by offset) through
- * the positions of all terms from the query, coalescing those hits that occur in a single passage
- * into a {@link Passage}, and then scores each Passage using a separate {@link PassageScorer}.
- * Passages are finally formatted into highlighted snippets with a {@link PassageFormatter}.
- * <p>
- * You can customize the behavior by subclassing this highlighter, some important hooks:
- * <ul>
- * <li>{@link #getBreakIterator(String)}: Customize how the text is divided into passages.
- * <li>{@link #getScorer(String)}: Customize how passages are ranked.
- * <li>{@link #getFormatter(String)}: Customize how snippets are formatted.
- * <li>{@link #getIndexAnalyzer(String)}: Enable highlighting of MultiTermQuerys such as {@code WildcardQuery}.
- * </ul>
- * <p>
- * <b>WARNING</b>: The code is very new and probably still has some exciting bugs!
- * <p>
- * Example usage:
- * <pre class="prettyprint">
- * // configure field with offsets at index time
- * FieldType offsetsType = new FieldType(TextField.TYPE_STORED);
- * offsetsType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
- * Field body = new Field("body", "foobar", offsetsType);
- *
- * // retrieve highlights at query time
- * PostingsHighlighter highlighter = new PostingsHighlighter();
- * Query query = new TermQuery(new Term("body", "highlighting"));
- * TopDocs topDocs = searcher.search(query, n);
- * String highlights[] = highlighter.highlight("body", query, searcher, topDocs);
- * </pre>
- * <p>
- * This is thread-safe, and can be used across different readers.
- * @lucene.experimental
- */
-public class PostingsHighlighter {
-
- // TODO: maybe allow re-analysis for tiny fields? currently we require offsets,
- // but if the analyzer is really fast and the field is tiny, this might really be
- // unnecessary.
-
- /** for rewriting: we don't want slow processing from MTQs */
- private static final IndexSearcher EMPTY_INDEXSEARCHER;
- static {
- try {
- IndexReader emptyReader = new MultiReader();
- EMPTY_INDEXSEARCHER = new IndexSearcher(emptyReader);
- EMPTY_INDEXSEARCHER.setQueryCache(null);
- } catch (IOException bogus) {
- throw new RuntimeException(bogus);
- }
- }
-
- /** Default maximum content size to process. Typically snippets
- * closer to the beginning of the document better summarize its content */
- public static final int DEFAULT_MAX_LENGTH = 10000;
-
- private final int maxLength;
-
- /** Set the first time {@link #getFormatter} is called,
- * and then reused. */
- private PassageFormatter defaultFormatter;
-
- /** Set the first time {@link #getScorer} is called,
- * and then reused. */
- private PassageScorer defaultScorer;
-
- /**
- * Creates a new highlighter with {@link #DEFAULT_MAX_LENGTH}.
- */
- public PostingsHighlighter() {
- this(DEFAULT_MAX_LENGTH);
- }
-
- /**
- * Creates a new highlighter, specifying maximum content length.
- * @param maxLength maximum content size to process.
- * @throws IllegalArgumentException if <code>maxLength</code> is negative or <code>Integer.MAX_VALUE</code>
- */
- public PostingsHighlighter(int maxLength) {
- if (maxLength < 0 || maxLength == Integer.MAX_VALUE) {
- // two reasons: no overflow problems in BreakIterator.preceding(offset+1),
- // our sentinel in the offsets queue uses this value to terminate.
- throw new IllegalArgumentException("maxLength must be < Integer.MAX_VALUE");
- }
- this.maxLength = maxLength;
- }
-
- /** Returns the {@link BreakIterator} to use for
- * dividing text into passages. This returns
- * {@link BreakIterator#getSentenceInstance(Locale)} by default;
- * subclasses can override to customize. */
- protected BreakIterator getBreakIterator(String field) {
- return BreakIterator.getSentenceInstance(Locale.ROOT);
- }
-
- /** Returns the {@link PassageFormatter} to use for
- * formatting passages into highlighted snippets. This
- * returns a new {@code PassageFormatter} by default;
- * subclasses can override to customize. */
- protected PassageFormatter getFormatter(String field) {
- if (defaultFormatter == null) {
- defaultFormatter = new DefaultPassageFormatter();
- }
- return defaultFormatter;
- }
-
- /** Returns the {@link PassageScorer} to use for
- * ranking passages. This
- * returns a new {@code PassageScorer} by default;
- * subclasses can override to customize. */
- protected PassageScorer getScorer(String field) {
- if (defaultScorer == null) {
- defaultScorer = new PassageScorer();
- }
- return defaultScorer;
- }
-
- /**
- * Highlights the top passages from a single field.
- *
- * @param field field name to highlight.
- * Must have a stored string value and also be indexed with offsets.
- * @param query query to highlight.
- * @param searcher searcher that was previously used to execute the query.
- * @param topDocs TopDocs containing the summary result documents to highlight.
- * @return Array of formatted snippets corresponding to the documents in <code>topDocs</code>.
- * If no highlights were found for a document, the
- * first sentence for the field will be returned.
- * @throws IOException if an I/O error occurred during processing
- * @throws IllegalArgumentException if <code>field</code> was indexed without
- * {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
- */
- public String[] highlight(String field, Query query, IndexSearcher searcher, TopDocs topDocs) throws IOException {
- return highlight(field, query, searcher, topDocs, 1);
- }
-
- /**
- * Highlights the top-N passages from a single field.
- *
- * @param field field name to highlight.
- * Must have a stored string value and also be indexed with offsets.
- * @param query query to highlight.
- * @param searcher searcher that was previously used to execute the query.
- * @param topDocs TopDocs containing the summary result documents to highlight.
- * @param maxPassages The maximum number of top-N ranked passages used to
- * form the highlighted snippets.
- * @return Array of formatted snippets corresponding to the documents in <code>topDocs</code>.
- * If no highlights were found for a document, the
- * first {@code maxPassages} sentences from the
- * field will be returned.
- * @throws IOException if an I/O error occurred during processing
- * @throws IllegalArgumentException if <code>field</code> was indexed without
- * {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
- */
- public String[] highlight(String field, Query query, IndexSearcher searcher, TopDocs topDocs, int maxPassages) throws IOException {
- Map<String,String[]> res = highlightFields(new String[] { field }, query, searcher, topDocs, new int[] { maxPassages });
- return res.get(field);
- }
-
- /**
- * Highlights the top passages from multiple fields.
- * <p>
- * Conceptually, this behaves as a more efficient form of:
- * <pre class="prettyprint">
- * Map m = new HashMap();
- * for (String field : fields) {
- * m.put(field, highlight(field, query, searcher, topDocs));
- * }
- * return m;
- * </pre>
- *
- * @param fields field names to highlight.
- * Must have a stored string value and also be indexed with offsets.
- * @param query query to highlight.
- * @param searcher searcher that was previously used to execute the query.
- * @param topDocs TopDocs containing the summary result documents to highlight.
- * @return Map keyed on field name, containing the array of formatted snippets
- * corresponding to the documents in <code>topDocs</code>.
- * If no highlights were found for a document, the
- * first sentence from the field will be returned.
- * @throws IOException if an I/O error occurred during processing
- * @throws IllegalArgumentException if <code>field</code> was indexed without
- * {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
- */
- public Map<String,String[]> highlightFields(String fields[], Query query, IndexSearcher searcher, TopDocs topDocs) throws IOException {
- int maxPassages[] = new int[fields.length];
- Arrays.fill(maxPassages, 1);
- return highlightFields(fields, query, searcher, topDocs, maxPassages);
- }
-
- /**
- * Highlights the top-N passages from multiple fields.
- * <p>
- * Conceptually, this behaves as a more efficient form of:
- * <pre class="prettyprint">
- * Map m = new HashMap();
- * for (String field : fields) {
- * m.put(field, highlight(field, query, searcher, topDocs, maxPassages));
- * }
- * return m;
- * </pre>
- *
- * @param fields field names to highlight.
- * Must have a stored string value and also be indexed with offsets.
- * @param query query to highlight.
- * @param searcher searcher that was previously used to execute the query.
- * @param topDocs TopDocs containing the summary result documents to highlight.
- * @param maxPassages The maximum number of top-N ranked passages per-field used to
- * form the highlighted snippets.
- * @return Map keyed on field name, containing the array of formatted snippets
- * corresponding to the documents in <code>topDocs</code>.
- * If no highlights were found for a document, the
- * first {@code maxPassages} sentences from the
- * field will be returned.
- * @throws IOException if an I/O error occurred during processing
- * @throws IllegalArgumentException if <code>field</code> was indexed without
- * {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
- */
- public Map<String,String[]> highlightFields(String fields[], Query query, IndexSearcher searcher, TopDocs topDocs, int maxPassages[]) throws IOException {
- final ScoreDoc scoreDocs[] = topDocs.scoreDocs;
- int docids[] = new int[scoreDocs.length];
- for (int i = 0; i < docids.length; i++) {
- docids[i] = scoreDocs[i].doc;
- }
-
- return highlightFields(fields, query, searcher, docids, maxPassages);
- }
-
- /**
- * Highlights the top-N passages from multiple fields,
- * for the provided int[] docids.
- *
- * @param fieldsIn field names to highlight.
- * Must have a stored string value and also be indexed with offsets.
- * @param query query to highlight.
- * @param searcher searcher that was previously used to execute the query.
- * @param docidsIn containing the document IDs to highlight.
- * @param maxPassagesIn The maximum number of top-N ranked passages per-field used to
- * form the highlighted snippets.
- * @return Map keyed on field name, containing the array of formatted snippets
- * corresponding to the documents in <code>docidsIn</code>.
- * If no highlights were found for a document, the
- * first {@code maxPassages} from the field will
- * be returned.
- * @throws IOException if an I/O error occurred during processing
- * @throws IllegalArgumentException if <code>field</code> was indexed without
- * {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
- */
- public Map<String,String[]> highlightFields(String fieldsIn[], Query query, IndexSearcher searcher, int[] docidsIn, int maxPassagesIn[]) throws IOException {
- Map<String,String[]> snippets = new HashMap<>();
- for(Map.Entry<String,Object[]> ent : highlightFieldsAsObjects(fieldsIn, query, searcher, docidsIn, maxPassagesIn).entrySet()) {
- Object[] snippetObjects = ent.getValue();
- String[] snippetStrings = new String[snippetObjects.length];
- snippets.put(ent.getKey(), snippetStrings);
- for(int i=0;i<snippetObjects.length;i++) {
- Object snippet = snippetObjects[i];
- if (snippet != null) {
- snippetStrings[i] = snippet.toString();
- }
- }
- }
-
- return snippets;
- }
-
- /**
- * Expert: highlights the top-N passages from multiple fields,
- * for the provided int[] docids, to custom Object as
- * returned by the {@link PassageFormatter}. Use
- * this API to render to something other than String.
- *
- * @param fieldsIn field names to highlight.
- * Must have a stored string value and also be indexed with offsets.
- * @param query query to highlight.
- * @param searcher searcher that was previously used to execute the query.
- * @param docidsIn containing the document IDs to highlight.
- * @param maxPassagesIn The maximum number of top-N ranked passages per-field used to
- * form the highlighted snippets.
- * @return Map keyed on field name, containing the array of formatted snippets
- * corresponding to the documents in <code>docidsIn</code>.
- * If no highlights were found for a document, the
- * first {@code maxPassages} from the field will
- * be returned.
- * @throws IOException if an I/O error occurred during processing
- * @throws IllegalArgumentException if <code>field</code> was indexed without
- * {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
- */
- protected Map<String,Object[]> highlightFieldsAsObjects(String fieldsIn[], Query query, IndexSearcher searcher, int[] docidsIn, int maxPassagesIn[]) throws IOException {
- if (fieldsIn.length < 1) {
- throw new IllegalArgumentException("fieldsIn must not be empty");
- }
- if (fieldsIn.length != maxPassagesIn.length) {
- throw new IllegalArgumentException("invalid number of maxPassagesIn");
- }
- SortedSet<Term> queryTerms = new TreeSet<>();
- EMPTY_INDEXSEARCHER.createNormalizedWeight(query, false).extractTerms(queryTerms);
-
- IndexReaderContext readerContext = searcher.getIndexReader().getContext();
- List<LeafReaderContext> leaves = readerContext.leaves();
-
- // Make our own copies because we sort in-place:
- int[] docids = new int[docidsIn.length];
- System.arraycopy(docidsIn, 0, docids, 0, docidsIn.length);
- final String fields[] = new String[fieldsIn.length];
- System.arraycopy(fieldsIn, 0, fields, 0, fieldsIn.length);
- final int maxPassages[] = new int[maxPassagesIn.length];
- System.arraycopy(maxPassagesIn, 0, maxPassages, 0, maxPassagesIn.length);
-
- // sort for sequential io
- Arrays.sort(docids);
- new InPlaceMergeSorter() {
-
- @Override
- protected void swap(int i, int j) {
- String tmp = fields[i];
- fields[i] = fields[j];
- fields[j] = tmp;
- int tmp2 = maxPassages[i];
- maxPassages[i] = maxPassages[j];
- maxPassages[j] = tmp2;
- }
-
- @Override
- protected int compare(int i, int j) {
- return fields[i].compareTo(fields[j]);
- }
-
- }.sort(0, fields.length);
-
- // pull stored data:
- String[][] contents = loadFieldValues(searcher, fields, docids, maxLength);
-
- Map<String,Object[]> highlights = new HashMap<>();
- for (int i = 0; i < fields.length; i++) {
- String field = fields[i];
- int numPassages = maxPassages[i];
- Term floor = new Term(field, "");
- Term ceiling = new Term(field, UnicodeUtil.BIG_TERM);
- SortedSet<Term> fieldTerms = queryTerms.subSet(floor, ceiling);
- // TODO: should we have some reasonable defaults for term pruning? (e.g. stopwords)
-
- // Strip off the redundant field:
- BytesRef terms[] = new BytesRef[fieldTerms.size()];
- int termUpto = 0;
- for(Term term : fieldTerms) {
- terms[termUpto++] = term.bytes();
- }
- Map<Integer,Object> fieldHighlights = highlightField(field, contents[i], getBreakIterator(field), terms, docids, leaves, numPassages, query);
-
- Object[] result = new Object[docids.length];
- for (int j = 0; j < docidsIn.length; j++) {
- result[j] = fieldHighlights.get(docidsIn[j]);
- }
- highlights.put(field, result);
- }
- return highlights;
- }
-
- /** Loads the String values for each field X docID to be
- * highlighted. By default this loads from stored
- * fields, but a subclass can change the source. This
- * method should allocate the String[fields.length][docids.length]
- * and fill all values. The returned Strings must be
- * identical to what was indexed. */
- protected String[][] loadFieldValues(IndexSearcher searcher, String[] fields, int[] docids, int maxLength) throws IOException {
- String contents[][] = new String[fields.length][docids.length];
- char valueSeparators[] = new char[fields.length];
- for (int i = 0; i < fields.length; i++) {
- valueSeparators[i] = getMultiValuedSeparator(fields[i]);
- }
- LimitedStoredFieldVisitor visitor = new LimitedStoredFieldVisitor(fields, valueSeparators, maxLength);
- for (int i = 0; i < docids.length; i++) {
- searcher.doc(docids[i], visitor);
- for (int j = 0; j < fields.length; j++) {
- contents[j][i] = visitor.getValue(j).toString();
- }
- visitor.reset();
- }
- return contents;
- }
-
- /**
- * Returns the logical separator between values for multi-valued fields.
- * The default value is a space character, which means passages can span across values,
- * but a subclass can override, for example with {@code U+2029 PARAGRAPH SEPARATOR (PS)}
- * if each value holds a discrete passage for highlighting.
- */
- protected char getMultiValuedSeparator(String field) {
- return ' ';
- }
-
- /**
- * Returns the analyzer originally used to index the content for {@code field}.
- * <p>
- * This is used to highlight some MultiTermQueries.
- * @return Analyzer or null (the default, meaning no special multi-term processing)
- */
- protected Analyzer getIndexAnalyzer(String field) {
- return null;
- }
-
- private Map<Integer,Object> highlightField(String field, String contents[], BreakIterator bi, BytesRef terms[], int[] docids, List<LeafReaderContext> leaves, int maxPassages, Query query) throws IOException {
- Map<Integer,Object> highlights = new HashMap<>();
-
- PassageFormatter fieldFormatter = getFormatter(field);
- if (fieldFormatter == null) {
- throw new NullPointerException("PassageFormatter must not be null");
- }
-
- // check if we should do any multiterm processing
- Analyzer analyzer = getIndexAnalyzer(field);
- CharacterRunAutomaton automata[] = new CharacterRunAutomaton[0];
- if (analyzer != null) {
- automata = MultiTermHighlighting.extractAutomata(query, field);
- }
-
- // resize 'terms', where the last term is the multiterm matcher
- if (automata.length > 0) {
- BytesRef newTerms[] = new BytesRef[terms.length + 1];
- System.arraycopy(terms, 0, newTerms, 0, terms.length);
- terms = newTerms;
- }
-
- // we are processing in increasing docid order, so we only need to reinitialize stuff on segment changes
- // otherwise, we will just advance() existing enums to the new document in the same segment.
- PostingsEnum postings[] = null;
- TermsEnum termsEnum = null;
- int lastLeaf = -1;
-
- for (int i = 0; i < docids.length; i++) {
- String content = contents[i];
- if (content.length() == 0) {
- continue; // nothing to do
- }
- bi.setText(content);
- int doc = docids[i];
- int leaf = ReaderUtil.subIndex(doc, leaves);
- LeafReaderContext subContext = leaves.get(leaf);
- LeafReader r = subContext.reader();
-
- assert leaf >= lastLeaf; // increasing order
-
- // if the segment has changed, we must initialize new enums.
- if (leaf != lastLeaf) {
- Terms t = r.terms(field);
- if (t != null) {
- if (!t.hasOffsets()) {
- // no offsets available
- throw new IllegalArgumentException("field '" + field + "' was indexed without offsets, cannot highlight");
- }
- termsEnum = t.iterator();
- postings = new PostingsEnum[terms.length];
- } else {
- termsEnum = null;
- }
- }
- if (termsEnum == null) {
- continue; // no terms for this field, nothing to do
- }
-
- // if there are multi-term matches, we have to initialize the "fake" enum for each document
- if (automata.length > 0) {
- PostingsEnum dp = MultiTermHighlighting.getDocsEnum(analyzer.tokenStream(field, content), automata);
- dp.advance(doc - subContext.docBase);
- postings[terms.length-1] = dp; // last term is the multiterm matcher
- }
-
- Passage passages[] = highlightDoc(field, terms, content.length(), bi, doc - subContext.docBase, termsEnum, postings, maxPassages);
-
- if (passages.length == 0) {
- // no passages were returned, so ask for a default summary
- passages = getEmptyHighlight(field, bi, maxPassages);
- }
-
- if (passages.length > 0) {
- highlights.put(doc, fieldFormatter.format(passages, content));
- }
-
- lastLeaf = leaf;
- }
-
- return highlights;
- }
-
- // algorithm: treat sentence snippets as miniature documents
- // we can intersect these with the postings lists via BreakIterator.preceding(offset),s
- // score each sentence as norm(sentenceStartOffset) * sum(weight * tf(freq))
- private Passage[] highlightDoc(String field, BytesRef terms[], int contentLength, BreakIterator bi, int doc,
- TermsEnum termsEnum, PostingsEnum[] postings, int n) throws IOException {
- PassageScorer scorer = getScorer(field);
- if (scorer == null) {
- throw new NullPointerException("PassageScorer must not be null");
- }
- PriorityQueue<OffsetsEnum> pq = new PriorityQueue<>();
- float weights[] = new float[terms.length];
- // initialize postings
- for (int i = 0; i < terms.length; i++) {
- PostingsEnum de = postings[i];
- int pDoc;
- if (de == EMPTY) {
- continue;
- } else if (de == null) {
- postings[i] = EMPTY; // initially
- if (!termsEnum.seekExact(terms[i])) {
- continue; // term not found
- }
- de = postings[i] = termsEnum.postings(null, PostingsEnum.OFFSETS);
- assert de != null;
- pDoc = de.advance(doc);
- } else {
- pDoc = de.docID();
- if (pDoc < doc) {
- pDoc = de.advance(doc);
- }
- }
-
- if (doc == pDoc) {
- weights[i] = scorer.weight(contentLength, de.freq());
- de.nextPosition();
- pq.add(new OffsetsEnum(de, i));
- }
- }
-
- pq.add(new OffsetsEnum(EMPTY, Integer.MAX_VALUE)); // a sentinel for termination
-
- PriorityQueue<Passage> passageQueue = new PriorityQueue<>(n, new Comparator<Passage>() {
- @Override
- public int compare(Passage left, Passage right) {
- if (left.score < right.score) {
- return -1;
- } else if (left.score > right.score) {
- return 1;
- } else {
- return left.startOffset - right.startOffset;
- }
- }
- });
- Passage current = new Passage();
-
- OffsetsEnum off;
- while ((off = pq.poll()) != null) {
- final PostingsEnum dp = off.dp;
- int start = dp.startOffset();
- assert start >= 0;
- int end = dp.endOffset();
- // LUCENE-5166: this hit would span the content limit... however more valid
- // hits may exist (they are sorted by start). so we pretend like we never
- // saw this term, it won't cause a passage to be added to passageQueue or anything.
- assert EMPTY.startOffset() == Integer.MAX_VALUE;
- if (start < contentLength && end > contentLength) {
- continue;
- }
- if (start >= current.endOffset) {
- if (current.startOffset >= 0) {
- // finalize current
- current.score *= scorer.norm(current.startOffset);
- // new sentence: first add 'current' to queue
- if (passageQueue.size() == n && current.score < passageQueue.peek().score) {
- current.reset(); // can't compete, just reset it
- } else {
- passageQueue.offer(current);
- if (passageQueue.size() > n) {
- current = passageQueue.poll();
- current.reset();
- } else {
- current = new Passage();
- }
- }
- }
- // if we exceed limit, we are done
- if (start >= contentLength) {
- Passage passages[] = new Passage[passageQueue.size()];
- passageQueue.toArray(passages);
- for (Passage p : passages) {
- p.sort();
- }
- // sort in ascending order
- Arrays.sort(passages, new Comparator<Passage>() {
- @Override
- public int compare(Passage left, Passage right) {
- return left.startOffset - right.startOffset;
- }
- });
- return passages;
- }
- // advance breakiterator
- assert BreakIterator.DONE < 0;
- current.startOffset = Math.max(bi.preceding(start+1), 0);
- current.endOffset = Math.min(bi.next(), contentLength);
- }
- int tf = 0;
- while (true) {
- tf++;
- BytesRef term = terms[off.id];
- if (term == null) {
- // multitermquery match, pull from payload
- term = off.dp.getPayload();
- assert term != null;
- }
- current.addMatch(start, end, term);
- if (off.pos == dp.freq()) {
- break; // removed from pq
- } else {
- off.pos++;
- dp.nextPosition();
- start = dp.startOffset();
- end = dp.endOffset();
- }
- if (start >= current.endOffset || end > contentLength) {
- pq.offer(off);
- break;
- }
- }
- current.score += weights[off.id] * scorer.tf(tf, current.endOffset - current.startOffset);
- }
-
- // Dead code but compiler disagrees:
- assert false;
- return null;
- }
-
- /** Called to summarize a document when no hits were
- * found. By default this just returns the first
- * {@code maxPassages} sentences; subclasses can override
- * to customize. */
- protected Passage[] getEmptyHighlight(String fieldName, BreakIterator bi, int maxPassages) {
- // BreakIterator should be un-next'd:
- List<Passage> passages = new ArrayList<>();
- int pos = bi.current();
- assert pos == 0;
- while (passages.size() < maxPassages) {
- int next = bi.next();
- if (next == BreakIterator.DONE) {
- break;
- }
- Passage passage = new Passage();
- passage.score = Float.NaN;
- passage.startOffset = pos;
- passage.endOffset = next;
- passages.add(passage);
- pos = next;
- }
-
- return passages.toArray(new Passage[passages.size()]);
- }
-
- private static class OffsetsEnum implements Comparable<OffsetsEnum> {
- PostingsEnum dp;
- int pos;
- int id;
-
- OffsetsEnum(PostingsEnum dp, int id) throws IOException {
- this.dp = dp;
- this.id = id;
- this.pos = 1;
- }
-
- @Override
- public int compareTo(OffsetsEnum other) {
- try {
- int off = dp.startOffset();
- int otherOff = other.dp.startOffset();
- if (off == otherOff) {
- return id - other.id;
- } else {
- return Integer.compare(off, otherOff);
- }
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- }
- }
-
- private static final PostingsEnum EMPTY = new PostingsEnum() {
-
- @Override
- public int nextPosition() throws IOException { return -1; }
-
- @Override
- public int startOffset() throws IOException { return Integer.MAX_VALUE; }
-
- @Override
- public int endOffset() throws IOException { return Integer.MAX_VALUE; }
-
- @Override
- public BytesRef getPayload() throws IOException { return null; }
-
- @Override
- public int freq() throws IOException { return 0; }
-
- @Override
- public int docID() { return NO_MORE_DOCS; }
-
- @Override
- public int nextDoc() throws IOException { return NO_MORE_DOCS; }
-
- @Override
- public int advance(int target) throws IOException { return NO_MORE_DOCS; }
-
- @Override
- public long cost() { return 0; }
- };
-
- private static class LimitedStoredFieldVisitor extends StoredFieldVisitor {
- private final String fields[];
- private final char valueSeparators[];
- private final int maxLength;
- private final StringBuilder builders[];
- private int currentField = -1;
-
- public LimitedStoredFieldVisitor(String fields[], char valueSeparators[], int maxLength) {
- assert fields.length == valueSeparators.length;
- this.fields = fields;
- this.valueSeparators = valueSeparators;
- this.maxLength = maxLength;
- builders = new StringBuilder[fields.length];
- for (int i = 0; i < builders.length; i++) {
- builders[i] = new StringBuilder();
- }
- }
-
- @Override
- public void stringField(FieldInfo fieldInfo, byte[] bytes) throws IOException {
- String value = new String(bytes, StandardCharsets.UTF_8);
- assert currentField >= 0;
- StringBuilder builder = builders[currentField];
- if (builder.length() > 0 && builder.length() < maxLength) {
- builder.append(valueSeparators[currentField]);
- }
- if (builder.length() + value.length() > maxLength) {
- builder.append(value, 0, maxLength - builder.length());
- } else {
- builder.append(value);
- }
- }
-
- @Override
- public Status needsField(FieldInfo fieldInfo) throws IOException {
- currentField = Arrays.binarySearch(fields, fieldInfo.name);
- if (currentField < 0) {
- return Status.NO;
- } else if (builders[currentField].length() > maxLength) {
- return fields.length == 1 ? Status.STOP : Status.NO;
- }
- return Status.YES;
- }
-
- String getValue(int i) {
- return builders[i].toString();
- }
-
- void reset() {
- currentField = -1;
- for (int i = 0; i < fields.length; i++) {
- builders[i].setLength(0);
- }
- }
- }
-}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0d3c73ea/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/package-info.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/package-info.java b/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/package-info.java
deleted file mode 100644
index 10013c2..0000000
--- a/lucene/highlighter/src/java/org/apache/lucene/search/postingshighlight/package-info.java
+++ /dev/null
@@ -1,21 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * Highlighter implementation that uses offsets from postings lists.
- */
-package org.apache.lucene.search.postingshighlight;
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0d3c73ea/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/CustomSeparatorBreakIterator.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/CustomSeparatorBreakIterator.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/CustomSeparatorBreakIterator.java
new file mode 100644
index 0000000..7119aed
--- /dev/null
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/CustomSeparatorBreakIterator.java
@@ -0,0 +1,150 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.uhighlight;
+
+import java.text.BreakIterator;
+import java.text.CharacterIterator;
+
+/**
+ * A {@link BreakIterator} that breaks the text whenever a certain separator, provided as a constructor argument, is found.
+ */
+public final class CustomSeparatorBreakIterator extends BreakIterator {
+
+ private final char separator;
+ private CharacterIterator text;
+ private int current;
+
+ public CustomSeparatorBreakIterator(char separator) {
+ this.separator = separator;
+ }
+
+ @Override
+ public int current() {
+ return current;
+ }
+
+ @Override
+ public int first() {
+ text.setIndex(text.getBeginIndex());
+ return current = text.getIndex();
+ }
+
+ @Override
+ public int last() {
+ text.setIndex(text.getEndIndex());
+ return current = text.getIndex();
+ }
+
+ @Override
+ public int next() {
+ if (text.getIndex() == text.getEndIndex()) {
+ return DONE;
+ } else {
+ return advanceForward();
+ }
+ }
+
+ private int advanceForward() {
+ char c;
+ while ((c = text.next()) != CharacterIterator.DONE) {
+ if (c == separator) {
+ return current = text.getIndex() + 1;
+ }
+ }
+ assert text.getIndex() == text.getEndIndex();
+ return current = text.getIndex();
+ }
+
+ @Override
+ public int following(int pos) {
+ if (pos < text.getBeginIndex() || pos > text.getEndIndex()) {
+ throw new IllegalArgumentException("offset out of bounds");
+ } else if (pos == text.getEndIndex()) {
+ // this conflicts with the javadocs, but matches actual behavior (Oracle has a bug in something)
+ // https://bugs.openjdk.java.net/browse/JDK-8015110
+ text.setIndex(text.getEndIndex());
+ current = text.getIndex();
+ return DONE;
+ } else {
+ text.setIndex(pos);
+ current = text.getIndex();
+ return advanceForward();
+ }
+ }
+
+ @Override
+ public int previous() {
+ if (text.getIndex() == text.getBeginIndex()) {
+ return DONE;
+ } else {
+ return advanceBackward();
+ }
+ }
+
+ private int advanceBackward() {
+ char c;
+ while ((c = text.previous()) != CharacterIterator.DONE) {
+ if (c == separator) {
+ return current = text.getIndex() + 1;
+ }
+ }
+ assert text.getIndex() == text.getBeginIndex();
+ return current = text.getIndex();
+ }
+
+ @Override
+ public int preceding(int pos) {
+ if (pos < text.getBeginIndex() || pos > text.getEndIndex()) {
+ throw new IllegalArgumentException("offset out of bounds");
+ } else if (pos == text.getBeginIndex()) {
+ // this conflicts with the javadocs, but matches actual behavior (Oracle has a bug in something)
+ // https://bugs.openjdk.java.net/browse/JDK-8015110
+ text.setIndex(text.getBeginIndex());
+ current = text.getIndex();
+ return DONE;
+ } else {
+ text.setIndex(pos);
+ current = text.getIndex();
+ return advanceBackward();
+ }
+ }
+
+ @Override
+ public int next(int n) {
+ if (n < 0) {
+ for (int i = 0; i < -n; i++) {
+ previous();
+ }
+ } else {
+ for (int i = 0; i < n; i++) {
+ next();
+ }
+ }
+ return current();
+ }
+
+ @Override
+ public CharacterIterator getText() {
+ return text;
+ }
+
+ @Override
+ public void setText(CharacterIterator newText) {
+ text = newText;
+ current = text.getBeginIndex();
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/0d3c73ea/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/WholeBreakIterator.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/WholeBreakIterator.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/WholeBreakIterator.java
new file mode 100644
index 0000000..37f48aa
--- /dev/null
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/WholeBreakIterator.java
@@ -0,0 +1,116 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.uhighlight;
+
+import java.text.BreakIterator;
+import java.text.CharacterIterator;
+
+/** Just produces one single fragment for the entire text */
+public final class WholeBreakIterator extends BreakIterator {
+ private CharacterIterator text;
+ private int start;
+ private int end;
+ private int current;
+
+ @Override
+ public int current() {
+ return current;
+ }
+
+ @Override
+ public int first() {
+ return (current = start);
+ }
+
+ @Override
+ public int following(int pos) {
+ if (pos < start || pos > end) {
+ throw new IllegalArgumentException("offset out of bounds");
+ } else if (pos == end) {
+ // this conflicts with the javadocs, but matches actual behavior (Oracle has a bug in something)
+ // https://bugs.openjdk.java.net/browse/JDK-8015110
+ current = end;
+ return DONE;
+ } else {
+ return last();
+ }
+ }
+
+ @Override
+ public CharacterIterator getText() {
+ return text;
+ }
+
+ @Override
+ public int last() {
+ return (current = end);
+ }
+
+ @Override
+ public int next() {
+ if (current == end) {
+ return DONE;
+ } else {
+ return last();
+ }
+ }
+
+ @Override
+ public int next(int n) {
+ if (n < 0) {
+ for (int i = 0; i < -n; i++) {
+ previous();
+ }
+ } else {
+ for (int i = 0; i < n; i++) {
+ next();
+ }
+ }
+ return current();
+ }
+
+ @Override
+ public int preceding(int pos) {
+ if (pos < start || pos > end) {
+ throw new IllegalArgumentException("offset out of bounds");
+ } else if (pos == start) {
+ // this conflicts with the javadocs, but matches actual behavior (Oracle has a bug in something)
+ // https://bugs.openjdk.java.net/browse/JDK-8015110
+ current = start;
+ return DONE;
+ } else {
+ return first();
+ }
+ }
+
+ @Override
+ public int previous() {
+ if (current == start) {
+ return DONE;
+ } else {
+ return first();
+ }
+ }
+
+ @Override
+ public void setText(CharacterIterator newText) {
+ start = newText.getBeginIndex();
+ end = newText.getEndIndex();
+ text = newText;
+ current = start;
+ }
+}