You are viewing a plain text version of this content. The canonical link for it is here.
Posted to commits@lucene.apache.org by ds...@apache.org on 2016/10/05 03:13:02 UTC
[4/6] lucene-solr:branch_6x: LUCENE-7438: New UnifiedHighlighter
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/4b679436/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java
new file mode 100644
index 0000000..72be180
--- /dev/null
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/UnifiedHighlighter.java
@@ -0,0 +1,1021 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.uhighlight;
+
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
+import java.text.BreakIterator;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.EnumSet;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Locale;
+import java.util.Map;
+import java.util.Objects;
+import java.util.Set;
+import java.util.SortedSet;
+import java.util.TreeSet;
+import java.util.function.Supplier;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.document.FieldType;
+import org.apache.lucene.index.BaseCompositeReader;
+import org.apache.lucene.index.FieldInfo;
+import org.apache.lucene.index.FieldInfos;
+import org.apache.lucene.index.Fields;
+import org.apache.lucene.index.FilterLeafReader;
+import org.apache.lucene.index.IndexOptions;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.LeafReader;
+import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.MultiFields;
+import org.apache.lucene.index.MultiReader;
+import org.apache.lucene.index.StoredFieldVisitor;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.DocIdSetIterator;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.MultiTermQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.search.Weight;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.InPlaceMergeSorter;
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util.automaton.CharacterRunAutomaton;
+
+/**
+ * A Highlighter that can get offsets from either
+ * postings ({@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}),
+ * term vectors ({@link FieldType#setStoreTermVectorOffsets(boolean)}),
+ * or via re-analyzing text.
+ * <p>
+ * This highlighter treats the single original document as the whole corpus, and then scores individual
+ * passages as if they were documents in this corpus. It uses a {@link BreakIterator} to find
+ * passages in the text; by default it breaks using {@link BreakIterator#getSentenceInstance(Locale)
+ * getSentenceInstance(Locale.ROOT)}. It then iterates in parallel (merge sorting by offset) through
+ * the positions of all terms from the query, coalescing those hits that occur in a single passage
+ * into a {@link Passage}, and then scores each Passage using a separate {@link PassageScorer}.
+ * Passages are finally formatted into highlighted snippets with a {@link PassageFormatter}.
+ * <p>
+ * You can customize the behavior by calling some of the setters, or by subclassing and overriding some methods.
+ * Some important hooks:
+ * <ul>
+ * <li>{@link #getBreakIterator(String)}: Customize how the text is divided into passages.
+ * <li>{@link #getScorer(String)}: Customize how passages are ranked.
+ * <li>{@link #getFormatter(String)}: Customize how snippets are formatted.
+ * </ul>
+ * <p>
+ * This is thread-safe.
+ *
+ * @lucene.experimental
+ */
+public class UnifiedHighlighter {
+
+ protected static final char MULTIVAL_SEP_CHAR = (char) 0;
+
+ public static final int DEFAULT_MAX_LENGTH = 10000;
+
+ public static final int DEFAULT_CACHE_CHARS_THRESHOLD = 524288; // ~ 1 MB (2 byte chars)
+
+ static final IndexSearcher EMPTY_INDEXSEARCHER;
+
+ static {
+ try {
+ IndexReader emptyReader = new MultiReader();
+ EMPTY_INDEXSEARCHER = new IndexSearcher(emptyReader);
+ EMPTY_INDEXSEARCHER.setQueryCache(null);
+ } catch (IOException bogus) {
+ throw new RuntimeException(bogus);
+ }
+ }
+
+ protected static final CharacterRunAutomaton[] ZERO_LEN_AUTOMATA_ARRAY = new CharacterRunAutomaton[0];
+
+ protected final IndexSearcher searcher; // if null, can only use highlightWithoutSearcher
+
+ protected final Analyzer indexAnalyzer;
+
+ private boolean defaultHandleMtq = true; // e.g. wildcards
+
+ private boolean defaultHighlightPhrasesStrictly = true; // AKA "accuracy" or "query debugging"
+
+ // private boolean defaultRequireFieldMatch = true; TODO
+
+ private int maxLength = DEFAULT_MAX_LENGTH;
+
+ // BreakIterator is stateful so we use a Supplier factory method
+ private Supplier<BreakIterator> defaultBreakIterator = () -> BreakIterator.getSentenceInstance(Locale.ROOT);
+
+ private PassageScorer defaultScorer = new PassageScorer();
+
+ private PassageFormatter defaultFormatter = new DefaultPassageFormatter();
+
+ private int defaultMaxNoHighlightPassages = -1;
+
+ // lazy initialized with double-check locking; protected so subclass can init
+ protected volatile FieldInfos fieldInfos;
+
+ private int cacheFieldValCharsThreshold = DEFAULT_CACHE_CHARS_THRESHOLD;
+
+ /**
+ * Calls {@link Weight#extractTerms(Set)} on an empty index for the query.
+ */
+ protected static SortedSet<Term> extractTerms(Query query) throws IOException {
+ SortedSet<Term> queryTerms = new TreeSet<>();
+ EMPTY_INDEXSEARCHER.createNormalizedWeight(query, false).extractTerms(queryTerms);
+ return queryTerms;
+ }
+
+ /**
+ * Constructs the highlighter with the given index searcher and analyzer.
+ *
+ * @param indexSearcher Usually required, unless {@link #highlightWithoutSearcher(String, Query, String, int)} is
+ * used, in which case this needs to be null.
+ * @param indexAnalyzer Required, even if in some circumstances it isn't used.
+ */
+ public UnifiedHighlighter(IndexSearcher indexSearcher, Analyzer indexAnalyzer) {
+ this.searcher = indexSearcher; //TODO: make non nullable
+ this.indexAnalyzer = Objects.requireNonNull(indexAnalyzer,
+ "indexAnalyzer is required"
+ + " (even if in some circumstances it isn't used)");
+ }
+
+ public void setHandleMultiTermQuery(boolean handleMtq) {
+ this.defaultHandleMtq = handleMtq;
+ }
+
+ public void setHighlightPhrasesStrictly(boolean highlightPhrasesStrictly) {
+ this.defaultHighlightPhrasesStrictly = highlightPhrasesStrictly;
+ }
+
+ public void setMaxLength(int maxLength) {
+ if (maxLength < 0 || maxLength == Integer.MAX_VALUE) {
+ // two reasons: no overflow problems in BreakIterator.preceding(offset+1),
+ // our sentinel in the offsets queue uses this value to terminate.
+ throw new IllegalArgumentException("maxLength must be < Integer.MAX_VALUE");
+ }
+ this.maxLength = maxLength;
+ }
+
+ public void setBreakIterator(Supplier<BreakIterator> breakIterator) {
+ this.defaultBreakIterator = breakIterator;
+ }
+
+ public void setScorer(PassageScorer scorer) {
+ this.defaultScorer = scorer;
+ }
+
+ public void setFormatter(PassageFormatter formatter) {
+ this.defaultFormatter = formatter;
+ }
+
+ public void setMaxNoHighlightPassages(int defaultMaxNoHighlightPassages) {
+ this.defaultMaxNoHighlightPassages = defaultMaxNoHighlightPassages;
+ }
+
+ public void setCacheFieldValCharsThreshold(int cacheFieldValCharsThreshold) {
+ this.cacheFieldValCharsThreshold = cacheFieldValCharsThreshold;
+ }
+
+ /**
+ * Returns whether {@link MultiTermQuery} derivatives will be highlighted. By default it's enabled. MTQ
+ * highlighting can be expensive, particularly when using offsets in postings.
+ */
+ protected boolean shouldHandleMultiTermQuery(String field) {
+ return defaultHandleMtq;
+ }
+
+ /**
+ * Returns whether position sensitive queries (e.g. phrases and {@link SpanQuery}ies)
+ * should be highlighted strictly based on query matches (slower)
+ * versus any/all occurrences of the underlying terms. By default it's enabled, but there's no overhead if such
+ * queries aren't used.
+ */
+ protected boolean shouldHighlightPhrasesStrictly(String field) {
+ return defaultHighlightPhrasesStrictly;
+ }
+
+ /**
+ * The maximum content size to process. Content will be truncated to this size before highlighting. Typically
+ * snippets closer to the beginning of the document better summarize its content.
+ */
+ public int getMaxLength() {
+ return maxLength;
+ }
+
+ /**
+ * Returns the {@link BreakIterator} to use for
+ * dividing text into passages. This returns
+ * {@link BreakIterator#getSentenceInstance(Locale)} by default;
+ * subclasses can override to customize.
+ * <p>
+ * Note: this highlighter will call
+ * {@link BreakIterator#preceding(int)} and {@link BreakIterator#next()} many times on it.
+ * The default generic JDK implementation of {@code preceding} performs poorly.
+ */
+ protected BreakIterator getBreakIterator(String field) {
+ return defaultBreakIterator.get();
+ }
+
+ /**
+ * Returns the {@link PassageScorer} to use for
+ * ranking passages. This
+ * returns a new {@code PassageScorer} by default;
+ * subclasses can override to customize.
+ */
+ protected PassageScorer getScorer(String field) {
+ return defaultScorer;
+ }
+
+ /**
+ * Returns the {@link PassageFormatter} to use for
+ * formatting passages into highlighted snippets. This
+ * returns a new {@code PassageFormatter} by default;
+ * subclasses can override to customize.
+ */
+ protected PassageFormatter getFormatter(String field) {
+ return defaultFormatter;
+ }
+
+ /**
+ * Returns the number of leading passages (as delineated by the {@link BreakIterator}) when no
+ * highlights could be found. If it's less than 0 (the default) then this defaults to the {@code maxPassages}
+ * parameter given for each request. If this is 0 then the resulting highlight is null (not formatted).
+ */
+ protected int getMaxNoHighlightPassages(String field) {
+ return defaultMaxNoHighlightPassages;
+ }
+
+ /**
+ * Limits the amount of field value pre-fetching until this threshold is passed. The highlighter
+ * internally highlights in batches of documents sized on the sum field value length (in chars) of the fields
+ * to be highlighted (bounded by {@link #getMaxLength()} for each field). By setting this to 0, you can force
+ * documents to be fetched and highlighted one at a time, which you usually shouldn't do.
+ * The default is 524288 chars which translates to about a megabyte. However, note
+ * that the highlighter sometimes ignores this and highlights one document at a time (without caching a
+ * bunch of documents in advance) when it can detect there's no point in it -- such as when all fields will be
+ * highlighted via re-analysis as one example.
+ */
+ public int getCacheFieldValCharsThreshold() { // question: should we size by bytes instead?
+ return cacheFieldValCharsThreshold;
+ }
+
+ /**
+ * ... as passed in from constructor.
+ */
+ public IndexSearcher getIndexSearcher() {
+ return searcher;
+ }
+
+ /**
+ * ... as passed in from constructor.
+ */
+ public Analyzer getIndexAnalyzer() {
+ return indexAnalyzer;
+ }
+
+ /**
+ * Source of term offsets; essential for highlighting.
+ */
+ public enum OffsetSource {
+ POSTINGS, TERM_VECTORS, ANALYSIS, POSTINGS_WITH_TERM_VECTORS, NONE_NEEDED
+ }
+
+ /**
+ * Determine the offset source for the specified field. The default algorithm is as follows:
+ * <ol>
+ * <li>This calls {@link #getFieldInfo(String)}. Note this returns null if there is no searcher or if the
+ * field isn't found there.</li>
+ * <li> If there's a field info it has
+ * {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS} then {@link OffsetSource#POSTINGS} is
+ * returned.</li>
+ * <li>If there's a field info and {@link FieldInfo#hasVectors()} then {@link OffsetSource#TERM_VECTORS} is
+ * returned (note we can't check here if the TV has offsets; if there isn't then an exception will get thrown
+ * down the line).</li>
+ * <li>Fall-back: {@link OffsetSource#ANALYSIS} is returned.</li>
+ * </ol>
+ * <p>
+ * Note that the highlighter sometimes switches to something else based on the query, such as if you have
+ * {@link OffsetSource#POSTINGS_WITH_TERM_VECTORS} but in fact don't need term vectors.
+ */
+ protected OffsetSource getOffsetSource(String field) {
+ FieldInfo fieldInfo = getFieldInfo(field);
+ if (fieldInfo != null) {
+ if (fieldInfo.getIndexOptions() == IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) {
+ return fieldInfo.hasVectors() ? OffsetSource.POSTINGS_WITH_TERM_VECTORS : OffsetSource.POSTINGS;
+ }
+ if (fieldInfo.hasVectors()) { // unfortunately we can't also check if the TV has offsets
+ return OffsetSource.TERM_VECTORS;
+ }
+ }
+ return OffsetSource.ANALYSIS;
+ }
+
+ /**
+ * Called by the default implementation of {@link #getOffsetSource(String)}.
+ * If there is no searcher then we simply always return null.
+ */
+ protected FieldInfo getFieldInfo(String field) {
+ if (searcher == null) {
+ return null;
+ }
+ // Need thread-safety for lazy-init but lets avoid 'synchronized' by using double-check locking idiom
+ FieldInfos fieldInfos = this.fieldInfos; // note: it's volatile; read once
+ if (fieldInfos == null) {
+ synchronized (this) {
+ fieldInfos = this.fieldInfos;
+ if (fieldInfos == null) {
+ fieldInfos = MultiFields.getMergedFieldInfos(searcher.getIndexReader());
+ this.fieldInfos = fieldInfos;
+ }
+
+ }
+
+ }
+ return fieldInfos.fieldInfo(field);
+ }
+
+ /**
+ * Highlights the top passages from a single field.
+ *
+ * @param field field name to highlight.
+ * Must have a stored string value and also be indexed with offsets.
+ * @param query query to highlight.
+ * @param topDocs TopDocs containing the summary result documents to highlight.
+ * @return Array of formatted snippets corresponding to the documents in <code>topDocs</code>.
+ * If no highlights were found for a document, the
+ * first sentence for the field will be returned.
+ * @throws IOException if an I/O error occurred during processing
+ * @throws IllegalArgumentException if <code>field</code> was indexed without
+ * {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
+ */
+ public String[] highlight(String field, Query query, TopDocs topDocs) throws IOException {
+ return highlight(field, query, topDocs, 1);
+ }
+
+ /**
+ * Highlights the top-N passages from a single field.
+ *
+ * @param field field name to highlight. Must have a stored string value.
+ * @param query query to highlight.
+ * @param topDocs TopDocs containing the summary result documents to highlight.
+ * @param maxPassages The maximum number of top-N ranked passages used to
+ * form the highlighted snippets.
+ * @return Array of formatted snippets corresponding to the documents in <code>topDocs</code>.
+ * If no highlights were found for a document, the
+ * first {@code maxPassages} sentences from the
+ * field will be returned.
+ * @throws IOException if an I/O error occurred during processing
+ * @throws IllegalArgumentException if <code>field</code> was indexed without
+ * {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
+ */
+ public String[] highlight(String field, Query query, TopDocs topDocs, int maxPassages) throws IOException {
+ Map<String, String[]> res = highlightFields(new String[]{field}, query, topDocs, new int[]{maxPassages});
+ return res.get(field);
+ }
+
+ /**
+ * Highlights the top passages from multiple fields.
+ * <p>
+ * Conceptually, this behaves as a more efficient form of:
+ * <pre class="prettyprint">
+ * Map m = new HashMap();
+ * for (String field : fields) {
+ * m.put(field, highlight(field, query, topDocs));
+ * }
+ * return m;
+ * </pre>
+ *
+ * @param fields field names to highlight. Must have a stored string value.
+ * @param query query to highlight.
+ * @param topDocs TopDocs containing the summary result documents to highlight.
+ * @return Map keyed on field name, containing the array of formatted snippets
+ * corresponding to the documents in <code>topDocs</code>.
+ * If no highlights were found for a document, the
+ * first sentence from the field will be returned.
+ * @throws IOException if an I/O error occurred during processing
+ * @throws IllegalArgumentException if <code>field</code> was indexed without
+ * {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
+ */
+ public Map<String, String[]> highlightFields(String[] fields, Query query, TopDocs topDocs) throws IOException {
+ int maxPassages[] = new int[fields.length];
+ Arrays.fill(maxPassages, 1);
+ return highlightFields(fields, query, topDocs, maxPassages);
+ }
+
+ /**
+ * Highlights the top-N passages from multiple fields.
+ * <p>
+ * Conceptually, this behaves as a more efficient form of:
+ * <pre class="prettyprint">
+ * Map m = new HashMap();
+ * for (String field : fields) {
+ * m.put(field, highlight(field, query, topDocs, maxPassages));
+ * }
+ * return m;
+ * </pre>
+ *
+ * @param fields field names to highlight. Must have a stored string value.
+ * @param query query to highlight.
+ * @param topDocs TopDocs containing the summary result documents to highlight.
+ * @param maxPassages The maximum number of top-N ranked passages per-field used to
+ * form the highlighted snippets.
+ * @return Map keyed on field name, containing the array of formatted snippets
+ * corresponding to the documents in <code>topDocs</code>.
+ * If no highlights were found for a document, the
+ * first {@code maxPassages} sentences from the
+ * field will be returned.
+ * @throws IOException if an I/O error occurred during processing
+ * @throws IllegalArgumentException if <code>field</code> was indexed without
+ * {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
+ */
+ public Map<String, String[]> highlightFields(String[] fields, Query query, TopDocs topDocs, int[] maxPassages)
+ throws IOException {
+ final ScoreDoc scoreDocs[] = topDocs.scoreDocs;
+ int docids[] = new int[scoreDocs.length];
+ for (int i = 0; i < docids.length; i++) {
+ docids[i] = scoreDocs[i].doc;
+ }
+
+ return highlightFields(fields, query, docids, maxPassages);
+ }
+
+ /**
+ * Highlights the top-N passages from multiple fields,
+ * for the provided int[] docids.
+ *
+ * @param fieldsIn field names to highlight. Must have a stored string value.
+ * @param query query to highlight.
+ * @param docidsIn containing the document IDs to highlight.
+ * @param maxPassagesIn The maximum number of top-N ranked passages per-field used to
+ * form the highlighted snippets.
+ * @return Map keyed on field name, containing the array of formatted snippets
+ * corresponding to the documents in <code>docidsIn</code>.
+ * If no highlights were found for a document, the
+ * first {@code maxPassages} from the field will
+ * be returned.
+ * @throws IOException if an I/O error occurred during processing
+ * @throws IllegalArgumentException if <code>field</code> was indexed without
+ * {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
+ */
+ public Map<String, String[]> highlightFields(String[] fieldsIn, Query query, int[] docidsIn, int[] maxPassagesIn)
+ throws IOException {
+ Map<String, String[]> snippets = new HashMap<>();
+ for (Map.Entry<String, Object[]> ent : highlightFieldsAsObjects(fieldsIn, query, docidsIn, maxPassagesIn).entrySet()) {
+ Object[] snippetObjects = ent.getValue();
+ String[] snippetStrings = new String[snippetObjects.length];
+ snippets.put(ent.getKey(), snippetStrings);
+ for (int i = 0; i < snippetObjects.length; i++) {
+ Object snippet = snippetObjects[i];
+ if (snippet != null) {
+ snippetStrings[i] = snippet.toString();
+ }
+ }
+ }
+
+ return snippets;
+ }
+
+ /**
+ * Expert: highlights the top-N passages from multiple fields,
+ * for the provided int[] docids, to custom Object as
+ * returned by the {@link PassageFormatter}. Use
+ * this API to render to something other than String.
+ *
+ * @param fieldsIn field names to highlight. Must have a stored string value.
+ * @param query query to highlight.
+ * @param docIdsIn containing the document IDs to highlight.
+ * @param maxPassagesIn The maximum number of top-N ranked passages per-field used to
+ * form the highlighted snippets.
+ * @return Map keyed on field name, containing the array of formatted snippets
+ * corresponding to the documents in <code>docIdsIn</code>.
+ * If no highlights were found for a document, the
+ * first {@code maxPassages} from the field will
+ * be returned.
+ * @throws IOException if an I/O error occurred during processing
+ * @throws IllegalArgumentException if <code>field</code> was indexed without
+ * {@link IndexOptions#DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS}
+ */
+ protected Map<String, Object[]> highlightFieldsAsObjects(String[] fieldsIn, Query query, int[] docIdsIn,
+ int[] maxPassagesIn) throws IOException {
+ if (fieldsIn.length < 1) {
+ throw new IllegalArgumentException("fieldsIn must not be empty");
+ }
+ if (fieldsIn.length != maxPassagesIn.length) {
+ throw new IllegalArgumentException("invalid number of maxPassagesIn");
+ }
+ if (searcher == null) {
+ throw new IllegalStateException("This method requires that an indexSearcher was passed in the "
+ + "constructor. Perhaps you mean to call highlightWithoutSearcher?");
+ }
+
+ // Sort docs & fields for sequential i/o
+
+ // Sort doc IDs w/ index to original order: (copy input arrays since we sort in-place)
+ int[] docIds = new int[docIdsIn.length];
+ int[] docInIndexes = new int[docIds.length]; // fill in ascending order; points into docIdsIn[]
+ copyAndSortDocIdsWithIndex(docIdsIn, docIds, docInIndexes); // latter 2 are "out" params
+
+ // Sort fields w/ maxPassages pair: (copy input arrays since we sort in-place)
+ final String fields[] = new String[fieldsIn.length];
+ final int maxPassages[] = new int[maxPassagesIn.length];
+ copyAndSortFieldsWithMaxPassages(fieldsIn, maxPassagesIn, fields, maxPassages); // latter 2 are "out" params
+
+ // Init field highlighters (where most of the highlight logic lives, and on a per field basis)
+ SortedSet<Term> queryTerms = extractTerms(query);
+ FieldHighlighter[] fieldHighlighters = new FieldHighlighter[fields.length];
+ int numTermVectors = 0;
+ int numPostings = 0;
+ for (int f = 0; f < fields.length; f++) {
+ FieldHighlighter fieldHighlighter = getFieldHighlighter(fields[f], query, queryTerms, maxPassages[f]);
+ fieldHighlighters[f] = fieldHighlighter;
+
+ switch (fieldHighlighter.getOffsetSource()) {
+ case TERM_VECTORS:
+ numTermVectors++;
+ break;
+ case POSTINGS:
+ numPostings++;
+ break;
+ case POSTINGS_WITH_TERM_VECTORS:
+ numTermVectors++;
+ numPostings++;
+ break;
+ case ANALYSIS:
+ case NONE_NEEDED:
+ default:
+ //do nothing
+ break;
+ }
+ }
+
+ int cacheCharsThreshold = calculateOptimalCacheCharsThreshold(numTermVectors, numPostings);
+
+ IndexReader indexReaderWithTermVecCache =
+ (numTermVectors >= 2) ? TermVectorReusingLeafReader.wrap(searcher.getIndexReader()) : null;
+
+ // [fieldIdx][docIdInIndex] of highlightDoc result
+ Object[][] highlightDocsInByField = new Object[fields.length][docIds.length];
+ // Highlight in doc batches determined by loadFieldValues (consumes from docIdIter)
+ DocIdSetIterator docIdIter = asDocIdSetIterator(docIds);
+ for (int batchDocIdx = 0; batchDocIdx < docIds.length; ) {
+ // Load the field values of the first batch of document(s) (note: commonly all docs are in this batch)
+ List<CharSequence[]> fieldValsByDoc =
+ loadFieldValues(fields, docIdIter, cacheCharsThreshold);
+ // the size of the above list is the size of the batch (num of docs in the batch)
+
+ // Highlight in per-field order first, then by doc (better I/O pattern)
+ for (int fieldIdx = 0; fieldIdx < fields.length; fieldIdx++) {
+ Object[] resultByDocIn = highlightDocsInByField[fieldIdx];//parallel to docIdsIn
+ FieldHighlighter fieldHighlighter = fieldHighlighters[fieldIdx];
+ for (int docIdx = batchDocIdx; docIdx - batchDocIdx < fieldValsByDoc.size(); docIdx++) {
+ int docId = docIds[docIdx];//sorted order
+ CharSequence content = fieldValsByDoc.get(docIdx - batchDocIdx)[fieldIdx];
+ if (content == null) {
+ continue;
+ }
+ IndexReader indexReader =
+ (fieldHighlighter.getOffsetSource() == OffsetSource.TERM_VECTORS
+ && indexReaderWithTermVecCache != null)
+ ? indexReaderWithTermVecCache
+ : searcher.getIndexReader();
+ int docInIndex = docInIndexes[docIdx];//original input order
+ assert resultByDocIn[docInIndex] == null;
+ resultByDocIn[docInIndex] =
+ fieldHighlighter
+ .highlightFieldForDoc(indexReader, docId, content.toString());
+ }
+
+ }
+
+ batchDocIdx += fieldValsByDoc.size();
+ }
+ assert docIdIter.docID() == DocIdSetIterator.NO_MORE_DOCS
+ || docIdIter.nextDoc() == DocIdSetIterator.NO_MORE_DOCS;
+
+ // TODO reconsider the return type; since this is an "advanced" method, lets not return a Map? Notice the only
+ // caller simply iterates it to build another structure.
+
+ // field -> object highlights parallel to docIdsIn
+ Map<String, Object[]> resultMap = new HashMap<>(fields.length);
+ for (int f = 0; f < fields.length; f++) {
+ resultMap.put(fields[f], highlightDocsInByField[f]);
+ }
+ return resultMap;
+ }
+
+ /**
+ * When cacheCharsThreshold is 0, loadFieldValues() only fetches one document at a time. We override it to be 0
+ * in two circumstances:
+ */
+ private int calculateOptimalCacheCharsThreshold(int numTermVectors, int numPostings) {
+ if (numPostings == 0 && numTermVectors == 0) {
+ // (1) When all fields are ANALYSIS there's no point in caching a batch of documents
+ // because no other info on disk is needed to highlight it.
+ return 0;
+ } else if (numTermVectors >= 2) {
+ // (2) When two or more fields have term vectors, given the field-then-doc algorithm, the underlying term
+ // vectors will be fetched in a terrible access pattern unless we highlight a doc at a time and use a special
+ // current-doc TV cache. So we do that. Hopefully one day TVs will be improved to make this pointless.
+ return 0;
+ } else {
+ return getCacheFieldValCharsThreshold();
+ }
+ }
+
+ private void copyAndSortFieldsWithMaxPassages(String[] fieldsIn, int[] maxPassagesIn, final String[] fields,
+ final int[] maxPassages) {
+ System.arraycopy(fieldsIn, 0, fields, 0, fieldsIn.length);
+ System.arraycopy(maxPassagesIn, 0, maxPassages, 0, maxPassagesIn.length);
+ new InPlaceMergeSorter() {
+ @Override
+ protected void swap(int i, int j) {
+ String tmp = fields[i];
+ fields[i] = fields[j];
+ fields[j] = tmp;
+ int tmp2 = maxPassages[i];
+ maxPassages[i] = maxPassages[j];
+ maxPassages[j] = tmp2;
+ }
+
+ @Override
+ protected int compare(int i, int j) {
+ return fields[i].compareTo(fields[j]);
+ }
+
+ }.sort(0, fields.length);
+ }
+
+ private void copyAndSortDocIdsWithIndex(int[] docIdsIn, final int[] docIds, final int[] docInIndexes) {
+ System.arraycopy(docIdsIn, 0, docIds, 0, docIdsIn.length);
+ for (int i = 0; i < docInIndexes.length; i++) {
+ docInIndexes[i] = i;
+ }
+ new InPlaceMergeSorter() {
+ @Override
+ protected void swap(int i, int j) {
+ int tmp = docIds[i];
+ docIds[i] = docIds[j];
+ docIds[j] = tmp;
+ tmp = docInIndexes[i];
+ docInIndexes[i] = docInIndexes[j];
+ docInIndexes[j] = tmp;
+ }
+
+ @Override
+ protected int compare(int i, int j) {
+ return Integer.compare(docIds[i], docIds[j]);
+ }
+ }.sort(0, docIds.length);
+ }
+
+ /**
+ * Highlights text passed as a parameter. This requires the {@link IndexSearcher} provided to this highlighter is
+ * null. This use-case is more rare. Naturally, the mode of operation will be {@link OffsetSource#ANALYSIS}.
+ * The result of this method is whatever the {@link PassageFormatter} returns. For the {@link
+ * DefaultPassageFormatter} and assuming {@code content} has non-zero length, the result will be a non-null
+ * string -- so it's safe to call {@link Object#toString()} on it in that case.
+ *
+ * @param field field name to highlight (as found in the query).
+ * @param query query to highlight.
+ * @param content text to highlight.
+ * @param maxPassages The maximum number of top-N ranked passages used to
+ * form the highlighted snippets.
+ * @return result of the {@link PassageFormatter} -- probably a String. Might be null.
+ * @throws IOException if an I/O error occurred during processing
+ */
+ //TODO make content a List? and return a List? and ensure getEmptyHighlight is never invoked multiple times?
+ public Object highlightWithoutSearcher(String field, Query query, String content, int maxPassages)
+ throws IOException {
+ if (this.searcher != null) {
+ throw new IllegalStateException("highlightWithoutSearcher should only be called on a " +
+ getClass().getSimpleName() + " without an IndexSearcher.");
+ }
+ Objects.requireNonNull(content, "content is required");
+ SortedSet<Term> queryTerms = extractTerms(query);
+ return getFieldHighlighter(field, query, queryTerms, maxPassages)
+ .highlightFieldForDoc(null, -1, content);
+ }
+
+ protected FieldHighlighter getFieldHighlighter(String field, Query query, SortedSet<Term> allTerms, int maxPassages) {
+ return new FieldHighlighter(field,
+ getOffsetStrategy(field, query, allTerms),
+ new SplittingBreakIterator(getBreakIterator(field), UnifiedHighlighter.MULTIVAL_SEP_CHAR),
+ getScorer(field),
+ maxPassages,
+ getMaxNoHighlightPassages(field),
+ getFormatter(field));
+ }
+
+ protected FieldOffsetStrategy getOffsetStrategy(String field, Query query, SortedSet<Term> allTerms) {
+ EnumSet<HighlightFlag> highlightFlags = getFlags(field);
+ BytesRef[] terms = filterExtractedTerms(field, allTerms);
+ PhraseHelper phraseHelper = getPhraseHelper(field, query, highlightFlags);
+ CharacterRunAutomaton[] automata = getAutomata(field, query, highlightFlags);
+ OffsetSource offsetSource = getOptimizedOffsetSource(field, terms, phraseHelper, automata);
+ switch (offsetSource) {
+ case ANALYSIS:
+ return new AnalysisOffsetStrategy(field, terms, phraseHelper, automata, getIndexAnalyzer());
+ case NONE_NEEDED:
+ return NoOpOffsetStrategy.INSTANCE;
+ case TERM_VECTORS:
+ return new TermVectorOffsetStrategy(field, terms, phraseHelper, automata);
+ case POSTINGS:
+ return new PostingsOffsetStrategy(field, terms, phraseHelper, automata);
+ case POSTINGS_WITH_TERM_VECTORS:
+ return new PostingsWithTermVectorsOffsetStrategy(field, terms, phraseHelper, automata);
+ default:
+ throw new IllegalArgumentException("Unrecognized offset source " + offsetSource);
+ }
+ }
+
+ protected EnumSet<HighlightFlag> getFlags(String field) {
+ EnumSet<HighlightFlag> highlightFlags = EnumSet.noneOf(HighlightFlag.class);
+ if (shouldHandleMultiTermQuery(field)) {
+ highlightFlags.add(HighlightFlag.MULTI_TERM_QUERY);
+ }
+ if (shouldHighlightPhrasesStrictly(field)) {
+ highlightFlags.add(HighlightFlag.PHRASES);
+ }
+ return highlightFlags;
+ }
+
+ protected BytesRef[] filterExtractedTerms(String field, SortedSet<Term> queryTerms) {
+ // TODO consider requireFieldMatch
+ Term floor = new Term(field, "");
+ Term ceiling = new Term(field, UnicodeUtil.BIG_TERM);
+ SortedSet<Term> fieldTerms = queryTerms.subSet(floor, ceiling);
+
+ // Strip off the redundant field:
+ BytesRef[] terms = new BytesRef[fieldTerms.size()];
+ int termUpto = 0;
+ for (Term term : fieldTerms) {
+ terms[termUpto++] = term.bytes();
+ }
+ return terms;
+ }
+
+ protected PhraseHelper getPhraseHelper(String field, Query query, EnumSet<HighlightFlag> highlightFlags) {
+ boolean highlightPhrasesStrictly = highlightFlags.contains(HighlightFlag.PHRASES);
+ boolean handleMultiTermQuery = highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY);
+ return highlightPhrasesStrictly ?
+ new PhraseHelper(query, field, this::requiresRewrite, !handleMultiTermQuery) :
+ PhraseHelper.NONE;
+ }
+
+ protected CharacterRunAutomaton[] getAutomata(String field, Query query, EnumSet<HighlightFlag> highlightFlags) {
+ return highlightFlags.contains(HighlightFlag.MULTI_TERM_QUERY)
+ ? MultiTermHighlighting.extractAutomata(query, field, !highlightFlags.contains(HighlightFlag.PHRASES))
+ : ZERO_LEN_AUTOMATA_ARRAY;
+ }
+
+ protected OffsetSource getOptimizedOffsetSource(String field, BytesRef[] terms, PhraseHelper phraseHelper, CharacterRunAutomaton[] automata) {
+ if (terms.length == 0 && automata.length == 0 && !phraseHelper.willRewrite()) {
+ return OffsetSource.NONE_NEEDED; //nothing to highlight
+ }
+
+ OffsetSource offsetSource = getOffsetSource(field);
+ switch (offsetSource) {
+ case POSTINGS:
+ if (phraseHelper.willRewrite()) {
+ // We can't choose the postings offset source when there is "rewriting" in the strict phrase
+ // processing (rare but possible). Postings requires knowing all the terms (except wildcards)
+ // up front.
+ return OffsetSource.ANALYSIS;
+ } else if (automata.length > 0) {
+ return OffsetSource.ANALYSIS;
+ }
+ break;
+ case POSTINGS_WITH_TERM_VECTORS:
+ if (!phraseHelper.willRewrite() && automata.length == 0) {
+ return OffsetSource.POSTINGS; //We don't need term vectors
+ }
+ break;
+ case ANALYSIS:
+ case TERM_VECTORS:
+ case NONE_NEEDED:
+ default:
+ //stick with the original offset source
+ break;
+ }
+
+ return offsetSource;
+ }
+
+ /**
+ * When highlighting phrases accurately, we need to know which {@link SpanQuery}'s need to have
+ * {@link Query#rewrite(IndexReader)} called on them. It helps performance to avoid it if it's not needed.
+ * This method will be invoked on all SpanQuery instances recursively. If you have custom SpanQuery queries then
+ * override this to check instanceof and provide a definitive answer. If the query isn't your custom one, simply
+ * return null to have the default rules apply, which govern the ones included in Lucene.
+ */
+ protected Boolean requiresRewrite(SpanQuery spanQuery) {
+ return null;
+ }
+
+ private DocIdSetIterator asDocIdSetIterator(int[] sortedDocIds) {
+ return new DocIdSetIterator() {
+ int idx = -1;
+
+ @Override
+ public int docID() {
+ if (idx < 0 || idx >= sortedDocIds.length) {
+ return NO_MORE_DOCS;
+ }
+ return sortedDocIds[idx];
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ idx++;
+ return docID();
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+ return super.slowAdvance(target); // won't be called, so whatever
+ }
+
+ @Override
+ public long cost() {
+ return Math.max(0, sortedDocIds.length - (idx + 1)); // remaining docs
+ }
+ };
+ }
+
+ /**
+ * Loads the String values for each docId by field to be highlighted. By default this loads from stored fields
+ * by the same name as given, but a subclass can change the source. The returned Strings must be identical to
+ * what was indexed (at least for postings or term-vectors offset sources).
+ * This method must load fields for at least one document from the given {@link DocIdSetIterator}
+ * but need not return all of them; by default the character lengths are summed and this method will return early
+ * when {@code cacheCharsThreshold} is exceeded. Specifically if that number is 0, then only one document is
+ * fetched no matter what. Values in the array of {@link CharSequence} will be null if no value was found.
+ */
+ protected List<CharSequence[]> loadFieldValues(String[] fields,
+ DocIdSetIterator docIter, int cacheCharsThreshold)
+ throws IOException {
+ List<CharSequence[]> docListOfFields =
+ new ArrayList<>(cacheCharsThreshold == 0 ? 1 : (int) Math.min(64, docIter.cost()));
+
+ LimitedStoredFieldVisitor visitor = newLimitedStoredFieldsVisitor(fields);
+ int sumChars = 0;
+ do {
+ int docId = docIter.nextDoc();
+ if (docId == DocIdSetIterator.NO_MORE_DOCS) {
+ break;
+ }
+ visitor.init();
+ searcher.doc(docId, visitor);
+ CharSequence[] valuesByField = visitor.getValuesByField();
+ docListOfFields.add(valuesByField);
+ for (CharSequence val : valuesByField) {
+ sumChars += (val == null ? 0 : val.length());
+ }
+ } while (sumChars <= cacheCharsThreshold && cacheCharsThreshold != 0);
+ return docListOfFields;
+ }
+
+ /**
+ * @lucene.internal
+ */
+ protected LimitedStoredFieldVisitor newLimitedStoredFieldsVisitor(String[] fields) {
+ return new LimitedStoredFieldVisitor(fields, MULTIVAL_SEP_CHAR, getMaxLength());
+ }
+
+ /**
+ * Fetches stored fields for highlighting. Uses a multi-val separator char and honors a max length to retrieve.
+ * @lucene.internal
+ */
+ protected static class LimitedStoredFieldVisitor extends StoredFieldVisitor {
+ protected final String[] fields;
+ protected final char valueSeparator;
+ protected final int maxLength;
+ protected CharSequence[] values;// starts off as String; may become StringBuilder.
+ protected int currentField;
+
+ public LimitedStoredFieldVisitor(String[] fields, char valueSeparator, int maxLength) {
+ this.fields = fields;
+ this.valueSeparator = valueSeparator;
+ this.maxLength = maxLength;
+ }
+
+ void init() {
+ values = new CharSequence[fields.length];
+ currentField = -1;
+ }
+
+ @Override
+ public void stringField(FieldInfo fieldInfo, byte[] byteValue) throws IOException {
+ String value = new String(byteValue, StandardCharsets.UTF_8);
+ assert currentField >= 0;
+ CharSequence curValue = values[currentField];
+ if (curValue == null) {
+ //question: if truncate due to maxLength, should we try and avoid keeping the other chars in-memory on
+ // the backing char[]?
+ values[currentField] = value.substring(0, Math.min(maxLength, value.length()));//note: may return 'this'
+ return;
+ }
+ final int lengthBudget = maxLength - curValue.length();
+ if (lengthBudget <= 0) {
+ return;
+ }
+ StringBuilder curValueBuilder;
+ if (curValue instanceof StringBuilder) {
+ curValueBuilder = (StringBuilder) curValue;
+ } else {
+ // upgrade String to StringBuilder. Choose a good initial size.
+ curValueBuilder = new StringBuilder(curValue.length() + Math.min(lengthBudget, value.length() + 256));
+ curValueBuilder.append(curValue);
+ }
+ curValueBuilder.append(valueSeparator);
+ curValueBuilder.append(value.substring(0, Math.min(lengthBudget - 1, value.length())));
+ values[currentField] = curValueBuilder;
+ }
+
+ @Override
+ public Status needsField(FieldInfo fieldInfo) throws IOException {
+ currentField = Arrays.binarySearch(fields, fieldInfo.name);
+ if (currentField < 0) {
+ return Status.NO;
+ }
+ CharSequence curVal = values[currentField];
+ if (curVal != null && curVal.length() >= maxLength) {
+ return fields.length == 1 ? Status.STOP : Status.NO;
+ }
+ return Status.YES;
+ }
+
+ CharSequence[] getValuesByField() {
+ return this.values;
+ }
+
+ }
+
+ /**
+ * Wraps an IndexReader that remembers/caches the last call to {@link LeafReader#getTermVectors(int)} so that
+ * if the next call has the same ID, then it is reused. If TV's were column-stride (like doc-values), there would
+ * be no need for this.
+ */
+ private static class TermVectorReusingLeafReader extends FilterLeafReader {
+
+ static IndexReader wrap(IndexReader reader) throws IOException {
+ LeafReader[] leafReaders = reader.leaves().stream()
+ .map(LeafReaderContext::reader)
+ .map(TermVectorReusingLeafReader::new)
+ .toArray(LeafReader[]::new);
+ return new BaseCompositeReader<IndexReader>(leafReaders) {
+ @Override
+ protected void doClose() throws IOException {
+ reader.close();
+ }
+ };
+ }
+
+ private int lastDocId = -1;
+ private Fields tvFields;
+
+ TermVectorReusingLeafReader(LeafReader in) {
+ super(in);
+ }
+
+ @Override
+ public Fields getTermVectors(int docID) throws IOException {
+ if (docID != lastDocId) {
+ lastDocId = docID;
+ tvFields = in.getTermVectors(docID);
+ }
+ return tvFields;
+ }
+
+ }
+
+ /**
+ * Flags for controlling highlighting behavior.
+ */
+ public enum HighlightFlag {
+ PHRASES,
+ MULTI_TERM_QUERY
+ // TODO: ignoreQueryFields
+ // TODO: useQueryBoosts
+ // TODO: avoidMemoryIndexIfPossible
+ // TODO: preferMemoryIndexForStats
+ }
+}
http://git-wip-us.apache.org/repos/asf/lucene-solr/blob/4b679436/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/package-info.java
----------------------------------------------------------------------
diff --git a/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/package-info.java b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/package-info.java
new file mode 100644
index 0000000..b0acd36
--- /dev/null
+++ b/lucene/highlighter/src/java/org/apache/lucene/search/uhighlight/package-info.java
@@ -0,0 +1,22 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * The UnifiedHighlighter -- a flexible highlighter that can get offsets from postings, term vectors, or analysis.
+ * @see org.apache.lucene.search.uhighlight.UnifiedHighlighter
+ */
+package org.apache.lucene.search.uhighlight;